傅里叶变换和离散余弦变换的几种快速算法与代码实现
完整代码详见Digital-Image-Process/fdt.cpp at main · FouforPast/Digital-Image-Process (github.com)
1. 傅里叶变换 1.1 朴素傅里叶变换算法
利用上述公式计算的DFT如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 void dftNaive (Mat& src, Mat& dst, int sx, int sy, bool shift) { Mat input = src.clone (); pad (input, sx, sy); int M = input.rows; int N = input.cols; if (src.type () != 0 ) { throw std::invalid_argument ("only CV_8U is accepted" ); } Mat res = Mat (M, N, CV_32FC2, Scalar::all (0 )); float * ptr_res = (float *)res.data; for (int u = 0 ; u < M; ++u) { for (int v = 0 ; v < N; ++v) { float * ptr_src = (float *)input.data; for (int x = 0 ; x < M; ++x) { for (int y = 0 ; y < N; ++y) { double tmp = ((double )u * x / (double )M + (double )v * y / (double )N) * 2 * M_PI; int factor = shift & (x + y) & 1 ? -1 : 1 ; res.at <Vec2f>(u, v)[0 ] += input.at <uchar>(x, y) * cos (-tmp) * factor; res.at <Vec2f>(u, v)[1 ] += input.at <uchar>(x, y) * sin (-tmp) * factor; } } } } res.copyTo (dst); }
朴素算法的算法复杂度是$O(M^2 N^2)$,复杂度较高,实用性较低。利用DFT的可分离性,可以将二维DFT的计算转化为一维DFT的计算。优化一维DFT就可以达到优化二维DFT的目的。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 void helpFFT (float * real, float * imag, int M, int N, bool inverse) { for (int i = 0 ; i < M; ++i) { dft1d (real + N * i, imag + N * i, N, inverse); } for (int j = 0 ; j < N; ++j) { complex<float >* col = new complex<float >[M]; float * real_ = new float [M]; float * imag_ = new float [M]; for (int i = 0 ; i < M; ++i) { real_[i] = real[i * M + j]; imag_[i] = imag[i * M + j]; } dft1d (real_, imag_, M, inverse); for (int i = 0 ; i < M; ++i) { real[i * M + j] = real_[i]; imag[i * M + j] = imag_[i]; } } }
其中朴素一维DFT如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 void dft1dNaive (float * real, float * imag, unsigned L, bool inverse) { if (L == 1 ) { return ; } int fact = inverse ? 1 : -1 ; vector<double > real_ (L, 0 ) , imag_ (L, 0 ) ; for (int i = 0 ; i < L; ++i) { double real0 = cos (2 * M_PI * i / L), imag0 = fact * sin (2 * M_PI * i / L); double real1 = 1 , imag1 = 0 ; for (int j = 0 ; j < L; ++j) { real_[i] += real[j] * real1 - imag[j] * imag1; imag_[i] += real[j] * imag1 + imag[j] * real1; real1 = real1 * real0 - imag1 * imag0; imag1 = real1 * imag0 + imag1 * real0; } } for (int i = 0 ; i < L; ++i) { real[i] = real_[i]; imag[i] = imag_[i]; } }
1.2 Cooley-Tukey算法 Cooley-Turkey算法的前提是信号长度N不是质数,即N可以分解$N=nm$。该算法采用分治的思想,将N个数据分成m组,每组n个(按照下标对m取模的值分组)。根据消去引理,设单位复根$wN^t=\exp(\frac{-j2πt}{N})$,则$w {kN}^{kt}=w_N^t$。所以一维DFT
对每组长度为n的数据进行DFT,得到的结果为$X{a,u}$(表示第a组数据DFT的结果中下标为u的数据),很容易证明$X {a,u}=X_{a,u+n}$,进而
因此,要计算$F(u+kn)$,取出每组得到的DFT结果中下标均为u的数据,将其乘以$w_{nm}^{au}$ ,然后再对这些数据做DFT即可。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 void fft1dCooleyTukey (float * real, float * imag, unsigned L, bool inverse) { if (L == 1 ) { return ; } int fact = inverse ? 1 : -1 ; int num; if (L % 2 == 0 ) { num = 2 ; } else if (L % 3 == 0 ) { num = 3 ; } else { num = 5 ; } unsigned newL = L / num; float ** groupReal = new float * [num]; float ** groupImag = new float * [num]; for (int i = 0 ; i < num; ++i) { groupReal[i] = new float [newL]; groupImag[i] = new float [newL]; } for (int i = 0 ; i < L; ++i) { groupReal[i % num][i / num] = real[i]; groupImag[i % num][i / num] = imag[i]; } for (int i = 0 ; i < num; ++i) { dft1d (groupReal[i], groupImag[i], newL, inverse); } float real0 = cos (2 * M_PI / L), imag0 = fact * sin (2 * M_PI / L); float real1 = 1 , imag1 = 0 ; for (int i = 0 ; i < newL; ++i) { if (num == 2 ) { float tmpReal = real1 * groupReal[1 ][i] - imag1 * groupImag[1 ][i]; float tmpImag = imag1 * groupReal[1 ][i] + real1 * groupImag[1 ][i]; real[i] = groupReal[0 ][i] + tmpReal; imag[i] = groupImag[0 ][i] + tmpImag; real[i + newL] = groupReal[0 ][i] - tmpReal; imag[i + newL] = groupImag[0 ][i] - tmpImag; } else { float * tempReal = new float [num]; float * tempImag = new float [num]; float real2 = 1 , imag2 = 0 ; for (int j = 0 ; j < num; ++j) { tempReal[j] = groupReal[j][i] * real2 - groupImag[j][i] * imag2; tempImag[j] = groupImag[j][i] * real2 + groupReal[j][i] * imag2; auto tmpReal2 = real2; real2 = real2 * real1 - imag2 * imag1; imag2 = tmpReal2 * imag1 + imag2 * real1; } dft1dNaive (tempReal, tempImag, num, inverse); for (int j = 0 ; j < num; ++j) { real[i + newL * j] = tempReal[j]; imag[i + newL * j] = tempImag[j]; } } auto tmpReal1 = real1; real1 = real1 * real0 - imag1 * imag0; imag1 = tmpReal1 * imag0 + imag1 * real0; } }
1.3 基2FFT算法 如果信号的长度N是2的幂次,此时采用Cooley–Tukey算法就是最常见的基2快速傅里叶算法,这种情况下按照下标的奇偶性将对信号分成两组。对于基2的DCT算法,既可以采用递归形式实现,也可以采用迭代形式实现,如果采用递归实现,调用过程中会占用大量系统堆栈,效率不高,因此一般是迭代形式实现。
由下面的递归树可知,只需要事先对原始数据排好序,然后对其进行归并即可。
观察N=8时的例子,排好序后的顺序,实际上是原来下标的二进制表示翻转得到的。
以$a_6$为例,6的二进制表示是110,将其翻转得到011,011就是3,也就是排好序后$a_6$所处的位置。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 size_t reverseBits (size_t val, int width) { size_t result = 0 ; for (int i = 0 ; i < width; i++, val >>= 1 ) result = (result << 1 ) | (val & 1U ); return result; } void fft1dIterRadix2 (float * real, float * imag, unsigned L, bool inverse) { if (L & (L - 1 )) { throw std::invalid_argument ("invalid length, make sure L is positive and a power of 2" ); } int n = log2n (L); int fact = inverse ? 1 : -1 ; for (int i = 0 ; i < L; ++i) { size_t y = reverseBits (i, n); if (y > i) { swap (real[i], real[y]); swap (imag[i], imag[y]); } } float * _real = new float [L >> 1 ]; float * _imag = new float [L >> 1 ]; for (int i = 0 ; i < (L >> 1 ); ++i) { _real[i] = cos (2 * M_PI * i / L); _imag[i] = fact * sin (2 * M_PI * i / L); } for (int i = 2 ; i <= L; i <<= 1 ) { for (int j = 0 ; j < L; j += i) { for (int k = 0 ; k < (i >> 1 ); ++k) { int idx1 = j + k; int idx2 = idx1 + (i >> 1 ); float tmpReal = real[idx2] * _real[k * L / i] - imag[idx2] * _imag[k * L / i]; float tmpImag = imag[idx2] * _real[k * L / i] + real[idx2] * _imag[k * L / i]; real[idx2] = real[idx1] - tmpReal; imag[idx2] = imag[idx1] - tmpImag; real[idx1] += tmpReal; imag[idx1] += tmpImag; } } } }
1.4 Bluestein算法 当信号f(x)的长度N是质数时,CT算法不再适应,此时可以使用Bluestein算法。
其中,$g(x) = f(x) w_N^{x^2/2}$,$h(x) = w_N^{-x^2/2}$,则
然后使用卷积定理,并补零到2的幂次:
其中 pad
表示补零操作,FFT_2
和 IFFT_2
分别表示基2的迭代形式FFT算法和逆FFT算法。这里,h'
是 h
在对称中心进行延拓后得到的信号。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 void fft1dBluestein (float * real, float * imag, complex<float >* table, unsigned L, bool inverse) { int N = 1 << (log2n (L * 2 ) + 1 ); int fact = inverse ? 1 : -1 ; float * realA = new float [N](); float * realB = new float [N](); float * imagA = new float [N](); float * imagB = new float [N](); complex<float >* a = new complex<float >[N]; complex<float >* b = new complex<float >[N]; for (int i = 0 ; i < L; ++i) { realA[i] = real[i] * table[i].real () - imag[i] * table[i].imag (); imagA[i] = imag[i] * table[i].real () + real[i] * table[i].imag (); } for (int i = 1 ; i < L * 2 - 1 ; ++i) { if (i < L) { imagB[i] = -table[L - i - 1 ].imag (); realB[i] = table[L - i - 1 ].real (); } else { imagB[i] = -table[i + 1 - L].imag (); realB[i] = table[i + 1 - L].real (); } } float ** conv = convolve (realA, imagA, realB, imagB, N); for (int i = 0 ; i < L; ++i) { real[i] = conv[0 ][i + L] * table[i].real () - conv[1 ][i + L] * table[i].imag (); imag[i] = conv[1 ][i + L] * table[i].real () + conv[0 ][i + L] * table[i].imag (); } } complex<float >* getBluesteinTable (unsigned L, bool inverse) { int fact = inverse ? 1 : -1 ; complex<float >* table = new complex<float >[L]; for (int i = 0 ; i < L; ++i) { uintmax_t temp = static_cast <uintmax_t >(i) * i; temp %= static_cast <uintmax_t >(L) * 2 ; double angle = M_PI * temp / L * fact; table[i].real (cos (angle)); table[i].imag (sin (angle)); } return table; } float ** convolve (float * real1, float * imag1, float * real2, float * imag2, unsigned L) { fft1dIterRadix2 (real1, imag1, L, false ); fft1dIterRadix2 (real2, imag2, L, false ); float ** dst = new float *[2 ](); dst[0 ] = new float [L]; dst[1 ] = new float [L]; for (int i = 0 ; i < L; ++i) { dst[0 ][i] = real1[i] * real2[i] - imag1[i] * imag2[i]; dst[1 ][i] = imag1[i] * real2[i] + real1[i] * imag2[i]; } fft1dIterRadix2 (dst[0 ], dst[1 ], L, true ); for (int i = 0 ; i < L; ++i) { dst[0 ][i] /= L; dst[1 ][i] /= L; } return dst; }
1.5 实际采用的FFT算法 上面几种FFT算法中,算法性能大致是基2FFT>CT>Bluestein算法,所以实际使用时,根据序列的长度N设计如下算法:
如果N是2的幂次,采用基2FFT算法
否则如果N是2的倍数或5的倍数,采用一般形式的CT算法
否则采用Bluestein算法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 void dft1d (float * real, float * imag, unsigned L, bool inverse) { if (L == 1 ) { return ; } else if ((L & (L - 1 )) == 0 ) { fft1dIterRadix2 (real, imag, L, inverse); } else { if (L % 5 == 0 || L % 3 == 0 || L % 2 == 0 ) { fft1dCooleyTukey (real, imag, L, inverse); } else { long tmp = inverse ? -1 * L : L; if (mem.count (tmp) == 0 ) { mem[tmp] = getBluesteinTable (L, inverse); } fft1dBluestein (real, imag, mem[tmp], L, inverse); } } }
2. 离散余弦变换(DCT) 二维离散余弦变换同样具备可分离性,因此只需要考虑如果优化一维DCT。
实际上,一维DCT可以利用一维FFT实现。可以构造这样一个信号$g(x)$,它通过在原始信号$f(x)$的末尾补N个0得到。
这样就可以利用FFT算法计算DCT。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 void myfdct (Mat& src, Mat& dst, int sx, int sy) { Mat input = src.clone (); pad (input, sx, sy); int M = input.rows; int N = input.cols; if (src.type () != 5 ) { throw std::invalid_argument ("only CV_32F is accepted" ); } Mat res = Mat (M, N, CV_32F, Scalar::all (0 )); float * ptr_res = (float *)res.data; for (int i = 0 ; i < M; ++i) { dct1d ((float *)src.ptr (i), (float *)res.ptr (i), N); } for (int j = 0 ; j < N; ++j) { float * col = new float [M]; for (int i = 0 ; i < M; ++i) { col[i] = *(ptr_res + N * i + j); } float * tmp = new float [M]; dct1d (col, tmp, M); for (int i = 0 ; i < M; ++i) { *(ptr_res + N * i + j) = tmp[i]; } } res.copyTo (dst); } void dct1d (float * src, float * dst, int N) { dct1dNaive (src, dst, N); } void fdctfft1d (float * src, float * dst, int N) { float * imag = new float [2 * N]; float * real = new float [2 * N]; memset (imag, 0 , N * 2 ); memset (real, 0 , N * 2 ); for (int i = 0 ; i < N; i++) { real[i] = src[i]; } dft1d (real, imag, N * 2 , false ); for (int i = 0 ; i < N; i++) { double tmp = i * M_PI / (2 * N); dst[i] = (real[i] * cos (tmp) + imag[i] * sin (tmp)) * c (i, N); } }
3.性能测试
图像尺寸
opencv DFT+IDFT
朴素 DFT+IDFT
自编 FFT+IFFT
opencv DCT+IDCT
朴素 DCT+IDCT
自编 FDCT+IFDCT
256×256 (2的幂次)
0.97ms
170514ms
29.5ms
0.46ms
83422ms
61ms
257×257 (质数)
3.65ms
170916ms
86.15ms
2.29ms
93993ms
170ms
240×240 (2,3,5的倍数)
0.98ms
133231ms
32.87ms
2.29ms
81342ms
73ms