1 /* ======================================================================= */
2 /* TEXAS INSTRUMENTS, INC. */
3 /* */
4 /* NAME */
5 /* DSP_fft16x32_cn -- Radix-4 FFT with digit reversal */
6 /* */
7 /* USAGE */
8 /* */
9 /* This routine is C-callable and can be called as: */
10 /* */
11 /* void DSP_fft16x32_cn */
12 /* ( */
13 /* short * w, */
14 /* int nx, */
15 /* int * x, */
16 /* int * y */
17 /* ) */
18 /* */
19 /* w[2*nx]: Pointer to vector of Q.15 FFT coefficients of size */
20 /* 2*nx elements. */
21 /* */
22 /* nx: Number of complex elements in vector x. */
23 /* */
24 /* x[2*nx]: Pointer to input vector of size 2*nx elements. */
25 /* */
26 /* y[2*nx]: Pointer to output vector of size 2*nx elements. */
27 /* */
28 /* */
29 /* DESCRIPTION */
30 /* */
31 /* This code performs a Radix-4 FFT with digit reversal. The code */
32 /* uses a special ordering of twiddle factors and memory accesses */
33 /* to improve performance in the presence of cache. It operates */
34 /* largely in-place, but the final digit-reversed output is written */
35 /* out-of-place. */
36 /* */
37 /* This code requires a special sequence of twiddle factors stored */
38 /* in Q1.15 fixed-point format. The following C code illustrates */
39 /* one way to generate the desired twiddle-factor array: */
40 /* */
41 /* #include <math.h> */
42 /* */
43 /* */
44 /* short d2s(double d) */
45 /* { */
46 /* d = floor(0.5 + d); // Explicit rounding to integer // */
47 /* if (d >= 32767.0) return 32767; */
48 /* if (d <= -32768.0) return -32768; */
49 /* return (short)d; */
50 /* } */
51 /* */
52 /* void gen_twiddle_fft_c(short *w, int n) */
53 /* { */
54 /* double M = 32767.5; */
55 /* const double PI = 3.141592654; */
56 /* int i, j, k; */
57 /* */
58 /* for (j = 1, k = 0; j < n >> 2; j = j << 2) { */
59 /* for (i = 0; i < n >> 2; i += j << 1) { */
60 /* w[k + 11] = d2s(M * cos(6.0 * PI * (i + j) / n)); */
61 /* w[k + 10] = d2s(M * sin(6.0 * PI * (i + j) / n)); */
62 /* w[k + 9] = d2s(M * cos(6.0 * PI * (i ) / n)); */
63 /* w[k + 8] = d2s(M * sin(6.0 * PI * (i ) / n)); */
64 /* */
65 /* w[k + 7] = d2s(M * cos(4.0 * PI * (i + j) / n)); */
66 /* w[k + 6] = d2s(M * sin(4.0 * PI * (i + j) / n)); */
67 /* w[k + 5] = d2s(M * cos(4.0 * PI * (i ) / n)); */
68 /* w[k + 4] = d2s(M * sin(4.0 * PI * (i ) / n)); */
69 /* */
70 /* w[k + 3] = d2s(M * cos(2.0 * PI * (i + j) / n)); */
71 /* w[k + 2] = d2s(M * sin(2.0 * PI * (i + j) / n)); */
72 /* w[k + 1] = d2s(M * cos(2.0 * PI * (i ) / n)); */
73 /* w[k + 0] = d2s(M * sin(2.0 * PI * (i ) / n)); */
74 /* */
75 /* k += 12; */
76 /* } */
77 /* } */
78 /* w[2*n - 1] = w[2*n - 3] = w[2*n - 5] = 32767; */
79 /* w[2*n - 2] = w[2*n - 4] = w[2*n - 6] = 0; */
80 /* } */
81 /* */
82 /* */
83 /* TECHNIQUES */
84 /* */
85 /* The following C code represents an implementation of the Cooley */
86 /* Tukey radix 4 DIF FFT. It accepts the inputs in normal order and */
87 /* produces the outputs in digit reversed order. The natural C code */
88 /* shown in this file on the other hand, accepts the inputs in nor- */
89 /* mal order and produces the outputs in normal order. */
90 /* */
91 /* Several transformations have been applied to the original Cooley */
92 /* Tukey code to produce the natural C code description shown here. */
93 /* In order to understand these it would first be educational to */
94 /* understand some of the issues involved in the conventional Cooley */
95 /* Tukey FFT code. */
96 /* */
97 /* void radix4(int n, short x[], short wn[]) */
98 /* { */
99 /* int n1, n2, ie, ia1, ia2, ia3; */
100 /* int i0, i1, i2, i3, i, j, k; */
101 /* short co1, co2, co3, si1, si2, si3; */
102 /* short xt0, yt0, xt1, yt1, xt2, yt2; */
103 /* short xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21; */
104 /* */
105 /* n2 = n; */
106 /* ie = 1; */
107 /* for (k = n; k > 1; k >>= 2) { */
108 /* n1 = n2; */
109 /* n2 >>= 2; */
110 /* ia1 = 0; */
111 /* */
112 /* for (j = 0; j < n2; j++) { */
113 /* ia2 = ia1 + ia1; */
114 /* ia3 = ia2 + ia1; */
115 /* */
116 /* co1 = wn[2 * ia1 ]; */
117 /* si1 = wn[2 * ia1 + 1]; */
118 /* co2 = wn[2 * ia2 ]; */
119 /* si2 = wn[2 * ia2 + 1]; */
120 /* co3 = wn[2 * ia3 ]; */
121 /* si3 = wn[2 * ia3 + 1]; */
122 /* ia1 = ia1 + ie; */
123 /* */
124 /* for (i0 = j; i0< n; i0 += n1) { */
125 /* i1 = i0 + n2; */
126 /* i2 = i1 + n2; */
127 /* i3 = i2 + n2; */
128 /* */
129 /* xh0 = x[2 * i0 ] + x[2 * i2 ]; */
130 /* xh1 = x[2 * i0 + 1] + x[2 * i2 + 1]; */
131 /* xl0 = x[2 * i0 ] - x[2 * i2 ]; */
132 /* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; */
133 /* */
134 /* xh20 = x[2 * i1 ] + x[2 * i3 ]; */
135 /* xh21 = x[2 * i1 + 1] + x[2 * i3 + 1]; */
136 /* xl20 = x[2 * i1 ] - x[2 * i3 ]; */
137 /* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; */
138 /* */
139 /* x[2 * i0 ] = xh0 + xh20; */
140 /* x[2 * i0 + 1] = xh1 + xh21; */
141 /* */
142 /* xt0 = xh0 - xh20; */
143 /* yt0 = xh1 - xh21; */
144 /* xt1 = xl0 + xl21; */
145 /* yt2 = xl1 + xl20; */
146 /* xt2 = xl0 - xl21; */
147 /* yt1 = xl1 - xl20; */
148 /* */
149 /* x[2 * i1 ] = (xt1 * co1 + yt1 * si1) >> 15; */
150 /* x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15; */
151 /* x[2 * i2 ] = (xt0 * co2 + yt0 * si2) >> 15; */
152 /* x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15; */
153 /* x[2 * i3 ] = (xt2 * co3 + yt2 * si3) >> 15; */
154 /* x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15; */
155 /* } */
156 /* } */
157 /* ie <<= 2; */
158 /* } */
159 /* } */
160 /* */
161 /* The conventional Cooley Tukey FFT, is written using three loops. */
162 /* The outermost loop "k" cycles through the stages. There are log */
163 /* N to the base 4 stages in all. The loop "j" cycles through the */
164 /* groups of butterflies with different twiddle factors, loop "i" */
165 /* reuses the twiddle factors for the different butterflies within */
166 /* a stage. It is interesting to note the following: */
167 /* */
168 /* ----------------------------------------------------------------------- */
169 /* Stage# #Groups # Butterflies with common #Groups*Bflys */
170 /* twiddle factors */
171 /* ----------------------------------------------------------------------- */
172 /* 1 N/4 1 N/4 */
173 /* 2 N/16 4 N/4 */
174 /* .. */
175 /* logN 1 N/4 N/4 */
176 /* ----------------------------------------------------------------------- */
177 /* */
178 /* The following statements can be made based on above observations: */
179 /* */
180 /* a) Inner loop "i0" iterates a variable number of times. In */
181 /* particular the number of iterations quadruples every time from */
182 /* 1..N/4. Hence software pipelining a loop that iterates a variable */
183 /* number of times is not profitable. */
184 /* */
185 /* b) Outer loop "j" iterates a variable number of times as well. */
186 /* However the number of iterations is quartered every time from */
187 /* N/4 ..1. Hence the behaviour in (a) and (b) are exactly opposite */
188 /* to each other. */
189 /* */
190 /* c) If the two loops "i" and "j" are colaesced together then they */
191 /* will iterate for a fixed number of times namely N/4. This allows */
192 /* us to combine the "i" and "j" loops into 1 loop. Optimized impl- */
193 /* ementations will make use of this fact. */
194 /* */
195 /* In addition the Cooley Tukey FFT accesses three twiddle factors */
196 /* per iteration of the inner loop, as the butterflies that re-use */
197 /* twiddle factors are lumped together. This leads to accessing the */
198 /* twiddle factor array at three points each sepearted by "ie". Note */
199 /* that "ie" is initially 1, and is quadrupled with every iteration. */
200 /* Therfore these three twiddle factors are not even contiguous in */
201 /* the array. */
202 /* */
203 /* In order to vectorize the FFT, it is desirable to access twiddle */
204 /* factor array using double word wide loads and fetch the twiddle */
205 /* factors needed. In order to do this a modified twiddle factor */
206 /* array is created, in which the factors WN/4, WN/2, W3N/4 are */
207 /* arranged to be contiguous. This eliminates the seperation between */
208 /* twiddle factors within a butterfly. However this implies that as */
209 /* the loop is traversed from one stage to another, that we maintain */
210 /* a redundant version of the twiddle factor array. Hence the size */
211 /* of the twiddle factor array increases as compared to the normal */
212 /* Cooley Tukey FFT. The modified twiddle factor array is of size */
213 /* "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4" */
214 /* where N is the number of complex points to be transformed. The */
215 /* routine that generates the modified twiddle factor array was */
216 /* presented earlier. With the above transformation of the FFT, */
217 /* both the input data and the twiddle factor array can be accessed */
218 /* using double-word wide loads to enable packed data processing. */
219 /* */
220 /* The final stage is optimised to remove the multiplication as */
221 /* w0 = 1. This stage also performs digit reversal on the data, */
222 /* so the final output is in natural order. */
223 /* */
224 /* The fft() code shown here performs the bulk of the computation */
225 /* in place. However, because digit-reversal cannot be performed */
226 /* in-place, the final result is written to a separate array, y[]. */
227 /* */
228 /* There is one slight break in the flow of packed processing that */
229 /* needs to be comprehended. The real part of the complex number is */
230 /* in the lower half, and the imaginary part is in the upper half. */
231 /* The flow breaks in case of "xl0" and "xl1" because in this case */
232 /* the real part needs to be combined with the imaginary part because */
233 /* of the multiplication by "j". This requires a packed quantity like */
234 /* "xl21xl20" to be rotated as "xl20xl21" so that it can be combined */
235 /* using add2's and sub2's. Hence the natural version of C code */
236 /* shown below is transformed using packed data processing as shown: */
237 /* */
238 /* xl0 = x[2 * i0 + 0] - x[2 * i2 + 0]; */
239 /* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; */
240 /* xl20 = x[2 * i1 + 0] - x[2 * i3 + 0]; */
241 /* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; */
242 /* */
243 /* xt1 = xl0 + xl21; */
244 /* yt2 = xl1 + xl20; */
245 /* xt2 = xl0 - xl21; */
246 /* yt1 = xl1 - xl20; */
247 /* */
248 /* xl1_xl0 = _sub2(x21_x20, x21_x20) */
249 /* xl21_xl20 = _sub2(x32_x22, x23_x22) */
250 /* xl20_xl21 = _rotl(xl21_xl20, 16) */
251 /* */
252 /* yt2_xt1 = _add2(xl1_xl0, xl20_xl21) */
253 /* yt1_xt2 = _sub2(xl1_xl0, xl20_xl21) */
254 /* */
255 /* Also notice that xt1, yt1 endup on seperate words, these need to */
256 /* be packed together to take advantage of the packed twiddle fact */
257 /* ors that have been loaded. In order for this to be achieved they */
258 /* are re-aligned as follows: */
259 /* */
260 /* yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1) */
261 /* yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2) */
262 /* */
263 /* The packed words "yt1_xt1" allows the loaded"sc" twiddle factor */
264 /* to be used for the complex multiplies. The real part os the */
265 /* complex multiply is implemented using _dotp2. The imaginary */
266 /* part of the complex multiply is implemented using _dotpn2 */
267 /* after the twiddle factors are swizzled within the half word. */
268 /* */
269 /* (X + jY) ( C + j S) = (XC + YS) + j (YC - XS). */
270 /* */
271 /* The actual twiddle factors for the FFT are cosine, - sine. The */
272 /* twiddle factors stored in the table are csine and sine, hence */
273 /* the sign of the "sine" term is comprehended during multipli- */
274 /* cation as shown above. */
275 /* */
276 /* */
277 /* ASSUMPTIONS */
278 /* */
279 /* The size of the FFT, n, must be a power of 4 and greater than */
280 /* or equal to 16 and less than 32768. */
281 /* */
282 /* The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a */
283 /* double-word boundary for the "optimized" implementations. */
284 /* */
285 /* The input and output data are complex, with the real/imaginary */
286 /* components stored in adjacent locations in the array. The real */
287 /* components are stored at even array indices, and the imaginary */
288 /* components are stored at odd array indices. */
289 /* */
290 /* Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/ */
291 /* */
292 /* */
293 /* Redistribution and use in source and binary forms, with or without */
294 /* modification, are permitted provided that the following conditions */
295 /* are met: */
296 /* */
297 /* Redistributions of source code must retain the above copyright */
298 /* notice, this list of conditions and the following disclaimer. */
299 /* */
300 /* Redistributions in binary form must reproduce the above copyright */
301 /* notice, this list of conditions and the following disclaimer in the */
302 /* documentation and/or other materials provided with the */
303 /* distribution. */
304 /* */
305 /* Neither the name of Texas Instruments Incorporated nor the names of */
306 /* its contributors may be used to endorse or promote products derived */
307 /* from this software without specific prior written permission. */
308 /* */
309 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
310 /* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */
311 /* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR */
312 /* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT */
313 /* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
314 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT */
315 /* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
316 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY */
317 /* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
318 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE */
319 /* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
320 /* */
321 /* ======================================================================= */
323 #pragma CODE_SECTION(DSP_fft16x32_cn, ".text:ansi");
325 #include "DSP_fft16x32_cn.h"
327 /* ======================================================================== */
328 /* Perform a signed 16-bit by signed 32-bit multiply, rounding and */
329 /* shifting away the lower 15 bits. */
330 /* ======================================================================== */
332 #define MPY16X32R(x,y) \
333 (((int)((short)(x) * (unsigned short)(y) + 0x4000) >> 15) + \
334 ((int)((short)(x) * (short)((y) >> 16)) << 1))
336 /*--------------------------------------------------------------------------*/
337 /* The following macro is used to obtain a digit reversed index, of a given */
338 /* number i, into j where the number of bits in "i" is "m". For the natural */
339 /* form of C code, this is done by first interchanging every set of "2 bit" */
340 /* pairs, followed by exchanging nibbles, followed by exchanging bytes, and */
341 /* finally halfwords. To give an example, condider the following number: */
342 /* */
343 /* N = FEDCBA9876543210, where each digit represents a bit, the following */
344 /* steps illustrate the changes as the exchanges are performed: */
345 /* M = DCFE98BA54761032 is the number after every "2 bits" are exchanged. */
346 /* O = 98BADCFE10325476 is the number after every nibble is exchanged. */
347 /* P = 1032547698BADCFE is the number after every byte is exchanged. */
348 /* Since only 16 digits were considered this represents the digit reversed */
349 /* index. Since the numbers are represented as 32 bits, there is one more */
350 /* step typically of exchanging the half words as well. */
351 /*--------------------------------------------------------------------------*/
352 #define DIG_REV(i, m, j) \
353 do { \
354 unsigned _ = (i); \
355 _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
356 _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
357 _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
358 _ = ((_ & 0x0000FFFF) << 16) | ((_ & ~0x0000FFFF) >> 16); \
359 (j) = _ >> (m); \
360 } while (0)
362 void DSP_fft16x32_cn (
363 const short * ptr_w,
364 int npoints,
365 int * ptr_x,
366 int * ptr_y
367 )
368 {
369 int i, j, l1, l2, h2, predj, tw_offset, stride, fft_jmp;
370 int xt0_0, yt0_0, xt1_0, yt1_0, xt2_0, yt2_0;
371 int xt0_1, yt0_1, xt1_1, yt1_1, xt2_1, yt2_1;
372 int xh0_0, xh1_0, xh20_0, xh21_0, xl0_0, xl1_0, xl20_0, xl21_0;
373 int xh0_1, xh1_1, xh20_1, xh21_1, xl0_1, xl1_1, xl20_1, xl21_1;
374 int x_0, x_1, x_2, x_3, x_l1_0, x_l1_1, x_l1_2, x_l1_3, x_l2_0, x_l2_1;
375 int xh0_2, xh1_2, xl0_2, xl1_2, xh0_3, xh1_3, xl0_3, xl1_3;
376 int x_4, x_5, x_6, x_7, x_l2_2, x_l2_3, x_h2_0, x_h2_1, x_h2_2, x_h2_3;
377 int x_8, x_9, x_a, x_b, x_c, x_d, x_e, x_f;
378 short si10, si20, si30, co10, co20, co30;
379 short si11, si21, si31, co11, co21, co31;
380 const short * w;
381 int * x, * x2, * x0;
382 int * y0, * y1, * y2, * y3;
383 int n00, n10, n20, n30, n01, n11, n21, n31;
384 int n02, n12, n22, n32, n03, n13, n23, n33;
385 int n0, j0;
386 int radix, m;
387 int y0r, y0i, y4r, y4i;
388 int norm;
390 /*---------------------------------------------------------------------*/
391 /* Determine the magnitude of the number of points to be transformed. */
392 /* Check whether we can use a radix4 decomposition or a mixed radix */
393 /* transformation, by determining modulo 2. */
394 /*---------------------------------------------------------------------*/
395 for (i = 31, m = 1; (npoints & (1 << i)) == 0; i--, m++)
396 ;
397 radix = m & 1 ? 2 : 4;
398 norm = m - 2;
400 /*----------------------------------------------------------------------*/
401 /* The stride is quartered with every iteration of the outer loop. It */
402 /* denotes the seperation between any two adjacent inputs to the butter */
403 /* -fly. This should start out at N/4, hence stride is initially set to */
404 /* N. For every stride, 6*stride twiddle factors are accessed. The */
405 /* "tw_offset" is the offset within the current twiddle factor sub- */
406 /* table. This is set to zero, at the start of the code and is used to */
407 /* obtain the appropriate sub-table twiddle pointer by offseting it */
408 /* with the base pointer "ptr_w". */
409 /*----------------------------------------------------------------------*/
410 stride = npoints;
411 tw_offset = 0;
412 fft_jmp = 6 * stride;
414 #ifndef NOASSUME
415 _nassert(stride > 4);
416 #pragma MUST_ITERATE(1,,1);
417 #endif
419 while (stride > 4) {
420 /*-----------------------------------------------------------------*/
421 /* At the start of every iteration of the outer loop, "j" is set */
422 /* to zero, as "w" is pointing to the correct location within the */
423 /* twiddle factor array. For every iteration of the inner loop */
424 /* 6 * stride twiddle factors are accessed. For eg, */
425 /* */
426 /* #Iteration of outer loop # twiddle factors #times cycled */
427 /* 1 6 N/4 1 */
428 /* 2 6 N/16 4 */
429 /* ... */
430 /*-----------------------------------------------------------------*/
431 j = 0;
432 fft_jmp >>= 2;
434 /*-----------------------------------------------------------------*/
435 /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or */
436 /* "N/2", "N", "3N/2" half word */
437 /*-----------------------------------------------------------------*/
438 h2 = stride >> 1;
439 l1 = stride;
440 l2 = stride + (stride >> 1);
442 /*-----------------------------------------------------------------*/
443 /* Reset "x" to point to the start of the input data array. */
444 /* "tw_offset" starts off at 0, and increments by "6 * stride" */
445 /* The stride quarters with every iteration of the outer loop */
446 /*-----------------------------------------------------------------*/
447 x = ptr_x;
448 w = ptr_w + tw_offset;
449 tw_offset += fft_jmp;
450 stride >>= 2;
452 /*----------------------------------------------------------------*/
453 /* The following loop iterates through the different butterflies, */
454 /* within a given stage. Recall that there are logN to base 4 */
455 /* stages. Certain butterflies share the twiddle factors. These */
456 /* are grouped together. On the very first stage there are no */
457 /* butterflies that share the twiddle factor, all N/4 butter- */
458 /* flies have different factors. On the next stage two sets of */
459 /* N/8 butterflies share the same twiddle factor. Hence after */
460 /* half the butterflies are performed, j the index into the */
461 /* factor array resets to 0, and the twiddle factors are reused. */
462 /* When this happens, the data pointer 'x' is incremented by the */
463 /* fft_jmp amount. In addition the following code is unrolled to */
464 /* perform "2" radix4 butterflies in parallel. */
465 /*----------------------------------------------------------------*/
466 #ifndef NOASSUME
467 _nassert((int)(w) % 8 == 0);
468 _nassert((int)(x) % 8 == 0);
469 _nassert(h2 % 8 == 0);
470 _nassert(l1 % 8 == 0);
471 _nassert(l2 % 8 == 0);
472 #pragma MUST_ITERATE(1, ,1);
473 #endif
475 for (i = 0; i < (npoints >> 3); i ++) {
476 /*------------------------------------------------------------*/
477 /* Read the first 12 twiddle factors, six of which are used */
478 /* for one radix 4 butterfly and six of which are used for */
479 /* next one. */
480 /*------------------------------------------------------------*/
481 co10 = w[j+1]; si10 = w[j+0];
482 co20 = w[j+3]; si20 = w[j+2];
483 co30 = w[j+5]; si30 = w[j+4];
484 co11 = w[j+7]; si11 = w[j+6];
485 co21 = w[j+9]; si21 = w[j+8];
486 co31 = w[j+11]; si31 = w[j+10];
488 /*------------------------------------------------------------*/
489 /* Read in the first complex input for the butterflies. */
490 /* 1st complex input to 1st butterfly: x[0] + jx[1] */
491 /* 1st complex input to 2nd butterfly: x[2] + jx[3] */
492 /*------------------------------------------------------------*/
493 x_0 = x[0]; x_1 = x[1];
494 x_2 = x[2]; x_3 = x[3];
496 /*------------------------------------------------------------*/
497 /* Read in the complex inputs for the butterflies. Each of the*/
498 /* successive complex inputs of the butterfly are seperated */
499 /* by a fixed amount known as stride. The stride starts out */
500 /* at N/4, and quarters with every stage. */
501 /*------------------------------------------------------------*/
502 x_l1_0 = x[l1+0]; x_l1_1 = x[l1+1];
503 x_l1_2 = x[l1+2]; x_l1_3 = x[l1+3];
505 x_l2_0 = x[l2+0]; x_l2_1 = x[l2+1];
506 x_l2_2 = x[l2+2]; x_l2_3 = x[l2+3];
508 x_h2_0 = x[h2+0]; x_h2_1 = x[h2+1];
509 x_h2_2 = x[h2+2]; x_h2_3 = x[h2+3];
511 /*-----------------------------------------------------------*/
512 /* Two butterflies are evaluated in parallel. The following */
513 /* results will be shown for one butterfly only, although */
514 /* both are being evaluated in parallel. */
515 /* */
516 /* Perform radix2 style DIF butterflies. */
517 /*-----------------------------------------------------------*/
518 xh0_0 = x_0 + x_l1_0; xh1_0 = x_1 + x_l1_1;
519 xh0_1 = x_2 + x_l1_2; xh1_1 = x_3 + x_l1_3;
521 xl0_0 = x_0 - x_l1_0; xl1_0 = x_1 - x_l1_1;
522 xl0_1 = x_2 - x_l1_2; xl1_1 = x_3 - x_l1_3;
524 xh20_0 = x_h2_0 + x_l2_0; xh21_0 = x_h2_1 + x_l2_1;
525 xh20_1 = x_h2_2 + x_l2_2; xh21_1 = x_h2_3 + x_l2_3;
527 xl20_0 = x_h2_0 - x_l2_0; xl21_0 = x_h2_1 - x_l2_1;
528 xl20_1 = x_h2_2 - x_l2_2; xl21_1 = x_h2_3 - x_l2_3;
530 /*-----------------------------------------------------------*/
531 /* Derive output pointers using the input pointer "x" */
532 /*-----------------------------------------------------------*/
533 x0 = x;
534 x2 = x0;
536 /*-----------------------------------------------------------*/
537 /* When the twiddle factors are not to be re-used, j is */
538 /* incremented by 12, to reflect the fact that 12 half words */
539 /* are consumed in every iteration. The input data pointer */
540 /* increments by 4. Note that within a stage, the stride */
541 /* does not change and hence the offsets for the other three */
542 /* legs, 0, h2, l1, l2. */
543 /*-----------------------------------------------------------*/
544 j += 12;
545 x += 4;
547 predj = (j - fft_jmp);
548 if (!predj) x += fft_jmp;
549 if (!predj) j = 0;
551 /*----------------------------------------------------------*/
552 /* These four partial results can be re-written to show */
553 /* the underlying DIF structure similar to radix2 as */
554 /* follows: */
555 /* */
556 /* X(4k) = (x(n)+x(n + N/2)) + (x(n+N/4)+ x(n + 3N/4)) */
557 /* X(4k+1)= (x(n)-x(n + N/2)) -j(x(n+N/4) - x(n + 3N/4)) */
558 /* x(4k+2)= (x(n)+x(n + N/2)) - (x(n+N/4)+ x(n + 3N/4)) */
559 /* X(4k+3)= (x(n)-x(n + N/2)) +j(x(n+N/4) - x(n + 3N/4)) */
560 /* */
561 /* which leads to the real and imaginary values as foll: */
562 /* */
563 /* y0r = x0r + x2r + x1r + x3r = xh0 + xh20 */
564 /* y0i = x0i + x2i + x1i + x3i = xh1 + xh21 */
565 /* y1r = x0r - x2r + (x1i - x3i) = xl0 + xl21 */
566 /* y1i = x0i - x2i - (x1r - x3r) = xl1 - xl20 */
567 /* y2r = x0r + x2r - (x1r + x3r) = xh0 - xh20 */
568 /* y2i = x0i + x2i - (x1i + x3i = xh1 - xh21 */
569 /* y3r = x0r - x2r - (x1i - x3i) = xl0 - xl21 */
570 /* y3i = x0i - x2i + (x1r - x3r) = xl1 + xl20 */
571 /* ---------------------------------------------------------*/
572 y0r = xh0_0 + xh20_0; y0i = xh1_0 + xh21_0;
573 y4r = xh0_1 + xh20_1; y4i = xh1_1 + xh21_1;
576 xt0_0 = xh0_0 - xh20_0; yt0_0 = xh1_0 - xh21_0;
577 xt0_1 = xh0_1 - xh20_1; yt0_1 = xh1_1 - xh21_1;
579 xt1_0 = xl0_0 + xl21_0; yt2_0 = xl1_0 + xl20_0;
580 xt2_0 = xl0_0 - xl21_0; yt1_0 = xl1_0 - xl20_0;
582 xt1_1 = xl0_1 + xl21_1; yt2_1 = xl1_1 + xl20_1;
583 xt2_1 = xl0_1 - xl21_1; yt1_1 = xl1_1 - xl20_1;
585 x2[0] = y0r; x2[1] = y0i;
586 x2[2] = y4r; x2[3] = y4i;
588 /*---------------------------------------------------------*/
589 /* Perform twiddle factor multiplies of three terms,top */
590 /* term does not have any multiplies. Note the twiddle */
591 /* factors for a normal FFT are C + j (-S). Since the */
592 /* factors that are stored are C + j S, this is */
593 /* corrected for in the multiplies. */
594 /* */
595 /* Y1 = (xt1 + jyt1) (c + js) = (xc + ys) + (yc -xs) */
596 /* Perform the multiplies using 16 by 32 multiply macro */
597 /* defined. This treats the twiddle factor as 16 bits */
598 /* and incoming data as 32 bits. */
599 /*---------------------------------------------------------*/
600 x2[h2+0] = MPY16X32R(si10, yt1_0) + MPY16X32R(co10, xt1_0);
601 x2[h2+1] = MPY16X32R(co10, yt1_0) - MPY16X32R(si10, xt1_0);
603 x2[h2+2] = MPY16X32R(si11, yt1_1) + MPY16X32R(co11, xt1_1);
604 x2[h2+3] = MPY16X32R(co11, yt1_1) - MPY16X32R(si11, xt1_1);
606 x2[l1+0] = MPY16X32R(si20, yt0_0) + MPY16X32R(co20, xt0_0);
607 x2[l1+1] = MPY16X32R(co20, yt0_0) - MPY16X32R(si20, xt0_0);
609 x2[l1+2] = MPY16X32R(si21, yt0_1) + MPY16X32R(co21, xt0_1);
610 x2[l1+3] = MPY16X32R(co21, yt0_1) - MPY16X32R(si21, xt0_1);
612 x2[l2+0] = MPY16X32R(si30, yt2_0) + MPY16X32R(co30, xt2_0);
613 x2[l2+1] = MPY16X32R(co30, yt2_0) - MPY16X32R(si30, xt2_0);
615 x2[l2+2] = MPY16X32R(si31, yt2_1) + MPY16X32R(co31, xt2_1);
616 x2[l2+3] = MPY16X32R(co31, yt2_1) - MPY16X32R(si31, xt2_1);
617 }
618 }
620 /*-----------------------------------------------------------------*/
621 /* The following code performs either a standard radix4 pass or a */
622 /* radix2 pass. Two pointers are used to access the input data. */
623 /* The input data is read "N/4" complex samples apart or "N/2" */
624 /* words apart using pointers "x0" and "x2". This produces out- */
625 /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8 */
626 /* N/2, 3N/8 for radix 2. */
627 /*-----------------------------------------------------------------*/
628 y0 = ptr_y;
629 y2 = ptr_y + (int)npoints;
630 x0 = ptr_x;
631 x2 = ptr_x + (int)(npoints >> 1);
633 if (radix == 2) {
634 /*----------------------------------------------------------------*/
635 /* The pointers are set at the following locations which are half */
636 /* the offsets of a radix4 FFT. */
637 /*----------------------------------------------------------------*/
638 y1 = y0 + (int)(npoints >> 2);
639 y3 = y2 + (int)(npoints >> 2);
640 l1 = norm + 1;
641 j0 = 8;
642 n0 = npoints >> 1;
643 }
644 else {
645 y1 = y0 + (int)(npoints >> 1);
646 y3 = y2 + (int)(npoints >> 1);
647 l1 = norm + 2;
648 j0 = 4;
649 n0 = npoints >> 2;
650 }
652 /*--------------------------------------------------------------------*/
653 /* The following code reads data indentically for either a radix 4 */
654 /* or a radix 2 style decomposition. It writes out at different */
655 /* locations though. It checks if either half the points, or a */
656 /* quarter of the complex points have been exhausted to jump to */
657 /* pervent double reversal. */
658 /*--------------------------------------------------------------------*/
659 j = 0;
661 #ifndef NOASSUME
662 _nassert((int)(n0) % 4 == 0);
663 _nassert((int)(x0) % 8 == 0);
664 _nassert((int)(x2) % 8 == 0);
665 _nassert((int)(y0) % 8 == 0);
666 #pragma MUST_ITERATE(2,,2);
667 #endif
669 for (i = 0; i < npoints; i += 8)
670 {
671 /*----------------------------------------------------------------*/
672 /* Digit reverse the index starting from 0. The increment to "j" */
673 /* is either by 4, or 8. */
674 /*----------------------------------------------------------------*/
675 DIG_REV(j, l1, h2);
677 /*----------------------------------------------------------------*/
678 /* Read in the input data, from the first eight locations. These */
679 /* are transformed either as a radix4 or as a radix 2. */
680 /*----------------------------------------------------------------*/
681 x_0 = x0[0]; x_1 = x0[1];
682 x_2 = x0[2]; x_3 = x0[3];
683 x_4 = x0[4]; x_5 = x0[5];
684 x_6 = x0[6]; x_7 = x0[7];
685 x0 += 8;
687 xh0_0 = x_0 + x_4; xh1_0 = x_1 + x_5;
688 xl0_0 = x_0 - x_4; xl1_0 = x_1 - x_5;
689 xh0_1 = x_2 + x_6; xh1_1 = x_3 + x_7;
690 xl0_1 = x_2 - x_6; xl1_1 = x_3 - x_7;
692 n00 = xh0_0 + xh0_1; n01 = xh1_0 + xh1_1;
693 n10 = xl0_0 + xl1_1; n11 = xl1_0 - xl0_1;
694 n20 = xh0_0 - xh0_1; n21 = xh1_0 - xh1_1;
695 n30 = xl0_0 - xl1_1; n31 = xl1_0 + xl0_1;
697 if (radix == 2) {
698 /*-------------------------------------------------------------*/
699 /* Perform radix2 style decomposition. */
700 /*-------------------------------------------------------------*/
701 n00 = x_0 + x_2; n01 = x_1 + x_3;
702 n20 = x_0 - x_2; n21 = x_1 - x_3;
703 n10 = x_4 + x_6; n11 = x_5 + x_7;
704 n30 = x_4 - x_6; n31 = x_5 - x_7;
705 }
707 y0[2*h2] = n00; y0[2*h2 + 1] = n01;
708 y1[2*h2] = n10; y1[2*h2 + 1] = n11;
709 y2[2*h2] = n20; y2[2*h2 + 1] = n21;
710 y3[2*h2] = n30; y3[2*h2 + 1] = n31;
712 /*----------------------------------------------------------------*/
713 /* Read in ht enext eight inputs, and perform radix4 or radix2 */
714 /* decomposition. */
715 /*----------------------------------------------------------------*/
716 x_8 = x2[0]; x_9 = x2[1];
717 x_a = x2[2]; x_b = x2[3];
718 x_c = x2[4]; x_d = x2[5];
719 x_e = x2[6]; x_f = x2[7];
720 x2 += 8;
722 xh0_2 = x_8 + x_c; xh1_2 = x_9 + x_d;
723 xl0_2 = x_8 - x_c; xl1_2 = x_9 - x_d;
724 xh0_3 = x_a + x_e; xh1_3 = x_b + x_f;
725 xl0_3 = x_a - x_e; xl1_3 = x_b - x_f;
727 n02 = xh0_2 + xh0_3; n03 = xh1_2 + xh1_3;
728 n12 = xl0_2 + xl1_3; n13 = xl1_2 - xl0_3;
729 n22 = xh0_2 - xh0_3; n23 = xh1_2 - xh1_3;
730 n32 = xl0_2 - xl1_3; n33 = xl1_2 + xl0_3;
732 if (radix == 2) {
733 n02 = x_8 + x_a; n03 = x_9 + x_b;
734 n22 = x_8 - x_a; n23 = x_9 - x_b;
735 n12 = x_c + x_e; n13 = x_d + x_f;
736 n32 = x_c - x_e; n33 = x_d - x_f;
737 }
739 /*-----------------------------------------------------------------*/
740 /* Points that are read from succesive locations map to y, y[N/4] */
741 /* y[N/2], y[3N/4] in a radix4 scheme, y, y[N/8], y[N/2],y[5N/8] */
742 /*-----------------------------------------------------------------*/
743 y0[2*h2+2] = n02; y0[2*h2+3] = n03;
744 y1[2*h2+2] = n12; y1[2*h2+3] = n13;
745 y2[2*h2+2] = n22; y2[2*h2+3] = n23;
746 y3[2*h2+2] = n32; y3[2*h2+3] = n33;
748 j += j0;
749 if (j == n0) {
750 j += n0;
751 x0 += (int)npoints >> 1;
752 x2 += (int)npoints >> 1;
753 }
754 }
755 }
757 /* ======================================================================== */
758 /* End of file: DSP_fft16x32_cn.c */
759 /* ------------------------------------------------------------------------ */
760 /* Copyright (C) 2011 Texas Instruments, Incorporated. */
761 /* All Rights Reserved. */
762 /* ======================================================================== */