1 /* ======================================================================= */
2 /* TEXAS INSTRUMENTS, INC. */
3 /* */
4 /* DSPLIB DSP Signal Processing Library */
5 /* */
6 /* This library contains proprietary intellectual property of Texas */
7 /* Instruments, Inc. The library and its source code are protected by */
8 /* various copyrights, and portions may also be protected by patents or */
9 /* other legal protections. */
10 /* */
11 /* This software is licensed for use with Texas Instruments TMS320 */
12 /* family DSPs. This license was provided to you prior to installing */
13 /* the software. You may review this license by consulting the file */
14 /* TI_license.PDF which accompanies the files in this library. */
15 /* */
16 /* ----------------------------------------------------------------------- */
17 /* */
18 /* TEXAS INSTRUMENTS, INC. */
19 /* */
20 /* NAME */
21 /* DSP_fft16x16 -- fft16x16 */
22 /* */
23 /* USAGE */
24 /* This routine is C-callable and can be called as: */
25 /* */
26 /* void DSP_fft16x16_i ( */
27 /* const short * ptr_w, */
28 /* int npoints, */
29 /* short * ptr_x, */
30 /* short * ptr_y */
31 /* ); */
32 /* */
33 /* ptr_w = input twiddle factors */
34 /* npoints = number of points */
35 /* ptr_x = transformed data reversed */
36 /* ptr_y = linear transformed data */
37 /* */
38 /* */
39 /* DESCRIPTION */
40 /* The following code performs a mixed radix FFT for "npoints" which */
41 /* is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4 */
42 /* transform and performs either a radix2 or radix4 transform on the */
43 /* last stage depending on "npoints". If "npoints" is a multiple of 4, */
44 /* then this last stage is also a radix4 transform, otherwise it is a */
45 /* radix2 transform. */
46 /* */
47 /* */
48 /* int gen_twiddle_fft16x16(short *w, int n) */
49 /* */
50 /* int i, j, k; */
51 /* double M = 32767.5; */
52 /* */
53 /* for (j = 1, k = 0; j < n >> 2; j = j << 2) */
54 /* { */
55 /* for (i = 0; i < n >> 2; i += j << 1) */
56 /* { */
57 /* */
58 /* w[k + 11] = d2s(M * cos(6.0 * PI * (i + j) / n)); */
59 /* w[k + 10] = d2s(M * sin(6.0 * PI * (i + j) / n)); */
60 /* w[k + 9] = d2s(M * cos(6.0 * PI * (i ) / n)); */
61 /* w[k + 8] = d2s(M * sin(6.0 * PI * (i ) / n)); */
62 /* */
63 /* w[k + 7] = d2s(M * cos(4.0 * PI * (i + j) / n)); */
64 /* w[k + 6] = d2s(M * sin(4.0 * PI * (i + j) / n)); */
65 /* w[k + 5] = d2s(M * cos(4.0 * PI * (i ) / n)); */
66 /* w[k + 4] = d2s(M * sin(4.0 * PI * (i ) / n)); */
67 /* */
68 /* w[k + 3] = d2s(M * cos(2.0 * PI * (i + j) / n)); */
69 /* w[k + 2] = d2s(M * sin(2.0 * PI * (i + j) / n)); */
70 /* w[k + 1] = d2s(M * cos(2.0 * PI * (i ) / n)); */
71 /* w[k + 0] = d2s(M * sin(2.0 * PI * (i ) / n)); */
72 /* */
73 /* k += 12; */
74 /* */
75 /* */
76 /* } */
77 /* } */
78 /* */
79 /* return k; */
80 /* */
81 /* */
82 /* ASSUMPTIONS */
83 /* This code works for both "npoints" a multiple of 2 or 4. */
84 /* The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a */
85 /* double-word boundary for the "optimized" implementations. */
86 /* */
87 /* The input and output data are complex, with the real/imaginary */
88 /* components stored in adjacent locations in the array. The real */
89 /* components are stored at even array indices, and the imaginary */
90 /* components are stored at odd array indices. */
91 /* */
92 /* TECHNIQUES */
93 /* The following C code represents an implementation of the Cooley */
94 /* Tukey radix 4 DIF FFT. It accepts the inputs in normal order and */
95 /* produces the outputs in digit reversed order. The natural C code */
96 /* shown in this file on the other hand, accepts the inputs in nor- */
97 /* mal order and produces the outputs in normal order. */
98 /* */
99 /* Several transformations have been applied to the original Cooley */
100 /* Tukey code to produce the natural C code description shown here. */
101 /* In order to understand these it would first be educational to */
102 /* understand some of the issues involved in the conventional Cooley */
103 /* Tukey FFT code. */
104 /* */
105 /* void radix4(int n, short x[], short wn[]) */
106 /* { */
107 /* int n1, n2, ie, ia1, ia2, ia3; */
108 /* int i0, i1, i2, i3, i, j, k; */
109 /* short co1, co2, co3, si1, si2, si3; */
110 /* short xt0, yt0, xt1, yt1, xt2, yt2; */
111 /* short xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21; */
112 /* */
113 /* n2 = n; */
114 /* ie = 1; */
115 /* for (k = n; k > 1; k >>= 2) */
116 /* { */
117 /* n1 = n2; */
118 /* n2 >>= 2; */
119 /* ia1 = 0; */
120 /* */
121 /* for (j = 0; j < n2; j++) */
122 /* { */
123 /* ia2 = ia1 + ia1; */
124 /* ia3 = ia2 + ia1; */
125 /* */
126 /* co1 = wn[2 * ia1 ]; */
127 /* si1 = wn[2 * ia1 + 1]; */
128 /* co2 = wn[2 * ia2 ]; */
129 /* si2 = wn[2 * ia2 + 1]; */
130 /* co3 = wn[2 * ia3 ]; */
131 /* si3 = wn[2 * ia3 + 1]; */
132 /* ia1 = ia1 + ie; */
133 /* */
134 /* for (i0 = j; i0< n; i0 += n1) */
135 /* { */
136 /* i1 = i0 + n2; */
137 /* i2 = i1 + n2; */
138 /* i3 = i2 + n2; */
139 /* */
140 /* */
141 /* xh0 = x[2 * i0 ] + x[2 * i2 ]; */
142 /* xh1 = x[2 * i0 + 1] + x[2 * i2 + 1]; */
143 /* xl0 = x[2 * i0 ] - x[2 * i2 ]; */
144 /* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; */
145 /* */
146 /* xh20 = x[2 * i1 ] + x[2 * i3 ]; */
147 /* xh21 = x[2 * i1 + 1] + x[2 * i3 + 1]; */
148 /* xl20 = x[2 * i1 ] - x[2 * i3 ]; */
149 /* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; */
150 /* */
151 /* x[2 * i0 ] = xh0 + xh20; */
152 /* x[2 * i0 + 1] = xh1 + xh21; */
153 /* */
154 /* xt0 = xh0 - xh20; */
155 /* yt0 = xh1 - xh21; */
156 /* xt1 = xl0 + xl21; */
157 /* yt2 = xl1 + xl20; */
158 /* xt2 = xl0 - xl21; */
159 /* yt1 = xl1 - xl20; */
160 /* */
161 /* x[2 * i1 ] = (xt1 * co1 + yt1 * si1) >> 15; */
162 /* x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15; */
163 /* x[2 * i2 ] = (xt0 * co2 + yt0 * si2) >> 15; */
164 /* x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15; */
165 /* x[2 * i3 ] = (xt2 * co3 + yt2 * si3) >> 15; */
166 /* x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15; */
167 /* } */
168 /* } */
169 /* */
170 /* ie <<= 2; */
171 /* } */
172 /* } */
173 /* */
174 /* The conventional Cooley Tukey FFT, is written using three loops. */
175 /* The outermost loop "k" cycles through the stages. There are log */
176 /* N to the base 4 stages in all. The loop "j" cycles through the */
177 /* groups of butterflies with different twiddle factors, loop "i" */
178 /* reuses the twiddle factors for the different butterflies within */
179 /* a stage. It is interesting to note the following: */
180 /* */
181 /* ----------------------------------------------------------------------- */
182 /* Stage# #Groups # Butterflies with common #Groups*Bflys */
183 /* twiddle factors */
184 /* ----------------------------------------------------------------------- */
185 /* 1 N/4 1 N/4 */
186 /* 2 N/16 4 N/4 */
187 /* .. */
188 /* logN 1 N/4 N/4 */
189 /* ----------------------------------------------------------------------- */
190 /* */
191 /* The following statements can be made based on above observations: */
192 /* */
193 /* a) Inner loop "i0" iterates a veriable number of times. In */
194 /* particular the number of iterations quadruples every time from */
195 /* 1..N/4. Hence software pipelining a loop that iterates a vraiable */
196 /* number of times is not profitable. */
197 /* */
198 /* b) Outer loop "j" iterates a variable number of times as well. */
199 /* However the number of iterations is quartered every time from */
200 /* N/4 ..1. Hence the behaviour in (a) and (b) are exactly opposite */
201 /* to each other. */
202 /* */
203 /* c) If the two loops "i" and "j" are colaesced together then they */
204 /* will iterate for a fixed number of times namely N/4. This allows */
205 /* us to combine the "i" and "j" loops into 1 loop. Optimized impl- */
206 /* ementations will make use of this fact. */
207 /* */
208 /* In addition the Cooley Tukey FFT accesses three twiddle factors */
209 /* per iteration of the inner loop, as the butterflies that re-use */
210 /* twiddle factors are lumped together. This leads to accessing the */
211 /* twiddle factor array at three points each sepearted by "ie". Note */
212 /* that "ie" is initially 1, and is quadrupled with every iteration. */
213 /* Therfore these three twiddle factors are not even contiguous in */
214 /* the array. */
215 /* */
216 /* In order to vectorize the FFT, it is desirable to access twiddle */
217 /* factor array using double word wide loads and fetch the twiddle */
218 /* factors needed. In order to do this a modified twiddle factor */
219 /* array is created, in which the factors WN/4, WN/2, W3N/4 are */
220 /* arranged to be contiguous. This eliminates the seperation between */
221 /* twiddle factors within a butterfly. However this implies that as */
222 /* the loop is traversed from one stage to another, that we maintain */
223 /* a redundant version of the twiddle factor array. Hence the size */
224 /* of the twiddle factor array increases as compared to the normal */
225 /* Cooley Tukey FFT. The modified twiddle factor array is of size */
226 /* "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4" */
227 /* where N is the number of complex points to be transformed. The */
228 /* routine that generates the modified twiddle factor array was */
229 /* presented earlier. With the above transformation of the FFT, */
230 /* both the input data and the twiddle factor array can be accessed */
231 /* using double-word wide loads to enable packed data processing. */
232 /* */
233 /* The final stage is optimised to remove the multiplication as */
234 /* w0 = 1. This stage also performs digit reversal on the data, */
235 /* so the final output is in natural order. */
236 /* */
237 /* The fft() code shown here performs the bulk of the computation */
238 /* in place. However, because digit-reversal cannot be performed */
239 /* in-place, the final result is written to a separate array, y[]. */
240 /* */
241 /* The actual twiddle factors for the FFT are cosine, - sine. The */
242 /* twiddle factors stored in the table are csine and sine, hence */
243 /* the sign of the "sine" term is comprehended during multipli- */
244 /* cation as shown above. */
245 /* */
246 /* MEMORY NOTE */
247 /* The optimized implementations are written for LITTLE ENDIAN. */
248 /* */
249 /* Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/ */
250 /* */
251 /* */
252 /* Redistribution and use in source and binary forms, with or without */
253 /* modification, are permitted provided that the following conditions */
254 /* are met: */
255 /* */
256 /* Redistributions of source code must retain the above copyright */
257 /* notice, this list of conditions and the following disclaimer. */
258 /* */
259 /* Redistributions in binary form must reproduce the above copyright */
260 /* notice, this list of conditions and the following disclaimer in the */
261 /* documentation and/or other materials provided with the */
262 /* distribution. */
263 /* */
264 /* Neither the name of Texas Instruments Incorporated nor the names of */
265 /* its contributors may be used to endorse or promote products derived */
266 /* from this software without specific prior written permission. */
267 /* */
268 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
269 /* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */
270 /* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR */
271 /* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT */
272 /* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
273 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT */
274 /* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
275 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY */
276 /* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
277 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE */
278 /* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
279 /* */
280 /* ======================================================================= */
282 #pragma CODE_SECTION(DSP_fft16x16_i, ".text:optimized");
284 #include "DSP_fft16x16_i.h"
286 static inline void radix_2(short *restrict ptr_x, short *restrict ptr_y, int npoints);
287 static inline void radix_4(short *restrict ptr_x, short *restrict ptr_y, int npoints);
289 #ifdef _LITTLE_ENDIAN
290 void DSP_fft16x16_i (
291 const short * restrict ptr_w,
292 int npoints,
293 short * restrict ptr_x,
294 short * restrict ptr_y
295 )
296 {
297 const short * restrict w;
298 const short * restrict w2;
299 short * restrict x, * restrict x2, * restrict x0;
301 int j, i, l1, radix;
302 int l2, h2, fft_jmp;
303 int predj, tw_offset, stride;
305 int x_1o_x_0o, x_3o_x_2o;
306 int xl2_1_0, xl2_3_2;
308 unsigned int myt0_0_mxt0_0, myt0_1_mxt0_1;
309 unsigned int xl1_1_0, xl1_3_2, xh2_1_0, xh2_3_2;
310 unsigned int yt1_0_xt1_0, yt1_1_xt1_1, yt2_0_xt2_0, yt2_1_xt2_1;
312 unsigned int xl0_0_xl1_0, xt1_0_yt2_0;
313 unsigned int xt2_0_yt1_0, xl0_1_xl1_1;
314 unsigned int xt1_1_yt2_1, xt2_1_yt1_1;
316 unsigned int xh1_0_xh0_0, xh1_1_xh0_1;
317 unsigned int xl1_0_xl0_0, xl1_1_xl0_1;
318 unsigned int xh21_0_xh20_0, xh21_1_xh20_1;
319 unsigned int xl21_0_xl20_0, xl21_1_xl20_1;
321 double x_3210;
322 double co11si11_co10si10, co21si21_co20si20, co31si31_co30si30;
323 double x_l1_32_x_l1_10, x_l2_32_x_l2_10, x_h2_32_x_h2_10;
325 long long xt1_0_yt2_0_xt2_0_yt1_0, xt1_1_yt2_1_xt2_1_yt1_1;
326 long long yt1_0_xt1_0_yt2_0_xt2_0, yt1_1_xt1_1_yt2_1_xt2_1;
327 long long xh1_0_xh0_0_xl1_0_xl0_0, xh1_1_xh0_1_xl1_1_xl0_1;
328 long long xh21_0_xh20_0_xl21_0_xl20_0, xh21_1_xh20_1_xl21_1_xl20_1;
330 /*---------------------------------------------------------------------*/
331 /* Determine the magnitude od the number of points to be transformed. */
332 /* Check whether we can use a radix4 decomposition or a mixed radix */
333 /* transformation, by determining modulo 2. */
334 /*---------------------------------------------------------------------*/
335 radix = _norm(npoints) & 1 ? 2 : 4;
337 /*----------------------------------------------------------------------*/
338 /* The stride is quartered with every iteration of the outer loop. It */
339 /* denotes the seperation between any two adjacent inputs to the butter */
340 /* -fly. This should start out at N/4, hence stride is initially set to */
341 /* N. For every stride, 6*stride twiddle factors are accessed. The */
342 /* "tw_offset" is the offset within the current twiddle factor sub- */
343 /* table. This is set to zero, at the start of the code and is used to */
344 /* obtain the appropriate sub-table twiddle pointer by offseting it */
345 /* with the base pointer "ptr_w". */
346 /*----------------------------------------------------------------------*/
347 stride = npoints;
348 tw_offset = 0;
349 fft_jmp = 6 * stride;
351 _nassert(stride > 4);
352 #pragma MUST_ITERATE(1,,1);
353 while (stride > radix) {
354 /*-----------------------------------------------------------------*/
355 /* At the start of every iteration of the outer loop, "j" is set */
356 /* to zero, as "w" is pointing to the correct location within the */
357 /* twiddle factor array. For every iteration of the inner loop */
358 /* 6 * stride twiddle factors are accessed. For eg, */
359 /* */
360 /* #Iteration of outer loop # twiddle factors #times cycled */
361 /* 1 6 N/4 1 */
362 /* 2 6 N/16 4 */
363 /* ... */
364 /*-----------------------------------------------------------------*/
365 j = 0;
366 fft_jmp >>= 2;
368 /*-----------------------------------------------------------------*/
369 /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or */
370 /* "N/2", "N", "3N/2" half word */
371 /*-----------------------------------------------------------------*/
372 h2 = stride >> 1;
373 l1 = stride;
374 l2 = stride + (stride >> 1);
376 /*-----------------------------------------------------------------*/
377 /* Reset "x" to point to the start of the input data array. */
378 /* "tw_offset" starts off at 0, and increments by "6 * stride" */
379 /* The stride quarters with every iteration of the outer loop */
380 /*-----------------------------------------------------------------*/
381 x = ptr_x;
382 w = ptr_w + tw_offset;
383 tw_offset += fft_jmp;
384 stride >>= 2;
386 /*----------------------------------------------------------------*/
387 /* The following loop iterates through the different butterflies, */
388 /* within a given stage. Recall that there are logN to base 4 */
389 /* stages. Certain butterflies share the twiddle factors. These */
390 /* are grouped together. On the very first stage there are no */
391 /* butterflies that share the twiddle factor, all N/4 butter- */
392 /* flies have different factors. On the next stage two sets of */
393 /* N/8 butterflies share the same twiddle factor. Hence after */
394 /* half the butterflies are performed, j the index into the */
395 /* factor array resets to 0, and the twiddle factors are reused. */
396 /* When this happens, the data pointer 'x' is incremented by the */
397 /* fft_jmp amount. In addition the following code is unrolled to */
398 /* perform "2" radix4 butterflies in parallel. */
399 /*----------------------------------------------------------------*/
400 _nassert(npoints >= 16);
401 #pragma MUST_ITERATE(2,,2);
402 for (i = 0; i < npoints; i += 8) {
403 /*------------------------------------------------------------*/
404 /* Read the first 12 twiddle factors, six of which are used */
405 /* for one radix 4 butterfly and six of which are used for */
406 /* next one. */
407 /* */
408 /* co10 = w[j+1]; si10 = w[j+0]; */
409 /* co11 = w[j+3]; si11 = w[j+2]; */
410 /* co20 = w[j+5]; si20 = w[j+4]; */
411 /* co21 = w[j+7]; si21 = w[j+6]; */
412 /* co30 = w[j+9]; si30 = w[j+8]; */
413 /* co31 = w[j+11]; si31 = w[j+10]; */
414 /* */
415 /* The code shown above performs half word wide loads. The */
416 /* C64x can perform double word wide loads. This is done by */
417 /* the use of the _amemd8_const intrinsic. The const quali- */
418 /* fies the fact that this array will only be read from and */
419 /* not written to. */
420 /*------------------------------------------------------------*/
421 w2 = w + j;
422 co11si11_co10si10 = _amemd8_const(w2);
423 co21si21_co20si20 = _amemd8_const(&w2[4]);
424 co31si31_co30si30 = _amemd8_const(&w2[8]);
426 /*------------------------------------------------------------*/
427 /* Read in the first complex input for the butterflies. */
428 /* 1st complex input to 1st butterfly: x[0] + jx[1] */
429 /* 1st complex input to 2nd butterfly: x[2] + jx[3] */
430 /* Read in the complex inputs for the butterflies. Each of the*/
431 /* successive complex inputs of the butterfly are seperated */
432 /* by a fixed amount known as stride. The stride starts out */
433 /* at N/4, and quarters with every stage. */
434 /* */
435 /* x_0 = x[0]; x_1 = x[1]; */
436 /* x_2 = x[2]; x_3 = x[3]; */
437 /* x_l1_0 = x[l1 ]; x_l1_1 = x[l1 + 1]; */
438 /* x_l1_2 = x[l1 + 2]; x_l1_3 = x[l1 + 3]; */
439 /* x_l2_0 = x[l2 ]; x_l2_1 = x[l2 + 1]; */
440 /* x_l2_2 = x[l2 + 2]; x_l2_3 = x[l2 + 3]; */
441 /* x_h2_0 = x[h2 ]; x_h2_1 = x[h2 + 1]; */
442 /* x_h2_2 = x[h2 + 2]; x_h2_3 = x[h2 + 3]; */
443 /* */
444 /* These loads are performed using _amemd8. Note that there */
445 /* is no const qualifier in this case as this loop writes */
446 /* back results into the data array "x". The "a" in _amemd8 */
447 /* qualifies the fact that the loads are aligned. */
448 /*------------------------------------------------------------*/
449 x_3210 = _amemd8(&x[0]);
450 x_l1_32_x_l1_10 = _amemd8(&x[l1]);
451 x_l2_32_x_l2_10 = _amemd8(&x[l2]);
452 x_h2_32_x_h2_10 = _amemd8(&x[h2]);
454 /*-----------------------------------------------------------*/
455 /* Two butterflies are evaluated in parallel. The following */
456 /* results will be shown for one butterfly only, although */
457 /* both are being evaluated in parallel. */
458 /* */
459 /* Perform radix2 style DIF butterflies. */
460 /* */
461 /* xh0_0 = x_0 + x_l1_0; xh1_0 = x_1 + x_l1_1; */
462 /* xh0_1 = x_2 + x_l1_2; xh1_1 = x_3 + x_l1_3; */
463 /* xl0_0 = x_0 - x_l1_0; xl1_0 = x_1 - x_l1_1; */
464 /* xl0_1 = x_2 - x_l1_2; xl1_1 = x_3 - x_l1_3; */
465 /* xh20_0 = x_h2_0 + x_l2_0; xh21_0 = x_h2_1 + x_l2_1; */
466 /* xh20_1 = x_h2_2 + x_l2_2; xh21_1 = x_h2_3 + x_l2_3; */
467 /* xl20_0 = x_h2_0 - x_l2_0; xl21_0 = x_h2_1 - x_l2_1; */
468 /* xl20_1 = x_h2_2 - x_l2_2; xl21_1 = x_h2_3 - x_l2_3; */
469 /* */
470 /* These are done using packed data processing, and these */
471 /* are done using _add2, _sub2. */
472 /* For example "xh1_0_xh0_0" has a packed value xh1_0, */
473 /* xh0_0. */
474 /*-----------------------------------------------------------*/
475 xh1_0_xh0_0_xl1_0_xl0_0 = _addsub2((_lo(x_3210)), (_lo(x_l1_32_x_l1_10)));
476 xh1_1_xh0_1_xl1_1_xl0_1 = _addsub2((_hi(x_3210)), (_hi(x_l1_32_x_l1_10)));
478 xh1_0_xh0_0 = (_hill(xh1_0_xh0_0_xl1_0_xl0_0));
479 xh1_1_xh0_1 = (_hill(xh1_1_xh0_1_xl1_1_xl0_1));
480 xl1_0_xl0_0 = (_loll(xh1_0_xh0_0_xl1_0_xl0_0));
481 xl1_1_xl0_1 = (_loll(xh1_1_xh0_1_xl1_1_xl0_1));
483 xh21_0_xh20_0_xl21_0_xl20_0 = _addsub2((_lo(x_h2_32_x_h2_10)), (_lo(x_l2_32_x_l2_10)));
484 xh21_1_xh20_1_xl21_1_xl20_1 = _addsub2((_hi(x_h2_32_x_h2_10)), (_hi(x_l2_32_x_l2_10)));
486 xh21_0_xh20_0 = (_hill(xh21_0_xh20_0_xl21_0_xl20_0));
487 xh21_1_xh20_1 = (_hill(xh21_1_xh20_1_xl21_1_xl20_1));
488 xl21_0_xl20_0 = (_loll(xh21_0_xh20_0_xl21_0_xl20_0));
489 xl21_1_xl20_1 = (_loll(xh21_1_xh20_1_xl21_1_xl20_1));
491 /*----------------------------------------------------------*/
492 /* Derive output pointers using the input pointer "x" */
493 /*----------------------------------------------------------*/
494 x0 = (short *)_mvd((int)x);
495 x2 = (short *)_mvd((int)x0);
497 /*-----------------------------------------------------------*/
498 /* When the twiddle factors are not to be re-used, j is */
499 /* incremented by 12, to reflect the fact that 12 half words */
500 /* are consumed in every iteration. The input data pointer */
501 /* increments by 4. Note that within a stage, the stride */
502 /* does not change and hence the offsets for the other three */
503 /* legs, 0, h2, l1, l2. */
504 /*-----------------------------------------------------------*/
505 j += 12;
506 x += 4;
508 predj = (j - fft_jmp);
509 if (!predj) x += fft_jmp;
510 if (!predj) j = 0;
512 /*----------------------------------------------------------*/
513 /* x0[0] = (xh0_0 + xh20_0 + 1) >> 1; */
514 /* x0[1] = (xh1_0 + xh21_0 + 1) >> 1; */
515 /* x0[2] = (xh0_1 + xh20_1 + 1) >> 1; */
516 /* x0[3] = (xh1_1 + xh21_1 + 1) >> 1; */
517 /* */
518 /* "y0x0" and "y1x1" are the first four outputs of the */
519 /* radix4 butterfly. These are computed on the C64x */
520 /* using _avg2. */
521 /*----------------------------------------------------------*/
522 x_1o_x_0o = _avg2(xh21_0_xh20_0, xh1_0_xh0_0);
523 x_3o_x_2o = _avg2(xh21_1_xh20_1, xh1_1_xh0_1);
525 /*----------------------------------------------------------*/
526 /* Perform one more stage of radix2 decompositions as shown */
527 /* These are performed using packed data processing. */
528 /* */
529 /* xt0_0 = xh0_0 - xh20_0; yt0_0 = xh1_0 - xh21_0; */
530 /* xt0_1 = xh0_1 - xh20_1; yt0_1 = xh1_1 - xh21_1; */
531 /* xt1_0 = xl0_0 + xl21_0; yt2_0 = xl1_0 + xl20_0; */
532 /* xt2_0 = xl0_0 - xl21_0; yt1_0 = xl1_0 - xl20_0; */
533 /* xt1_1 = xl0_1 + xl21_1; yt2_1 = xl1_1 + xl20_1; */
534 /* xt2_1 = xl0_1 - xl21_1; yt1_1 = xl1_1 - xl20_1; */
535 /* */
536 /*----------------------------------------------------------*/
537 myt0_0_mxt0_0 = _sub2(xh21_0_xh20_0, xh1_0_xh0_0);
538 myt0_1_mxt0_1 = _sub2(xh21_1_xh20_1, xh1_1_xh0_1);
540 /*----------------------------------------------------------*/
541 /* Repack coefficients to sustain packed data processing. */
542 /* This requires a swizzle so that intermediate results */
543 /* end up in the right spot. This is needed to compute */
544 /* other intermediate results using add2 and sub2. */
545 /*----------------------------------------------------------*/
546 xl0_0_xl1_0 = _rotl(xl1_0_xl0_0, 16);
547 xl0_1_xl1_1 = _rotl(xl1_1_xl0_1, 16);
549 //;----------------------------------------------------------;
550 //; xt1_0 = xl0_0 + xl21_0 yt2_0 = xl1_0 + xl20_0 ;
551 //; xt1_1 = xl0_1 + xl21_1 yt2_1 = xl1_1 + xl20_1 ;
552 //; xt2_0 = xl0_0 - xl21_0 yt1_0 = xl1_0 - xl20_0 ;
553 //; xt2_1 = xl0_1 - xl21_1 yt1_1 = xl1_1 - xl20_1 ;
554 //;----------------------------------------------------------;
555 xt1_0_yt2_0_xt2_0_yt1_0 = _addsub2(xl0_0_xl1_0, xl21_0_xl20_0);
556 xt1_1_yt2_1_xt2_1_yt1_1 = _addsub2(xl0_1_xl1_1, xl21_1_xl20_1);
558 xt1_0_yt2_0 = (_hill(xt1_0_yt2_0_xt2_0_yt1_0));
559 xt1_1_yt2_1 = (_hill(xt1_1_yt2_1_xt2_1_yt1_1));
560 xt2_0_yt1_0 = (_loll(xt1_0_yt2_0_xt2_0_yt1_0));
561 xt2_1_yt1_1 = (_loll(xt1_1_yt2_1_xt2_1_yt1_1));
563 /*----------------------------------------------------------*/
564 /* Repack results so that xt_XXyt_XX are a packed 32 bit */
565 /* quantity. The operations performed above tend to leave */
566 /* the results as xt_XXyt_YY and xt_YYyt_XX. This is where */
567 /* the packed instructions come in handy. For example */
568 /* xt_XXyt_XX = _packlh2(xt_XXyt_YY, xt_YY_yt_XX) */
569 /*----------------------------------------------------------*/
570 yt1_0_xt1_0_yt2_0_xt2_0 = _dpackx2(xt1_0_yt2_0, xt2_0_yt1_0);
571 yt1_1_xt1_1_yt2_1_xt2_1 = _dpackx2(xt1_1_yt2_1, xt2_1_yt1_1);
573 yt1_0_xt1_0 = (_hill(yt1_0_xt1_0_yt2_0_xt2_0));
574 yt1_1_xt1_1 = (_hill(yt1_1_xt1_1_yt2_1_xt2_1));
575 yt2_0_xt2_0 = (_loll(yt1_0_xt1_0_yt2_0_xt2_0));
576 yt2_1_xt2_1 = (_loll(yt1_1_xt1_1_yt2_1_xt2_1));
578 /*---------------------------------------------------------*/
579 /* Notice that in this version of the code the two middle */
580 /* legs are swapped as indicated by the stores to x[l1].. */
581 /* x[l1 + 3] which preceede the stores to x[h2]..x[h2 + 3] */
582 /* This reversal guarantees that a radix4 DIF butterfly */
583 /* produces results in digit reversed order. Note that in */
584 /* addition to the rounding, the shift is performed by 16, */
585 /* as opposed to 15, to give scaling. */
586 /*---------------------------------------------------------*/
588 /*---------------------------------------------------------*/
589 /* */
590 /* x2[l1 ] = (si10 * yt1_0 + co10 * xt1_0 + 0x8000) >> 16;*/
591 /* x2[l1+1] = (co10 * yt1_0 - si10 * xt1_0 + 0x8000) >> 16;*/
592 /* x2[l1+2] = (si11 * yt1_1 + co11 * xt1_1 + 0x8000) >> 16;*/
593 /* x2[l1+3] = (co11 * yt1_1 - si11 * xt1_1 + 0x8000) >> 16;*/
594 /* */
595 /* These four results are retained in registers and a */
596 /* double word is formed so that it can be stored with */
597 /* one STDW. */
598 /*---------------------------------------------------------*/
599 xh2_1_0 = _cmpyr((_lo(co11si11_co10si10)), yt1_0_xt1_0);
600 xh2_3_2 = _cmpyr((_hi(co11si11_co10si10)), yt1_1_xt1_1);
602 /*---------------------------------------------------------*/
603 /* The following code computes intermediate results for: */
604 /* */
605 /* x2[h2 ] = (si20 * yt0_0 + co20 * xt0_0 + 0x8000) >> 16;*/
606 /* x2[h2+1] = (co20 * yt0_0 - si20 * xt0_0 + 0x8000) >> 16;*/
607 /* x2[h2+2] = (si21 * yt0_1 + co21 * xt0_1 + 0x8000) >> 16;*/
608 /* x2[h2+3] = (co21 * yt0_1 - si21 * xt0_1 + 0x8000) >> 16;*/
609 /*---------------------------------------------------------*/
610 xl1_1_0 = _cmpyr((_lo(co21si21_co20si20)), myt0_0_mxt0_0);
611 xl1_3_2 = _cmpyr((_hi(co21si21_co20si20)), myt0_1_mxt0_1);
613 /*---------------------------------------------------------*/
614 /* The following code computes intermediate results for: */
615 /* */
616 /* x2[l2 ] = (si30 * yt2_0 + co30 * xt2_0 + 0x8000) >> 16;*/
617 /* x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0 + 0x8000) >> 16;*/
618 /* x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1 + 0x8000) >> 16;*/
619 /* x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1 + 0x8000) >> 16;*/
620 /*---------------------------------------------------------*/
621 xl2_1_0 = _cmpyr((_lo(co31si31_co30si30)), yt2_0_xt2_0);
622 xl2_3_2 = _cmpyr((_hi(co31si31_co30si30)), yt2_1_xt2_1);
624 /*---------------------------------------------------------*/
625 /* Prepare the doubleword for each of the four outputs, by */
626 /* the use of the _itod intrinsic that takes integers, and */
627 /* forms a double word wide quantity. */
628 /* Store out the outputs to the four legs of the butterfly */
629 /* using aligned store double words. Notice the use of the */
630 /* indices "l1", "l2", "h2" to derive the pointers for the */
631 /* legs of the butterfly. */
632 /*---------------------------------------------------------*/
633 _amemd8(&x2[0]) = _itod(x_3o_x_2o, x_1o_x_0o);
634 _amemd8(&x2[l1]) = _itod(xl1_3_2, xl1_1_0);
635 _amemd8(&x2[h2]) = _itod(xh2_3_2, xh2_1_0);
636 _amemd8(&x2[l2]) = _itod(xl2_3_2, xl2_1_0);
637 }
638 }
640 if (radix == 2)
641 radix_2(ptr_x, ptr_y, npoints);
642 else if (radix == 4)
643 radix_4(ptr_x, ptr_y, npoints);
645 return;
646 }
648 void radix_2(
649 short *restrict ptr_x,
650 short *restrict ptr_y,
651 int npoints
652 )
653 {
654 short * restrict x2, * restrict x0;
655 short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
656 short n0, j0;
658 int i, j, l1, h2;
659 int y_10, y_32, y_54, y_76, y_98, y_BA, y_DC, y_FE;
661 double x_2301, x_6745, x_AB89, x_EFCD;
663 /*-----------------------------------------------------------------*/
664 /* The following code performs either a standard radix4 pass or a */
665 /* radix2 pass. Two pointers are used to access the input data. */
666 /* The input data is read "N/4" complex samples apart or "N/2" */
667 /* words apart using pointers "x0" and "x2". This produces out- */
668 /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8 */
669 /* N/2, 3N/8 for radix 2. */
670 /*-----------------------------------------------------------------*/
671 y0 = ptr_y;
672 y2 = ptr_y + (int)npoints;
673 x0 = ptr_x;
674 x2 = ptr_x + (int)(npoints >> 1);
676 /*----------------------------------------------------------------*/
677 /* The pointers are set at the following locations which are half */
678 /* the offsets of a radix4 FFT. */
679 /*----------------------------------------------------------------*/
680 y1 = y0 + (int)(npoints >> 2);
681 y3 = y2 + (int)(npoints >> 2);
682 l1 = _norm(npoints) + 1;
683 j0 = 8;
684 n0 = npoints >> 1;
686 /*--------------------------------------------------------------------*/
687 /* The following code reads data indentically for either a radix 4 */
688 /* or a radix 2 style decomposition. It writes out at different */
689 /* locations though. It checks if either half the points, or a */
690 /* quarter of the complex points have been exhausted to jump to */
691 /* pervent double reversal. */
692 /*--------------------------------------------------------------------*/
693 j = 0;
695 _nassert((int)(n0) % 4 == 0);
696 _nassert((int)(ptr_x) % 8 == 0);
697 _nassert((int)(ptr_y) % 8 == 0);
698 _nassert((int)(x0) % 8 == 0);
699 _nassert((int)(x2) % 8 == 0);
700 _nassert((int)(y0) % 8 == 0);
701 _nassert(npoints >= 16);
702 #pragma MUST_ITERATE(2,,2);
703 for (i = 0; i < npoints; i += 8) {
704 /*----------------------------------------------------------------*/
705 /* Digit reverse the index starting from 0. The increment to "j" */
706 /* is either by 4, or 8. */
707 /*----------------------------------------------------------------*/
708 h2 = _deal(j);
709 h2 = _bitr(h2);
710 h2 = _rotl(h2, 16);
711 h2 = _shfl(h2);
712 h2 >>= l1;
713 /*----------------------------------------------------------------*/
714 /* Read in the input data. These are transformed as a radix2. */
715 /*----------------------------------------------------------------*/
716 x_2301 = _amemd8(&x0[0]);
717 x_6745 = _amemd8(&x0[4]);
718 x0 += 8;
719 x_AB89 = _amemd8(&x2[0]);
720 x_EFCD = _amemd8(&x2[4]);
721 x2 += 8;
723 /*----------------------------------------------------------------------------*/
724 /* Perform radix2 style decomposition. */
725 /*----------------------------------------------------------------------------*/
726 /* n00 = x_0 + x_2; n01 = x_1 + x_3; y_10 = n00_n01 = _add2(x_01, x_23) */
727 /* n20 = x_0 - x_2; n21 = x_1 - x_3; y_98 = n20_n21 = _sub2(x_01, x_23) */
728 /* n10 = x_4 + x_6; n11 = x_5 + x_7; y_54 = n10_n11 = _add2(x_45, x_67) */
729 /* n30 = x_4 - x_6; n31 = x_5 - x_7; y_DC = n30_n31 = _sub2(x_45, x_67) */
730 /* n02 = x_8 + x_a; n03 = x_9 + x_b; y_32 = n03_n02 = _add2(x_89, x_AB) */
731 /* n12 = x_c + x_e; n13 = x_d + x_f; y_76 = n12_n13 = _add2(x_CD, x_EF) */
732 /* n22 = x_8 - x_a; n23 = x_9 - x_b; y_BA = n22_n23 = _sub2(x_89, x_AB) */
733 /* n32 = x_c - x_e; n33 = x_d - x_f; y_FE = n32_n33 = _sub2(x_CD, x_EF) */
735 y_10 = _add2(_lo(x_2301), _hi(x_2301));
736 y_98 = _sub2(_lo(x_2301), _hi(x_2301));
737 y_54 = _add2(_lo(x_6745), _hi(x_6745));
738 y_DC = _sub2(_lo(x_6745), _hi(x_6745));
739 y_32 = _add2(_lo(x_AB89), _hi(x_AB89));
740 y_76 = _add2(_lo(x_EFCD), _hi(x_EFCD));
741 y_BA = _sub2(_lo(x_AB89), _hi(x_AB89));
742 y_FE = _sub2(_lo(x_EFCD), _hi(x_EFCD));
744 /*----Imaginary------------Real-----------*/
745 /* y0[2*h2+0] = n01; y0[2*h2+1] = n00; */
746 /* y0[2*h2+2] = n03; y0[2*h2+3] = n02; */
747 /* y1[2*h2+0] = n11; y1[2*h2+1] = n10; */
748 /* y1[2*h2+2] = n13; y1[2*h2+3] = n12; */
749 /* y2[2*h2+0] = n21; y2[2*h2+1] = n20; */
750 /* y2[2*h2+2] = n23; y2[2*h2+3] = n22; */
751 /* y3[2*h2+0] = n31; y3[2*h2+1] = n30; */
752 /* y3[2*h2+2] = n33; y3[2*h2+3] = n32; */
754 /*------------------------------------------------------------*/
755 /* Store out the results of all four butterflies as double */
756 /* words. */
757 /*------------------------------------------------------------*/
758 _amemd8(&y0[2 * h2]) = _itod(y_32, y_10);
759 _amemd8(&y1[2 * h2]) = _itod(y_76, y_54);
760 _amemd8(&y2[2 * h2]) = _itod(y_BA, y_98);
761 _amemd8(&y3[2 * h2]) = _itod(y_FE, y_DC);
763 j += j0;
764 if (j == n0) {
765 j += n0;
766 x0 += (int)npoints >> 1;
767 x2 += (int)npoints >> 1;
768 }
769 }
770 }
772 void radix_4(
773 short *restrict ptr_x,
774 short *restrict ptr_y,
775 int npoints
776 )
777 {
778 short * restrict x2, * restrict x0;
779 short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
780 short n0, j0;
782 int i, j, l1, h2;
783 int xh0_0_xh1_0, xh0_1_xh1_1, xh0_2_xh1_2, xh0_3_xh1_3;
784 int xl0_0_xl1_0, xl0_1_xl1_1, xl1_1_xl0_1, xl0_2_xl1_2, xl0_3_xl1_3, xl1_3_xl0_3;
785 int n10_n11, n30_n31, n12_n13, n32_n33;
787 double x_2301, x_6745, x_AB89, x_EFCD;
789 long long n00_n01_n20_n21, n10_n31_n30_n11, n02_n03_n22_n23, n12_n33_n32_n13;
791 /*-----------------------------------------------------------------*/
792 /* The following code performs either a standard radix4 pass or a */
793 /* radix2 pass. Two pointers are used to access the input data. */
794 /* The input data is read "N/4" complex samples apart or "N/2" */
795 /* words apart using pointers "x0" and "x2". This produces out- */
796 /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8 */
797 /* N/2, 3N/8 for radix 2. */
798 /*-----------------------------------------------------------------*/
799 y0 = ptr_y;
800 y2 = ptr_y + (int)npoints;
801 x0 = ptr_x;
802 x2 = ptr_x + (int)(npoints >> 1);
804 /*----------------------------------------------------------------*/
805 /* The pointers are set at the following locations which are half */
806 /* the offsets of a radix4 FFT. */
807 /*----------------------------------------------------------------*/
808 y3 = y2 + (int)(npoints >> 1);
809 y1 = y0 + (int)(npoints >> 1);
810 l1 = _norm(npoints) + 2;
811 j0 = 4;
812 n0 = npoints >> 2;
814 /*--------------------------------------------------------------------*/
815 /* The following code reads data indentically for either a radix 4 */
816 /* or a radix 2 style decomposition. It writes out at different */
817 /* locations though. It checks if either half the points, or a */
818 /* quarter of the complex points have been exhausted to jump to */
819 /* pervent double reversal. */
820 /*--------------------------------------------------------------------*/
821 j = 0;
823 _nassert((int)(n0) % 4 == 0);
824 _nassert((int)(x0) % 8 == 0);
825 _nassert((int)(x2) % 8 == 0);
826 _nassert((int)(y0) % 8 == 0);
827 _nassert(npoints >= 16);
828 #pragma MUST_ITERATE(2,,2);
830 for (i = 0; i < npoints; i += 8) {
831 /*----------------------------------------------------------------*/
832 /* Digit reverse the index starting from 0. The increment to "j" */
833 /* is either by 4, or 8. */
834 /*----------------------------------------------------------------*/
835 h2 = _deal(j);
836 h2 = _bitr(h2);
837 h2 = _rotl(h2, 16);
838 h2 = _shfl(h2);
839 h2 >>= l1;
841 /*----------------------------------------------------------------*/
842 /* Read in the input data. These are transformed as a radix4. */
843 /*----------------------------------------------------------------*/
844 x_2301 = _amemd8(&x0[0]);
845 x_6745 = _amemd8(&x0[4]);
846 x0 += 8;
847 x_AB89 = _amemd8(&x2[0]);
848 x_EFCD = _amemd8(&x2[4]);
849 x2 += 8;
851 /*------------------------------------------*/
852 /* Perform radix4 style decomposition. */
853 /*------------------------------------------*/
854 /* xh0_0 = x_0 + x_4; xh1_0 = x_1 + x_5; */
855 /* xl0_0 = x_0 - x_4; xl1_0 = x_1 - x_5; */
856 /* xh0_1 = x_2 + x_6; xh1_1 = x_3 + x_7; */
857 /* xl0_1 = x_2 - x_6; xl1_1 = x_3 - x_7; */
858 xh0_0_xh1_0 = _add2(_lo(x_2301), _lo(x_6745));
859 xl0_0_xl1_0 = _sub2(_lo(x_2301), _lo(x_6745));
860 xh0_1_xh1_1 = _add2(_hi(x_2301), _hi(x_6745));
861 xl0_1_xl1_1 = _sub2(_hi(x_2301), _hi(x_6745));
863 /* n00 = xh0_0 + xh0_1; n20 = xh0_0 - xh0_1; */
864 /* n01 = xh1_0 + xh1_1; n21 = xh1_0 - xh1_1; */
865 n00_n01_n20_n21 = _addsub2(xh0_0_xh1_0, xh0_1_xh1_1);
867 /* n30 = xl0_0 + xl1_1; n10 = xl0_0 - xl1_1; */
868 /* n11 = xl1_0 + xl0_1; n31 = xl1_0 - xl0_1; */
869 xl1_1_xl0_1 = _rotl(xl0_1_xl1_1, 16);
870 n10_n31_n30_n11 = _addsub2(xl0_0_xl1_0, xl1_1_xl0_1);
872 n30_n31 = _packhl2(_hill(n10_n31_n30_n11), _loll(n10_n31_n30_n11));
873 n10_n11 = _packhl2(_loll(n10_n31_n30_n11), _hill(n10_n31_n30_n11));
875 /*------------------------------------------*/
876 /* Perform radix4 style decomposition. */
877 /*------------------------------------------*/
878 /* xh0_2 = x_8 + x_c; xh1_2 = x_9 + x_d; */
879 /* xl0_2 = x_8 - x_c; xl1_2 = x_9 - x_d; */
880 /* xh0_3 = x_a + x_e; xh1_3 = x_b + x_f; */
881 /* xl0_3 = x_a - x_e; xl1_3 = x_b - x_f; */
882 xh0_2_xh1_2 = _add2(_lo(x_AB89), _lo(x_EFCD));
883 xl0_2_xl1_2 = _sub2(_lo(x_AB89), _lo(x_EFCD));
884 xh0_3_xh1_3 = _add2(_hi(x_AB89), _hi(x_EFCD));
885 xl0_3_xl1_3 = _sub2(_hi(x_AB89), _hi(x_EFCD));
887 /* n02 = xh0_2 + xh0_3; n22 = xh0_2 - xh0_3; */
888 /* n03 = xh1_2 + xh1_3; n23 = xh1_2 - xh1_3; */
889 n02_n03_n22_n23 = _addsub2(xh0_2_xh1_2, xh0_3_xh1_3);
891 /* n32 = xl0_2 + xl1_3; n12 = xl0_2 - xl1_3; */
892 /* n13 = xl1_2 + xl0_3; n33 = xl1_2 - xl0_3; */
893 xl1_3_xl0_3 = _rotl(xl0_3_xl1_3, 16);
894 n12_n33_n32_n13 = _addsub2(xl0_2_xl1_2, xl1_3_xl0_3);
896 n32_n33 = _packhl2(_hill(n12_n33_n32_n13), _loll(n12_n33_n32_n13));
897 n12_n13 = _packhl2(_loll(n12_n33_n32_n13), _hill(n12_n33_n32_n13));
899 /*-----------------------------------------------------------------*/
900 /* Points that are read from succesive locations map to y, y[N/4] */
901 /* y[N/2], y[3N/4] in a radix4 scheme, y, y[N/8], y[N/2],y[5N/8] */
902 /*-----------------------------------------------------------------*/
904 /*----Imaginary----------Real-----------*/
905 /* y0[2*h2+0] = n01; y0[2*h2+1] = n00; */
906 /* y0[2*h2+2] = n03; y0[2*h2+3] = n02; */
907 /* y1[2*h2+0] = n11; y1[2*h2+1] = n10; */
908 /* y1[2*h2+2] = n13; y1[2*h2+3] = n12; */
909 /* y2[2*h2+0] = n21; y2[2*h2+1] = n20; */
910 /* y2[2*h2+2] = n23; y2[2*h2+3] = n22; */
911 /* y3[2*h2+0] = n31; y3[2*h2+1] = n30; */
912 /* y3[2*h2+2] = n33; y3[2*h2+3] = n32; */
914 _amemd8(&y0[2 * h2]) = _itod(_hill(n02_n03_n22_n23), _hill(n00_n01_n20_n21));
915 _amemd8(&y1[2 * h2]) = _itod(n12_n13, n10_n11);
916 _amemd8(&y2[2 * h2]) = _itod(_loll(n02_n03_n22_n23), _loll(n00_n01_n20_n21));
917 _amemd8(&y3[2 * h2]) = _itod(n32_n33, n30_n31);
919 j += j0;
920 if (j == n0) {
921 j += n0;
922 x0 += (int)npoints >> 1;
923 x2 += (int)npoints >> 1;
924 }
925 }
926 }
928 #else
929 void DSP_fft16x16_i (
930 const short * restrict ptr_w,
931 int npoints,
932 short * restrict ptr_x,
933 short * restrict ptr_y
934 )
935 {
936 const short * restrict w;
937 const short * restrict w2;
938 short * restrict x, * restrict x2, * restrict x0;
940 int j, i, l1, radix;
941 int l2, h2, fft_jmp;
942 int predj, tw_offset, stride;
944 int x_0o_x_1o, x_2o_x_3o;
945 int xl2_0_1, xl2_2_3;
947 unsigned int mxt0_0_myt0_0, mxt0_1_myt0_1;
948 unsigned int xl1_0_1, xl1_2_3, xh2_0_1, xh2_2_3;
949 unsigned int xt1_0_yt1_0, xt1_1_yt1_1, xt2_0_yt2_0, xt2_1_yt2_1;
951 unsigned int xl1_0_xl0_0, yt1_0_xt2_0;
952 unsigned int yt2_0_xt1_0, xl1_1_xl0_1;
953 unsigned int yt2_1_xt1_1, yt1_1_xt2_1;
955 unsigned int xh0_0_xh1_0, xh0_1_xh1_1;
956 unsigned int xl0_0_xl1_0, xl0_1_xl1_1;
957 unsigned int xh20_0_xh21_0, xh20_1_xh21_1;
958 unsigned int xl20_0_xl21_0, xl20_1_xl21_1;
960 double x_0123;
961 double co10si10_co11si11, co20si20_co21si21, co30si30_co31si31;
962 double x_l1_01_x_l1_23, x_l2_01_x_l2_23, x_h2_01_x_h2_23;
964 long long xt1_0_yt1_0_xt2_0_yt2_0, xt1_1_yt1_1_xt2_1_yt2_1;
965 long long yt2_0_xt1_0_yt1_0_xt2_0, yt2_1_xt1_1_yt1_1_xt2_1;
966 long long xh0_0_xh1_0_xl0_0_xl1_0, xh0_1_xh1_1_xl0_1_xl1_1;
967 long long xh20_0_xh21_0_xl20_0_xl21_0, xh20_1_xh21_1_xl20_1_xl21_1;
969 /*---------------------------------------------------------------------*/
970 /* Determine the magnitude od the number of points to be transformed. */
971 /* Check whether we can use a radix4 decomposition or a mixed radix */
972 /* transformation, by determining modulo 2. */
973 /*---------------------------------------------------------------------*/
974 radix = _norm(npoints) & 1 ? 2 : 4;
976 /*----------------------------------------------------------------------*/
977 /* The stride is quartered with every iteration of the outer loop. It */
978 /* denotes the seperation between any two adjacent inputs to the butter */
979 /* -fly. This should start out at N/4, hence stride is initially set to */
980 /* N. For every stride, 6*stride twiddle factors are accessed. The */
981 /* "tw_offset" is the offset within the current twiddle factor sub- */
982 /* table. This is set to zero, at the start of the code and is used to */
983 /* obtain the appropriate sub-table twiddle pointer by offseting it */
984 /* with the base pointer "ptr_w". */
985 /*----------------------------------------------------------------------*/
986 stride = npoints;
987 tw_offset = 0;
988 fft_jmp = 6 * stride;
990 _nassert(stride > 4);
991 #pragma MUST_ITERATE(1,,1);
992 while (stride > radix) {
993 /*-----------------------------------------------------------------*/
994 /* At the start of every iteration of the outer loop, "j" is set */
995 /* to zero, as "w" is pointing to the correct location within the */
996 /* twiddle factor array. For every iteration of the inner loop */
997 /* 6 * stride twiddle factors are accessed. For eg, */
998 /* */
999 /* #Iteration of outer loop # twiddle factors #times cycled */
1000 /* 1 6 N/4 1 */
1001 /* 2 6 N/16 4 */
1002 /* ... */
1003 /*-----------------------------------------------------------------*/
1004 j = 0;
1005 fft_jmp >>= 2;
1007 /*-----------------------------------------------------------------*/
1008 /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or */
1009 /* "N/2", "N", "3N/2" half word */
1010 /*-----------------------------------------------------------------*/
1011 h2 = stride >> 1;
1012 l1 = stride;
1013 l2 = stride + (stride >> 1);
1015 /*-----------------------------------------------------------------*/
1016 /* Reset "x" to point to the start of the input data array. */
1017 /* "tw_offset" starts off at 0, and increments by "6 * stride" */
1018 /* The stride quarters with every iteration of the outer loop */
1019 /*-----------------------------------------------------------------*/
1020 x = ptr_x;
1021 w = ptr_w + tw_offset;
1022 tw_offset += fft_jmp;
1023 stride >>= 2;
1025 /*----------------------------------------------------------------*/
1026 /* The following loop iterates through the different butterflies, */
1027 /* within a given stage. Recall that there are logN to base 4 */
1028 /* stages. Certain butterflies share the twiddle factors. These */
1029 /* are grouped together. On the very first stage there are no */
1030 /* butterflies that share the twiddle factor, all N/4 butter- */
1031 /* flies have different factors. On the next stage two sets of */
1032 /* N/8 butterflies share the same twiddle factor. Hence after */
1033 /* half the butterflies are performed, j the index into the */
1034 /* factor array resets to 0, and the twiddle factors are reused. */
1035 /* When this happens, the data pointer 'x' is incremented by the */
1036 /* fft_jmp amount. In addition the following code is unrolled to */
1037 /* perform "2" radix4 butterflies in parallel. */
1038 /*----------------------------------------------------------------*/
1039 _nassert(npoints >= 16);
1040 #pragma MUST_ITERATE(2,,2);
1041 for (i = 0; i < npoints; i += 8) {
1042 /*------------------------------------------------------------*/
1043 /* Read the first 12 twiddle factors, six of which are used */
1044 /* for one radix 4 butterfly and six of which are used for */
1045 /* next one. */
1046 /* */
1047 /* si10 = w[j+1]; co10 = w[j+0]; */
1048 /* si11 = w[j+3]; co11 = w[j+2]; */
1049 /* si20 = w[j+5]; co20 = w[j+4]; */
1050 /* si21 = w[j+7]; co21 = w[j+6]; */
1051 /* si30 = w[j+9]; co30 = w[j+8]; */
1052 /* si31 = w[j+11]; co31 = w[j+10]; */
1053 /* */
1054 /* The code shown above performs half word wide loads. The */
1055 /* C64x can perform double word wide loads. This is done by */
1056 /* the use of the _amemd8_const intrinsic. The const quali- */
1057 /* fies the fact that this array will only be read from and */
1058 /* not written to. */
1059 /*------------------------------------------------------------*/
1060 w2 = w + j;
1061 co10si10_co11si11 = _amemd8_const(w2);
1062 co20si20_co21si21 = _amemd8_const(&w2[4]);
1063 co30si30_co31si31 = _amemd8_const(&w2[8]);
1065 /*------------------------------------------------------------*/
1066 /* Read in the first complex input for the butterflies. */
1067 /* 1st complex input to 1st butterfly: x[0] + jx[1] */
1068 /* 1st complex input to 2nd butterfly: x[2] + jx[3] */
1069 /* Read in the complex inputs for the butterflies. Each of the*/
1070 /* successive complex inputs of the butterfly are seperated */
1071 /* by a fixed amount known as stride. The stride starts out */
1072 /* at N/4, and quarters with every stage. */
1073 /* */
1074 /* x_0 = x[0]; x_1 = x[1]; */
1075 /* x_2 = x[2]; x_3 = x[3]; */
1076 /* x_l1_0 = x[l1 ]; x_l1_1 = x[l1 + 1]; */
1077 /* x_l1_2 = x[l1 + 2]; x_l1_3 = x[l1 + 3]; */
1078 /* x_l2_0 = x[l2 ]; x_l2_1 = x[l2 + 1]; */
1079 /* x_l2_2 = x[l2 + 2]; x_l2_3 = x[l2 + 3]; */
1080 /* x_h2_0 = x[h2 ]; x_h2_1 = x[h2 + 1]; */
1081 /* x_h2_2 = x[h2 + 2]; x_h2_3 = x[h2 + 3]; */
1082 /* */
1083 /* These loads are performed using _amemd8. Note that there */
1084 /* is no const qualifier in this case as this loop writes */
1085 /* back results into the data array "x". The "a" in _amemd8 */
1086 /* qualifies the fact that the loads are aligned. */
1087 /*------------------------------------------------------------*/
1088 x_0123 = _amemd8(&x[0]);
1089 x_l1_01_x_l1_23 = _amemd8(&x[l1]);
1090 x_l2_01_x_l2_23 = _amemd8(&x[l2]);
1091 x_h2_01_x_h2_23 = _amemd8(&x[h2]);
1093 /*-----------------------------------------------------------*/
1094 /* Two butterflies are evaluated in parallel. The following */
1095 /* results will be shown for one butterfly only, although */
1096 /* both are being evaluated in parallel. */
1097 /* */
1098 /* Perform radix2 style DIF butterflies. */
1099 /* */
1100 /* xh0_0 = x_0 + x_l1_0; xh1_0 = x_1 + x_l1_1; */
1101 /* xh0_1 = x_2 + x_l1_2; xh1_1 = x_3 + x_l1_3; */
1102 /* xl0_0 = x_0 - x_l1_0; xl1_0 = x_1 - x_l1_1; */
1103 /* xl0_1 = x_2 - x_l1_2; xl1_1 = x_3 - x_l1_3; */
1104 /* xh20_0 = x_h2_0 + x_l2_0; xh21_0 = x_h2_1 + x_l2_1; */
1105 /* xh20_1 = x_h2_2 + x_l2_2; xh21_1 = x_h2_3 + x_l2_3; */
1106 /* xl20_0 = x_h2_0 - x_l2_0; xl21_0 = x_h2_1 - x_l2_1; */
1107 /* xl20_1 = x_h2_2 - x_l2_2; xl21_1 = x_h2_3 - x_l2_3; */
1108 /* */
1109 /* These are done using packed data processing, and these */
1110 /* are done using _add2, _sub2. */
1111 /* For example "xh1_0_xh0_0" has a packed value xh1_0, */
1112 /* xh0_0. */
1113 /*-----------------------------------------------------------*/
1114 xh0_0_xh1_0_xl0_0_xl1_0 = _addsub2((_hi(x_0123)), (_hi(x_l1_01_x_l1_23)));
1115 xh0_1_xh1_1_xl0_1_xl1_1 = _addsub2((_lo(x_0123)), (_lo(x_l1_01_x_l1_23)));
1117 xh0_0_xh1_0 = (_hill(xh0_0_xh1_0_xl0_0_xl1_0));
1118 xh0_1_xh1_1 = (_hill(xh0_1_xh1_1_xl0_1_xl1_1));
1119 xl0_0_xl1_0 = (_loll(xh0_0_xh1_0_xl0_0_xl1_0));
1120 xl0_1_xl1_1 = (_loll(xh0_1_xh1_1_xl0_1_xl1_1));
1122 xh20_0_xh21_0_xl20_0_xl21_0 = _addsub2((_hi(x_h2_01_x_h2_23)), (_hi(x_l2_01_x_l2_23)));
1123 xh20_1_xh21_1_xl20_1_xl21_1 = _addsub2((_lo(x_h2_01_x_h2_23)), (_lo(x_l2_01_x_l2_23)));
1125 xh20_0_xh21_0 = (_hill(xh20_0_xh21_0_xl20_0_xl21_0));
1126 xh20_1_xh21_1 = (_hill(xh20_1_xh21_1_xl20_1_xl21_1));
1127 xl20_0_xl21_0 = (_loll(xh20_0_xh21_0_xl20_0_xl21_0));
1128 xl20_1_xl21_1 = (_loll(xh20_1_xh21_1_xl20_1_xl21_1));
1130 /*----------------------------------------------------------*/
1131 /* Derive output pointers using the input pointer "x" */
1132 /*----------------------------------------------------------*/
1133 x0 = (short *)_mvd((int)x);
1134 x2 = (short *)_mvd((int)x0);
1136 /*-----------------------------------------------------------*/
1137 /* When the twiddle factors are not to be re-used, j is */
1138 /* incremented by 12, to reflect the fact that 12 half words */
1139 /* are consumed in every iteration. The input data pointer */
1140 /* increments by 4. Note that within a stage, the stride */
1141 /* does not change and hence the offsets for the other three */
1142 /* legs, 0, h2, l1, l2. */
1143 /*-----------------------------------------------------------*/
1144 j += 12;
1145 x += 4;
1147 predj = (j - fft_jmp);
1148 if (!predj) x += fft_jmp;
1149 if (!predj) j = 0;
1151 /*----------------------------------------------------------*/
1152 /* x0[0] = (xh0_0 + xh20_0 + 1) >> 1; */
1153 /* x0[1] = (xh1_0 + xh21_0 + 1) >> 1; */
1154 /* x0[2] = (xh0_1 + xh20_1 + 1) >> 1; */
1155 /* x0[3] = (xh1_1 + xh21_1 + 1) >> 1; */
1156 /* */
1157 /* "y0x0" and "y1x1" are the first four outputs of the */
1158 /* radix4 butterfly. These are computed on the C64x */
1159 /* using _avg2. */
1160 /*----------------------------------------------------------*/
1161 x_0o_x_1o = _avg2(xh20_0_xh21_0, xh0_0_xh1_0);
1162 x_2o_x_3o = _avg2(xh20_1_xh21_1, xh0_1_xh1_1);
1164 /*----------------------------------------------------------*/
1165 /* Perform one more stage of radix2 decompositions as shown */
1166 /* These are performed using packed data processing. */
1167 /* */
1168 /* xt0_0 = xh0_0 - xh20_0; yt0_0 = xh1_0 - xh21_0; */
1169 /* xt0_1 = xh0_1 - xh20_1; yt0_1 = xh1_1 - xh21_1; */
1170 /* xt1_0 = xl0_0 + xl21_0; yt2_0 = xl1_0 + xl20_0; */
1171 /* xt2_0 = xl0_0 - xl21_0; yt1_0 = xl1_0 - xl20_0; */
1172 /* xt1_1 = xl0_1 + xl21_1; yt2_1 = xl1_1 + xl20_1; */
1173 /* xt2_1 = xl0_1 - xl21_1; yt1_1 = xl1_1 - xl20_1; */
1174 /* */
1175 /*----------------------------------------------------------*/
1176 mxt0_0_myt0_0 = _sub2(xh20_0_xh21_0, xh0_0_xh1_0);
1177 mxt0_1_myt0_1 = _sub2(xh20_1_xh21_1, xh0_1_xh1_1);
1178 /*----------------------------------------------------------*/
1179 /* Repack coefficients to sustain packed data processing. */
1180 /* This requires a swizzle so that intermediate results */
1181 /* end up in the right spot. This is needed to compute */
1182 /* other intermediate results using add2 and sub2. */
1183 /*----------------------------------------------------------*/
1184 xl1_0_xl0_0 = _rotl(xl0_0_xl1_0, 16);
1185 xl1_1_xl0_1 = _rotl(xl0_1_xl1_1, 16);
1187 //;----------------------------------------------------------;
1188 //; xt1_0 = xl0_0 + xl21_0 yt2_0 = xl1_0 + xl20_0 ;
1189 //; xt1_1 = xl0_1 + xl21_1 yt2_1 = xl1_1 + xl20_1 ;
1190 //; xt2_0 = xl0_0 - xl21_0 yt1_0 = xl1_0 - xl20_0 ;
1191 //; xt2_1 = xl0_1 - xl21_1 yt1_1 = xl1_1 - xl20_1 ;
1192 //;----------------------------------------------------------;
1193 yt2_0_xt1_0_yt1_0_xt2_0 = _addsub2(xl1_0_xl0_0, xl20_0_xl21_0);
1194 yt2_1_xt1_1_yt1_1_xt2_1 = _addsub2(xl1_1_xl0_1, xl20_1_xl21_1);
1196 yt2_0_xt1_0 = (_hill(yt2_0_xt1_0_yt1_0_xt2_0));
1197 yt2_1_xt1_1 = (_hill(yt2_1_xt1_1_yt1_1_xt2_1));
1198 yt1_0_xt2_0 = (_loll(yt2_0_xt1_0_yt1_0_xt2_0));
1199 yt1_1_xt2_1 = (_loll(yt2_1_xt1_1_yt1_1_xt2_1));
1201 /*----------------------------------------------------------*/
1202 /* Repack results so that xt_XXyt_XX are a packed 32 bit */
1203 /* quantity. The operations performed above tend to leave */
1204 /* the results as xt_XXyt_YY and xt_YYyt_XX. This is where */
1205 /* the packed instructions come in handy. For example */
1206 /* xt_XXyt_XX = _packlh2(xt_XXyt_YY, xt_YY_yt_XX) */
1207 /*----------------------------------------------------------*/
1208 xt1_0_yt1_0_xt2_0_yt2_0 = _dpackx2(yt1_0_xt2_0, yt2_0_xt1_0);
1209 xt1_1_yt1_1_xt2_1_yt2_1 = _dpackx2(yt1_1_xt2_1, yt2_1_xt1_1);
1211 xt1_0_yt1_0 = (_hill(xt1_0_yt1_0_xt2_0_yt2_0));
1212 xt1_1_yt1_1 = (_hill(xt1_1_yt1_1_xt2_1_yt2_1));
1213 xt2_0_yt2_0 = (_loll(xt1_0_yt1_0_xt2_0_yt2_0));
1214 xt2_1_yt2_1 = (_loll(xt1_1_yt1_1_xt2_1_yt2_1));
1216 /*---------------------------------------------------------*/
1217 /* Notice that in this version of the code the two middle */
1218 /* legs are swapped as indicated by the stores to x[l1].. */
1219 /* x[l1 + 3] which preceede the stores to x[h2]..x[h2 + 3] */
1220 /* This reversal guarantees that a radix4 DIF butterfly */
1221 /* produces results in digit reversed order. Note that in */
1222 /* addition to the rounding, the shift is performed by 16, */
1223 /* as opposed to 15, to give scaling. */
1224 /*---------------------------------------------------------*/
1226 /*---------------------------------------------------------*/
1227 /* */
1228 /* x2[l1 ] = (si10 * yt1_0 + co10 * xt1_0 + 0x8000) >> 16;*/
1229 /* x2[l1+1] = (co10 * yt1_0 - si10 * xt1_0 + 0x8000) >> 16;*/
1230 /* x2[l1+2] = (si11 * yt1_1 + co11 * xt1_1 + 0x8000) >> 16;*/
1231 /* x2[l1+3] = (co11 * yt1_1 - si11 * xt1_1 + 0x8000) >> 16;*/
1232 /* */
1233 /* These four results are retained in registers and a */
1234 /* double word is formed so that it can be stored with */
1235 /* one STDW. */
1236 /*---------------------------------------------------------*/
1237 xh2_0_1 = _cmpyr((_hi(co10si10_co11si11)), xt1_0_yt1_0);
1238 xh2_2_3 = _cmpyr((_lo(co10si10_co11si11)), xt1_1_yt1_1);
1240 /*---------------------------------------------------------*/
1241 /* The following code computes intermediate results for: */
1242 /* */
1243 /* x2[h2 ] = (si20 * yt0_0 + co20 * xt0_0 + 0x8000) >> 16;*/
1244 /* x2[h2+1] = (co20 * yt0_0 - si20 * xt0_0 + 0x8000) >> 16;*/
1245 /* x2[h2+2] = (si21 * yt0_1 + co21 * xt0_1 + 0x8000) >> 16;*/
1246 /* x2[h2+3] = (co21 * yt0_1 - si21 * xt0_1 + 0x8000) >> 16;*/
1247 /*---------------------------------------------------------*/
1248 xl1_0_1 = _cmpyr((_hi(co20si20_co21si21)), mxt0_0_myt0_0);
1249 xl1_2_3 = _cmpyr((_lo(co20si20_co21si21)), mxt0_1_myt0_1);
1251 /*---------------------------------------------------------*/
1252 /* The following code computes intermediate results for: */
1253 /* */
1254 /* x2[l2 ] = (si30 * yt2_0 + co30 * xt2_0 + 0x8000) >> 16;*/
1255 /* x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0 + 0x8000) >> 16;*/
1256 /* x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1 + 0x8000) >> 16;*/
1257 /* x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1 + 0x8000) >> 16;*/
1258 /*---------------------------------------------------------*/
1259 xl2_0_1 = _cmpyr((_hi(co30si30_co31si31)), xt2_0_yt2_0);
1260 xl2_2_3 = _cmpyr((_lo(co30si30_co31si31)), xt2_1_yt2_1);
1262 /*---------------------------------------------------------*/
1263 /* Prepare the doubleword for each of the four outputs, by */
1264 /* the use of the _itod intrinsic that takes integers, and */
1265 /* forms a double word wide quantity. */
1266 /* Store out the outputs to the four legs of the butterfly */
1267 /* using aligned store double words. Notice the use of the */
1268 /* indices "l1", "l2", "h2" to derive the pointers for the */
1269 /* legs of the butterfly. */
1270 /*---------------------------------------------------------*/
1271 _amemd8(&x2[0]) = _itod(x_0o_x_1o, x_2o_x_3o);
1272 _amemd8(&x2[l1]) = _itod(xl1_0_1, xl1_2_3);
1273 _amemd8(&x2[h2]) = _itod(xh2_0_1, xh2_2_3);
1274 _amemd8(&x2[l2]) = _itod(xl2_0_1, xl2_2_3);
1275 }
1276 }
1278 if (radix == 2)
1279 radix_2(ptr_x, ptr_y, npoints);
1280 else if (radix == 4)
1281 radix_4(ptr_x, ptr_y, npoints);
1283 return;
1284 }
1286 void radix_2(
1287 short *restrict ptr_x,
1288 short *restrict ptr_y,
1289 int npoints
1290 )
1291 {
1292 short * restrict x2, * restrict x0;
1293 short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
1294 short n0, j0;
1296 int i, j, l1, h2;
1297 int y_01, y_23, y_45, y_67, y_89, y_AB, y_CD, y_EF;
1299 double x_1032, x_5476, x_98BA, x_DCFE;
1301 /*-----------------------------------------------------------------*/
1302 /* The following code performs either a standard radix4 pass or a */
1303 /* radix2 pass. Two pointers are used to access the input data. */
1304 /* The input data is read "N/4" complex samples apart or "N/2" */
1305 /* words apart using pointers "x0" and "x2". This produces out- */
1306 /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8 */
1307 /* N/2, 3N/8 for radix 2. */
1308 /*-----------------------------------------------------------------*/
1309 y0 = ptr_y;
1310 y2 = ptr_y + (int)npoints;
1311 x0 = ptr_x;
1312 x2 = ptr_x + (int)(npoints >> 1);
1314 /*----------------------------------------------------------------*/
1315 /* The pointers are set at the following locations which are half */
1316 /* the offsets of a radix4 FFT. */
1317 /*----------------------------------------------------------------*/
1318 y1 = y0 + (int)(npoints >> 2);
1319 y3 = y2 + (int)(npoints >> 2);
1320 l1 = _norm(npoints) + 1;
1321 j0 = 8;
1322 n0 = npoints >> 1;
1324 /*--------------------------------------------------------------------*/
1325 /* The following code reads data indentically for either a radix 4 */
1326 /* or a radix 2 style decomposition. It writes out at different */
1327 /* locations though. It checks if either half the points, or a */
1328 /* quarter of the complex points have been exhausted to jump to */
1329 /* pervent double reversal. */
1330 /*--------------------------------------------------------------------*/
1331 j = 0;
1333 _nassert((int)(n0) % 4 == 0);
1334 _nassert((int)(ptr_x) % 8 == 0);
1335 _nassert((int)(ptr_y) % 8 == 0);
1336 _nassert((int)(x0) % 8 == 0);
1337 _nassert((int)(x2) % 8 == 0);
1338 _nassert((int)(y0) % 8 == 0);
1339 _nassert(npoints >= 16);
1340 #pragma MUST_ITERATE(2,,2);
1341 for (i = 0; i < npoints; i += 8) {
1342 /*----------------------------------------------------------------*/
1343 /* Digit reverse the index starting from 0. The increment to "j" */
1344 /* is either by 4, or 8. */
1345 /*----------------------------------------------------------------*/
1346 h2 = _deal(j);
1347 h2 = _bitr(h2);
1348 h2 = _rotl(h2, 16);
1349 h2 = _shfl(h2);
1350 h2 >>= l1;
1351 /*----------------------------------------------------------------*/
1352 /* Read in the input data. These are transformed as a radix2. */
1353 /*----------------------------------------------------------------*/
1354 x_1032 = _amemd8(&x0[0]);
1355 x_5476 = _amemd8(&x0[4]);
1356 x0 += 8;
1357 x_98BA = _amemd8(&x2[0]);
1358 x_DCFE = _amemd8(&x2[4]);
1359 x2 += 8;
1361 /*----------------------------------------------------------------------------*/
1362 /* Perform radix2 style decomposition. */
1363 /*----------------------------------------------------------------------------*/
1364 /* n00 = x_0 + x_2; n01 = x_1 + x_3; y_01 = n00_n01 = _add2(x_10, x_32) */
1365 /* n20 = x_0 - x_2; n21 = x_1 - x_3; y_89 = n20_n21 = _sub2(x_10, x_32) */
1366 /* n10 = x_4 + x_6; n11 = x_5 + x_7; y_45 = n10_n11 = _add2(x_54, x_76) */
1367 /* n30 = x_4 - x_6; n31 = x_5 - x_7; y_CD = n30_n31 = _sub2(x_54, x_76) */
1368 /* n02 = x_8 + x_a; n03 = x_9 + x_b; y_23 = n03_n02 = _add2(x_98, x_BA) */
1369 /* n12 = x_c + x_e; n13 = x_d + x_f; y_67 = n12_n13 = _add2(x_DC, x_FE) */
1370 /* n22 = x_8 - x_a; n23 = x_9 - x_b; y_AB = n22_n23 = _sub2(x_98, x_BA) */
1371 /* n32 = x_c - x_e; n33 = x_d - x_f; y_EF = n32_n33 = _sub2(x_DC, x_FE) */
1373 y_01 = _add2(_hi(x_1032), _lo(x_1032));
1374 y_89 = _sub2(_hi(x_1032), _lo(x_1032));
1375 y_45 = _add2(_hi(x_5476), _lo(x_5476));
1376 y_CD = _sub2(_hi(x_5476), _lo(x_5476));
1377 y_23 = _add2(_hi(x_98BA), _lo(x_98BA));
1378 y_67 = _add2(_hi(x_DCFE), _lo(x_DCFE));
1379 y_AB = _sub2(_hi(x_98BA), _lo(x_98BA));
1380 y_EF = _sub2(_hi(x_DCFE), _lo(x_DCFE));
1382 /*----Imaginary------------Real-----------*/
1383 /* y0[2*h2+0] = n01; y0[2*h2+1] = n00; */
1384 /* y0[2*h2+2] = n03; y0[2*h2+3] = n02; */
1385 /* y1[2*h2+0] = n11; y1[2*h2+1] = n10; */
1386 /* y1[2*h2+2] = n13; y1[2*h2+3] = n12; */
1387 /* y2[2*h2+0] = n21; y2[2*h2+1] = n20; */
1388 /* y2[2*h2+2] = n23; y2[2*h2+3] = n22; */
1389 /* y3[2*h2+0] = n31; y3[2*h2+1] = n30; */
1390 /* y3[2*h2+2] = n33; y3[2*h2+3] = n32; */
1392 /*------------------------------------------------------------*/
1393 /* Store out the results of all four butterflies as double */
1394 /* words. */
1395 /*------------------------------------------------------------*/
1396 _amemd8(&y0[2 * h2]) = _itod(y_01, y_23);
1397 _amemd8(&y1[2 * h2]) = _itod(y_45, y_67);
1398 _amemd8(&y2[2 * h2]) = _itod(y_89, y_AB);
1399 _amemd8(&y3[2 * h2]) = _itod(y_CD, y_EF);
1401 j += j0;
1402 if (j == n0) {
1403 j += n0;
1404 x0 += (int)npoints >> 1;
1405 x2 += (int)npoints >> 1;
1406 }
1407 }
1408 }
1410 void radix_4(
1411 short *restrict ptr_x,
1412 short *restrict ptr_y,
1413 int npoints
1414 )
1415 {
1416 short * restrict x2, * restrict x0;
1417 short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
1418 short n0, j0;
1420 int i, j, l1, h2;
1421 int xh1_0_xh0_0, xh1_1_xh0_1, xh1_2_xh0_2, xh1_3_xh0_3;
1422 int xl1_0_xl0_0, xl1_1_xl0_1, xl0_1_xl1_1, xl1_2_xl0_2, xl1_3_xl0_3, xl0_3_xl1_3;
1423 int n11_n10, n31_n30, n13_n12, n33_n32;
1425 double x_1032, x_5476, x_98BA, x_DCFE;
1427 long long n01_n00_n21_n20, n11_n30_n31_n10, n03_n02_n23_n22, n13_n32_n33_n12;
1429 /*-----------------------------------------------------------------*/
1430 /* The following code performs either a standard radix4 pass or a */
1431 /* radix2 pass. Two pointers are used to access the input data. */
1432 /* The input data is read "N/4" complex samples apart or "N/2" */
1433 /* words apart using pointers "x0" and "x2". This produces out- */
1434 /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8 */
1435 /* N/2, 3N/8 for radix 2. */
1436 /*-----------------------------------------------------------------*/
1437 y0 = ptr_y;
1438 y2 = ptr_y + (int)npoints;
1439 x0 = ptr_x;
1440 x2 = ptr_x + (int)(npoints >> 1);
1442 /*----------------------------------------------------------------*/
1443 /* The pointers are set at the following locations which are half */
1444 /* the offsets of a radix4 FFT. */
1445 /*----------------------------------------------------------------*/
1446 y3 = y2 + (int)(npoints >> 1);
1447 y1 = y0 + (int)(npoints >> 1);
1448 l1 = _norm(npoints) + 2;
1449 j0 = 4;
1450 n0 = npoints >> 2;
1452 /*--------------------------------------------------------------------*/
1453 /* The following code reads data indentically for either a radix 4 */
1454 /* or a radix 2 style decomposition. It writes out at different */
1455 /* locations though. It checks if either half the points, or a */
1456 /* quarter of the complex points have been exhausted to jump to */
1457 /* pervent double reversal. */
1458 /*--------------------------------------------------------------------*/
1459 j = 0;
1461 _nassert((int)(n0) % 4 == 0);
1462 _nassert((int)(x0) % 8 == 0);
1463 _nassert((int)(x2) % 8 == 0);
1464 _nassert((int)(y0) % 8 == 0);
1465 _nassert(npoints >= 16);
1466 #pragma MUST_ITERATE(2,,2);
1468 for (i = 0; i < npoints; i += 8) {
1469 /*----------------------------------------------------------------*/
1470 /* Digit reverse the index starting from 0. The increment to "j" */
1471 /* is either by 4, or 8. */
1472 /*----------------------------------------------------------------*/
1473 h2 = _deal(j);
1474 h2 = _bitr(h2);
1475 h2 = _rotl(h2, 16);
1476 h2 = _shfl(h2);
1477 h2 >>= l1;
1479 /*----------------------------------------------------------------*/
1480 /* Read in the input data. These are transformed as a radix4. */
1481 /*----------------------------------------------------------------*/
1482 x_1032 = _amemd8(&x0[0]);
1483 x_5476 = _amemd8(&x0[4]);
1484 x0 += 8;
1485 x_98BA = _amemd8(&x2[0]);
1486 x_DCFE = _amemd8(&x2[4]);
1487 x2 += 8;
1489 /*------------------------------------------*/
1490 /* Perform radix4 style decomposition. */
1491 /*------------------------------------------*/
1492 /* xh0_0 = x_0 + x_4; xh1_0 = x_1 + x_5; */
1493 /* xl0_0 = x_0 - x_4; xl1_0 = x_1 - x_5; */
1494 /* xh0_1 = x_2 + x_6; xh1_1 = x_3 + x_7; */
1495 /* xl0_1 = x_2 - x_6; xl1_1 = x_3 - x_7; */
1496 xh1_0_xh0_0 = _add2(_hi(x_1032), _hi(x_5476));
1497 xl1_0_xl0_0 = _sub2(_hi(x_1032), _hi(x_5476));
1498 xh1_1_xh0_1 = _add2(_lo(x_1032), _lo(x_5476));
1499 xl1_1_xl0_1 = _sub2(_lo(x_1032), _lo(x_5476));
1501 /* n00 = xh0_0 + xh0_1; n20 = xh0_0 - xh0_1; */
1502 /* n01 = xh1_0 + xh1_1; n21 = xh1_0 - xh1_1; */
1503 n01_n00_n21_n20 = _addsub2(xh1_0_xh0_0, xh1_1_xh0_1);
1505 /* n30 = xl0_0 + xl1_1; n10 = xl0_0 - xl1_1; */
1506 /* n11 = xl1_0 + xl0_1; n31 = xl1_0 - xl0_1; */
1507 xl0_1_xl1_1 = _rotl(xl1_1_xl0_1, 16);
1508 n11_n30_n31_n10 = _addsub2(xl1_0_xl0_0, xl0_1_xl1_1);
1510 n31_n30 = _packhl2(_loll(n11_n30_n31_n10), _hill(n11_n30_n31_n10));
1511 n11_n10 = _packhl2(_hill(n11_n30_n31_n10), _loll(n11_n30_n31_n10));
1513 /*------------------------------------------*/
1514 /* Perform radix4 style decomposition. */
1515 /*------------------------------------------*/
1516 /* xh0_2 = x_8 + x_c; xh1_2 = x_9 + x_d; */
1517 /* xl0_2 = x_8 - x_c; xl1_2 = x_9 - x_d; */
1518 /* xh0_3 = x_a + x_e; xh1_3 = x_b + x_f; */
1519 /* xl0_3 = x_a - x_e; xl1_3 = x_b - x_f; */
1520 xh1_2_xh0_2 = _add2(_hi(x_98BA), _hi(x_DCFE));
1521 xl1_2_xl0_2 = _sub2(_hi(x_98BA), _hi(x_DCFE));
1522 xh1_3_xh0_3 = _add2(_lo(x_98BA), _lo(x_DCFE));
1523 xl1_3_xl0_3 = _sub2(_lo(x_98BA), _lo(x_DCFE));
1525 /* n02 = xh0_2 + xh0_3; n22 = xh0_2 - xh0_3; */
1526 /* n03 = xh1_2 + xh1_3; n23 = xh1_2 - xh1_3; */
1527 n03_n02_n23_n22 = _addsub2(xh1_2_xh0_2, xh1_3_xh0_3);
1529 /* n32 = xl0_2 + xl1_3; n12 = xl0_2 - xl1_3; */
1530 /* n13 = xl1_2 + xl0_3; n33 = xl1_2 - xl0_3; */
1531 xl0_3_xl1_3 = _rotl(xl1_3_xl0_3, 16);
1532 n13_n32_n33_n12 = _addsub2(xl1_2_xl0_2, xl0_3_xl1_3);
1534 n33_n32 = _packhl2(_loll(n13_n32_n33_n12), _hill(n13_n32_n33_n12));
1535 n13_n12 = _packhl2(_hill(n13_n32_n33_n12), _loll(n13_n32_n33_n12));
1537 /*-----------------------------------------------------------------*/
1538 /* Points that are read from succesive locations map to y, y[N/4] */
1539 /* y[N/2], y[3N/4] in a radix4 scheme, y, y[N/8], y[N/2],y[5N/8] */
1540 /*-----------------------------------------------------------------*/
1542 /*----Imaginary----------Real-----------*/
1543 /* y0[2*h2+0] = n01; y0[2*h2+1] = n00; */
1544 /* y0[2*h2+2] = n03; y0[2*h2+3] = n02; */
1545 /* y1[2*h2+0] = n11; y1[2*h2+1] = n10; */
1546 /* y1[2*h2+2] = n13; y1[2*h2+3] = n12; */
1547 /* y2[2*h2+0] = n21; y2[2*h2+1] = n20; */
1548 /* y2[2*h2+2] = n23; y2[2*h2+3] = n22; */
1549 /* y3[2*h2+0] = n31; y3[2*h2+1] = n30; */
1550 /* y3[2*h2+2] = n33; y3[2*h2+3] = n32; */
1551 _amemd8(&y0[2 * h2]) = _itod(_hill(n01_n00_n21_n20), _hill(n03_n02_n23_n22));
1552 _amemd8(&y1[2 * h2]) = _itod(n11_n10, n13_n12);
1553 _amemd8(&y2[2 * h2]) = _itod(_loll(n01_n00_n21_n20), _loll(n03_n02_n23_n22));
1554 _amemd8(&y3[2 * h2]) = _itod(n31_n30, n33_n32);
1556 j += j0;
1557 if (j == n0) {
1558 j += n0;
1559 x0 += (int)npoints >> 1;
1560 x2 += (int)npoints >> 1;
1561 }
1562 }
1563 }
1564 #endif
1565 /* ======================================================================== */
1566 /* End of file: DSP_fft16x16_i.c */
1567 /* ------------------------------------------------------------------------ */
1568 /* Copyright (C) 2011 Texas Instruments, Incorporated. */
1569 /* All Rights Reserved. */
1570 /* ======================================================================== */