[ep-processor-libraries/dsplib.git] / ti / dsplib / src / DSPF_sp_ifftSPxSP / c66 / DSPF_sp_ifftSPxSP_opt.c
1 /* ======================================================================= */
2 /* DSPF_sp_ifftSPxSP_opt.c -- Inverse FFT with Mixed Radix */
3 /* Intrinsic C Implementation */
4 /* */
5 /* Rev 0.0.1 */
6 /* */
7 /* Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/ */
8 /* */
9 /* */
10 /* Redistribution and use in source and binary forms, with or without */
11 /* modification, are permitted provided that the following conditions */
12 /* are met: */
13 /* */
14 /* Redistributions of source code must retain the above copyright */
15 /* notice, this list of conditions and the following disclaimer. */
16 /* */
17 /* Redistributions in binary form must reproduce the above copyright */
18 /* notice, this list of conditions and the following disclaimer in the */
19 /* documentation and/or other materials provided with the */
20 /* distribution. */
21 /* */
22 /* Neither the name of Texas Instruments Incorporated nor the names of */
23 /* its contributors may be used to endorse or promote products derived */
24 /* from this software without specific prior written permission. */
25 /* */
26 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
27 /* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */
28 /* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR */
29 /* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT */
30 /* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
31 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT */
32 /* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
33 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY */
34 /* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
35 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE */
36 /* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
37 /* */
38 /* ======================================================================= */
40 #pragma CODE_SECTION(DSPF_sp_ifftSPxSP_opt, ".text:Intrinsic");
42 #include "DSPF_sp_ifftSPxSP_opt.h"
43 #ifdef __TI_COMPILER_VERSION__
44 #include "c6x.h"
45 #endif
47 #ifdef _LITTLE_ENDIAN
48 void DSPF_sp_ifftSPxSP_opt (int N, float *ptr_x, float *ptr_w, float *ptr_y,
49 unsigned char *brev, int n_min, int offset, int n_max)
50 {
52 int i, j, k, l1, l2, h2, predj, tw_offset, stride, fft_jmp, radix;
54 float one_over_nmax;
55 float yt2, yt3, yt6, yt7;
56 float xt1_0, yt2_0, xt1_1, yt2_1, xt2_0, yt1_0, xt2_1, yt1_1;
57 float * restrict x, * restrict x2, * restrict w;
58 float * restrict y0, * restrict y1, * restrict y2, * restrict y3;
60 __float2_t scale;
61 __float2_t xh21_0_xh20_0, xh21_1_xh20_1, xh1_0_xh0_0, xh1_1_xh0_1;
62 __float2_t xl21_0_xl20_0, xl21_1_xl20_1, xl1_0_xl0_0, xl1_1_xl0_1;
63 __float2_t yt0_0_xt0_0, yt0_1_xt0_1, yt1_0_xt1_0;
64 __float2_t yt1_1_xt1_1, yt2_0_xt2_0, yt2_1_xt2_1;
65 __float2_t x_1o_x_0o, xl1_1o_xl1_0o, x_3o_x_2o, xl1_3o_xl1_2o;
66 __float2_t xh2_1o_xh2_0o, xl2_1o_xl2_0o, xh2_3o_xh2_2o, xl2_3o_xl2_2o;
67 __float2_t x_l1_10, x_l1_32, x_l2_10, x_l2_32, x_h2_10, x_h2_32;
68 __float2_t co10_si10, co20_si20, co30_si30, co11_si11, co21_si21, co31_si31;
69 __float2_t x_10, x_32, x_54, x_76, yt_10, yt_32, yt_54, yt_76;
71 /*----------------------------------------------------------------------*/
72 /* The stride is quartered with every iteration of the outer loop. It */
73 /* denotes the seperation between any two adjacent inputs to the butter */
74 /* -fly. This should start out at N/4, hence stride is initially set to */
75 /* N. For every stride, 6*stride twiddle factors are accessed. The */
76 /* "tw_offset" is the offset within the current twiddle factor sub- */
77 /* table. This is set to zero, at the start of the code and is used to */
78 /* obtain the appropriate sub-table twiddle pointer by offseting it */
79 /* with the base pointer "ptr_w". */
80 /*----------------------------------------------------------------------*/
81 radix = n_min;
82 stride = N;
83 tw_offset = 0;
84 fft_jmp = 6 * stride;
86 _nassert(stride > 4);
87 while (stride > radix)
88 {
89 /*-----------------------------------------------------------------*/
90 /* At the start of every iteration of the outer loop, "j" is set */
91 /* to zero, as "w" is pointing to the correct location within the */
92 /* twiddle factor array. For every iteration of the inner loop */
93 /* 6 * stride twiddle factors are accessed. For eg, */
94 /* */
95 /* #Iteration of outer loop # twiddle factors #times cycled */
96 /* 1 6 N/4 1 */
97 /* 2 6 N/16 4 */
98 /* ... */
99 /*-----------------------------------------------------------------*/
100 j = 0;
101 fft_jmp >>= 2;
103 /*-----------------------------------------------------------------*/
104 /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or */
105 /* "N/2", "N", "3N/2" half word */
106 /*-----------------------------------------------------------------*/
107 h2 = stride >> 1;
108 l1 = stride;
109 l2 = stride + (stride >> 1);
111 /*-----------------------------------------------------------------*/
112 /* Reset "x" to point to the start of the input data array. */
113 /* "tw_offset" starts off at 0, and increments by "6 * stride" */
114 /* The stride quarters with every iteration of the outer loop */
115 /*-----------------------------------------------------------------*/
116 x = ptr_x;
117 w = ptr_w + tw_offset;
118 tw_offset += fft_jmp;
120 /*-----------------------------------------------------------------*/
121 /* The following loop iterates through the different butterflies, */
122 /* within a given stage. Recall that there are logN to base 4 */
123 /* stages. Certain butterflies share the twiddle factors. These */
124 /* are grouped together. On the very first stage there are no */
125 /* butterflies that share the twiddle factor, all N/4 butter- */
126 /* flies have different factors. On the next stage two sets of */
127 /* N/8 butterflies share the same twiddle factor. Hence after */
128 /* half the butterflies are performed, j the index into the */
129 /* factor array resets to 0, and the twiddle factors are reused. */
130 /* When this happens, the data pointer 'x' is incremented by the */
131 /* fft_jmp amount. */
132 /*-----------------------------------------------------------------*/
133 _nassert((int)(w) % 8 == 0);
134 _nassert((int)(x) % 8 == 0);
135 _nassert(h2 % 8 == 0);
136 _nassert(l1 % 8 == 0);
137 _nassert(l2 % 8 == 0);
138 _nassert(N >= 8);
140 for (i = 0; i < (N >> 3); i++) {
142 /*-------------------------------------------------------------*/
143 /* Read the first six twiddle factor values. This loop */
144 /* computes two radix 4 butterflies at a time. */
145 /* si10 = w[0] co10 = w[1] si20 = w[2] co20 = w[3] */
146 /* si30 = w[4] co30 = w[5] si11 = w[6] co11 = w[7] */
147 /* si21 = w[8] co21 = w[9] si31 = w[a] co31 = w[b] */
148 /*-------------------------------------------------------------*/
149 co10_si10 = _amem8_f2_const(&w[j]);
150 co20_si20 = _amem8_f2_const(&w[j+2]);
151 co30_si30 = _amem8_f2_const(&w[j+4]);
153 co11_si11 = _amem8_f2_const(&w[j+6]);
154 co21_si21 = _amem8_f2_const(&w[j+8]);
155 co31_si31 = _amem8_f2_const(&w[j+10]);
157 /*-------------------------------------------------------------*/
158 /* Read in the data elements for the eight inputs of two */
159 /* radix4 butterflies. */
160 /* x[0] x[1] x[2] x[3] */
161 /* x[h2+0] x[h2+1] x[h2+2] x[h2+3] */
162 /* x[l1+0] x[l1+1] x[l1+2] x[l1+3] */
163 /* x[l2+0] x[l2+1] x[l2+2] x[l2+3] */
164 /*-------------------------------------------------------------*/
165 x_10 = _amem8_f2(&x[0]);
166 x_32 = _amem8_f2(&x[2]);
167 x_l1_10 = _amem8_f2(&x[l1]);
168 x_l1_32 = _amem8_f2(&x[l1 + 2]);
169 x_l2_10 = _amem8_f2(&x[l2]);
170 x_l2_32 = _amem8_f2(&x[l2 + 2]);
171 x_h2_10 = _amem8_f2(&x[h2]);
172 x_h2_32 = _amem8_f2(&x[h2 + 2]);
174 /*-------------------------------------------------------------*/
175 /* xh0_0 = x[0] + x[l1]; xh1_0 = x[1] + x[l1+1] */
176 /* xh0_1 = x[2] + x[l1+2]; xh1_1 = x[3] + x[l1+3] */
177 /* xl0_0 = x[0] - x[l1]; xl1_0 = x[1] - x[l1+1] */
178 /* xl0_1 = x[2] - x[l1+2]; xl1_1 = x[3] - x[l1+3] */
179 /*-------------------------------------------------------------*/
180 xh1_0_xh0_0 = _daddsp(x_10, x_l1_10);
181 xl1_0_xl0_0 = _dsubsp(x_10, x_l1_10);
182 xh1_1_xh0_1 = _daddsp(x_32, x_l1_32);
183 xl1_1_xl0_1 = _dsubsp(x_32, x_l1_32);
185 /*-------------------------------------------------------------*/
186 /* xh20_0 = x[h2 ] + x[l2 ]; xh21_0 = x[h2+1] + x[l2+1] */
187 /* xh20_1 = x[h2+2] + x[l2+2]; xh21_1 = x[h2+3] + x[l2+3] */
188 /* xl20_0 = x[h2 ] - x[l2 ]; xl21_0 = x[h2+1] - x[l2+1] */
189 /* xl20_1 = x[h2+2] - x[l2+2]; xl21_1 = x[h2+3] - x[l2+3] */
190 /*-------------------------------------------------------------*/
191 xh21_0_xh20_0 = _daddsp(x_h2_10, x_l2_10);
192 xl21_0_xl20_0 = _dsubsp(x_h2_10, x_l2_10);
193 xh21_1_xh20_1 = _daddsp(x_h2_32, x_l2_32);
194 xl21_1_xl20_1 = _dsubsp(x_h2_32, x_l2_32);
196 /*-------------------------------------------------------------*/
197 /* x_0o = xh0_0 + xh20_0; x_1o = xh1_0 + xh21_0; */
198 /* x_2o = xh0_1 + xh20_1; x_3o = xh1_1 + xh21_1; */
199 /* xt0_0 = xh0_0 - xh20_0; yt0_0 = xh1_0 - xh21_0; */
200 /* xt0_1 = xh0_1 - xh20_1; yt0_1 = xh1_1 - xh21_1; */
201 /*-------------------------------------------------------------*/
202 x_1o_x_0o = _daddsp(xh1_0_xh0_0, xh21_0_xh20_0);
203 x_3o_x_2o = _daddsp(xh1_1_xh0_1, xh21_1_xh20_1);
205 yt0_0_xt0_0 = _dsubsp(xh1_0_xh0_0, xh21_0_xh20_0);
206 yt0_1_xt0_1 = _dsubsp(xh1_1_xh0_1, xh21_1_xh20_1);
208 /*-------------------------------------------------------------*/
209 /* xt1_0 = xl0_0 - xl21_0; yt2_0 = xl1_0 - xl20_0; */
210 /* xt1_1 = xl0_1 - xl21_1; yt2_1 = xl1_1 - xl20_1; */
211 /* xt2_0 = xl0_0 + xl21_0; yt1_0 = xl1_0 + xl20_0; */
212 /* xt2_1 = xl0_1 + xl21_1; yt1_1 = xl1_1 + xl20_1; */
213 /*-------------------------------------------------------------*/
214 xt1_0 = _lof2(xl1_0_xl0_0) - _hif2(xl21_0_xl20_0);
215 yt2_0 = _hif2(xl1_0_xl0_0) - _lof2(xl21_0_xl20_0);
216 xt1_1 = _lof2(xl1_1_xl0_1) - _hif2(xl21_1_xl20_1);
217 yt2_1 = _hif2(xl1_1_xl0_1) - _lof2(xl21_1_xl20_1);
218 xt2_0 = _lof2(xl1_0_xl0_0) + _hif2(xl21_0_xl20_0);
219 yt1_0 = _hif2(xl1_0_xl0_0) + _lof2(xl21_0_xl20_0);
220 xt2_1 = _lof2(xl1_1_xl0_1) + _hif2(xl21_1_xl20_1);
221 yt1_1 = _hif2(xl1_1_xl0_1) + _lof2(xl21_1_xl20_1);
223 yt1_0_xt1_0 = _ftof2(yt1_0, xt1_0);
224 yt1_1_xt1_1 = _ftof2(yt1_1, xt1_1);
225 yt2_0_xt2_0 = _ftof2(yt2_0, xt2_0);
226 yt2_1_xt2_1 = _ftof2(yt2_1, xt2_1);
228 /*-------------------------------------------------------------*/
229 /* x2[h2 ] = (si10 * yt1_0 + co10 * xt1_0) */
230 /* x2[h2+1] = (co10 * yt1_0 - si10 * xt1_0) */
231 /* x2[h2+2] = (si11 * yt1_1 + co11 * xt1_1) */
232 /* x2[h2+3] = (co11 * yt1_1 - si11 * xt1_1) */
233 /*-------------------------------------------------------------*/
234 xl1_1o_xl1_0o = _complex_mpysp(co10_si10, yt1_0_xt1_0);
235 xl1_3o_xl1_2o = _complex_mpysp(co11_si11, yt1_1_xt1_1);
237 /*-------------------------------------------------------------*/
238 /* x2[l1 ] = (si20 * yt0_0 + co20 * xt0_0) */
239 /* x2[l1+1] = (co20 * yt0_0 - si20 * xt0_0) */
240 /* x2[l1+2] = (si21 * yt0_1 + co21 * xt0_1) */
241 /* x2[l1+3] = (co21 * yt0_1 - si21 * xt0_1) */
242 /*-------------------------------------------------------------*/
243 xh2_1o_xh2_0o = _complex_mpysp(co20_si20, yt0_0_xt0_0);
244 xh2_3o_xh2_2o = _complex_mpysp(co21_si21, yt0_1_xt0_1);
246 /*-------------------------------------------------------------*/
247 /* x2[l2 ] = (si30 * yt2_0 + co30 * xt2_0) */
248 /* x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0) */
249 /* x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1) */
250 /* x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1) */
251 /*-------------------------------------------------------------*/
252 xl2_1o_xl2_0o = _complex_mpysp(co30_si30, yt2_0_xt2_0);
253 xl2_3o_xl2_2o = _complex_mpysp(co31_si31, yt2_1_xt2_1);
255 /*-------------------------------------------------------------*/
256 /* Derive output pointers using the input pointer "x" */
257 /*-------------------------------------------------------------*/
258 x2 = (float*)_mvd((int)x);
260 /*-------------------------------------------------------------*/
261 /* Store eight outputs - four legs of each butterfly */
262 /*-------------------------------------------------------------*/
263 _amem8_f2(&x2[0]) = x_1o_x_0o;
264 _amem8_f2(&x2[2]) = x_3o_x_2o;
266 _amem8_f2(&x2[l1]) = xl1_1o_xl1_0o;
267 _amem8_f2(&x2[l1+2]) = xl1_3o_xl1_2o;
269 _amem8_f2(&x2[h2]) = xh2_1o_xh2_0o;
270 _amem8_f2(&x2[h2+2]) = xh2_3o_xh2_2o;
272 _amem8_f2(&x2[l2]) = xl2_1o_xl2_0o;
273 _amem8_f2(&x2[l2+2]) = xl2_3o_xl2_2o;
275 /*-------------------------------------------------------------*/
276 /* When the twiddle factors are not to be re-used, j is */
277 /* incremented by 12, to reflect the fact that 6 words are */
278 /* consumed in every iteration. The input data pointer */
279 /* increments by 4. Note that within a stage, the stride does */
280 /* not change and hence the offsets for the other three legs, */
281 /* 0, h2, l1, l2. */
282 /*-------------------------------------------------------------*/
283 j += 12;
284 x += 4;
285 predj = (j - fft_jmp);
286 if (!predj) x += fft_jmp;
287 if (!predj) j = 0;
288 }
289 stride >>= 2;
290 }
292 j = offset >> 2;
293 /*-----------------------------------------------------------------*/
294 /* The following code performs either a standard radix4 pass or a */
295 /* radix2 pass. Two pointers are used to access the input data. */
296 /* The input data is read "N/4" complex samples apart or "N/2" */
297 /* words apart using pointers "x0" and "x2". This produces out- */
298 /* puts that are 0, N/8, N/2, 3N/8 for radix 2. */
299 /*-----------------------------------------------------------------*/
300 y0 = ptr_y;
301 y2 = ptr_y + n_max;
302 x2 = ptr_x;
303 y1 = y0 + (n_max >> 1);
304 y3 = y2 + (n_max >> 1);
305 l1 = _norm(n_max) + 4;
306 one_over_nmax = _rcpsp((float)n_max);
307 scale = _ftof2(one_over_nmax, one_over_nmax);
309 if (radix == 4) {
310 for (i = 0; i < N; i += 4) {
311 /* reversal computation */
312 k = _bitr(j) >> l1;
313 j++;
315 /*-------------------------------------------------------------*/
316 /* Read in the input data. These are transformed as a radix 4. */
317 /*-------------------------------------------------------------*/
318 x_10 = _amem8_f2(&x2[0]);
319 x_32 = _amem8_f2(&x2[2]);
320 x_54 = _amem8_f2(&x2[4]);
321 x_76 = _amem8_f2(&x2[6]);
322 x2 += 8;
324 /*-------------------------------------------------------------*/
325 /* Perform radix4 style decomposition. */
326 /*-------------------------------------------------------------*/
327 xh1_0_xh0_0 = _daddsp(x_10, x_54);
328 xh1_1_xh0_1 = _daddsp(x_32, x_76);
329 xl1_0_xl0_0 = _dsubsp(x_10, x_54);
330 xl1_1_xl0_1 = _dsubsp(x_32, x_76);
332 yt_10 = _daddsp(xh1_0_xh0_0, xh1_1_xh0_1);
333 yt_54 = _dsubsp(xh1_0_xh0_0, xh1_1_xh0_1);
335 yt3 = _hif2(xl1_0_xl0_0) + _lof2(xl1_1_xl0_1);
336 yt2 = _lof2(xl1_0_xl0_0) - _hif2(xl1_1_xl0_1);
337 yt7 = _hif2(xl1_0_xl0_0) - _lof2(xl1_1_xl0_1);
338 yt6 = _lof2(xl1_0_xl0_0) + _hif2(xl1_1_xl0_1);
340 /*-------------------------------------------------------------*/
341 /* Points that are read from succesive locations map to y */
342 /* y[N/8], y[N/2], y[5N/8] in a radix2 scheme. */
343 /*-------------------------------------------------------------*/
344 _amem8_f2(&y0[k*2]) = _dmpysp(yt_10, scale);
345 _amem8_f2(&y1[k*2]) = _dmpysp(_ftof2(yt3, yt2), scale);
346 _amem8_f2(&y2[k*2]) = _dmpysp(yt_54, scale);
347 _amem8_f2(&y3[k*2]) = _dmpysp(_ftof2(yt7, yt6), scale);
348 }
349 }
350 if (radix == 2) {
351 for (i = 0; i < N; i += 4) {
352 /* reversal computation */
353 k = _bitr(j) >> l1;
354 j++;
356 /*-------------------------------------------------------------*/
357 /* Read in the input data. These are transformed as a radix 2. */
358 /*-------------------------------------------------------------*/
359 x_10 = _amem8_f2(&x2[0]);
360 x_32 = _amem8_f2(&x2[2]);
361 x_54 = _amem8_f2(&x2[4]);
362 x_76 = _amem8_f2(&x2[6]);
363 x2 += 8;
365 /*-------------------------------------------------------------*/
366 /* Perform radix2 style decomposition. */
367 /*-------------------------------------------------------------*/
368 yt_10 = _daddsp(x_10, x_32);
369 yt_32 = _daddsp(x_54, x_76);
370 yt_54 = _dsubsp(x_10, x_32);
371 yt_76 = _dsubsp(x_54, x_76);
373 /*-------------------------------------------------------------*/
374 /* Points that are read from succesive locations map to y */
375 /* y[N/8], y[N/2], y[5N/8] in a radix2 scheme. */
376 /*-------------------------------------------------------------*/
377 _amem8_f2(&y0[k*2]) = _dmpysp(yt_10, scale);
378 _amem8_f2(&y1[k*2]) = _dmpysp(yt_32, scale);
379 _amem8_f2(&y2[k*2]) = _dmpysp(yt_54, scale);
380 _amem8_f2(&y3[k*2]) = _dmpysp(yt_76, scale);
381 }
382 }
383 }
384 #else
385 void DSPF_sp_ifftSPxSP_opt (int N, float *ptr_x, float *ptr_w, float *ptr_y,
386 unsigned char *brev, int n_min, int offset, int n_max)
387 {
389 int i, j, k, l1, l2, h2, predj, tw_offset, stride, fft_jmp, radix;
391 float one_over_nmax;
392 float yt2, yt3, yt6, yt7;
393 float xt1_0, yt2_0, xt1_1, yt2_1, xt2_0, yt1_0, xt2_1, yt1_1;
394 float * restrict x, * restrict x2, * restrict w;
395 float * restrict y0, * restrict y1, * restrict y2, * restrict y3;
397 __float2_t scale;
398 __float2_t xh20_0_xh21_0, xh20_1_xh21_1, xh0_0_xh1_0, xh0_1_xh1_1;
399 __float2_t xl20_0_xl21_0, xl20_1_xl21_1, xl0_0_xl1_0, xl0_1_xl1_1;
400 __float2_t xt0_0_yt0_0, xt0_1_yt0_1, xt1_0_yt1_0;
401 __float2_t xt1_1_yt1_1, xt2_0_yt2_0, xt2_1_yt2_1;
402 __float2_t x_0o_x_1o, xl1_0o_xl1_1o, x_2o_x_3o, xl1_2o_xl1_3o;
403 __float2_t xh2_0o_xh2_1o, xl2_0o_xl2_1o, xh2_2o_xh2_3o, xl2_2o_xl2_3o;
404 __float2_t x_l1_01, x_l1_23, x_l2_01, x_l2_23, x_h2_01, x_h2_23;
405 __float2_t co10_si10, co20_si20, co30_si30, co11_si11, co21_si21, co31_si31;
406 __float2_t x_01, x_23, x_45, x_67, yt_01, yt_23, yt_45, yt_67;
408 /*----------------------------------------------------------------------*/
409 /* The stride is quartered with every iteration of the outer loop. It */
410 /* denotes the seperation between any two adjacent inputs to the butter */
411 /* -fly. This should start out at N/4, hence stride is initially set to */
412 /* N. For every stride, 6*stride twiddle factors are accessed. The */
413 /* "tw_offset" is the offset within the current twiddle factor sub- */
414 /* table. This is set to zero, at the start of the code and is used to */
415 /* obtain the appropriate sub-table twiddle pointer by offseting it */
416 /* with the base pointer "ptr_w". */
417 /*----------------------------------------------------------------------*/
418 radix = n_min;
419 stride = N;
420 tw_offset = 0;
421 fft_jmp = 6 * stride;
423 _nassert(stride > 4);
424 while (stride > radix)
425 {
426 /*-----------------------------------------------------------------*/
427 /* At the start of every iteration of the outer loop, "j" is set */
428 /* to zero, as "w" is pointing to the correct location within the */
429 /* twiddle factor array. For every iteration of the inner loop */
430 /* 6 * stride twiddle factors are accessed. For eg, */
431 /* */
432 /* #Iteration of outer loop # twiddle factors #times cycled */
433 /* 1 6 N/4 1 */
434 /* 2 6 N/16 4 */
435 /* ... */
436 /*-----------------------------------------------------------------*/
437 j = 0;
438 fft_jmp >>= 2;
440 /*-----------------------------------------------------------------*/
441 /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or */
442 /* "N/2", "N", "3N/2" half word */
443 /*-----------------------------------------------------------------*/
444 h2 = stride >> 1;
445 l1 = stride;
446 l2 = stride + (stride >> 1);
448 /*-----------------------------------------------------------------*/
449 /* Reset "x" to point to the start of the input data array. */
450 /* "tw_offset" starts off at 0, and increments by "6 * stride" */
451 /* The stride quarters with every iteration of the outer loop */
452 /*-----------------------------------------------------------------*/
453 x = ptr_x;
454 w = ptr_w + tw_offset;
455 tw_offset += fft_jmp;
457 /*-----------------------------------------------------------------*/
458 /* The following loop iterates through the different butterflies, */
459 /* within a given stage. Recall that there are logN to base 4 */
460 /* stages. Certain butterflies share the twiddle factors. These */
461 /* are grouped together. On the very first stage there are no */
462 /* butterflies that share the twiddle factor, all N/4 butter- */
463 /* flies have different factors. On the next stage two sets of */
464 /* N/8 butterflies share the same twiddle factor. Hence after */
465 /* half the butterflies are performed, j the index into the */
466 /* factor array resets to 0, and the twiddle factors are reused. */
467 /* When this happens, the data pointer 'x' is incremented by the */
468 /* fft_jmp amount. */
469 /*-----------------------------------------------------------------*/
470 _nassert((int)(w) % 8 == 0);
471 _nassert((int)(x) % 8 == 0);
472 _nassert(h2 % 8 == 0);
473 _nassert(l1 % 8 == 0);
474 _nassert(l2 % 8 == 0);
475 _nassert(N >= 8);
477 for (i = 0; i < (N >> 3); i++) {
479 /*-------------------------------------------------------------*/
480 /* Read the first six twiddle factor values. This loop */
481 /* computes two radix 4 butterflies at a time. */
482 /* si10 = w[0] co10 = w[1] si20 = w[2] co20 = w[3] */
483 /* si30 = w[4] co30 = w[5] si11 = w[6] co11 = w[7] */
484 /* si21 = w[8] co21 = w[9] si31 = w[a] co31 = w[b] */
485 /*-------------------------------------------------------------*/
486 co10_si10 = _amem8_f2_const(&w[j]);
487 co20_si20 = _amem8_f2_const(&w[j+2]);
488 co30_si30 = _amem8_f2_const(&w[j+4]);
490 co11_si11 = _amem8_f2_const(&w[j+6]);
491 co21_si21 = _amem8_f2_const(&w[j+8]);
492 co31_si31 = _amem8_f2_const(&w[j+10]);
494 /*-------------------------------------------------------------*/
495 /* Read in the data elements for the eight inputs of two */
496 /* radix4 butterflies. */
497 /* x[0] x[1] x[2] x[3] */
498 /* x[h2+0] x[h2+1] x[h2+2] x[h2+3] */
499 /* x[l1+0] x[l1+1] x[l1+2] x[l1+3] */
500 /* x[l2+0] x[l2+1] x[l2+2] x[l2+3] */
501 /*-------------------------------------------------------------*/
502 x_01 = _amem8_f2(&x[0]);
503 x_23 = _amem8_f2(&x[2]);
504 x_l1_01 = _amem8_f2(&x[l1]);
505 x_l1_23 = _amem8_f2(&x[l1 + 2]);
506 x_l2_01 = _amem8_f2(&x[l2]);
507 x_l2_23 = _amem8_f2(&x[l2 + 2]);
508 x_h2_01 = _amem8_f2(&x[h2]);
509 x_h2_23 = _amem8_f2(&x[h2 + 2]);
511 /*-------------------------------------------------------------*/
512 /* xh0_0 = x[0] + x[l1]; xh1_0 = x[1] + x[l1+1] */
513 /* xh0_1 = x[2] + x[l1+2]; xh1_1 = x[3] + x[l1+3] */
514 /* xl0_0 = x[0] - x[l1]; xl1_0 = x[1] - x[l1+1] */
515 /* xl0_1 = x[2] - x[l1+2]; xl1_1 = x[3] - x[l1+3] */
516 /*-------------------------------------------------------------*/
517 xh0_0_xh1_0 = _daddsp(x_01, x_l1_01);
518 xl0_0_xl1_0 = _dsubsp(x_01, x_l1_01);
519 xh0_1_xh1_1 = _daddsp(x_23, x_l1_23);
520 xl0_1_xl1_1 = _dsubsp(x_23, x_l1_23);
522 /*-------------------------------------------------------------*/
523 /* xh20_0 = x[h2 ] + x[l2 ]; xh21_0 = x[h2+1] + x[l2+1] */
524 /* xh20_1 = x[h2+2] + x[l2+2]; xh21_1 = x[h2+3] + x[l2+3] */
525 /* xl20_0 = x[h2 ] - x[l2 ]; xl21_0 = x[h2+1] - x[l2+1] */
526 /* xl20_1 = x[h2+2] - x[l2+2]; xl21_1 = x[h2+3] - x[l2+3] */
527 /*-------------------------------------------------------------*/
528 xh20_0_xh21_0 = _daddsp(x_h2_01, x_l2_01);
529 xl20_0_xl21_0 = _dsubsp(x_h2_01, x_l2_01);
530 xh20_1_xh21_1 = _daddsp(x_h2_23, x_l2_23);
531 xl20_1_xl21_1 = _dsubsp(x_h2_23, x_l2_23);
533 /*-------------------------------------------------------------*/
534 /* x_0o = xh0_0 + xh20_0; x_1o = xh1_0 + xh21_0; */
535 /* x_2o = xh0_1 + xh20_1; x_3o = xh1_1 + xh21_1; */
536 /* xt0_0 = xh0_0 - xh20_0; yt0_0 = xh1_0 - xh21_0; */
537 /* xt0_1 = xh0_1 - xh20_1; yt0_1 = xh1_1 - xh21_1; */
538 /*-------------------------------------------------------------*/
539 x_0o_x_1o = _daddsp(xh0_0_xh1_0, xh20_0_xh21_0);
540 x_2o_x_3o = _daddsp(xh0_1_xh1_1, xh20_1_xh21_1);
542 xt0_0_yt0_0 = _dsubsp(xh0_0_xh1_0, xh20_0_xh21_0);
543 xt0_1_yt0_1 = _dsubsp(xh0_1_xh1_1, xh20_1_xh21_1);
545 /*-------------------------------------------------------------*/
546 /* xt1_0 = xl0_0 - xl21_0; yt2_0 = xl1_0 - xl20_0; */
547 /* xt1_1 = xl0_1 - xl21_1; yt2_1 = xl1_1 - xl20_1; */
548 /* xt2_0 = xl0_0 + xl21_0; yt1_0 = xl1_0 + xl20_0; */
549 /* xt2_1 = xl0_1 + xl21_1; yt1_1 = xl1_1 + xl20_1; */
550 /*-------------------------------------------------------------*/
551 yt1_0 = _lof2(xl0_0_xl1_0) + _hif2(xl20_0_xl21_0);
552 xt2_0 = _hif2(xl0_0_xl1_0) + _lof2(xl20_0_xl21_0);
553 yt1_1 = _lof2(xl0_1_xl1_1) + _hif2(xl20_1_xl21_1);
554 xt2_1 = _hif2(xl0_1_xl1_1) + _lof2(xl20_1_xl21_1);
555 yt2_0 = _lof2(xl0_0_xl1_0) - _hif2(xl20_0_xl21_0);
556 xt1_0 = _hif2(xl0_0_xl1_0) - _lof2(xl20_0_xl21_0);
557 yt2_1 = _lof2(xl0_1_xl1_1) - _hif2(xl20_1_xl21_1);
558 xt1_1 = _hif2(xl0_1_xl1_1) - _lof2(xl20_1_xl21_1);
560 xt1_0_yt1_0 = _ftof2(xt1_0, yt1_0);
561 xt1_1_yt1_1 = _ftof2(xt1_1, yt1_1);
562 xt2_0_yt2_0 = _ftof2(xt2_0, yt2_0);
563 xt2_1_yt2_1 = _ftof2(xt2_1, yt2_1);
565 /*-------------------------------------------------------------*/
566 /* x2[h2 ] = (si10 * yt1_0 + co10 * xt1_0) */
567 /* x2[h2+1] = (co10 * yt1_0 - si10 * xt1_0) */
568 /* x2[h2+2] = (si11 * yt1_1 + co11 * xt1_1) */
569 /* x2[h2+3] = (co11 * yt1_1 - si11 * xt1_1) */
570 /*-------------------------------------------------------------*/
571 xl1_0o_xl1_1o = _complex_mpysp(co10_si10, xt1_0_yt1_0);
572 xl1_2o_xl1_3o = _complex_mpysp(co11_si11, xt1_1_yt1_1);
574 /*-------------------------------------------------------------*/
575 /* x2[l1 ] = (si20 * yt0_0 + co20 * xt0_0) */
576 /* x2[l1+1] = (co20 * yt0_0 - si20 * xt0_0) */
577 /* x2[l1+2] = (si21 * yt0_1 + co21 * xt0_1) */
578 /* x2[l1+3] = (co21 * yt0_1 - si21 * xt0_1) */
579 /*-------------------------------------------------------------*/
580 xh2_0o_xh2_1o = _complex_mpysp(co20_si20, xt0_0_yt0_0);
581 xh2_2o_xh2_3o = _complex_mpysp(co21_si21, xt0_1_yt0_1);
583 /*-------------------------------------------------------------*/
584 /* x2[l2 ] = (si30 * yt2_0 + co30 * xt2_0) */
585 /* x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0) */
586 /* x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1) */
587 /* x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1) */
588 /*-------------------------------------------------------------*/
589 xl2_0o_xl2_1o = _complex_mpysp(co30_si30, xt2_0_yt2_0);
590 xl2_2o_xl2_3o = _complex_mpysp(co31_si31, xt2_1_yt2_1);
592 /*-------------------------------------------------------------*/
593 /* Derive output pointers using the input pointer "x" */
594 /*-------------------------------------------------------------*/
595 x2 = (float*)_mvd((int)x);
597 /*-------------------------------------------------------------*/
598 /* Store eight outputs - four legs of each butterfly */
599 /*-------------------------------------------------------------*/
600 _amem8_f2(&x2[0]) = x_0o_x_1o;
601 _amem8_f2(&x2[2]) = x_2o_x_3o;
603 _amem8_f2(&x2[l1]) = xl1_0o_xl1_1o;
604 _amem8_f2(&x2[l1+2]) = xl1_2o_xl1_3o;
606 _amem8_f2(&x2[h2]) = xh2_0o_xh2_1o;
607 _amem8_f2(&x2[h2+2]) = xh2_2o_xh2_3o;
609 _amem8_f2(&x2[l2]) = xl2_0o_xl2_1o;
610 _amem8_f2(&x2[l2+2]) = xl2_2o_xl2_3o;
612 /*-------------------------------------------------------------*/
613 /* When the twiddle factors are not to be re-used, j is */
614 /* incremented by 12, to reflect the fact that 6 words are */
615 /* consumed in every iteration. The input data pointer */
616 /* increments by 4. Note that within a stage, the stride does */
617 /* not change and hence the offsets for the other three legs, */
618 /* 0, h2, l1, l2. */
619 /*-------------------------------------------------------------*/
620 j += 12;
621 x += 4;
622 predj = (j - fft_jmp);
623 if (!predj) x += fft_jmp;
624 if (!predj) j = 0;
625 }
626 stride >>= 2;
627 }
629 j = offset >> 2;
630 /*-----------------------------------------------------------------*/
631 /* The following code performs either a standard radix4 pass or a */
632 /* radix2 pass. Two pointers are used to access the input data. */
633 /* The input data is read "N/4" complex samples apart or "N/2" */
634 /* words apart using pointers "x0" and "x2". This produces out- */
635 /* puts that are 0, N/8, N/2, 3N/8 for radix 2. */
636 /*-----------------------------------------------------------------*/
637 y0 = ptr_y;
638 y2 = ptr_y + n_max;
639 x2 = ptr_x;
640 y1 = y0 + (n_max >> 1);
641 y3 = y2 + (n_max >> 1);
642 l1 = _norm(n_max) + 4;
643 one_over_nmax = _rcpsp((float)n_max);
644 scale = _ftof2(one_over_nmax, one_over_nmax);
646 if (radix == 4) {
647 for (i = 0; i < N; i += 4) {
648 /* reversal computation */
649 k = _bitr(j) >> l1;
650 j++;
652 /*-------------------------------------------------------------*/
653 /* Read in the input data. These are transformed as a radix 4. */
654 /*-------------------------------------------------------------*/
655 x_01 = _amem8_f2(&x2[0]);
656 x_23 = _amem8_f2(&x2[2]);
657 x_45 = _amem8_f2(&x2[4]);
658 x_67 = _amem8_f2(&x2[6]);
659 x2 += 8;
661 /*-------------------------------------------------------------*/
662 /* Perform radix4 style decomposition. */
663 /*-------------------------------------------------------------*/
664 xh0_0_xh1_0 = _daddsp(x_01, x_45);
665 xh0_1_xh1_1 = _daddsp(x_23, x_67);
666 xl0_0_xl1_0 = _dsubsp(x_01, x_45);
667 xl0_1_xl1_1 = _dsubsp(x_23, x_67);
669 yt_01 = _daddsp(xh0_0_xh1_0, xh0_1_xh1_1);
670 yt_45 = _dsubsp(xh0_0_xh1_0, xh0_1_xh1_1);
672 yt3 = _lof2(xl0_0_xl1_0) + _hif2(xl0_1_xl1_1);
673 yt2 = _hif2(xl0_0_xl1_0) - _lof2(xl0_1_xl1_1);
674 yt7 = _lof2(xl0_0_xl1_0) - _hif2(xl0_1_xl1_1);
675 yt6 = _hif2(xl0_0_xl1_0) + _lof2(xl0_1_xl1_1);
677 /*-------------------------------------------------------------*/
678 /* Points that are read from succesive locations map to y */
679 /* y[N/8], y[N/2], y[5N/8] in a radix2 scheme. */
680 /*-------------------------------------------------------------*/
681 _amem8_f2(&y0[k*2]) = _dmpysp(yt_01, scale);
682 _amem8_f2(&y1[k*2]) = _dmpysp(_ftof2(yt2, yt3), scale);
683 _amem8_f2(&y2[k*2]) = _dmpysp(yt_45, scale);
684 _amem8_f2(&y3[k*2]) = _dmpysp(_ftof2(yt6, yt7), scale);
685 }
686 }
687 if (radix == 2) {
688 for (i = 0; i < N; i += 4) {
689 /* reversal computation */
690 k = _bitr(j) >> l1;
691 j++;
693 /*-------------------------------------------------------------*/
694 /* Read in the input data. These are transformed as a radix 2. */
695 /*-------------------------------------------------------------*/
696 x_01 = _amem8_f2(&x2[0]);
697 x_23 = _amem8_f2(&x2[2]);
698 x_45 = _amem8_f2(&x2[4]);
699 x_67 = _amem8_f2(&x2[6]);
700 x2 += 8;
702 /*-------------------------------------------------------------*/
703 /* Perform radix2 style decomposition. */
704 /*-------------------------------------------------------------*/
705 yt_01 = _daddsp(x_01, x_23);
706 yt_23 = _daddsp(x_45, x_67);
707 yt_45 = _dsubsp(x_01, x_23);
708 yt_67 = _dsubsp(x_45, x_67);
710 /*-------------------------------------------------------------*/
711 /* Points that are read from succesive locations map to y */
712 /* y[N/8], y[N/2], y[5N/8] in a radix2 scheme. */
713 /*-------------------------------------------------------------*/
714 _amem8_f2(&y0[k*2]) = _dmpysp(yt_01, scale);
715 _amem8_f2(&y1[k*2]) = _dmpysp(yt_23, scale);
716 _amem8_f2(&y2[k*2]) = _dmpysp(yt_45, scale);
717 _amem8_f2(&y3[k*2]) = _dmpysp(yt_67, scale);
718 }
719 }
720 }
721 #endif
722 /* ======================================================================= */
723 /* End of file: DSPF_sp_ifftSPxSP_opt.c */
724 /* ----------------------------------------------------------------------- */
725 /* Copyright (c) 2011 Texas Instruments, Incorporated. */
726 /* All Rights Reserved. */
727 /* ======================================================================= */