]> Gitweb @ Texas Instruments - Open Source Git Repositories - git.TI.com/gitweb - ep-processor-libraries/dsplib.git/blob - ti/dsplib/src/DSP_fft16x16/c64P/DSP_fft16x16_i.c
DSPLIB: optimized signal processing functions for TI DSPs
[ep-processor-libraries/dsplib.git] / ti / dsplib / src / DSP_fft16x16 / c64P / DSP_fft16x16_i.c
1 /* ======================================================================= */
2 /*  TEXAS INSTRUMENTS, INC.                                                */
3 /*                                                                         */
4 /*  DSPLIB  DSP Signal Processing Library                                  */
5 /*                                                                         */
6 /*  This library contains proprietary intellectual property of Texas       */
7 /*  Instruments, Inc.  The library and its source code are protected by    */
8 /*  various copyrights, and portions may also be protected by patents or   */
9 /*  other legal protections.                                               */
10 /*                                                                         */
11 /*  This software is licensed for use with Texas Instruments TMS320        */
12 /*  family DSPs.  This license was provided to you prior to installing     */
13 /*  the software.  You may review this license by consulting the file      */
14 /*  TI_license.PDF which accompanies the files in this library.            */
15 /*                                                                         */
16 /* ----------------------------------------------------------------------- */
17 /*                                                                         */
18 /*  TEXAS INSTRUMENTS, INC.                                                */
19 /*                                                                         */
20 /*  NAME                                                                   */
21 /*      DSP_fft16x16 -- fft16x16                                           */
22 /*                                                                         */
23 /*      USAGE                                                              */
24 /*           This routine is C-callable and can be called as:              */
25 /*                                                                         */
26 /*          void DSP_fft16x16_i (                                          */
27 /*              const short * ptr_w,                                       */
28 /*              int npoints,                                               */
29 /*              short * ptr_x,                                             */
30 /*              short * ptr_y                                              */
31 /*          );                                                             */
32 /*                                                                         */
33 /*            ptr_w   =  input twiddle factors                             */
34 /*            npoints =  number of points                                  */
35 /*            ptr_x   =  transformed data reversed                         */
36 /*            ptr_y   =  linear transformed data                           */
37 /*                                                                         */
38 /*                                                                         */
39 /*  DESCRIPTION                                                            */
40 /*    The following code performs a mixed radix FFT for "npoints" which    */
41 /*    is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4   */
42 /*    transform and performs either a radix2 or radix4 transform on the    */
43 /*    last stage depending on "npoints". If "npoints" is a multiple of 4,  */
44 /*    then this last stage is also a radix4 transform, otherwise it is a   */
45 /*    radix2 transform.                                                    */
46 /*                                                                         */
47 /*                                                                         */
48 /* int gen_twiddle_fft16x16(short *w, int n)                               */
49 /*                                                                         */
50 /*    int i, j, k;                                                         */
51 /*     double M = 32767.5;                                                 */
52 /*                                                                         */
53 /*    for (j = 1, k = 0; j < n >> 2; j = j << 2)                           */
54 /*    {                                                                    */
55 /*        for (i = 0; i < n >> 2; i += j << 1)                             */
56 /*        {                                                                */
57 /*                                                                         */
58 /*          w[k + 11] = d2s(M * cos(6.0 * PI * (i + j) / n));              */
59 /*          w[k + 10] = d2s(M * sin(6.0 * PI * (i + j) / n));              */
60 /*          w[k +  9] = d2s(M * cos(6.0 * PI * (i    ) / n));              */
61 /*          w[k +  8] = d2s(M * sin(6.0 * PI * (i    ) / n));              */
62 /*                                                                         */
63 /*          w[k +  7] = d2s(M * cos(4.0 * PI * (i + j) / n));              */
64 /*          w[k +  6] = d2s(M * sin(4.0 * PI * (i + j) / n));              */
65 /*          w[k +  5] = d2s(M * cos(4.0 * PI * (i    ) / n));              */
66 /*          w[k +  4] = d2s(M * sin(4.0 * PI * (i    ) / n));              */
67 /*                                                                         */
68 /*          w[k +  3] = d2s(M * cos(2.0 * PI * (i + j) / n));              */
69 /*          w[k +  2] = d2s(M * sin(2.0 * PI * (i + j) / n));              */
70 /*          w[k +  1] = d2s(M * cos(2.0 * PI * (i    ) / n));              */
71 /*          w[k +  0] = d2s(M * sin(2.0 * PI * (i    ) / n));              */
72 /*                                                                         */
73 /*          k += 12;                                                       */
74 /*                                                                         */
75 /*                                                                         */
76 /*      }                                                                  */
77 /*    }                                                                    */
78 /*                                                                         */
79 /*    return k;                                                            */
80 /*                                                                         */
81 /*                                                                         */
82 /*  ASSUMPTIONS                                                            */
83 /*      This code works for  both "npoints" a multiple of 2 or 4.          */
84 /*      The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a        */
85 /*      double-word boundary for the "optimized" implementations.          */
86 /*                                                                         */
87 /*      The input and output data are complex, with the real/imaginary     */
88 /*      components stored in adjacent locations in the array.  The real    */
89 /*      components are stored at even array indices, and the imaginary     */
90 /*      components are stored at odd array indices.                        */
91 /*                                                                         */
92 /*  TECHNIQUES                                                             */
93 /*      The following C code represents an implementation of the Cooley    */
94 /*      Tukey radix 4 DIF FFT. It accepts the inputs in normal order and   */
95 /*      produces the outputs in digit reversed order. The natural C code   */
96 /*      shown in this file on the other hand, accepts the inputs in nor-   */
97 /*      mal order and produces the outputs in normal order.                */
98 /*                                                                         */
99 /*      Several transformations have been applied to the original Cooley   */
100 /*      Tukey code to produce the natural C code description shown here.   */
101 /*      In order to understand these it would first be educational to      */
102 /*      understand some of the issues involved in the conventional Cooley  */
103 /*      Tukey FFT code.                                                    */
104 /*                                                                         */
105 /*      void radix4(int n, short x[], short wn[])                          */
106 /*      {                                                                  */
107 /*          int    n1,  n2,  ie,   ia1,  ia2, ia3;                         */
108 /*          int    i0,  i1,  i2,    i3,    i, j,     k;                    */
109 /*          short  co1, co2, co3,  si1,  si2, si3;                         */
110 /*          short  xt0, yt0, xt1,  yt1,  xt2, yt2;                         */
111 /*          short  xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21;               */
112 /*                                                                         */
113 /*          n2 = n;                                                        */
114 /*          ie = 1;                                                        */
115 /*          for (k = n; k > 1; k >>= 2)                                    */
116 /*          {                                                              */
117 /*              n1 = n2;                                                   */
118 /*              n2 >>= 2;                                                  */
119 /*              ia1 = 0;                                                   */
120 /*                                                                         */
121 /*              for (j = 0; j < n2; j++)                                   */
122 /*              {                                                          */
123 /*                   ia2 = ia1 + ia1;                                      */
124 /*                   ia3 = ia2 + ia1;                                      */
125 /*                                                                         */
126 /*                   co1 = wn[2 * ia1    ];                                */
127 /*                   si1 = wn[2 * ia1 + 1];                                */
128 /*                   co2 = wn[2 * ia2    ];                                */
129 /*                   si2 = wn[2 * ia2 + 1];                                */
130 /*                   co3 = wn[2 * ia3    ];                                */
131 /*                   si3 = wn[2 * ia3 + 1];                                */
132 /*                   ia1 = ia1 + ie;                                       */
133 /*                                                                         */
134 /*                   for (i0 = j; i0< n; i0 += n1)                         */
135 /*                   {                                                     */
136 /*                       i1 = i0 + n2;                                     */
137 /*                       i2 = i1 + n2;                                     */
138 /*                       i3 = i2 + n2;                                     */
139 /*                                                                         */
140 /*                                                                         */
141 /*                       xh0  = x[2 * i0    ] + x[2 * i2    ];             */
142 /*                       xh1  = x[2 * i0 + 1] + x[2 * i2 + 1];             */
143 /*                       xl0  = x[2 * i0    ] - x[2 * i2    ];             */
144 /*                       xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];             */
145 /*                                                                         */
146 /*                       xh20 = x[2 * i1    ] + x[2 * i3    ];             */
147 /*                       xh21 = x[2 * i1 + 1] + x[2 * i3 + 1];             */
148 /*                       xl20 = x[2 * i1    ] - x[2 * i3    ];             */
149 /*                       xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];             */
150 /*                                                                         */
151 /*                       x[2 * i0    ] = xh0 + xh20;                       */
152 /*                       x[2 * i0 + 1] = xh1 + xh21;                       */
153 /*                                                                         */
154 /*                       xt0  = xh0 - xh20;                                */
155 /*                       yt0  = xh1 - xh21;                                */
156 /*                       xt1  = xl0 + xl21;                                */
157 /*                       yt2  = xl1 + xl20;                                */
158 /*                       xt2  = xl0 - xl21;                                */
159 /*                       yt1  = xl1 - xl20;                                */
160 /*                                                                         */
161 /*                       x[2 * i1    ] = (xt1 * co1 + yt1 * si1) >> 15;    */
162 /*                       x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15;    */
163 /*                       x[2 * i2    ] = (xt0 * co2 + yt0 * si2) >> 15;    */
164 /*                       x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15;    */
165 /*                       x[2 * i3    ] = (xt2 * co3 + yt2 * si3) >> 15;    */
166 /*                       x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15;    */
167 /*                   }                                                     */
168 /*             }                                                           */
169 /*                                                                         */
170 /*             ie <<= 2;                                                   */
171 /*         }                                                               */
172 /*     }                                                                   */
173 /*                                                                         */
174 /*      The conventional Cooley Tukey FFT, is written using three loops.   */
175 /*      The outermost loop "k" cycles through the stages. There are log    */
176 /*      N to the base 4 stages in all. The loop "j" cycles through the     */
177 /*      groups of butterflies with different twiddle factors, loop "i"     */
178 /*      reuses the twiddle factors for the different butterflies within    */
179 /*      a stage. It is interesting to note the following:                  */
180 /*                                                                         */
181 /* ----------------------------------------------------------------------- */
182 /*      Stage#     #Groups     # Butterflies with common     #Groups*Bflys */
183 /*                               twiddle factors                           */
184 /* ----------------------------------------------------------------------- */
185 /*       1         N/4          1                            N/4           */
186 /*       2         N/16         4                            N/4           */
187 /*       ..                                                                */
188 /*       logN      1            N/4                          N/4           */
189 /* ----------------------------------------------------------------------- */
190 /*                                                                         */
191 /*      The following statements can be made based on above observations:  */
192 /*                                                                         */
193 /*      a) Inner loop "i0" iterates a veriable number of times. In         */
194 /*      particular the number of iterations quadruples every time from     */
195 /*      1..N/4. Hence software pipelining a loop that iterates a vraiable  */
196 /*      number of times is not profitable.                                 */
197 /*                                                                         */
198 /*      b) Outer loop "j" iterates a variable number of times as well.     */
199 /*      However the number of iterations is quartered every time from      */
200 /*      N/4 ..1. Hence the behaviour in (a) and (b) are exactly opposite   */
201 /*      to each other.                                                     */
202 /*                                                                         */
203 /*      c) If the two loops "i" and "j" are colaesced together then they   */
204 /*      will iterate for a fixed number of times namely N/4. This allows   */
205 /*      us to combine the "i" and "j" loops into 1 loop. Optimized impl-   */
206 /*      ementations will make use of this fact.                            */
207 /*                                                                         */
208 /*      In addition the Cooley Tukey FFT accesses three twiddle factors    */
209 /*      per iteration of the inner loop, as the butterflies that re-use    */
210 /*      twiddle factors are lumped together. This leads to accessing the   */
211 /*      twiddle factor array at three points each sepearted by "ie". Note  */
212 /*      that "ie" is initially 1, and is quadrupled with every iteration.  */
213 /*      Therfore these three twiddle factors are not even contiguous in    */
214 /*      the array.                                                         */
215 /*                                                                         */
216 /*      In order to vectorize the FFT, it is desirable to access twiddle   */
217 /*      factor array using double word wide loads and fetch the twiddle    */
218 /*      factors needed. In order to do this a modified twiddle factor      */
219 /*      array is created, in which the factors WN/4, WN/2, W3N/4 are       */
220 /*      arranged to be contiguous. This eliminates the seperation between  */
221 /*      twiddle factors within a butterfly. However this implies that as   */
222 /*      the loop is traversed from one stage to another, that we maintain  */
223 /*      a redundant version of the twiddle factor array. Hence the size    */
224 /*      of the twiddle factor array increases as compared to the normal    */
225 /*      Cooley Tukey FFT.  The modified twiddle factor array is of size    */
226 /*      "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4"   */
227 /*      where N is the number of complex points to be transformed. The     */
228 /*      routine that generates the modified twiddle factor array was       */
229 /*      presented earlier. With the above transformation of the FFT,       */
230 /*      both the input data and the twiddle factor array can be accessed   */
231 /*      using double-word wide loads to enable packed data processing.     */
232 /*                                                                         */
233 /*      The final stage is optimised to remove the multiplication as       */
234 /*      w0 = 1.  This stage also performs digit reversal on the data,      */
235 /*      so the final output is in natural order.                           */
236 /*                                                                         */
237 /*      The fft() code shown here performs the bulk of the computation     */
238 /*      in place. However, because digit-reversal cannot be performed      */
239 /*      in-place, the final result is written to a separate array, y[].    */
240 /*                                                                         */
241 /*      The actual twiddle factors for the FFT are cosine, - sine. The     */
242 /*      twiddle factors stored in the table are csine and sine, hence      */
243 /*      the sign of the "sine" term is comprehended during multipli-       */
244 /*      cation as shown above.                                             */
245 /*                                                                         */
246 /*  MEMORY NOTE                                                            */
247 /*      The optimized implementations are written for LITTLE ENDIAN.       */
248 /*                                                                         */
249 /* Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/  */ 
250 /*                                                                         */
251 /*                                                                         */
252 /*  Redistribution and use in source and binary forms, with or without     */
253 /*  modification, are permitted provided that the following conditions     */
254 /*  are met:                                                               */
255 /*                                                                         */
256 /*    Redistributions of source code must retain the above copyright       */
257 /*    notice, this list of conditions and the following disclaimer.        */
258 /*                                                                         */
259 /*    Redistributions in binary form must reproduce the above copyright    */
260 /*    notice, this list of conditions and the following disclaimer in the  */
261 /*    documentation and/or other materials provided with the               */
262 /*    distribution.                                                        */
263 /*                                                                         */
264 /*    Neither the name of Texas Instruments Incorporated nor the names of  */
265 /*    its contributors may be used to endorse or promote products derived  */
266 /*    from this software without specific prior written permission.        */
267 /*                                                                         */
268 /*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    */
269 /*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      */
270 /*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR  */
271 /*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT   */
272 /*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,  */
273 /*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT       */
274 /*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  */
275 /*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  */
276 /*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT    */
277 /*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  */
278 /*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.   */
279 /*                                                                         */
280 /* ======================================================================= */
282 #pragma CODE_SECTION(DSP_fft16x16_i, ".text:optimized");
284 #include "DSP_fft16x16_i.h"
286 static inline void radix_2(short *restrict ptr_x, short *restrict ptr_y, int npoints);
287 static inline void radix_4(short *restrict ptr_x, short *restrict ptr_y, int npoints);
289 #ifdef _LITTLE_ENDIAN
290 void DSP_fft16x16_i (
291     const short * restrict ptr_w,
292     int npoints,
293     short * restrict ptr_x,
294     short * restrict ptr_y
297     const short * restrict w;
298     const short * restrict w2;
299     short * restrict x, * restrict x2, * restrict x0;
301     int j, i, l1, radix;
302     int l2, h2, fft_jmp;
303     int predj, tw_offset, stride;
305     int x_1o_x_0o, x_3o_x_2o;
306     int xl2_1_0, xl2_3_2;
308     unsigned int myt0_0_mxt0_0, myt0_1_mxt0_1;
309     unsigned int xl1_1_0, xl1_3_2, xh2_1_0, xh2_3_2;
310     unsigned int yt1_0_xt1_0, yt1_1_xt1_1, yt2_0_xt2_0, yt2_1_xt2_1;
312     unsigned int xl0_0_xl1_0, xt1_0_yt2_0;
313     unsigned int xt2_0_yt1_0, xl0_1_xl1_1;
314     unsigned int xt1_1_yt2_1, xt2_1_yt1_1;
316     unsigned int xh1_0_xh0_0, xh1_1_xh0_1;
317     unsigned int xl1_0_xl0_0, xl1_1_xl0_1;
318     unsigned int xh21_0_xh20_0, xh21_1_xh20_1;
319     unsigned int xl21_0_xl20_0, xl21_1_xl20_1;
321     double x_3210;
322     double co11si11_co10si10, co21si21_co20si20, co31si31_co30si30;
323     double x_l1_32_x_l1_10, x_l2_32_x_l2_10, x_h2_32_x_h2_10; 
325     long long xt1_0_yt2_0_xt2_0_yt1_0, xt1_1_yt2_1_xt2_1_yt1_1;
326     long long yt1_0_xt1_0_yt2_0_xt2_0, yt1_1_xt1_1_yt2_1_xt2_1;
327     long long xh1_0_xh0_0_xl1_0_xl0_0, xh1_1_xh0_1_xl1_1_xl0_1;
328     long long xh21_0_xh20_0_xl21_0_xl20_0, xh21_1_xh20_1_xl21_1_xl20_1;
330     /*---------------------------------------------------------------------*/
331     /* Determine the magnitude od the number of points to be transformed.  */
332     /* Check whether we can use a radix4 decomposition or a mixed radix    */
333     /* transformation, by determining modulo 2.                            */
334     /*---------------------------------------------------------------------*/
335     radix = _norm(npoints) & 1 ? 2 : 4;
337     /*----------------------------------------------------------------------*/
338     /* The stride is quartered with every iteration of the outer loop. It   */
339     /* denotes the seperation between any two adjacent inputs to the butter */
340     /* -fly. This should start out at N/4, hence stride is initially set to */
341     /* N. For every stride, 6*stride twiddle factors are accessed. The      */
342     /* "tw_offset" is the offset within the current twiddle factor sub-     */
343     /* table. This is set to zero, at the start of the code and is used to  */
344     /* obtain the appropriate sub-table twiddle pointer by offseting it     */
345     /* with the base pointer "ptr_w".                                       */
346     /*----------------------------------------------------------------------*/
347     stride = npoints;
348     tw_offset = 0;
349     fft_jmp = 6 * stride;
351     _nassert(stride > 4);
352     #pragma MUST_ITERATE(1,,1);
353     while (stride > radix) {
354         /*-----------------------------------------------------------------*/
355         /* At the start of every iteration of the outer loop, "j" is set   */
356         /* to zero, as "w" is pointing to the correct location within the  */
357         /* twiddle factor array. For every iteration of the inner loop     */
358         /* 6 * stride twiddle factors are accessed. For eg,                */
359         /*                                                                 */
360         /* #Iteration of outer loop  # twiddle factors    #times cycled    */
361         /*  1                          6 N/4               1               */
362         /*  2                          6 N/16              4               */
363         /*  ...                                                            */
364         /*-----------------------------------------------------------------*/
365         j = 0;
366         fft_jmp >>= 2;
368         /*-----------------------------------------------------------------*/
369         /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or  */
370         /* "N/2", "N", "3N/2" half word                                    */
371         /*-----------------------------------------------------------------*/
372         h2 = stride >> 1;
373         l1 = stride;
374         l2 = stride + (stride >> 1);
376         /*-----------------------------------------------------------------*/
377         /*  Reset "x" to point to the start of the input data array.       */
378         /* "tw_offset" starts off at 0, and increments by "6 * stride"     */
379         /*  The stride quarters with every iteration of the outer loop     */
380         /*-----------------------------------------------------------------*/
381         x = ptr_x;
382         w = ptr_w + tw_offset;
383         tw_offset += fft_jmp;
384         stride >>= 2;
386         /*----------------------------------------------------------------*/
387         /* The following loop iterates through the different butterflies, */
388         /* within a given stage. Recall that there are logN to base 4     */
389         /* stages. Certain butterflies share the twiddle factors. These   */
390         /* are grouped together. On the very first stage there are no     */
391         /* butterflies that share the twiddle factor, all N/4 butter-     */
392         /* flies have different factors. On the next stage two sets of    */
393         /* N/8 butterflies share the same twiddle factor. Hence after     */
394         /* half the butterflies are performed, j the index into the       */
395         /* factor array resets to 0, and the twiddle factors are reused.  */
396         /* When this happens, the data pointer 'x' is incremented by the  */
397         /* fft_jmp amount. In addition the following code is unrolled to  */
398         /* perform "2" radix4 butterflies in parallel.                    */
399         /*----------------------------------------------------------------*/
400         _nassert(npoints >= 16);
401         #pragma MUST_ITERATE(2,,2);
402         for (i = 0; i < npoints; i += 8) {
403             /*------------------------------------------------------------*/
404             /* Read the first 12 twiddle factors, six of which are used   */
405             /* for one radix 4 butterfly and six of which are used for    */
406             /* next one.                                                  */
407             /*                                                            */
408             /*  co10 = w[j+1];        si10 = w[j+0];                      */
409             /*  co11 = w[j+3];        si11 = w[j+2];                      */
410             /*  co20 = w[j+5];        si20 = w[j+4];                      */
411             /*  co21 = w[j+7];        si21 = w[j+6];                      */
412             /*  co30 = w[j+9];        si30 = w[j+8];                      */
413             /*  co31 = w[j+11];       si31 = w[j+10];                     */
414             /*                                                            */
415             /*  The code shown above performs half word wide loads. The   */
416             /*  C64x can perform double word wide loads. This is done by  */
417             /*  the use of the _amemd8_const intrinsic. The const quali-  */
418             /*  fies the fact that this array will only be read from and  */
419             /*  not written to.                                           */
420             /*------------------------------------------------------------*/
421             w2 = w + j;
422             co11si11_co10si10 = _amemd8_const(w2);
423             co21si21_co20si20 = _amemd8_const(&w2[4]);
424             co31si31_co30si30 = _amemd8_const(&w2[8]);
426             /*------------------------------------------------------------*/
427             /* Read in the first complex input for the butterflies.       */
428             /* 1st complex input to 1st butterfly: x[0] + jx[1]           */
429             /* 1st complex input to 2nd butterfly: x[2] + jx[3]           */
430             /* Read in the complex inputs for the butterflies. Each of the*/
431             /* successive complex inputs of the butterfly are seperated   */
432             /* by a fixed amount known as stride. The stride starts out   */
433             /* at N/4, and quarters with every stage.                     */
434             /*                                                            */
435             /*  x_0    =   x[0];          x_1    =   x[1];                */
436             /*  x_2    =   x[2];          x_3    =   x[3];                */
437             /*  x_l1_0 =   x[l1  ];       x_l1_1 =   x[l1 + 1];           */
438             /*  x_l1_2 =   x[l1 + 2];     x_l1_3 =   x[l1 + 3];           */
439             /*  x_l2_0 =   x[l2  ];       x_l2_1 =   x[l2 + 1];           */
440             /*  x_l2_2 =   x[l2 + 2];     x_l2_3 =   x[l2 + 3];           */
441             /*  x_h2_0 =   x[h2  ];       x_h2_1 =   x[h2 + 1];           */
442             /*  x_h2_2 =   x[h2 + 2];     x_h2_3 =   x[h2 + 3];           */
443             /*                                                            */
444             /* These loads are performed using _amemd8. Note that there   */
445             /* is no const qualifier in this case as this loop writes     */
446             /* back results into the data array "x". The "a" in _amemd8   */
447             /* qualifies the fact that the loads are aligned.             */
448             /*------------------------------------------------------------*/
449             x_3210 = _amemd8(&x[0]);
450             x_l1_32_x_l1_10 = _amemd8(&x[l1]);
451             x_l2_32_x_l2_10 = _amemd8(&x[l2]);
452             x_h2_32_x_h2_10 = _amemd8(&x[h2]);
454             /*-----------------------------------------------------------*/
455             /* Two butterflies are evaluated in parallel. The following  */
456             /* results will be shown for one butterfly only, although    */
457             /* both are being evaluated in parallel.                     */
458             /*                                                           */
459             /* Perform radix2 style DIF butterflies.                     */
460             /*                                                           */
461             /*  xh0_0  = x_0    + x_l1_0;     xh1_0  = x_1    + x_l1_1;  */
462             /*  xh0_1  = x_2    + x_l1_2;     xh1_1  = x_3    + x_l1_3;  */
463             /*  xl0_0  = x_0    - x_l1_0;     xl1_0  = x_1    - x_l1_1;  */
464             /*  xl0_1  = x_2    - x_l1_2;     xl1_1  = x_3    - x_l1_3;  */
465             /*  xh20_0 = x_h2_0 + x_l2_0;     xh21_0 = x_h2_1 + x_l2_1;  */
466             /*  xh20_1 = x_h2_2 + x_l2_2;     xh21_1 = x_h2_3 + x_l2_3;  */
467             /*  xl20_0 = x_h2_0 - x_l2_0;     xl21_0 = x_h2_1 - x_l2_1;  */
468             /*  xl20_1 = x_h2_2 - x_l2_2;     xl21_1 = x_h2_3 - x_l2_3;  */
469             /*                                                           */
470             /*  These are done using packed data processing, and these   */
471             /*  are done using _add2, _sub2.                             */
472             /*  For example "xh1_0_xh0_0" has a packed value xh1_0,      */
473             /*  xh0_0.                                                   */
474             /*-----------------------------------------------------------*/
475             xh1_0_xh0_0_xl1_0_xl0_0 = _addsub2((_lo(x_3210)), (_lo(x_l1_32_x_l1_10)));
476             xh1_1_xh0_1_xl1_1_xl0_1 = _addsub2((_hi(x_3210)), (_hi(x_l1_32_x_l1_10)));
478             xh1_0_xh0_0 = (_hill(xh1_0_xh0_0_xl1_0_xl0_0));
479             xh1_1_xh0_1 = (_hill(xh1_1_xh0_1_xl1_1_xl0_1));
480             xl1_0_xl0_0 = (_loll(xh1_0_xh0_0_xl1_0_xl0_0));
481             xl1_1_xl0_1 = (_loll(xh1_1_xh0_1_xl1_1_xl0_1));
483             xh21_0_xh20_0_xl21_0_xl20_0 = _addsub2((_lo(x_h2_32_x_h2_10)), (_lo(x_l2_32_x_l2_10)));
484             xh21_1_xh20_1_xl21_1_xl20_1 = _addsub2((_hi(x_h2_32_x_h2_10)), (_hi(x_l2_32_x_l2_10)));
486             xh21_0_xh20_0 = (_hill(xh21_0_xh20_0_xl21_0_xl20_0));
487             xh21_1_xh20_1 = (_hill(xh21_1_xh20_1_xl21_1_xl20_1));
488             xl21_0_xl20_0 = (_loll(xh21_0_xh20_0_xl21_0_xl20_0));
489             xl21_1_xl20_1 = (_loll(xh21_1_xh20_1_xl21_1_xl20_1));
491             /*----------------------------------------------------------*/
492             /* Derive output pointers using the input pointer "x"       */
493             /*----------------------------------------------------------*/
494             x0 = (short *)_mvd((int)x);
495             x2 = (short *)_mvd((int)x0);
497             /*-----------------------------------------------------------*/
498             /* When the twiddle factors are not to be re-used, j is      */
499             /* incremented by 12, to reflect the fact that 12 half words */
500             /* are consumed in every iteration. The input data pointer   */
501             /* increments by 4. Note that within a stage, the stride     */
502             /* does not change and hence the offsets for the other three */
503             /* legs, 0, h2, l1, l2.                                      */
504             /*-----------------------------------------------------------*/
505             j += 12;
506             x += 4;
508             predj = (j - fft_jmp);
509             if (!predj) x += fft_jmp;
510             if (!predj) j = 0;
512             /*----------------------------------------------------------*/
513             /* x0[0] = (xh0_0 + xh20_0 + 1) >> 1;                       */
514             /* x0[1] = (xh1_0 + xh21_0 + 1) >> 1;                       */
515             /* x0[2] = (xh0_1 + xh20_1 + 1) >> 1;                       */
516             /* x0[3] = (xh1_1 + xh21_1 + 1) >> 1;                       */
517             /*                                                          */
518             /* "y0x0" and "y1x1" are the first four outputs of the      */
519             /*  radix4 butterfly. These are computed on the C64x        */
520             /* using _avg2.                                             */
521             /*----------------------------------------------------------*/
522             x_1o_x_0o = _avg2(xh21_0_xh20_0, xh1_0_xh0_0);
523             x_3o_x_2o = _avg2(xh21_1_xh20_1, xh1_1_xh0_1);
525             /*----------------------------------------------------------*/
526             /* Perform one more stage of radix2 decompositions as shown */
527             /* These are performed using packed data processing.        */
528             /*                                                          */
529             /* xt0_0 = xh0_0 - xh20_0;  yt0_0 = xh1_0 - xh21_0;         */
530             /* xt0_1 = xh0_1 - xh20_1;  yt0_1 = xh1_1 - xh21_1;         */
531             /* xt1_0 = xl0_0 + xl21_0;  yt2_0 = xl1_0 + xl20_0;         */
532             /* xt2_0 = xl0_0 - xl21_0;  yt1_0 = xl1_0 - xl20_0;         */
533             /* xt1_1 = xl0_1 + xl21_1;  yt2_1 = xl1_1 + xl20_1;         */
534             /* xt2_1 = xl0_1 - xl21_1;  yt1_1 = xl1_1 - xl20_1;         */
535             /*                                                          */
536             /*----------------------------------------------------------*/
537             myt0_0_mxt0_0 = _sub2(xh21_0_xh20_0, xh1_0_xh0_0);
538             myt0_1_mxt0_1 = _sub2(xh21_1_xh20_1, xh1_1_xh0_1);
540             /*----------------------------------------------------------*/
541             /* Repack coefficients to sustain packed data processing.   */
542             /* This requires a swizzle so that intermediate results     */
543             /* end up in the right spot. This is needed to compute      */
544             /* other intermediate results using add2 and sub2.          */
545             /*----------------------------------------------------------*/
546             xl0_0_xl1_0 = _rotl(xl1_0_xl0_0, 16);
547             xl0_1_xl1_1 = _rotl(xl1_1_xl0_1, 16);
549             //;----------------------------------------------------------;
550             //; xt1_0 = xl0_0 + xl21_0   yt2_0 = xl1_0 + xl20_0          ;
551             //; xt1_1 = xl0_1 + xl21_1   yt2_1 = xl1_1 + xl20_1          ;
552             //; xt2_0 = xl0_0 - xl21_0   yt1_0 = xl1_0 - xl20_0          ;
553             //; xt2_1 = xl0_1 - xl21_1   yt1_1 = xl1_1 - xl20_1          ;
554             //;----------------------------------------------------------;
555             xt1_0_yt2_0_xt2_0_yt1_0 = _addsub2(xl0_0_xl1_0, xl21_0_xl20_0);
556             xt1_1_yt2_1_xt2_1_yt1_1 = _addsub2(xl0_1_xl1_1, xl21_1_xl20_1);
558             xt1_0_yt2_0 = (_hill(xt1_0_yt2_0_xt2_0_yt1_0));
559             xt1_1_yt2_1 = (_hill(xt1_1_yt2_1_xt2_1_yt1_1));
560             xt2_0_yt1_0 = (_loll(xt1_0_yt2_0_xt2_0_yt1_0));
561             xt2_1_yt1_1 = (_loll(xt1_1_yt2_1_xt2_1_yt1_1));
563             /*----------------------------------------------------------*/
564             /* Repack results so that xt_XXyt_XX are a packed  32 bit   */
565             /* quantity. The operations performed above tend to leave   */
566             /* the results as xt_XXyt_YY and xt_YYyt_XX. This is where  */
567             /* the packed instructions come in handy. For example       */
568             /* xt_XXyt_XX = _packlh2(xt_XXyt_YY, xt_YY_yt_XX)          */
569             /*----------------------------------------------------------*/
570             yt1_0_xt1_0_yt2_0_xt2_0 = _dpackx2(xt1_0_yt2_0, xt2_0_yt1_0);
571             yt1_1_xt1_1_yt2_1_xt2_1 = _dpackx2(xt1_1_yt2_1, xt2_1_yt1_1);
573             yt1_0_xt1_0 = (_hill(yt1_0_xt1_0_yt2_0_xt2_0));
574             yt1_1_xt1_1 = (_hill(yt1_1_xt1_1_yt2_1_xt2_1));
575             yt2_0_xt2_0 = (_loll(yt1_0_xt1_0_yt2_0_xt2_0));
576             yt2_1_xt2_1 = (_loll(yt1_1_xt1_1_yt2_1_xt2_1));
578             /*---------------------------------------------------------*/
579             /* Notice that in this version of the code the two middle  */
580             /* legs are swapped as indicated by the stores to x[l1]..  */
581             /* x[l1 + 3] which preceede the stores to x[h2]..x[h2 + 3] */
582             /* This reversal guarantees that a radix4 DIF butterfly    */
583             /* produces results in digit reversed order. Note that in  */
584             /* addition to the rounding, the shift is performed by 16, */
585             /* as opposed to 15, to give scaling.                      */
586             /*---------------------------------------------------------*/
588             /*---------------------------------------------------------*/
589             /*                                                         */
590             /* x2[l1  ] = (si10 * yt1_0 + co10 * xt1_0 + 0x8000) >> 16;*/
591             /* x2[l1+1] = (co10 * yt1_0 - si10 * xt1_0 + 0x8000) >> 16;*/
592             /* x2[l1+2] = (si11 * yt1_1 + co11 * xt1_1 + 0x8000) >> 16;*/
593             /* x2[l1+3] = (co11 * yt1_1 - si11 * xt1_1 + 0x8000) >> 16;*/
594             /*                                                         */
595             /* These four results are retained in registers and a      */
596             /* double word is formed so that it can be stored with     */
597             /* one STDW.                                               */
598             /*---------------------------------------------------------*/
599             xh2_1_0 = _cmpyr((_lo(co11si11_co10si10)), yt1_0_xt1_0);
600             xh2_3_2 = _cmpyr((_hi(co11si11_co10si10)), yt1_1_xt1_1);
602             /*---------------------------------------------------------*/
603             /* The following code computes intermediate results for:   */
604             /*                                                         */
605             /* x2[h2  ] = (si20 * yt0_0 + co20 * xt0_0 + 0x8000) >> 16;*/
606             /* x2[h2+1] = (co20 * yt0_0 - si20 * xt0_0 + 0x8000) >> 16;*/
607             /* x2[h2+2] = (si21 * yt0_1 + co21 * xt0_1 + 0x8000) >> 16;*/
608             /* x2[h2+3] = (co21 * yt0_1 - si21 * xt0_1 + 0x8000) >> 16;*/
609             /*---------------------------------------------------------*/
610             xl1_1_0 = _cmpyr((_lo(co21si21_co20si20)), myt0_0_mxt0_0);
611             xl1_3_2 = _cmpyr((_hi(co21si21_co20si20)), myt0_1_mxt0_1);
613             /*---------------------------------------------------------*/
614             /* The following code computes intermediate results for:   */
615             /*                                                         */
616             /* x2[l2  ] = (si30 * yt2_0 + co30 * xt2_0 + 0x8000) >> 16;*/
617             /* x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0 + 0x8000) >> 16;*/
618             /* x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1 + 0x8000) >> 16;*/
619             /* x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1 + 0x8000) >> 16;*/
620             /*---------------------------------------------------------*/
621             xl2_1_0 = _cmpyr((_lo(co31si31_co30si30)), yt2_0_xt2_0);
622             xl2_3_2 = _cmpyr((_hi(co31si31_co30si30)), yt2_1_xt2_1);
624             /*---------------------------------------------------------*/
625             /* Prepare the doubleword for each of the four outputs, by */
626             /* the use of the _itod intrinsic that takes integers, and */
627             /* forms a double word wide quantity.                      */
628             /* Store out the outputs to the four legs of the butterfly */
629             /* using aligned store double words. Notice the use of the */
630             /* indices "l1", "l2", "h2" to derive the pointers for the */
631             /* legs of the butterfly.                                  */
632             /*---------------------------------------------------------*/
633             _amemd8(&x2[0]) = _itod(x_3o_x_2o, x_1o_x_0o);
634             _amemd8(&x2[l1]) = _itod(xl1_3_2, xl1_1_0);
635             _amemd8(&x2[h2]) = _itod(xh2_3_2, xh2_1_0);
636             _amemd8(&x2[l2]) = _itod(xl2_3_2, xl2_1_0);
637         }
638     }
640     if (radix == 2)
641         radix_2(ptr_x, ptr_y, npoints);
642     else if (radix == 4)
643         radix_4(ptr_x, ptr_y, npoints);
645     return;
648 void radix_2(
649     short *restrict ptr_x,
650     short *restrict ptr_y,
651     int npoints
654     short * restrict x2, * restrict x0;
655     short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
656     short n0, j0;
658     int i, j, l1, h2;
659     int y_10, y_32, y_54, y_76, y_98, y_BA, y_DC, y_FE;
661     double x_2301, x_6745, x_AB89, x_EFCD;
663     /*-----------------------------------------------------------------*/
664     /* The following code performs either a standard radix4 pass or a  */
665     /* radix2 pass. Two pointers are used to access the input data.    */
666     /* The input data is read "N/4" complex samples apart or "N/2"     */
667     /* words apart using pointers "x0" and "x2". This produces out-    */
668     /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8    */
669     /* N/2, 3N/8 for radix 2.                                          */
670     /*-----------------------------------------------------------------*/
671     y0 = ptr_y;
672     y2 = ptr_y + (int)npoints;
673     x0 = ptr_x;
674     x2 = ptr_x + (int)(npoints >> 1);
676     /*----------------------------------------------------------------*/
677     /* The pointers are set at the following locations which are half */
678     /* the offsets of a radix4 FFT.                                   */
679     /*----------------------------------------------------------------*/
680     y1 = y0 + (int)(npoints >> 2);
681     y3 = y2 + (int)(npoints >> 2);
682     l1 = _norm(npoints) + 1;
683     j0 = 8;
684     n0 = npoints >> 1;
686     /*--------------------------------------------------------------------*/
687     /* The following code reads data indentically for either a radix 4    */
688     /* or a radix 2 style decomposition. It writes out at different       */
689     /* locations though. It checks if either half the points, or a        */
690     /* quarter of the complex points have been exhausted to jump to       */
691     /* pervent double reversal.                                           */
692     /*--------------------------------------------------------------------*/
693     j = 0;
695     _nassert((int)(n0) % 4 == 0);
696     _nassert((int)(ptr_x) % 8 == 0);
697     _nassert((int)(ptr_y) % 8 == 0);
698     _nassert((int)(x0) % 8 == 0);
699     _nassert((int)(x2) % 8 == 0);
700     _nassert((int)(y0) % 8 == 0);
701     _nassert(npoints >= 16);
702     #pragma MUST_ITERATE(2,,2);
703     for (i = 0; i < npoints; i += 8) {
704         /*----------------------------------------------------------------*/
705         /* Digit reverse the index starting from 0. The increment to "j"  */
706         /* is either by 4, or 8.                                          */
707         /*----------------------------------------------------------------*/
708         h2 = _deal(j);
709         h2 = _bitr(h2);
710         h2 = _rotl(h2, 16);
711         h2 = _shfl(h2);
712         h2 >>= l1;
713         /*----------------------------------------------------------------*/
714         /* Read in the input data. These are transformed as a radix2.     */
715         /*----------------------------------------------------------------*/
716         x_2301 = _amemd8(&x0[0]);
717         x_6745 = _amemd8(&x0[4]);
718         x0 += 8;
719         x_AB89 = _amemd8(&x2[0]);
720         x_EFCD = _amemd8(&x2[4]);
721         x2 += 8;
723         /*----------------------------------------------------------------------------*/
724         /* Perform radix2 style decomposition.                                        */
725         /*----------------------------------------------------------------------------*/
726         /* n00 = x_0 + x_2;    n01 = x_1 + x_3;    y_10 = n00_n01 = _add2(x_01, x_23) */            
727         /* n20 = x_0 - x_2;    n21 = x_1 - x_3;    y_98 = n20_n21 = _sub2(x_01, x_23) */
728         /* n10 = x_4 + x_6;    n11 = x_5 + x_7;    y_54 = n10_n11 = _add2(x_45, x_67) */
729         /* n30 = x_4 - x_6;    n31 = x_5 - x_7;    y_DC = n30_n31 = _sub2(x_45, x_67) */
730         /* n02 = x_8 + x_a;    n03 = x_9 + x_b;    y_32 = n03_n02 = _add2(x_89, x_AB) */
731         /* n12 = x_c + x_e;    n13 = x_d + x_f;    y_76 = n12_n13 = _add2(x_CD, x_EF) */
732         /* n22 = x_8 - x_a;    n23 = x_9 - x_b;    y_BA = n22_n23 = _sub2(x_89, x_AB) */
733         /* n32 = x_c - x_e;    n33 = x_d - x_f;    y_FE = n32_n33 = _sub2(x_CD, x_EF) */
734         
735         y_10 = _add2(_lo(x_2301), _hi(x_2301));
736         y_98 = _sub2(_lo(x_2301), _hi(x_2301));
737         y_54 = _add2(_lo(x_6745), _hi(x_6745));
738         y_DC = _sub2(_lo(x_6745), _hi(x_6745));
739         y_32 = _add2(_lo(x_AB89), _hi(x_AB89));
740         y_76 = _add2(_lo(x_EFCD), _hi(x_EFCD));
741         y_BA = _sub2(_lo(x_AB89), _hi(x_AB89));
742         y_FE = _sub2(_lo(x_EFCD), _hi(x_EFCD));
743     
744         /*----Imaginary------------Real-----------*/ 
745         /* y0[2*h2+0] = n01;    y0[2*h2+1] = n00; */
746         /* y0[2*h2+2] = n03;    y0[2*h2+3] = n02; */
747         /* y1[2*h2+0] = n11;    y1[2*h2+1] = n10; */
748         /* y1[2*h2+2] = n13;    y1[2*h2+3] = n12; */
749         /* y2[2*h2+0] = n21;    y2[2*h2+1] = n20; */
750         /* y2[2*h2+2] = n23;    y2[2*h2+3] = n22; */
751         /* y3[2*h2+0] = n31;    y3[2*h2+1] = n30; */
752         /* y3[2*h2+2] = n33;    y3[2*h2+3] = n32; */
754         /*------------------------------------------------------------*/
755         /*  Store out the results of all four butterflies as double   */
756         /*  words.                                                    */
757         /*------------------------------------------------------------*/
758         _amemd8(&y0[2 * h2]) = _itod(y_32, y_10);
759         _amemd8(&y1[2 * h2]) = _itod(y_76, y_54);
760         _amemd8(&y2[2 * h2]) = _itod(y_BA, y_98);
761         _amemd8(&y3[2 * h2]) = _itod(y_FE, y_DC);
763         j += j0;
764         if (j == n0) {
765             j += n0;
766             x0 += (int)npoints >> 1;
767             x2 += (int)npoints >> 1;
768         }
769     }
772 void radix_4(
773     short *restrict ptr_x,
774     short *restrict ptr_y,
775     int npoints
778     short * restrict x2, * restrict x0;
779     short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
780     short n0, j0;
782     int i, j, l1, h2;
783     int xh0_0_xh1_0, xh0_1_xh1_1, xh0_2_xh1_2, xh0_3_xh1_3;
784     int xl0_0_xl1_0, xl0_1_xl1_1, xl1_1_xl0_1, xl0_2_xl1_2, xl0_3_xl1_3, xl1_3_xl0_3;
785     int n10_n11, n30_n31, n12_n13, n32_n33;
787     double x_2301, x_6745, x_AB89, x_EFCD;
789     long long n00_n01_n20_n21, n10_n31_n30_n11, n02_n03_n22_n23, n12_n33_n32_n13;
791     /*-----------------------------------------------------------------*/
792     /* The following code performs either a standard radix4 pass or a  */
793     /* radix2 pass. Two pointers are used to access the input data.    */
794     /* The input data is read "N/4" complex samples apart or "N/2"     */
795     /* words apart using pointers "x0" and "x2". This produces out-    */
796     /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8    */
797     /* N/2, 3N/8 for radix 2.                                          */
798     /*-----------------------------------------------------------------*/
799     y0 = ptr_y;
800     y2 = ptr_y + (int)npoints;
801     x0 = ptr_x;
802     x2 = ptr_x + (int)(npoints >> 1);
804     /*----------------------------------------------------------------*/
805     /* The pointers are set at the following locations which are half */
806     /* the offsets of a radix4 FFT.                                   */
807     /*----------------------------------------------------------------*/
808     y3 = y2 + (int)(npoints >> 1);
809     y1 = y0 + (int)(npoints >> 1);
810     l1 = _norm(npoints) + 2;
811     j0 = 4;
812     n0 = npoints >> 2;
814     /*--------------------------------------------------------------------*/
815     /* The following code reads data indentically for either a radix 4    */
816     /* or a radix 2 style decomposition. It writes out at different       */
817     /* locations though. It checks if either half the points, or a        */
818     /* quarter of the complex points have been exhausted to jump to       */
819     /* pervent double reversal.                                           */
820     /*--------------------------------------------------------------------*/
821     j = 0;
823     _nassert((int)(n0) % 4 == 0);
824     _nassert((int)(x0) % 8 == 0);
825     _nassert((int)(x2) % 8 == 0);
826     _nassert((int)(y0) % 8 == 0);
827     _nassert(npoints >= 16);
828     #pragma MUST_ITERATE(2,,2);
830     for (i = 0; i < npoints; i += 8) {
831         /*----------------------------------------------------------------*/
832         /* Digit reverse the index starting from 0. The increment to "j"  */
833         /* is either by 4, or 8.                                          */
834         /*----------------------------------------------------------------*/
835         h2 = _deal(j);
836         h2 = _bitr(h2);
837         h2 = _rotl(h2, 16);
838         h2 = _shfl(h2);
839         h2 >>= l1;
841         /*----------------------------------------------------------------*/
842         /* Read in the input data. These are transformed as a radix4.     */
843         /*----------------------------------------------------------------*/
844         x_2301 = _amemd8(&x0[0]);
845         x_6745 = _amemd8(&x0[4]);
846         x0 += 8;
847         x_AB89 = _amemd8(&x2[0]);
848         x_EFCD = _amemd8(&x2[4]);
849         x2 += 8;
851         /*------------------------------------------*/
852         /* Perform radix4 style decomposition.      */
853         /*------------------------------------------*/
854         /* xh0_0 = x_0 + x_4;    xh1_0 = x_1 + x_5; */
855         /* xl0_0 = x_0 - x_4;    xl1_0 = x_1 - x_5; */
856         /* xh0_1 = x_2 + x_6;    xh1_1 = x_3 + x_7; */
857         /* xl0_1 = x_2 - x_6;    xl1_1 = x_3 - x_7; */
858         xh0_0_xh1_0 = _add2(_lo(x_2301), _lo(x_6745));
859         xl0_0_xl1_0 = _sub2(_lo(x_2301), _lo(x_6745));
860         xh0_1_xh1_1 = _add2(_hi(x_2301), _hi(x_6745));
861         xl0_1_xl1_1 = _sub2(_hi(x_2301), _hi(x_6745));
863         /* n00 = xh0_0 + xh0_1;    n20 = xh0_0 - xh0_1; */
864         /* n01 = xh1_0 + xh1_1;    n21 = xh1_0 - xh1_1; */
865         n00_n01_n20_n21 = _addsub2(xh0_0_xh1_0, xh0_1_xh1_1);
867         /* n30 = xl0_0 + xl1_1;    n10 = xl0_0 - xl1_1; */
868         /* n11 = xl1_0 + xl0_1;    n31 = xl1_0 - xl0_1; */
869         xl1_1_xl0_1 = _rotl(xl0_1_xl1_1, 16);
870         n10_n31_n30_n11 = _addsub2(xl0_0_xl1_0, xl1_1_xl0_1);
872         n30_n31 = _packhl2(_hill(n10_n31_n30_n11), _loll(n10_n31_n30_n11));
873         n10_n11 = _packhl2(_loll(n10_n31_n30_n11), _hill(n10_n31_n30_n11));
875         /*------------------------------------------*/
876         /* Perform radix4 style decomposition.      */
877         /*------------------------------------------*/
878         /* xh0_2 = x_8 + x_c;    xh1_2 = x_9 + x_d; */
879         /* xl0_2 = x_8 - x_c;    xl1_2 = x_9 - x_d; */
880         /* xh0_3 = x_a + x_e;    xh1_3 = x_b + x_f; */
881         /* xl0_3 = x_a - x_e;    xl1_3 = x_b - x_f; */
882         xh0_2_xh1_2 = _add2(_lo(x_AB89), _lo(x_EFCD));
883         xl0_2_xl1_2 = _sub2(_lo(x_AB89), _lo(x_EFCD));
884         xh0_3_xh1_3 = _add2(_hi(x_AB89), _hi(x_EFCD));
885         xl0_3_xl1_3 = _sub2(_hi(x_AB89), _hi(x_EFCD));
887         /* n02 = xh0_2 + xh0_3;    n22 = xh0_2 - xh0_3; */
888         /* n03 = xh1_2 + xh1_3;    n23 = xh1_2 - xh1_3; */
889         n02_n03_n22_n23 = _addsub2(xh0_2_xh1_2, xh0_3_xh1_3);
891         /* n32 = xl0_2 + xl1_3;    n12 = xl0_2 - xl1_3; */
892         /* n13 = xl1_2 + xl0_3;    n33 = xl1_2 - xl0_3; */
893         xl1_3_xl0_3 = _rotl(xl0_3_xl1_3, 16);
894         n12_n33_n32_n13 = _addsub2(xl0_2_xl1_2, xl1_3_xl0_3);
896         n32_n33 = _packhl2(_hill(n12_n33_n32_n13), _loll(n12_n33_n32_n13));
897         n12_n13 = _packhl2(_loll(n12_n33_n32_n13), _hill(n12_n33_n32_n13));
899         /*-----------------------------------------------------------------*/
900         /* Points that are read from succesive locations map to y, y[N/4]  */
901         /* y[N/2], y[3N/4] in a radix4 scheme, y, y[N/8], y[N/2],y[5N/8]   */
902         /*-----------------------------------------------------------------*/
904         /*----Imaginary----------Real-----------*/
905         /* y0[2*h2+0] = n01;  y0[2*h2+1] = n00; */
906         /* y0[2*h2+2] = n03;  y0[2*h2+3] = n02; */
907         /* y1[2*h2+0] = n11;  y1[2*h2+1] = n10; */
908         /* y1[2*h2+2] = n13;  y1[2*h2+3] = n12; */
909         /* y2[2*h2+0] = n21;  y2[2*h2+1] = n20; */
910         /* y2[2*h2+2] = n23;  y2[2*h2+3] = n22; */
911         /* y3[2*h2+0] = n31;  y3[2*h2+1] = n30; */
912         /* y3[2*h2+2] = n33;  y3[2*h2+3] = n32; */
914         _amemd8(&y0[2 * h2]) = _itod(_hill(n02_n03_n22_n23), _hill(n00_n01_n20_n21));
915         _amemd8(&y1[2 * h2]) = _itod(n12_n13, n10_n11);
916         _amemd8(&y2[2 * h2]) = _itod(_loll(n02_n03_n22_n23), _loll(n00_n01_n20_n21));
917         _amemd8(&y3[2 * h2]) = _itod(n32_n33, n30_n31);
919         j += j0;
920         if (j == n0) {
921             j  += n0;
922             x0 += (int)npoints >> 1;
923             x2 += (int)npoints >> 1;
924         }
925     }
928 #else
929 void DSP_fft16x16_i (
930     const short * restrict ptr_w,
931     int npoints,
932     short * restrict ptr_x,
933     short * restrict ptr_y
936     const short * restrict w;
937     const short * restrict w2;
938     short * restrict x, * restrict x2, * restrict x0;
940     int j, i, l1, radix;
941     int l2, h2, fft_jmp;
942     int predj, tw_offset, stride;
944     int x_0o_x_1o, x_2o_x_3o;
945     int xl2_0_1, xl2_2_3;
947     unsigned int mxt0_0_myt0_0, mxt0_1_myt0_1;
948     unsigned int xl1_0_1, xl1_2_3, xh2_0_1, xh2_2_3;
949     unsigned int xt1_0_yt1_0, xt1_1_yt1_1, xt2_0_yt2_0, xt2_1_yt2_1;
951     unsigned int xl1_0_xl0_0, yt1_0_xt2_0;
952     unsigned int yt2_0_xt1_0, xl1_1_xl0_1;
953     unsigned int yt2_1_xt1_1, yt1_1_xt2_1;
955     unsigned int xh0_0_xh1_0, xh0_1_xh1_1;
956     unsigned int xl0_0_xl1_0, xl0_1_xl1_1;
957     unsigned int xh20_0_xh21_0, xh20_1_xh21_1;
958     unsigned int xl20_0_xl21_0, xl20_1_xl21_1;
959     
960     double x_0123;
961     double co10si10_co11si11, co20si20_co21si21, co30si30_co31si31;
962     double x_l1_01_x_l1_23, x_l2_01_x_l2_23, x_h2_01_x_h2_23; 
964     long long xt1_0_yt1_0_xt2_0_yt2_0, xt1_1_yt1_1_xt2_1_yt2_1;
965     long long yt2_0_xt1_0_yt1_0_xt2_0, yt2_1_xt1_1_yt1_1_xt2_1;
966     long long xh0_0_xh1_0_xl0_0_xl1_0, xh0_1_xh1_1_xl0_1_xl1_1;
967     long long xh20_0_xh21_0_xl20_0_xl21_0, xh20_1_xh21_1_xl20_1_xl21_1;
969     /*---------------------------------------------------------------------*/
970     /* Determine the magnitude od the number of points to be transformed.  */
971     /* Check whether we can use a radix4 decomposition or a mixed radix    */
972     /* transformation, by determining modulo 2.                            */
973     /*---------------------------------------------------------------------*/
974     radix = _norm(npoints) & 1 ? 2 : 4;
976     /*----------------------------------------------------------------------*/
977     /* The stride is quartered with every iteration of the outer loop. It   */
978     /* denotes the seperation between any two adjacent inputs to the butter */
979     /* -fly. This should start out at N/4, hence stride is initially set to */
980     /* N. For every stride, 6*stride twiddle factors are accessed. The      */
981     /* "tw_offset" is the offset within the current twiddle factor sub-     */
982     /* table. This is set to zero, at the start of the code and is used to  */
983     /* obtain the appropriate sub-table twiddle pointer by offseting it     */
984     /* with the base pointer "ptr_w".                                       */
985     /*----------------------------------------------------------------------*/
986     stride = npoints;
987     tw_offset = 0;
988     fft_jmp = 6 * stride;
990     _nassert(stride > 4);
991     #pragma MUST_ITERATE(1,,1);
992     while (stride > radix) {
993         /*-----------------------------------------------------------------*/
994         /* At the start of every iteration of the outer loop, "j" is set   */
995         /* to zero, as "w" is pointing to the correct location within the  */
996         /* twiddle factor array. For every iteration of the inner loop     */
997         /* 6 * stride twiddle factors are accessed. For eg,                */
998         /*                                                                 */
999         /* #Iteration of outer loop  # twiddle factors    #times cycled    */
1000         /*  1                          6 N/4               1               */
1001         /*  2                          6 N/16              4               */
1002         /*  ...                                                            */
1003         /*-----------------------------------------------------------------*/
1004         j = 0;
1005         fft_jmp >>= 2;
1007         /*-----------------------------------------------------------------*/
1008         /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or  */
1009         /* "N/2", "N", "3N/2" half word                                    */
1010         /*-----------------------------------------------------------------*/
1011         h2 = stride >> 1;
1012         l1 = stride;
1013         l2 = stride + (stride >> 1);
1015         /*-----------------------------------------------------------------*/
1016         /*  Reset "x" to point to the start of the input data array.       */
1017         /* "tw_offset" starts off at 0, and increments by "6 * stride"     */
1018         /*  The stride quarters with every iteration of the outer loop     */
1019         /*-----------------------------------------------------------------*/
1020         x = ptr_x;
1021         w = ptr_w + tw_offset;
1022         tw_offset += fft_jmp;
1023         stride >>= 2;
1025         /*----------------------------------------------------------------*/
1026         /* The following loop iterates through the different butterflies, */
1027         /* within a given stage. Recall that there are logN to base 4     */
1028         /* stages. Certain butterflies share the twiddle factors. These   */
1029         /* are grouped together. On the very first stage there are no     */
1030         /* butterflies that share the twiddle factor, all N/4 butter-     */
1031         /* flies have different factors. On the next stage two sets of    */
1032         /* N/8 butterflies share the same twiddle factor. Hence after     */
1033         /* half the butterflies are performed, j the index into the       */
1034         /* factor array resets to 0, and the twiddle factors are reused.  */
1035         /* When this happens, the data pointer 'x' is incremented by the  */
1036         /* fft_jmp amount. In addition the following code is unrolled to  */
1037         /* perform "2" radix4 butterflies in parallel.                    */
1038         /*----------------------------------------------------------------*/
1039         _nassert(npoints >= 16);
1040         #pragma MUST_ITERATE(2,,2);
1041         for (i = 0; i < npoints; i += 8) {
1042             /*------------------------------------------------------------*/
1043             /* Read the first 12 twiddle factors, six of which are used   */
1044             /* for one radix 4 butterfly and six of which are used for    */
1045             /* next one.                                                  */
1046             /*                                                            */
1047             /*  si10 = w[j+1];        co10 = w[j+0];                      */
1048             /*  si11 = w[j+3];        co11 = w[j+2];                      */
1049             /*  si20 = w[j+5];        co20 = w[j+4];                      */
1050             /*  si21 = w[j+7];        co21 = w[j+6];                      */
1051             /*  si30 = w[j+9];        co30 = w[j+8];                      */
1052             /*  si31 = w[j+11];       co31 = w[j+10];                     */
1053             /*                                                            */
1054             /*  The code shown above performs half word wide loads. The   */
1055             /*  C64x can perform double word wide loads. This is done by  */
1056             /*  the use of the _amemd8_const intrinsic. The const quali-  */
1057             /*  fies the fact that this array will only be read from and  */
1058             /*  not written to.                                           */
1059             /*------------------------------------------------------------*/
1060             w2 = w + j;
1061             co10si10_co11si11 = _amemd8_const(w2);
1062             co20si20_co21si21 = _amemd8_const(&w2[4]);
1063             co30si30_co31si31 = _amemd8_const(&w2[8]);
1064             
1065             /*------------------------------------------------------------*/
1066             /* Read in the first complex input for the butterflies.       */
1067             /* 1st complex input to 1st butterfly: x[0] + jx[1]           */
1068             /* 1st complex input to 2nd butterfly: x[2] + jx[3]           */
1069             /* Read in the complex inputs for the butterflies. Each of the*/
1070             /* successive complex inputs of the butterfly are seperated   */
1071             /* by a fixed amount known as stride. The stride starts out   */
1072             /* at N/4, and quarters with every stage.                     */
1073             /*                                                            */
1074             /*  x_0    =   x[0];          x_1    =   x[1];                */
1075             /*  x_2    =   x[2];          x_3    =   x[3];                */
1076             /*  x_l1_0 =   x[l1  ];       x_l1_1 =   x[l1 + 1];           */
1077             /*  x_l1_2 =   x[l1 + 2];     x_l1_3 =   x[l1 + 3];           */
1078             /*  x_l2_0 =   x[l2  ];       x_l2_1 =   x[l2 + 1];           */
1079             /*  x_l2_2 =   x[l2 + 2];     x_l2_3 =   x[l2 + 3];           */
1080             /*  x_h2_0 =   x[h2  ];       x_h2_1 =   x[h2 + 1];           */
1081             /*  x_h2_2 =   x[h2 + 2];     x_h2_3 =   x[h2 + 3];           */
1082             /*                                                            */
1083             /* These loads are performed using _amemd8. Note that there   */
1084             /* is no const qualifier in this case as this loop writes     */
1085             /* back results into the data array "x". The "a" in _amemd8   */
1086             /* qualifies the fact that the loads are aligned.             */
1087             /*------------------------------------------------------------*/
1088             x_0123 = _amemd8(&x[0]);
1089             x_l1_01_x_l1_23 = _amemd8(&x[l1]);
1090             x_l2_01_x_l2_23 = _amemd8(&x[l2]);
1091             x_h2_01_x_h2_23 = _amemd8(&x[h2]);
1093             /*-----------------------------------------------------------*/
1094             /* Two butterflies are evaluated in parallel. The following  */
1095             /* results will be shown for one butterfly only, although    */
1096             /* both are being evaluated in parallel.                     */
1097             /*                                                           */
1098             /* Perform radix2 style DIF butterflies.                     */
1099             /*                                                           */
1100             /*  xh0_0  = x_0    + x_l1_0;     xh1_0  = x_1    + x_l1_1;  */
1101             /*  xh0_1  = x_2    + x_l1_2;     xh1_1  = x_3    + x_l1_3;  */
1102             /*  xl0_0  = x_0    - x_l1_0;     xl1_0  = x_1    - x_l1_1;  */
1103             /*  xl0_1  = x_2    - x_l1_2;     xl1_1  = x_3    - x_l1_3;  */
1104             /*  xh20_0 = x_h2_0 + x_l2_0;     xh21_0 = x_h2_1 + x_l2_1;  */
1105             /*  xh20_1 = x_h2_2 + x_l2_2;     xh21_1 = x_h2_3 + x_l2_3;  */
1106             /*  xl20_0 = x_h2_0 - x_l2_0;     xl21_0 = x_h2_1 - x_l2_1;  */
1107             /*  xl20_1 = x_h2_2 - x_l2_2;     xl21_1 = x_h2_3 - x_l2_3;  */
1108             /*                                                           */
1109             /*  These are done using packed data processing, and these   */
1110             /*  are done using _add2, _sub2.                             */
1111             /*  For example "xh1_0_xh0_0" has a packed value xh1_0,      */
1112             /*  xh0_0.                                                   */
1113             /*-----------------------------------------------------------*/
1114             xh0_0_xh1_0_xl0_0_xl1_0 = _addsub2((_hi(x_0123)), (_hi(x_l1_01_x_l1_23)));
1115             xh0_1_xh1_1_xl0_1_xl1_1 = _addsub2((_lo(x_0123)), (_lo(x_l1_01_x_l1_23)));
1117             xh0_0_xh1_0 = (_hill(xh0_0_xh1_0_xl0_0_xl1_0));
1118             xh0_1_xh1_1 = (_hill(xh0_1_xh1_1_xl0_1_xl1_1));
1119             xl0_0_xl1_0 = (_loll(xh0_0_xh1_0_xl0_0_xl1_0));
1120             xl0_1_xl1_1 = (_loll(xh0_1_xh1_1_xl0_1_xl1_1));
1122             xh20_0_xh21_0_xl20_0_xl21_0 = _addsub2((_hi(x_h2_01_x_h2_23)), (_hi(x_l2_01_x_l2_23)));
1123             xh20_1_xh21_1_xl20_1_xl21_1 = _addsub2((_lo(x_h2_01_x_h2_23)), (_lo(x_l2_01_x_l2_23)));
1125             xh20_0_xh21_0 = (_hill(xh20_0_xh21_0_xl20_0_xl21_0));
1126             xh20_1_xh21_1 = (_hill(xh20_1_xh21_1_xl20_1_xl21_1));
1127             xl20_0_xl21_0 = (_loll(xh20_0_xh21_0_xl20_0_xl21_0));
1128             xl20_1_xl21_1 = (_loll(xh20_1_xh21_1_xl20_1_xl21_1));
1130             /*----------------------------------------------------------*/
1131             /* Derive output pointers using the input pointer "x"       */
1132             /*----------------------------------------------------------*/
1133             x0 = (short *)_mvd((int)x);
1134             x2 = (short *)_mvd((int)x0);
1136             /*-----------------------------------------------------------*/
1137             /* When the twiddle factors are not to be re-used, j is      */
1138             /* incremented by 12, to reflect the fact that 12 half words */
1139             /* are consumed in every iteration. The input data pointer   */
1140             /* increments by 4. Note that within a stage, the stride     */
1141             /* does not change and hence the offsets for the other three */
1142             /* legs, 0, h2, l1, l2.                                      */
1143             /*-----------------------------------------------------------*/
1144             j += 12;
1145             x += 4;
1147             predj = (j - fft_jmp);
1148             if (!predj) x += fft_jmp;
1149             if (!predj) j = 0;
1151             /*----------------------------------------------------------*/
1152             /* x0[0] = (xh0_0 + xh20_0 + 1) >> 1;                       */
1153             /* x0[1] = (xh1_0 + xh21_0 + 1) >> 1;                       */
1154             /* x0[2] = (xh0_1 + xh20_1 + 1) >> 1;                       */
1155             /* x0[3] = (xh1_1 + xh21_1 + 1) >> 1;                       */
1156             /*                                                          */
1157             /* "y0x0" and "y1x1" are the first four outputs of the      */
1158             /*  radix4 butterfly. These are computed on the C64x        */
1159             /* using _avg2.                                             */
1160             /*----------------------------------------------------------*/
1161             x_0o_x_1o = _avg2(xh20_0_xh21_0, xh0_0_xh1_0);
1162             x_2o_x_3o = _avg2(xh20_1_xh21_1, xh0_1_xh1_1);
1164             /*----------------------------------------------------------*/
1165             /* Perform one more stage of radix2 decompositions as shown */
1166             /* These are performed using packed data processing.        */
1167             /*                                                          */
1168             /* xt0_0 = xh0_0 - xh20_0;  yt0_0 = xh1_0 - xh21_0;         */
1169             /* xt0_1 = xh0_1 - xh20_1;  yt0_1 = xh1_1 - xh21_1;         */
1170             /* xt1_0 = xl0_0 + xl21_0;  yt2_0 = xl1_0 + xl20_0;         */
1171             /* xt2_0 = xl0_0 - xl21_0;  yt1_0 = xl1_0 - xl20_0;         */
1172             /* xt1_1 = xl0_1 + xl21_1;  yt2_1 = xl1_1 + xl20_1;         */
1173             /* xt2_1 = xl0_1 - xl21_1;  yt1_1 = xl1_1 - xl20_1;         */
1174             /*                                                          */
1175             /*----------------------------------------------------------*/
1176             mxt0_0_myt0_0 = _sub2(xh20_0_xh21_0, xh0_0_xh1_0);
1177             mxt0_1_myt0_1 = _sub2(xh20_1_xh21_1, xh0_1_xh1_1);
1178             /*----------------------------------------------------------*/
1179             /* Repack coefficients to sustain packed data processing.   */
1180             /* This requires a swizzle so that intermediate results     */
1181             /* end up in the right spot. This is needed to compute      */
1182             /* other intermediate results using add2 and sub2.          */
1183             /*----------------------------------------------------------*/
1184             xl1_0_xl0_0 = _rotl(xl0_0_xl1_0, 16);
1185             xl1_1_xl0_1 = _rotl(xl0_1_xl1_1, 16);
1187             //;----------------------------------------------------------;
1188             //; xt1_0 = xl0_0 + xl21_0   yt2_0 = xl1_0 + xl20_0          ;
1189             //; xt1_1 = xl0_1 + xl21_1   yt2_1 = xl1_1 + xl20_1          ;
1190             //; xt2_0 = xl0_0 - xl21_0   yt1_0 = xl1_0 - xl20_0          ;
1191             //; xt2_1 = xl0_1 - xl21_1   yt1_1 = xl1_1 - xl20_1          ;
1192             //;----------------------------------------------------------;
1193             yt2_0_xt1_0_yt1_0_xt2_0 = _addsub2(xl1_0_xl0_0, xl20_0_xl21_0);
1194             yt2_1_xt1_1_yt1_1_xt2_1 = _addsub2(xl1_1_xl0_1, xl20_1_xl21_1);
1196             yt2_0_xt1_0 = (_hill(yt2_0_xt1_0_yt1_0_xt2_0));
1197             yt2_1_xt1_1 = (_hill(yt2_1_xt1_1_yt1_1_xt2_1));
1198             yt1_0_xt2_0 = (_loll(yt2_0_xt1_0_yt1_0_xt2_0));
1199             yt1_1_xt2_1 = (_loll(yt2_1_xt1_1_yt1_1_xt2_1));
1201             /*----------------------------------------------------------*/
1202             /* Repack results so that xt_XXyt_XX are a packed  32 bit   */
1203             /* quantity. The operations performed above tend to leave   */
1204             /* the results as xt_XXyt_YY and xt_YYyt_XX. This is where  */
1205             /* the packed instructions come in handy. For example       */
1206             /* xt_XXyt_XX = _packlh2(xt_XXyt_YY, xt_YY_yt_XX)          */
1207             /*----------------------------------------------------------*/
1208             xt1_0_yt1_0_xt2_0_yt2_0 = _dpackx2(yt1_0_xt2_0, yt2_0_xt1_0);
1209             xt1_1_yt1_1_xt2_1_yt2_1 = _dpackx2(yt1_1_xt2_1, yt2_1_xt1_1);
1211             xt1_0_yt1_0 = (_hill(xt1_0_yt1_0_xt2_0_yt2_0));
1212             xt1_1_yt1_1 = (_hill(xt1_1_yt1_1_xt2_1_yt2_1));
1213             xt2_0_yt2_0 = (_loll(xt1_0_yt1_0_xt2_0_yt2_0));
1214             xt2_1_yt2_1 = (_loll(xt1_1_yt1_1_xt2_1_yt2_1));
1215       
1216             /*---------------------------------------------------------*/
1217             /* Notice that in this version of the code the two middle  */
1218             /* legs are swapped as indicated by the stores to x[l1]..  */
1219             /* x[l1 + 3] which preceede the stores to x[h2]..x[h2 + 3] */
1220             /* This reversal guarantees that a radix4 DIF butterfly    */
1221             /* produces results in digit reversed order. Note that in  */
1222             /* addition to the rounding, the shift is performed by 16, */
1223             /* as opposed to 15, to give scaling.                      */
1224             /*---------------------------------------------------------*/
1226             /*---------------------------------------------------------*/
1227             /*                                                         */
1228             /* x2[l1  ] = (si10 * yt1_0 + co10 * xt1_0 + 0x8000) >> 16;*/
1229             /* x2[l1+1] = (co10 * yt1_0 - si10 * xt1_0 + 0x8000) >> 16;*/
1230             /* x2[l1+2] = (si11 * yt1_1 + co11 * xt1_1 + 0x8000) >> 16;*/
1231             /* x2[l1+3] = (co11 * yt1_1 - si11 * xt1_1 + 0x8000) >> 16;*/
1232             /*                                                         */
1233             /* These four results are retained in registers and a      */
1234             /* double word is formed so that it can be stored with     */
1235             /* one STDW.                                               */
1236             /*---------------------------------------------------------*/
1237             xh2_0_1 = _cmpyr((_hi(co10si10_co11si11)), xt1_0_yt1_0);
1238             xh2_2_3 = _cmpyr((_lo(co10si10_co11si11)), xt1_1_yt1_1);
1239       
1240             /*---------------------------------------------------------*/
1241             /* The following code computes intermediate results for:   */
1242             /*                                                         */
1243             /* x2[h2  ] = (si20 * yt0_0 + co20 * xt0_0 + 0x8000) >> 16;*/
1244             /* x2[h2+1] = (co20 * yt0_0 - si20 * xt0_0 + 0x8000) >> 16;*/
1245             /* x2[h2+2] = (si21 * yt0_1 + co21 * xt0_1 + 0x8000) >> 16;*/
1246             /* x2[h2+3] = (co21 * yt0_1 - si21 * xt0_1 + 0x8000) >> 16;*/
1247             /*---------------------------------------------------------*/
1248             xl1_0_1 = _cmpyr((_hi(co20si20_co21si21)), mxt0_0_myt0_0);
1249             xl1_2_3 = _cmpyr((_lo(co20si20_co21si21)), mxt0_1_myt0_1);
1250     
1251             /*---------------------------------------------------------*/
1252             /* The following code computes intermediate results for:   */
1253             /*                                                         */
1254             /* x2[l2  ] = (si30 * yt2_0 + co30 * xt2_0 + 0x8000) >> 16;*/
1255             /* x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0 + 0x8000) >> 16;*/
1256             /* x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1 + 0x8000) >> 16;*/
1257             /* x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1 + 0x8000) >> 16;*/
1258             /*---------------------------------------------------------*/
1259             xl2_0_1 = _cmpyr((_hi(co30si30_co31si31)), xt2_0_yt2_0);
1260             xl2_2_3 = _cmpyr((_lo(co30si30_co31si31)), xt2_1_yt2_1);
1262             /*---------------------------------------------------------*/
1263             /* Prepare the doubleword for each of the four outputs, by */
1264             /* the use of the _itod intrinsic that takes integers, and */
1265             /* forms a double word wide quantity.                      */
1266             /* Store out the outputs to the four legs of the butterfly */
1267             /* using aligned store double words. Notice the use of the */
1268             /* indices "l1", "l2", "h2" to derive the pointers for the */
1269             /* legs of the butterfly.                                  */
1270             /*---------------------------------------------------------*/
1271             _amemd8(&x2[0]) = _itod(x_0o_x_1o, x_2o_x_3o);
1272             _amemd8(&x2[l1]) = _itod(xl1_0_1, xl1_2_3);
1273             _amemd8(&x2[h2]) = _itod(xh2_0_1, xh2_2_3);
1274             _amemd8(&x2[l2]) = _itod(xl2_0_1, xl2_2_3);
1275         }
1276     }
1278     if (radix == 2)
1279         radix_2(ptr_x, ptr_y, npoints);
1280     else if (radix == 4)
1281         radix_4(ptr_x, ptr_y, npoints);
1283     return;
1286 void radix_2(
1287     short *restrict ptr_x,
1288     short *restrict ptr_y,
1289     int npoints
1292     short * restrict x2, * restrict x0;
1293     short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
1294     short n0, j0;
1296     int i, j, l1, h2;
1297     int y_01, y_23, y_45, y_67, y_89, y_AB, y_CD, y_EF;
1299     double x_1032, x_5476, x_98BA, x_DCFE;
1301     /*-----------------------------------------------------------------*/
1302     /* The following code performs either a standard radix4 pass or a  */
1303     /* radix2 pass. Two pointers are used to access the input data.    */
1304     /* The input data is read "N/4" complex samples apart or "N/2"     */
1305     /* words apart using pointers "x0" and "x2". This produces out-    */
1306     /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8    */
1307     /* N/2, 3N/8 for radix 2.                                          */
1308     /*-----------------------------------------------------------------*/
1309     y0 = ptr_y;
1310     y2 = ptr_y + (int)npoints;
1311     x0 = ptr_x;
1312     x2 = ptr_x + (int)(npoints >> 1);
1314     /*----------------------------------------------------------------*/
1315     /* The pointers are set at the following locations which are half */
1316     /* the offsets of a radix4 FFT.                                   */
1317     /*----------------------------------------------------------------*/
1318     y1 = y0 + (int)(npoints >> 2);
1319     y3 = y2 + (int)(npoints >> 2);
1320     l1 = _norm(npoints) + 1;
1321     j0 = 8;
1322     n0 = npoints >> 1;
1324     /*--------------------------------------------------------------------*/
1325     /* The following code reads data indentically for either a radix 4    */
1326     /* or a radix 2 style decomposition. It writes out at different       */
1327     /* locations though. It checks if either half the points, or a        */
1328     /* quarter of the complex points have been exhausted to jump to       */
1329     /* pervent double reversal.                                           */
1330     /*--------------------------------------------------------------------*/
1331     j = 0;
1333     _nassert((int)(n0) % 4 == 0);
1334     _nassert((int)(ptr_x) % 8 == 0);
1335     _nassert((int)(ptr_y) % 8 == 0);
1336     _nassert((int)(x0) % 8 == 0);
1337     _nassert((int)(x2) % 8 == 0);
1338     _nassert((int)(y0) % 8 == 0);
1339     _nassert(npoints >= 16);
1340     #pragma MUST_ITERATE(2,,2);
1341     for (i = 0; i < npoints; i += 8) {
1342         /*----------------------------------------------------------------*/
1343         /* Digit reverse the index starting from 0. The increment to "j"  */
1344         /* is either by 4, or 8.                                          */
1345         /*----------------------------------------------------------------*/
1346         h2 = _deal(j);
1347         h2 = _bitr(h2);
1348         h2 = _rotl(h2, 16);
1349         h2 = _shfl(h2);
1350         h2 >>= l1;
1351         /*----------------------------------------------------------------*/
1352         /* Read in the input data. These are transformed as a radix2.     */
1353         /*----------------------------------------------------------------*/
1354         x_1032 = _amemd8(&x0[0]);
1355         x_5476 = _amemd8(&x0[4]);
1356         x0 += 8;
1357         x_98BA = _amemd8(&x2[0]);
1358         x_DCFE = _amemd8(&x2[4]);
1359         x2 += 8;
1361         /*----------------------------------------------------------------------------*/
1362         /* Perform radix2 style decomposition.                                        */
1363         /*----------------------------------------------------------------------------*/
1364         /* n00 = x_0 + x_2;    n01 = x_1 + x_3;    y_01 = n00_n01 = _add2(x_10, x_32) */            
1365         /* n20 = x_0 - x_2;    n21 = x_1 - x_3;    y_89 = n20_n21 = _sub2(x_10, x_32) */
1366         /* n10 = x_4 + x_6;    n11 = x_5 + x_7;    y_45 = n10_n11 = _add2(x_54, x_76) */
1367         /* n30 = x_4 - x_6;    n31 = x_5 - x_7;    y_CD = n30_n31 = _sub2(x_54, x_76) */
1368         /* n02 = x_8 + x_a;    n03 = x_9 + x_b;    y_23 = n03_n02 = _add2(x_98, x_BA) */
1369         /* n12 = x_c + x_e;    n13 = x_d + x_f;    y_67 = n12_n13 = _add2(x_DC, x_FE) */
1370         /* n22 = x_8 - x_a;    n23 = x_9 - x_b;    y_AB = n22_n23 = _sub2(x_98, x_BA) */
1371         /* n32 = x_c - x_e;    n33 = x_d - x_f;    y_EF = n32_n33 = _sub2(x_DC, x_FE) */
1372         
1373         y_01 = _add2(_hi(x_1032), _lo(x_1032));
1374         y_89 = _sub2(_hi(x_1032), _lo(x_1032));
1375         y_45 = _add2(_hi(x_5476), _lo(x_5476));
1376         y_CD = _sub2(_hi(x_5476), _lo(x_5476));
1377         y_23 = _add2(_hi(x_98BA), _lo(x_98BA));
1378         y_67 = _add2(_hi(x_DCFE), _lo(x_DCFE));
1379         y_AB = _sub2(_hi(x_98BA), _lo(x_98BA));
1380         y_EF = _sub2(_hi(x_DCFE), _lo(x_DCFE));
1381     
1382         /*----Imaginary------------Real-----------*/ 
1383         /* y0[2*h2+0] = n01;    y0[2*h2+1] = n00; */
1384         /* y0[2*h2+2] = n03;    y0[2*h2+3] = n02; */
1385         /* y1[2*h2+0] = n11;    y1[2*h2+1] = n10; */
1386         /* y1[2*h2+2] = n13;    y1[2*h2+3] = n12; */
1387         /* y2[2*h2+0] = n21;    y2[2*h2+1] = n20; */
1388         /* y2[2*h2+2] = n23;    y2[2*h2+3] = n22; */
1389         /* y3[2*h2+0] = n31;    y3[2*h2+1] = n30; */
1390         /* y3[2*h2+2] = n33;    y3[2*h2+3] = n32; */
1392         /*------------------------------------------------------------*/
1393         /*  Store out the results of all four butterflies as double   */
1394         /*  words.                                                    */
1395         /*------------------------------------------------------------*/
1396         _amemd8(&y0[2 * h2]) = _itod(y_01, y_23);
1397         _amemd8(&y1[2 * h2]) = _itod(y_45, y_67);
1398         _amemd8(&y2[2 * h2]) = _itod(y_89, y_AB);
1399         _amemd8(&y3[2 * h2]) = _itod(y_CD, y_EF);
1401         j += j0;
1402         if (j == n0) {
1403             j += n0;
1404             x0 += (int)npoints >> 1;
1405             x2 += (int)npoints >> 1;
1406         }
1407     }
1410 void radix_4(
1411     short *restrict ptr_x,
1412     short *restrict ptr_y,
1413     int npoints
1416     short * restrict x2, * restrict x0;
1417     short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
1418     short n0, j0;
1420     int i, j, l1, h2;
1421     int xh1_0_xh0_0, xh1_1_xh0_1, xh1_2_xh0_2, xh1_3_xh0_3;
1422     int xl1_0_xl0_0, xl1_1_xl0_1, xl0_1_xl1_1, xl1_2_xl0_2, xl1_3_xl0_3, xl0_3_xl1_3;
1423     int n11_n10, n31_n30, n13_n12, n33_n32;
1425     double x_1032, x_5476, x_98BA, x_DCFE;
1427     long long n01_n00_n21_n20, n11_n30_n31_n10, n03_n02_n23_n22, n13_n32_n33_n12;
1429     /*-----------------------------------------------------------------*/
1430     /* The following code performs either a standard radix4 pass or a  */
1431     /* radix2 pass. Two pointers are used to access the input data.    */
1432     /* The input data is read "N/4" complex samples apart or "N/2"     */
1433     /* words apart using pointers "x0" and "x2". This produces out-    */
1434     /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8    */
1435     /* N/2, 3N/8 for radix 2.                                          */
1436     /*-----------------------------------------------------------------*/
1437     y0 = ptr_y;
1438     y2 = ptr_y + (int)npoints;
1439     x0 = ptr_x;
1440     x2 = ptr_x + (int)(npoints >> 1);
1442     /*----------------------------------------------------------------*/
1443     /* The pointers are set at the following locations which are half */
1444     /* the offsets of a radix4 FFT.                                   */
1445     /*----------------------------------------------------------------*/
1446     y3 = y2 + (int)(npoints >> 1);
1447     y1 = y0 + (int)(npoints >> 1);
1448     l1 = _norm(npoints) + 2;
1449     j0 = 4;
1450     n0 = npoints >> 2;
1452     /*--------------------------------------------------------------------*/
1453     /* The following code reads data indentically for either a radix 4    */
1454     /* or a radix 2 style decomposition. It writes out at different       */
1455     /* locations though. It checks if either half the points, or a        */
1456     /* quarter of the complex points have been exhausted to jump to       */
1457     /* pervent double reversal.                                           */
1458     /*--------------------------------------------------------------------*/
1459     j = 0;
1461     _nassert((int)(n0) % 4 == 0);
1462     _nassert((int)(x0) % 8 == 0);
1463     _nassert((int)(x2) % 8 == 0);
1464     _nassert((int)(y0) % 8 == 0);
1465     _nassert(npoints >= 16);
1466     #pragma MUST_ITERATE(2,,2);
1468     for (i = 0; i < npoints; i += 8) {
1469         /*----------------------------------------------------------------*/
1470         /* Digit reverse the index starting from 0. The increment to "j"  */
1471         /* is either by 4, or 8.                                          */
1472         /*----------------------------------------------------------------*/
1473         h2 = _deal(j);
1474         h2 = _bitr(h2);
1475         h2 = _rotl(h2, 16);
1476         h2 = _shfl(h2);
1477         h2 >>= l1;
1479         /*----------------------------------------------------------------*/
1480         /* Read in the input data. These are transformed as a radix4.     */
1481         /*----------------------------------------------------------------*/
1482         x_1032 = _amemd8(&x0[0]);
1483         x_5476 = _amemd8(&x0[4]);
1484         x0 += 8;
1485         x_98BA = _amemd8(&x2[0]);
1486         x_DCFE = _amemd8(&x2[4]);
1487         x2 += 8;
1489         /*------------------------------------------*/
1490         /* Perform radix4 style decomposition.      */
1491         /*------------------------------------------*/
1492         /* xh0_0 = x_0 + x_4;    xh1_0 = x_1 + x_5; */
1493         /* xl0_0 = x_0 - x_4;    xl1_0 = x_1 - x_5; */
1494         /* xh0_1 = x_2 + x_6;    xh1_1 = x_3 + x_7; */
1495         /* xl0_1 = x_2 - x_6;    xl1_1 = x_3 - x_7; */
1496         xh1_0_xh0_0 = _add2(_hi(x_1032), _hi(x_5476));
1497         xl1_0_xl0_0 = _sub2(_hi(x_1032), _hi(x_5476));
1498         xh1_1_xh0_1 = _add2(_lo(x_1032), _lo(x_5476));
1499         xl1_1_xl0_1 = _sub2(_lo(x_1032), _lo(x_5476));
1501         /* n00 = xh0_0 + xh0_1;    n20 = xh0_0 - xh0_1; */
1502         /* n01 = xh1_0 + xh1_1;    n21 = xh1_0 - xh1_1; */
1503         n01_n00_n21_n20 = _addsub2(xh1_0_xh0_0, xh1_1_xh0_1);
1505         /* n30 = xl0_0 + xl1_1;    n10 = xl0_0 - xl1_1; */
1506         /* n11 = xl1_0 + xl0_1;    n31 = xl1_0 - xl0_1; */
1507         xl0_1_xl1_1 = _rotl(xl1_1_xl0_1, 16);
1508         n11_n30_n31_n10 = _addsub2(xl1_0_xl0_0, xl0_1_xl1_1);
1510         n31_n30 = _packhl2(_loll(n11_n30_n31_n10), _hill(n11_n30_n31_n10));
1511         n11_n10 = _packhl2(_hill(n11_n30_n31_n10), _loll(n11_n30_n31_n10));
1513         /*------------------------------------------*/
1514         /* Perform radix4 style decomposition.      */
1515         /*------------------------------------------*/
1516         /* xh0_2 = x_8 + x_c;    xh1_2 = x_9 + x_d; */
1517         /* xl0_2 = x_8 - x_c;    xl1_2 = x_9 - x_d; */
1518         /* xh0_3 = x_a + x_e;    xh1_3 = x_b + x_f; */
1519         /* xl0_3 = x_a - x_e;    xl1_3 = x_b - x_f; */
1520         xh1_2_xh0_2 = _add2(_hi(x_98BA), _hi(x_DCFE));
1521         xl1_2_xl0_2 = _sub2(_hi(x_98BA), _hi(x_DCFE));
1522         xh1_3_xh0_3 = _add2(_lo(x_98BA), _lo(x_DCFE));
1523         xl1_3_xl0_3 = _sub2(_lo(x_98BA), _lo(x_DCFE));
1525         /* n02 = xh0_2 + xh0_3;    n22 = xh0_2 - xh0_3; */
1526         /* n03 = xh1_2 + xh1_3;    n23 = xh1_2 - xh1_3; */
1527         n03_n02_n23_n22 = _addsub2(xh1_2_xh0_2, xh1_3_xh0_3);
1529         /* n32 = xl0_2 + xl1_3;    n12 = xl0_2 - xl1_3; */
1530         /* n13 = xl1_2 + xl0_3;    n33 = xl1_2 - xl0_3; */
1531         xl0_3_xl1_3 = _rotl(xl1_3_xl0_3, 16);
1532         n13_n32_n33_n12 = _addsub2(xl1_2_xl0_2, xl0_3_xl1_3);
1534         n33_n32 = _packhl2(_loll(n13_n32_n33_n12), _hill(n13_n32_n33_n12));
1535         n13_n12 = _packhl2(_hill(n13_n32_n33_n12), _loll(n13_n32_n33_n12));
1537         /*-----------------------------------------------------------------*/
1538         /* Points that are read from succesive locations map to y, y[N/4]  */
1539         /* y[N/2], y[3N/4] in a radix4 scheme, y, y[N/8], y[N/2],y[5N/8]   */
1540         /*-----------------------------------------------------------------*/
1542         /*----Imaginary----------Real-----------*/
1543         /* y0[2*h2+0] = n01;  y0[2*h2+1] = n00; */
1544         /* y0[2*h2+2] = n03;  y0[2*h2+3] = n02; */
1545         /* y1[2*h2+0] = n11;  y1[2*h2+1] = n10; */
1546         /* y1[2*h2+2] = n13;  y1[2*h2+3] = n12; */
1547         /* y2[2*h2+0] = n21;  y2[2*h2+1] = n20; */
1548         /* y2[2*h2+2] = n23;  y2[2*h2+3] = n22; */
1549         /* y3[2*h2+0] = n31;  y3[2*h2+1] = n30; */
1550         /* y3[2*h2+2] = n33;  y3[2*h2+3] = n32; */
1551         _amemd8(&y0[2 * h2]) = _itod(_hill(n01_n00_n21_n20), _hill(n03_n02_n23_n22));
1552         _amemd8(&y1[2 * h2]) = _itod(n11_n10, n13_n12);
1553         _amemd8(&y2[2 * h2]) = _itod(_loll(n01_n00_n21_n20), _loll(n03_n02_n23_n22));
1554         _amemd8(&y3[2 * h2]) = _itod(n31_n30, n33_n32);
1556         j += j0;
1557         if (j == n0) {
1558             j  += n0;
1559             x0 += (int)npoints >> 1;
1560             x2 += (int)npoints >> 1;
1561         }
1562     }
1564 #endif
1565 /* ======================================================================== */
1566 /*  End of file: DSP_fft16x16_i.c                                           */
1567 /* ------------------------------------------------------------------------ */
1568 /*          Copyright (C) 2011 Texas Instruments, Incorporated.             */
1569 /*                          All Rights Reserved.                            */
1570 /* ======================================================================== */