ti/dsplib/src/DSP_fft16x16/c64P/DSP_fft16x16_i.c

   1 /* ======================================================================= */
   2 /*  TEXAS INSTRUMENTS, INC.                                                */
   3 /*                                                                         */
   4 /*  DSPLIB  DSP Signal Processing Library                                  */
   5 /*                                                                         */
   6 /*  This library contains proprietary intellectual property of Texas       */
   7 /*  Instruments, Inc.  The library and its source code are protected by    */
   8 /*  various copyrights, and portions may also be protected by patents or   */
   9 /*  other legal protections.                                               */
  10 /*                                                                         */
  11 /*  This software is licensed for use with Texas Instruments TMS320        */
  12 /*  family DSPs.  This license was provided to you prior to installing     */
  13 /*  the software.  You may review this license by consulting the file      */
  14 /*  TI_license.PDF which accompanies the files in this library.            */
  15 /*                                                                         */
  16 /* ----------------------------------------------------------------------- */
  17 /*                                                                         */
  18 /*  TEXAS INSTRUMENTS, INC.                                                */
  19 /*                                                                         */
  20 /*  NAME                                                                   */
  21 /*      DSP_fft16x16 -- fft16x16                                           */
  22 /*                                                                         */
  23 /*      USAGE                                                              */
  24 /*           This routine is C-callable and can be called as:              */
  25 /*                                                                         */
  26 /*          void DSP_fft16x16_i (                                          */
  27 /*              const short * ptr_w,                                       */
  28 /*              int npoints,                                               */
  29 /*              short * ptr_x,                                             */
  30 /*              short * ptr_y                                              */
  31 /*          );                                                             */
  32 /*                                                                         */
  33 /*            ptr_w   =  input twiddle factors                             */
  34 /*            npoints =  number of points                                  */
  35 /*            ptr_x   =  transformed data reversed                         */
  36 /*            ptr_y   =  linear transformed data                           */
  37 /*                                                                         */
  38 /*                                                                         */
  39 /*  DESCRIPTION                                                            */
  40 /*    The following code performs a mixed radix FFT for "npoints" which    */
  41 /*    is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4   */
  42 /*    transform and performs either a radix2 or radix4 transform on the    */
  43 /*    last stage depending on "npoints". If "npoints" is a multiple of 4,  */
  44 /*    then this last stage is also a radix4 transform, otherwise it is a   */
  45 /*    radix2 transform.                                                    */
  46 /*                                                                         */
  47 /*                                                                         */
  48 /* int gen_twiddle_fft16x16(short *w, int n)                               */
  49 /*                                                                         */
  50 /*    int i, j, k;                                                         */
  51 /*     double M = 32767.5;                                                 */
  52 /*                                                                         */
  53 /*    for (j = 1, k = 0; j < n >> 2; j = j << 2)                           */
  54 /*    {                                                                    */
  55 /*        for (i = 0; i < n >> 2; i += j << 1)                             */
  56 /*        {                                                                */
  57 /*                                                                         */
  58 /*          w[k + 11] = d2s(M * cos(6.0 * PI * (i + j) / n));              */
  59 /*          w[k + 10] = d2s(M * sin(6.0 * PI * (i + j) / n));              */
  60 /*          w[k +  9] = d2s(M * cos(6.0 * PI * (i    ) / n));              */
  61 /*          w[k +  8] = d2s(M * sin(6.0 * PI * (i    ) / n));              */
  62 /*                                                                         */
  63 /*          w[k +  7] = d2s(M * cos(4.0 * PI * (i + j) / n));              */
  64 /*          w[k +  6] = d2s(M * sin(4.0 * PI * (i + j) / n));              */
  65 /*          w[k +  5] = d2s(M * cos(4.0 * PI * (i    ) / n));              */
  66 /*          w[k +  4] = d2s(M * sin(4.0 * PI * (i    ) / n));              */
  67 /*                                                                         */
  68 /*          w[k +  3] = d2s(M * cos(2.0 * PI * (i + j) / n));              */
  69 /*          w[k +  2] = d2s(M * sin(2.0 * PI * (i + j) / n));              */
  70 /*          w[k +  1] = d2s(M * cos(2.0 * PI * (i    ) / n));              */
  71 /*          w[k +  0] = d2s(M * sin(2.0 * PI * (i    ) / n));              */
  72 /*                                                                         */
  73 /*          k += 12;                                                       */
  74 /*                                                                         */
  75 /*                                                                         */
  76 /*      }                                                                  */
  77 /*    }                                                                    */
  78 /*                                                                         */
  79 /*    return k;                                                            */
  80 /*                                                                         */
  81 /*                                                                         */
  82 /*  ASSUMPTIONS                                                            */
  83 /*      This code works for  both "npoints" a multiple of 2 or 4.          */
  84 /*      The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a        */
  85 /*      double-word boundary for the "optimized" implementations.          */
  86 /*                                                                         */
  87 /*      The input and output data are complex, with the real/imaginary     */
  88 /*      components stored in adjacent locations in the array.  The real    */
  89 /*      components are stored at even array indices, and the imaginary     */
  90 /*      components are stored at odd array indices.                        */
  91 /*                                                                         */
  92 /*  TECHNIQUES                                                             */
  93 /*      The following C code represents an implementation of the Cooley    */
  94 /*      Tukey radix 4 DIF FFT. It accepts the inputs in normal order and   */
  95 /*      produces the outputs in digit reversed order. The natural C code   */
  96 /*      shown in this file on the other hand, accepts the inputs in nor-   */
  97 /*      mal order and produces the outputs in normal order.                */
  98 /*                                                                         */
  99 /*      Several transformations have been applied to the original Cooley   */
 100 /*      Tukey code to produce the natural C code description shown here.   */
 101 /*      In order to understand these it would first be educational to      */
 102 /*      understand some of the issues involved in the conventional Cooley  */
 103 /*      Tukey FFT code.                                                    */
 104 /*                                                                         */
 105 /*      void radix4(int n, short x[], short wn[])                          */
 106 /*      {                                                                  */
 107 /*          int    n1,  n2,  ie,   ia1,  ia2, ia3;                         */
 108 /*          int    i0,  i1,  i2,    i3,    i, j,     k;                    */
 109 /*          short  co1, co2, co3,  si1,  si2, si3;                         */
 110 /*          short  xt0, yt0, xt1,  yt1,  xt2, yt2;                         */
 111 /*          short  xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21;               */
 112 /*                                                                         */
 113 /*          n2 = n;                                                        */
 114 /*          ie = 1;                                                        */
 115 /*          for (k = n; k > 1; k >>= 2)                                    */
 116 /*          {                                                              */
 117 /*              n1 = n2;                                                   */
 118 /*              n2 >>= 2;                                                  */
 119 /*              ia1 = 0;                                                   */
 120 /*                                                                         */
 121 /*              for (j = 0; j < n2; j++)                                   */
 122 /*              {                                                          */
 123 /*                   ia2 = ia1 + ia1;                                      */
 124 /*                   ia3 = ia2 + ia1;                                      */
 125 /*                                                                         */
 126 /*                   co1 = wn[2 * ia1    ];                                */
 127 /*                   si1 = wn[2 * ia1 + 1];                                */
 128 /*                   co2 = wn[2 * ia2    ];                                */
 129 /*                   si2 = wn[2 * ia2 + 1];                                */
 130 /*                   co3 = wn[2 * ia3    ];                                */
 131 /*                   si3 = wn[2 * ia3 + 1];                                */
 132 /*                   ia1 = ia1 + ie;                                       */
 133 /*                                                                         */
 134 /*                   for (i0 = j; i0< n; i0 += n1)                         */
 135 /*                   {                                                     */
 136 /*                       i1 = i0 + n2;                                     */
 137 /*                       i2 = i1 + n2;                                     */
 138 /*                       i3 = i2 + n2;                                     */
 139 /*                                                                         */
 140 /*                                                                         */
 141 /*                       xh0  = x[2 * i0    ] + x[2 * i2    ];             */
 142 /*                       xh1  = x[2 * i0 + 1] + x[2 * i2 + 1];             */
 143 /*                       xl0  = x[2 * i0    ] - x[2 * i2    ];             */
 144 /*                       xl1  = x[2 * i0 + 1] - x[2 * i2 + 1];             */
 145 /*                                                                         */
 146 /*                       xh20 = x[2 * i1    ] + x[2 * i3    ];             */
 147 /*                       xh21 = x[2 * i1 + 1] + x[2 * i3 + 1];             */
 148 /*                       xl20 = x[2 * i1    ] - x[2 * i3    ];             */
 149 /*                       xl21 = x[2 * i1 + 1] - x[2 * i3 + 1];             */
 150 /*                                                                         */
 151 /*                       x[2 * i0    ] = xh0 + xh20;                       */
 152 /*                       x[2 * i0 + 1] = xh1 + xh21;                       */
 153 /*                                                                         */
 154 /*                       xt0  = xh0 - xh20;                                */
 155 /*                       yt0  = xh1 - xh21;                                */
 156 /*                       xt1  = xl0 + xl21;                                */
 157 /*                       yt2  = xl1 + xl20;                                */
 158 /*                       xt2  = xl0 - xl21;                                */
 159 /*                       yt1  = xl1 - xl20;                                */
 160 /*                                                                         */
 161 /*                       x[2 * i1    ] = (xt1 * co1 + yt1 * si1) >> 15;    */
 162 /*                       x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15;    */
 163 /*                       x[2 * i2    ] = (xt0 * co2 + yt0 * si2) >> 15;    */
 164 /*                       x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15;    */
 165 /*                       x[2 * i3    ] = (xt2 * co3 + yt2 * si3) >> 15;    */
 166 /*                       x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15;    */
 167 /*                   }                                                     */
 168 /*             }                                                           */
 169 /*                                                                         */
 170 /*             ie <<= 2;                                                   */
 171 /*         }                                                               */
 172 /*     }                                                                   */
 173 /*                                                                         */
 174 /*      The conventional Cooley Tukey FFT, is written using three loops.   */
 175 /*      The outermost loop "k" cycles through the stages. There are log    */
 176 /*      N to the base 4 stages in all. The loop "j" cycles through the     */
 177 /*      groups of butterflies with different twiddle factors, loop "i"     */
 178 /*      reuses the twiddle factors for the different butterflies within    */
 179 /*      a stage. It is interesting to note the following:                  */
 180 /*                                                                         */
 181 /* ----------------------------------------------------------------------- */
 182 /*      Stage#     #Groups     # Butterflies with common     #Groups*Bflys */
 183 /*                               twiddle factors                           */
 184 /* ----------------------------------------------------------------------- */
 185 /*       1         N/4          1                            N/4           */
 186 /*       2         N/16         4                            N/4           */
 187 /*       ..                                                                */
 188 /*       logN      1            N/4                          N/4           */
 189 /* ----------------------------------------------------------------------- */
 190 /*                                                                         */
 191 /*      The following statements can be made based on above observations:  */
 192 /*                                                                         */
 193 /*      a) Inner loop "i0" iterates a veriable number of times. In         */
 194 /*      particular the number of iterations quadruples every time from     */
 195 /*      1..N/4. Hence software pipelining a loop that iterates a vraiable  */
 196 /*      number of times is not profitable.                                 */
 197 /*                                                                         */
 198 /*      b) Outer loop "j" iterates a variable number of times as well.     */
 199 /*      However the number of iterations is quartered every time from      */
 200 /*      N/4 ..1. Hence the behaviour in (a) and (b) are exactly opposite   */
 201 /*      to each other.                                                     */
 202 /*                                                                         */
 203 /*      c) If the two loops "i" and "j" are colaesced together then they   */
 204 /*      will iterate for a fixed number of times namely N/4. This allows   */
 205 /*      us to combine the "i" and "j" loops into 1 loop. Optimized impl-   */
 206 /*      ementations will make use of this fact.                            */
 207 /*                                                                         */
 208 /*      In addition the Cooley Tukey FFT accesses three twiddle factors    */
 209 /*      per iteration of the inner loop, as the butterflies that re-use    */
 210 /*      twiddle factors are lumped together. This leads to accessing the   */
 211 /*      twiddle factor array at three points each sepearted by "ie". Note  */
 212 /*      that "ie" is initially 1, and is quadrupled with every iteration.  */
 213 /*      Therfore these three twiddle factors are not even contiguous in    */
 214 /*      the array.                                                         */
 215 /*                                                                         */
 216 /*      In order to vectorize the FFT, it is desirable to access twiddle   */
 217 /*      factor array using double word wide loads and fetch the twiddle    */
 218 /*      factors needed. In order to do this a modified twiddle factor      */
 219 /*      array is created, in which the factors WN/4, WN/2, W3N/4 are       */
 220 /*      arranged to be contiguous. This eliminates the seperation between  */
 221 /*      twiddle factors within a butterfly. However this implies that as   */
 222 /*      the loop is traversed from one stage to another, that we maintain  */
 223 /*      a redundant version of the twiddle factor array. Hence the size    */
 224 /*      of the twiddle factor array increases as compared to the normal    */
 225 /*      Cooley Tukey FFT.  The modified twiddle factor array is of size    */
 226 /*      "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4"   */
 227 /*      where N is the number of complex points to be transformed. The     */
 228 /*      routine that generates the modified twiddle factor array was       */
 229 /*      presented earlier. With the above transformation of the FFT,       */
 230 /*      both the input data and the twiddle factor array can be accessed   */
 231 /*      using double-word wide loads to enable packed data processing.     */
 232 /*                                                                         */
 233 /*      The final stage is optimised to remove the multiplication as       */
 234 /*      w0 = 1.  This stage also performs digit reversal on the data,      */
 235 /*      so the final output is in natural order.                           */
 236 /*                                                                         */
 237 /*      The fft() code shown here performs the bulk of the computation     */
 238 /*      in place. However, because digit-reversal cannot be performed      */
 239 /*      in-place, the final result is written to a separate array, y[].    */
 240 /*                                                                         */
 241 /*      The actual twiddle factors for the FFT are cosine, - sine. The     */
 242 /*      twiddle factors stored in the table are csine and sine, hence      */
 243 /*      the sign of the "sine" term is comprehended during multipli-       */
 244 /*      cation as shown above.                                             */
 245 /*                                                                         */
 246 /*  MEMORY NOTE                                                            */
 247 /*      The optimized implementations are written for LITTLE ENDIAN.       */
 248 /*                                                                         */
 249 /* Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/  */
 250 /*                                                                         */
 251 /*                                                                         */
 252 /*  Redistribution and use in source and binary forms, with or without     */
 253 /*  modification, are permitted provided that the following conditions     */
 254 /*  are met:                                                               */
 255 /*                                                                         */
 256 /*    Redistributions of source code must retain the above copyright       */
 257 /*    notice, this list of conditions and the following disclaimer.        */
 258 /*                                                                         */
 259 /*    Redistributions in binary form must reproduce the above copyright    */
 260 /*    notice, this list of conditions and the following disclaimer in the  */
 261 /*    documentation and/or other materials provided with the               */
 262 /*    distribution.                                                        */
 263 /*                                                                         */
 264 /*    Neither the name of Texas Instruments Incorporated nor the names of  */
 265 /*    its contributors may be used to endorse or promote products derived  */
 266 /*    from this software without specific prior written permission.        */
 267 /*                                                                         */
 268 /*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    */
 269 /*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      */
 270 /*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR  */
 271 /*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT   */
 272 /*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,  */
 273 /*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT       */
 274 /*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  */
 275 /*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  */
 276 /*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT    */
 277 /*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  */
 278 /*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.   */
 279 /*                                                                         */
 280 /* ======================================================================= */
 281
 282 #pragma CODE_SECTION(DSP_fft16x16_i, ".text:optimized");
 283
 284 #include "DSP_fft16x16_i.h"
 285
 286 static inline void radix_2(short *restrict ptr_x, short *restrict ptr_y, int npoints);
 287 static inline void radix_4(short *restrict ptr_x, short *restrict ptr_y, int npoints);
 288
 289 #ifdef _LITTLE_ENDIAN
 290 void DSP_fft16x16_i (
 291     const short * restrict ptr_w,
 292     int npoints,
 293     short * restrict ptr_x,
 294     short * restrict ptr_y
 295 )
 296 {
 297     const short * restrict w;
 298     const short * restrict w2;
 299     short * restrict x, * restrict x2, * restrict x0;
 300
 301     int j, i, l1, radix;
 302     int l2, h2, fft_jmp;
 303     int predj, tw_offset, stride;
 304
 305     int x_1o_x_0o, x_3o_x_2o;
 306     int xl2_1_0, xl2_3_2;
 307
 308     unsigned int myt0_0_mxt0_0, myt0_1_mxt0_1;
 309     unsigned int xl1_1_0, xl1_3_2, xh2_1_0, xh2_3_2;
 310     unsigned int yt1_0_xt1_0, yt1_1_xt1_1, yt2_0_xt2_0, yt2_1_xt2_1;
 311
 312     unsigned int xl0_0_xl1_0, xt1_0_yt2_0;
 313     unsigned int xt2_0_yt1_0, xl0_1_xl1_1;
 314     unsigned int xt1_1_yt2_1, xt2_1_yt1_1;
 315
 316     unsigned int xh1_0_xh0_0, xh1_1_xh0_1;
 317     unsigned int xl1_0_xl0_0, xl1_1_xl0_1;
 318     unsigned int xh21_0_xh20_0, xh21_1_xh20_1;
 319     unsigned int xl21_0_xl20_0, xl21_1_xl20_1;
 320
 321     double x_3210;
 322     double co11si11_co10si10, co21si21_co20si20, co31si31_co30si30;
 323     double x_l1_32_x_l1_10, x_l2_32_x_l2_10, x_h2_32_x_h2_10;
 324
 325     long long xt1_0_yt2_0_xt2_0_yt1_0, xt1_1_yt2_1_xt2_1_yt1_1;
 326     long long yt1_0_xt1_0_yt2_0_xt2_0, yt1_1_xt1_1_yt2_1_xt2_1;
 327     long long xh1_0_xh0_0_xl1_0_xl0_0, xh1_1_xh0_1_xl1_1_xl0_1;
 328     long long xh21_0_xh20_0_xl21_0_xl20_0, xh21_1_xh20_1_xl21_1_xl20_1;
 329
 330     /*---------------------------------------------------------------------*/
 331     /* Determine the magnitude od the number of points to be transformed.  */
 332     /* Check whether we can use a radix4 decomposition or a mixed radix    */
 333     /* transformation, by determining modulo 2.                            */
 334     /*---------------------------------------------------------------------*/
 335     radix = _norm(npoints) & 1 ? 2 : 4;
 336
 337     /*----------------------------------------------------------------------*/
 338     /* The stride is quartered with every iteration of the outer loop. It   */
 339     /* denotes the seperation between any two adjacent inputs to the butter */
 340     /* -fly. This should start out at N/4, hence stride is initially set to */
 341     /* N. For every stride, 6*stride twiddle factors are accessed. The      */
 342     /* "tw_offset" is the offset within the current twiddle factor sub-     */
 343     /* table. This is set to zero, at the start of the code and is used to  */
 344     /* obtain the appropriate sub-table twiddle pointer by offseting it     */
 345     /* with the base pointer "ptr_w".                                       */
 346     /*----------------------------------------------------------------------*/
 347     stride = npoints;
 348     tw_offset = 0;
 349     fft_jmp = 6 * stride;
 350
 351     _nassert(stride > 4);
 352     #pragma MUST_ITERATE(1,,1);
 353     while (stride > radix) {
 354         /*-----------------------------------------------------------------*/
 355         /* At the start of every iteration of the outer loop, "j" is set   */
 356         /* to zero, as "w" is pointing to the correct location within the  */
 357         /* twiddle factor array. For every iteration of the inner loop     */
 358         /* 6 * stride twiddle factors are accessed. For eg,                */
 359         /*                                                                 */
 360         /* #Iteration of outer loop  # twiddle factors    #times cycled    */
 361         /*  1                          6 N/4               1               */
 362         /*  2                          6 N/16              4               */
 363         /*  ...                                                            */
 364         /*-----------------------------------------------------------------*/
 365         j = 0;
 366         fft_jmp >>= 2;
 367
 368         /*-----------------------------------------------------------------*/
 369         /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or  */
 370         /* "N/2", "N", "3N/2" half word                                    */
 371         /*-----------------------------------------------------------------*/
 372         h2 = stride >> 1;
 373         l1 = stride;
 374         l2 = stride + (stride >> 1);
 375
 376         /*-----------------------------------------------------------------*/
 377         /*  Reset "x" to point to the start of the input data array.       */
 378         /* "tw_offset" starts off at 0, and increments by "6 * stride"     */
 379         /*  The stride quarters with every iteration of the outer loop     */
 380         /*-----------------------------------------------------------------*/
 381         x = ptr_x;
 382         w = ptr_w + tw_offset;
 383         tw_offset += fft_jmp;
 384         stride >>= 2;
 385
 386         /*----------------------------------------------------------------*/
 387         /* The following loop iterates through the different butterflies, */
 388         /* within a given stage. Recall that there are logN to base 4     */
 389         /* stages. Certain butterflies share the twiddle factors. These   */
 390         /* are grouped together. On the very first stage there are no     */
 391         /* butterflies that share the twiddle factor, all N/4 butter-     */
 392         /* flies have different factors. On the next stage two sets of    */
 393         /* N/8 butterflies share the same twiddle factor. Hence after     */
 394         /* half the butterflies are performed, j the index into the       */
 395         /* factor array resets to 0, and the twiddle factors are reused.  */
 396         /* When this happens, the data pointer 'x' is incremented by the  */
 397         /* fft_jmp amount. In addition the following code is unrolled to  */
 398         /* perform "2" radix4 butterflies in parallel.                    */
 399         /*----------------------------------------------------------------*/
 400         _nassert(npoints >= 16);
 401         #pragma MUST_ITERATE(2,,2);
 402         for (i = 0; i < npoints; i += 8) {
 403             /*------------------------------------------------------------*/
 404             /* Read the first 12 twiddle factors, six of which are used   */
 405             /* for one radix 4 butterfly and six of which are used for    */
 406             /* next one.                                                  */
 407             /*                                                            */
 408             /*  co10 = w[j+1];        si10 = w[j+0];                      */
 409             /*  co11 = w[j+3];        si11 = w[j+2];                      */
 410             /*  co20 = w[j+5];        si20 = w[j+4];                      */
 411             /*  co21 = w[j+7];        si21 = w[j+6];                      */
 412             /*  co30 = w[j+9];        si30 = w[j+8];                      */
 413             /*  co31 = w[j+11];       si31 = w[j+10];                     */
 414             /*                                                            */
 415             /*  The code shown above performs half word wide loads. The   */
 416             /*  C64x can perform double word wide loads. This is done by  */
 417             /*  the use of the _amemd8_const intrinsic. The const quali-  */
 418             /*  fies the fact that this array will only be read from and  */
 419             /*  not written to.                                           */
 420             /*------------------------------------------------------------*/
 421             w2 = w + j;
 422             co11si11_co10si10 = _amemd8_const(w2);
 423             co21si21_co20si20 = _amemd8_const(&w2[4]);
 424             co31si31_co30si30 = _amemd8_const(&w2[8]);
 425
 426             /*------------------------------------------------------------*/
 427             /* Read in the first complex input for the butterflies.       */
 428             /* 1st complex input to 1st butterfly: x[0] + jx[1]           */
 429             /* 1st complex input to 2nd butterfly: x[2] + jx[3]           */
 430             /* Read in the complex inputs for the butterflies. Each of the*/
 431             /* successive complex inputs of the butterfly are seperated   */
 432             /* by a fixed amount known as stride. The stride starts out   */
 433             /* at N/4, and quarters with every stage.                     */
 434             /*                                                            */
 435             /*  x_0    =   x[0];          x_1    =   x[1];                */
 436             /*  x_2    =   x[2];          x_3    =   x[3];                */
 437             /*  x_l1_0 =   x[l1  ];       x_l1_1 =   x[l1 + 1];           */
 438             /*  x_l1_2 =   x[l1 + 2];     x_l1_3 =   x[l1 + 3];           */
 439             /*  x_l2_0 =   x[l2  ];       x_l2_1 =   x[l2 + 1];           */
 440             /*  x_l2_2 =   x[l2 + 2];     x_l2_3 =   x[l2 + 3];           */
 441             /*  x_h2_0 =   x[h2  ];       x_h2_1 =   x[h2 + 1];           */
 442             /*  x_h2_2 =   x[h2 + 2];     x_h2_3 =   x[h2 + 3];           */
 443             /*                                                            */
 444             /* These loads are performed using _amemd8. Note that there   */
 445             /* is no const qualifier in this case as this loop writes     */
 446             /* back results into the data array "x". The "a" in _amemd8   */
 447             /* qualifies the fact that the loads are aligned.             */
 448             /*------------------------------------------------------------*/
 449             x_3210 = _amemd8(&x[0]);
 450             x_l1_32_x_l1_10 = _amemd8(&x[l1]);
 451             x_l2_32_x_l2_10 = _amemd8(&x[l2]);
 452             x_h2_32_x_h2_10 = _amemd8(&x[h2]);
 453
 454             /*-----------------------------------------------------------*/
 455             /* Two butterflies are evaluated in parallel. The following  */
 456             /* results will be shown for one butterfly only, although    */
 457             /* both are being evaluated in parallel.                     */
 458             /*                                                           */
 459             /* Perform radix2 style DIF butterflies.                     */
 460             /*                                                           */
 461             /*  xh0_0  = x_0    + x_l1_0;     xh1_0  = x_1    + x_l1_1;  */
 462             /*  xh0_1  = x_2    + x_l1_2;     xh1_1  = x_3    + x_l1_3;  */
 463             /*  xl0_0  = x_0    - x_l1_0;     xl1_0  = x_1    - x_l1_1;  */
 464             /*  xl0_1  = x_2    - x_l1_2;     xl1_1  = x_3    - x_l1_3;  */
 465             /*  xh20_0 = x_h2_0 + x_l2_0;     xh21_0 = x_h2_1 + x_l2_1;  */
 466             /*  xh20_1 = x_h2_2 + x_l2_2;     xh21_1 = x_h2_3 + x_l2_3;  */
 467             /*  xl20_0 = x_h2_0 - x_l2_0;     xl21_0 = x_h2_1 - x_l2_1;  */
 468             /*  xl20_1 = x_h2_2 - x_l2_2;     xl21_1 = x_h2_3 - x_l2_3;  */
 469             /*                                                           */
 470             /*  These are done using packed data processing, and these   */
 471             /*  are done using _add2, _sub2.                             */
 472             /*  For example "xh1_0_xh0_0" has a packed value xh1_0,      */
 473             /*  xh0_0.                                                   */
 474             /*-----------------------------------------------------------*/
 475             xh1_0_xh0_0_xl1_0_xl0_0 = _addsub2((_lo(x_3210)), (_lo(x_l1_32_x_l1_10)));
 476             xh1_1_xh0_1_xl1_1_xl0_1 = _addsub2((_hi(x_3210)), (_hi(x_l1_32_x_l1_10)));
 477
 478             xh1_0_xh0_0 = (_hill(xh1_0_xh0_0_xl1_0_xl0_0));
 479             xh1_1_xh0_1 = (_hill(xh1_1_xh0_1_xl1_1_xl0_1));
 480             xl1_0_xl0_0 = (_loll(xh1_0_xh0_0_xl1_0_xl0_0));
 481             xl1_1_xl0_1 = (_loll(xh1_1_xh0_1_xl1_1_xl0_1));
 482
 483             xh21_0_xh20_0_xl21_0_xl20_0 = _addsub2((_lo(x_h2_32_x_h2_10)), (_lo(x_l2_32_x_l2_10)));
 484             xh21_1_xh20_1_xl21_1_xl20_1 = _addsub2((_hi(x_h2_32_x_h2_10)), (_hi(x_l2_32_x_l2_10)));
 485
 486             xh21_0_xh20_0 = (_hill(xh21_0_xh20_0_xl21_0_xl20_0));
 487             xh21_1_xh20_1 = (_hill(xh21_1_xh20_1_xl21_1_xl20_1));
 488             xl21_0_xl20_0 = (_loll(xh21_0_xh20_0_xl21_0_xl20_0));
 489             xl21_1_xl20_1 = (_loll(xh21_1_xh20_1_xl21_1_xl20_1));
 490
 491             /*----------------------------------------------------------*/
 492             /* Derive output pointers using the input pointer "x"       */
 493             /*----------------------------------------------------------*/
 494             x0 = (short *)_mvd((int)x);
 495             x2 = (short *)_mvd((int)x0);
 496
 497             /*-----------------------------------------------------------*/
 498             /* When the twiddle factors are not to be re-used, j is      */
 499             /* incremented by 12, to reflect the fact that 12 half words */
 500             /* are consumed in every iteration. The input data pointer   */
 501             /* increments by 4. Note that within a stage, the stride     */
 502             /* does not change and hence the offsets for the other three */
 503             /* legs, 0, h2, l1, l2.                                      */
 504             /*-----------------------------------------------------------*/
 505             j += 12;
 506             x += 4;
 507
 508             predj = (j - fft_jmp);
 509             if (!predj) x += fft_jmp;
 510             if (!predj) j = 0;
 511
 512             /*----------------------------------------------------------*/
 513             /* x0[0] = (xh0_0 + xh20_0 + 1) >> 1;                       */
 514             /* x0[1] = (xh1_0 + xh21_0 + 1) >> 1;                       */
 515             /* x0[2] = (xh0_1 + xh20_1 + 1) >> 1;                       */
 516             /* x0[3] = (xh1_1 + xh21_1 + 1) >> 1;                       */
 517             /*                                                          */
 518             /* "y0x0" and "y1x1" are the first four outputs of the      */
 519             /*  radix4 butterfly. These are computed on the C64x        */
 520             /* using _avg2.                                             */
 521             /*----------------------------------------------------------*/
 522             x_1o_x_0o = _avg2(xh21_0_xh20_0, xh1_0_xh0_0);
 523             x_3o_x_2o = _avg2(xh21_1_xh20_1, xh1_1_xh0_1);
 524
 525             /*----------------------------------------------------------*/
 526             /* Perform one more stage of radix2 decompositions as shown */
 527             /* These are performed using packed data processing.        */
 528             /*                                                          */
 529             /* xt0_0 = xh0_0 - xh20_0;  yt0_0 = xh1_0 - xh21_0;         */
 530             /* xt0_1 = xh0_1 - xh20_1;  yt0_1 = xh1_1 - xh21_1;         */
 531             /* xt1_0 = xl0_0 + xl21_0;  yt2_0 = xl1_0 + xl20_0;         */
 532             /* xt2_0 = xl0_0 - xl21_0;  yt1_0 = xl1_0 - xl20_0;         */
 533             /* xt1_1 = xl0_1 + xl21_1;  yt2_1 = xl1_1 + xl20_1;         */
 534             /* xt2_1 = xl0_1 - xl21_1;  yt1_1 = xl1_1 - xl20_1;         */
 535             /*                                                          */
 536             /*----------------------------------------------------------*/
 537             myt0_0_mxt0_0 = _sub2(xh21_0_xh20_0, xh1_0_xh0_0);
 538             myt0_1_mxt0_1 = _sub2(xh21_1_xh20_1, xh1_1_xh0_1);
 539
 540             /*----------------------------------------------------------*/
 541             /* Repack coefficients to sustain packed data processing.   */
 542             /* This requires a swizzle so that intermediate results     */
 543             /* end up in the right spot. This is needed to compute      */
 544             /* other intermediate results using add2 and sub2.          */
 545             /*----------------------------------------------------------*/
 546             xl0_0_xl1_0 = _rotl(xl1_0_xl0_0, 16);
 547             xl0_1_xl1_1 = _rotl(xl1_1_xl0_1, 16);
 548
 549             //;----------------------------------------------------------;
 550             //; xt1_0 = xl0_0 + xl21_0   yt2_0 = xl1_0 + xl20_0          ;
 551             //; xt1_1 = xl0_1 + xl21_1   yt2_1 = xl1_1 + xl20_1          ;
 552             //; xt2_0 = xl0_0 - xl21_0   yt1_0 = xl1_0 - xl20_0          ;
 553             //; xt2_1 = xl0_1 - xl21_1   yt1_1 = xl1_1 - xl20_1          ;
 554             //;----------------------------------------------------------;
 555             xt1_0_yt2_0_xt2_0_yt1_0 = _addsub2(xl0_0_xl1_0, xl21_0_xl20_0);
 556             xt1_1_yt2_1_xt2_1_yt1_1 = _addsub2(xl0_1_xl1_1, xl21_1_xl20_1);
 557
 558             xt1_0_yt2_0 = (_hill(xt1_0_yt2_0_xt2_0_yt1_0));
 559             xt1_1_yt2_1 = (_hill(xt1_1_yt2_1_xt2_1_yt1_1));
 560             xt2_0_yt1_0 = (_loll(xt1_0_yt2_0_xt2_0_yt1_0));
 561             xt2_1_yt1_1 = (_loll(xt1_1_yt2_1_xt2_1_yt1_1));
 562
 563             /*----------------------------------------------------------*/
 564             /* Repack results so that xt_XXyt_XX are a packed  32 bit   */
 565             /* quantity. The operations performed above tend to leave   */
 566             /* the results as xt_XXyt_YY and xt_YYyt_XX. This is where  */
 567             /* the packed instructions come in handy. For example       */
 568             /* xt_XXyt_XX = _packlh2(xt_XXyt_YY, xt_YY_yt_XX)          */
 569             /*----------------------------------------------------------*/
 570             yt1_0_xt1_0_yt2_0_xt2_0 = _dpackx2(xt1_0_yt2_0, xt2_0_yt1_0);
 571             yt1_1_xt1_1_yt2_1_xt2_1 = _dpackx2(xt1_1_yt2_1, xt2_1_yt1_1);
 572
 573             yt1_0_xt1_0 = (_hill(yt1_0_xt1_0_yt2_0_xt2_0));
 574             yt1_1_xt1_1 = (_hill(yt1_1_xt1_1_yt2_1_xt2_1));
 575             yt2_0_xt2_0 = (_loll(yt1_0_xt1_0_yt2_0_xt2_0));
 576             yt2_1_xt2_1 = (_loll(yt1_1_xt1_1_yt2_1_xt2_1));
 577
 578             /*---------------------------------------------------------*/
 579             /* Notice that in this version of the code the two middle  */
 580             /* legs are swapped as indicated by the stores to x[l1]..  */
 581             /* x[l1 + 3] which preceede the stores to x[h2]..x[h2 + 3] */
 582             /* This reversal guarantees that a radix4 DIF butterfly    */
 583             /* produces results in digit reversed order. Note that in  */
 584             /* addition to the rounding, the shift is performed by 16, */
 585             /* as opposed to 15, to give scaling.                      */
 586             /*---------------------------------------------------------*/
 587
 588             /*---------------------------------------------------------*/
 589             /*                                                         */
 590             /* x2[l1  ] = (si10 * yt1_0 + co10 * xt1_0 + 0x8000) >> 16;*/
 591             /* x2[l1+1] = (co10 * yt1_0 - si10 * xt1_0 + 0x8000) >> 16;*/
 592             /* x2[l1+2] = (si11 * yt1_1 + co11 * xt1_1 + 0x8000) >> 16;*/
 593             /* x2[l1+3] = (co11 * yt1_1 - si11 * xt1_1 + 0x8000) >> 16;*/
 594             /*                                                         */
 595             /* These four results are retained in registers and a      */
 596             /* double word is formed so that it can be stored with     */
 597             /* one STDW.                                               */
 598             /*---------------------------------------------------------*/
 599             xh2_1_0 = _cmpyr((_lo(co11si11_co10si10)), yt1_0_xt1_0);
 600             xh2_3_2 = _cmpyr((_hi(co11si11_co10si10)), yt1_1_xt1_1);
 601
 602             /*---------------------------------------------------------*/
 603             /* The following code computes intermediate results for:   */
 604             /*                                                         */
 605             /* x2[h2  ] = (si20 * yt0_0 + co20 * xt0_0 + 0x8000) >> 16;*/
 606             /* x2[h2+1] = (co20 * yt0_0 - si20 * xt0_0 + 0x8000) >> 16;*/
 607             /* x2[h2+2] = (si21 * yt0_1 + co21 * xt0_1 + 0x8000) >> 16;*/
 608             /* x2[h2+3] = (co21 * yt0_1 - si21 * xt0_1 + 0x8000) >> 16;*/
 609             /*---------------------------------------------------------*/
 610             xl1_1_0 = _cmpyr((_lo(co21si21_co20si20)), myt0_0_mxt0_0);
 611             xl1_3_2 = _cmpyr((_hi(co21si21_co20si20)), myt0_1_mxt0_1);
 612
 613             /*---------------------------------------------------------*/
 614             /* The following code computes intermediate results for:   */
 615             /*                                                         */
 616             /* x2[l2  ] = (si30 * yt2_0 + co30 * xt2_0 + 0x8000) >> 16;*/
 617             /* x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0 + 0x8000) >> 16;*/
 618             /* x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1 + 0x8000) >> 16;*/
 619             /* x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1 + 0x8000) >> 16;*/
 620             /*---------------------------------------------------------*/
 621             xl2_1_0 = _cmpyr((_lo(co31si31_co30si30)), yt2_0_xt2_0);
 622             xl2_3_2 = _cmpyr((_hi(co31si31_co30si30)), yt2_1_xt2_1);
 623
 624             /*---------------------------------------------------------*/
 625             /* Prepare the doubleword for each of the four outputs, by */
 626             /* the use of the _itod intrinsic that takes integers, and */
 627             /* forms a double word wide quantity.                      */
 628             /* Store out the outputs to the four legs of the butterfly */
 629             /* using aligned store double words. Notice the use of the */
 630             /* indices "l1", "l2", "h2" to derive the pointers for the */
 631             /* legs of the butterfly.                                  */
 632             /*---------------------------------------------------------*/
 633             _amemd8(&x2[0]) = _itod(x_3o_x_2o, x_1o_x_0o);
 634             _amemd8(&x2[l1]) = _itod(xl1_3_2, xl1_1_0);
 635             _amemd8(&x2[h2]) = _itod(xh2_3_2, xh2_1_0);
 636             _amemd8(&x2[l2]) = _itod(xl2_3_2, xl2_1_0);
 637         }
 638     }
 639
 640     if (radix == 2)
 641         radix_2(ptr_x, ptr_y, npoints);
 642     else if (radix == 4)
 643         radix_4(ptr_x, ptr_y, npoints);
 644
 645     return;
 646 }
 647
 648 void radix_2(
 649     short *restrict ptr_x,
 650     short *restrict ptr_y,
 651     int npoints
 652 )
 653 {
 654     short * restrict x2, * restrict x0;
 655     short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
 656     short n0, j0;
 657
 658     int i, j, l1, h2;
 659     int y_10, y_32, y_54, y_76, y_98, y_BA, y_DC, y_FE;
 660
 661     double x_2301, x_6745, x_AB89, x_EFCD;
 662
 663     /*-----------------------------------------------------------------*/
 664     /* The following code performs either a standard radix4 pass or a  */
 665     /* radix2 pass. Two pointers are used to access the input data.    */
 666     /* The input data is read "N/4" complex samples apart or "N/2"     */
 667     /* words apart using pointers "x0" and "x2". This produces out-    */
 668     /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8    */
 669     /* N/2, 3N/8 for radix 2.                                          */
 670     /*-----------------------------------------------------------------*/
 671     y0 = ptr_y;
 672     y2 = ptr_y + (int)npoints;
 673     x0 = ptr_x;
 674     x2 = ptr_x + (int)(npoints >> 1);
 675
 676     /*----------------------------------------------------------------*/
 677     /* The pointers are set at the following locations which are half */
 678     /* the offsets of a radix4 FFT.                                   */
 679     /*----------------------------------------------------------------*/
 680     y1 = y0 + (int)(npoints >> 2);
 681     y3 = y2 + (int)(npoints >> 2);
 682     l1 = _norm(npoints) + 1;
 683     j0 = 8;
 684     n0 = npoints >> 1;
 685
 686     /*--------------------------------------------------------------------*/
 687     /* The following code reads data indentically for either a radix 4    */
 688     /* or a radix 2 style decomposition. It writes out at different       */
 689     /* locations though. It checks if either half the points, or a        */
 690     /* quarter of the complex points have been exhausted to jump to       */
 691     /* pervent double reversal.                                           */
 692     /*--------------------------------------------------------------------*/
 693     j = 0;
 694
 695     _nassert((int)(n0) % 4 == 0);
 696     _nassert((int)(ptr_x) % 8 == 0);
 697     _nassert((int)(ptr_y) % 8 == 0);
 698     _nassert((int)(x0) % 8 == 0);
 699     _nassert((int)(x2) % 8 == 0);
 700     _nassert((int)(y0) % 8 == 0);
 701     _nassert(npoints >= 16);
 702     #pragma MUST_ITERATE(2,,2);
 703     for (i = 0; i < npoints; i += 8) {
 704         /*----------------------------------------------------------------*/
 705         /* Digit reverse the index starting from 0. The increment to "j"  */
 706         /* is either by 4, or 8.                                          */
 707         /*----------------------------------------------------------------*/
 708         h2 = _deal(j);
 709         h2 = _bitr(h2);
 710         h2 = _rotl(h2, 16);
 711         h2 = _shfl(h2);
 712         h2 >>= l1;
 713         /*----------------------------------------------------------------*/
 714         /* Read in the input data. These are transformed as a radix2.     */
 715         /*----------------------------------------------------------------*/
 716         x_2301 = _amemd8(&x0[0]);
 717         x_6745 = _amemd8(&x0[4]);
 718         x0 += 8;
 719         x_AB89 = _amemd8(&x2[0]);
 720         x_EFCD = _amemd8(&x2[4]);
 721         x2 += 8;
 722
 723         /*----------------------------------------------------------------------------*/
 724         /* Perform radix2 style decomposition.                                        */
 725         /*----------------------------------------------------------------------------*/
 726         /* n00 = x_0 + x_2;    n01 = x_1 + x_3;    y_10 = n00_n01 = _add2(x_01, x_23) */
 727         /* n20 = x_0 - x_2;    n21 = x_1 - x_3;    y_98 = n20_n21 = _sub2(x_01, x_23) */
 728         /* n10 = x_4 + x_6;    n11 = x_5 + x_7;    y_54 = n10_n11 = _add2(x_45, x_67) */
 729         /* n30 = x_4 - x_6;    n31 = x_5 - x_7;    y_DC = n30_n31 = _sub2(x_45, x_67) */
 730         /* n02 = x_8 + x_a;    n03 = x_9 + x_b;    y_32 = n03_n02 = _add2(x_89, x_AB) */
 731         /* n12 = x_c + x_e;    n13 = x_d + x_f;    y_76 = n12_n13 = _add2(x_CD, x_EF) */
 732         /* n22 = x_8 - x_a;    n23 = x_9 - x_b;    y_BA = n22_n23 = _sub2(x_89, x_AB) */
 733         /* n32 = x_c - x_e;    n33 = x_d - x_f;    y_FE = n32_n33 = _sub2(x_CD, x_EF) */
 734
 735         y_10 = _add2(_lo(x_2301), _hi(x_2301));
 736         y_98 = _sub2(_lo(x_2301), _hi(x_2301));
 737         y_54 = _add2(_lo(x_6745), _hi(x_6745));
 738         y_DC = _sub2(_lo(x_6745), _hi(x_6745));
 739         y_32 = _add2(_lo(x_AB89), _hi(x_AB89));
 740         y_76 = _add2(_lo(x_EFCD), _hi(x_EFCD));
 741         y_BA = _sub2(_lo(x_AB89), _hi(x_AB89));
 742         y_FE = _sub2(_lo(x_EFCD), _hi(x_EFCD));
 743
 744         /*----Imaginary------------Real-----------*/
 745         /* y0[2*h2+0] = n01;    y0[2*h2+1] = n00; */
 746         /* y0[2*h2+2] = n03;    y0[2*h2+3] = n02; */
 747         /* y1[2*h2+0] = n11;    y1[2*h2+1] = n10; */
 748         /* y1[2*h2+2] = n13;    y1[2*h2+3] = n12; */
 749         /* y2[2*h2+0] = n21;    y2[2*h2+1] = n20; */
 750         /* y2[2*h2+2] = n23;    y2[2*h2+3] = n22; */
 751         /* y3[2*h2+0] = n31;    y3[2*h2+1] = n30; */
 752         /* y3[2*h2+2] = n33;    y3[2*h2+3] = n32; */
 753
 754         /*------------------------------------------------------------*/
 755         /*  Store out the results of all four butterflies as double   */
 756         /*  words.                                                    */
 757         /*------------------------------------------------------------*/
 758         _amemd8(&y0[2 * h2]) = _itod(y_32, y_10);
 759         _amemd8(&y1[2 * h2]) = _itod(y_76, y_54);
 760         _amemd8(&y2[2 * h2]) = _itod(y_BA, y_98);
 761         _amemd8(&y3[2 * h2]) = _itod(y_FE, y_DC);
 762
 763         j += j0;
 764         if (j == n0) {
 765             j += n0;
 766             x0 += (int)npoints >> 1;
 767             x2 += (int)npoints >> 1;
 768         }
 769     }
 770 }
 771
 772 void radix_4(
 773     short *restrict ptr_x,
 774     short *restrict ptr_y,
 775     int npoints
 776 )
 777 {
 778     short * restrict x2, * restrict x0;
 779     short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
 780     short n0, j0;
 781
 782     int i, j, l1, h2;
 783     int xh0_0_xh1_0, xh0_1_xh1_1, xh0_2_xh1_2, xh0_3_xh1_3;
 784     int xl0_0_xl1_0, xl0_1_xl1_1, xl1_1_xl0_1, xl0_2_xl1_2, xl0_3_xl1_3, xl1_3_xl0_3;
 785     int n10_n11, n30_n31, n12_n13, n32_n33;
 786
 787     double x_2301, x_6745, x_AB89, x_EFCD;
 788
 789     long long n00_n01_n20_n21, n10_n31_n30_n11, n02_n03_n22_n23, n12_n33_n32_n13;
 790
 791     /*-----------------------------------------------------------------*/
 792     /* The following code performs either a standard radix4 pass or a  */
 793     /* radix2 pass. Two pointers are used to access the input data.    */
 794     /* The input data is read "N/4" complex samples apart or "N/2"     */
 795     /* words apart using pointers "x0" and "x2". This produces out-    */
 796     /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8    */
 797     /* N/2, 3N/8 for radix 2.                                          */
 798     /*-----------------------------------------------------------------*/
 799     y0 = ptr_y;
 800     y2 = ptr_y + (int)npoints;
 801     x0 = ptr_x;
 802     x2 = ptr_x + (int)(npoints >> 1);
 803
 804     /*----------------------------------------------------------------*/
 805     /* The pointers are set at the following locations which are half */
 806     /* the offsets of a radix4 FFT.                                   */
 807     /*----------------------------------------------------------------*/
 808     y3 = y2 + (int)(npoints >> 1);
 809     y1 = y0 + (int)(npoints >> 1);
 810     l1 = _norm(npoints) + 2;
 811     j0 = 4;
 812     n0 = npoints >> 2;
 813
 814     /*--------------------------------------------------------------------*/
 815     /* The following code reads data indentically for either a radix 4    */
 816     /* or a radix 2 style decomposition. It writes out at different       */
 817     /* locations though. It checks if either half the points, or a        */
 818     /* quarter of the complex points have been exhausted to jump to       */
 819     /* pervent double reversal.                                           */
 820     /*--------------------------------------------------------------------*/
 821     j = 0;
 822
 823     _nassert((int)(n0) % 4 == 0);
 824     _nassert((int)(x0) % 8 == 0);
 825     _nassert((int)(x2) % 8 == 0);
 826     _nassert((int)(y0) % 8 == 0);
 827     _nassert(npoints >= 16);
 828     #pragma MUST_ITERATE(2,,2);
 829
 830     for (i = 0; i < npoints; i += 8) {
 831         /*----------------------------------------------------------------*/
 832         /* Digit reverse the index starting from 0. The increment to "j"  */
 833         /* is either by 4, or 8.                                          */
 834         /*----------------------------------------------------------------*/
 835         h2 = _deal(j);
 836         h2 = _bitr(h2);
 837         h2 = _rotl(h2, 16);
 838         h2 = _shfl(h2);
 839         h2 >>= l1;
 840
 841         /*----------------------------------------------------------------*/
 842         /* Read in the input data. These are transformed as a radix4.     */
 843         /*----------------------------------------------------------------*/
 844         x_2301 = _amemd8(&x0[0]);
 845         x_6745 = _amemd8(&x0[4]);
 846         x0 += 8;
 847         x_AB89 = _amemd8(&x2[0]);
 848         x_EFCD = _amemd8(&x2[4]);
 849         x2 += 8;
 850
 851         /*------------------------------------------*/
 852         /* Perform radix4 style decomposition.      */
 853         /*------------------------------------------*/
 854         /* xh0_0 = x_0 + x_4;    xh1_0 = x_1 + x_5; */
 855         /* xl0_0 = x_0 - x_4;    xl1_0 = x_1 - x_5; */
 856         /* xh0_1 = x_2 + x_6;    xh1_1 = x_3 + x_7; */
 857         /* xl0_1 = x_2 - x_6;    xl1_1 = x_3 - x_7; */
 858         xh0_0_xh1_0 = _add2(_lo(x_2301), _lo(x_6745));
 859         xl0_0_xl1_0 = _sub2(_lo(x_2301), _lo(x_6745));
 860         xh0_1_xh1_1 = _add2(_hi(x_2301), _hi(x_6745));
 861         xl0_1_xl1_1 = _sub2(_hi(x_2301), _hi(x_6745));
 862
 863         /* n00 = xh0_0 + xh0_1;    n20 = xh0_0 - xh0_1; */
 864         /* n01 = xh1_0 + xh1_1;    n21 = xh1_0 - xh1_1; */
 865         n00_n01_n20_n21 = _addsub2(xh0_0_xh1_0, xh0_1_xh1_1);
 866
 867         /* n30 = xl0_0 + xl1_1;    n10 = xl0_0 - xl1_1; */
 868         /* n11 = xl1_0 + xl0_1;    n31 = xl1_0 - xl0_1; */
 869         xl1_1_xl0_1 = _rotl(xl0_1_xl1_1, 16);
 870         n10_n31_n30_n11 = _addsub2(xl0_0_xl1_0, xl1_1_xl0_1);
 871
 872         n30_n31 = _packhl2(_hill(n10_n31_n30_n11), _loll(n10_n31_n30_n11));
 873         n10_n11 = _packhl2(_loll(n10_n31_n30_n11), _hill(n10_n31_n30_n11));
 874
 875         /*------------------------------------------*/
 876         /* Perform radix4 style decomposition.      */
 877         /*------------------------------------------*/
 878         /* xh0_2 = x_8 + x_c;    xh1_2 = x_9 + x_d; */
 879         /* xl0_2 = x_8 - x_c;    xl1_2 = x_9 - x_d; */
 880         /* xh0_3 = x_a + x_e;    xh1_3 = x_b + x_f; */
 881         /* xl0_3 = x_a - x_e;    xl1_3 = x_b - x_f; */
 882         xh0_2_xh1_2 = _add2(_lo(x_AB89), _lo(x_EFCD));
 883         xl0_2_xl1_2 = _sub2(_lo(x_AB89), _lo(x_EFCD));
 884         xh0_3_xh1_3 = _add2(_hi(x_AB89), _hi(x_EFCD));
 885         xl0_3_xl1_3 = _sub2(_hi(x_AB89), _hi(x_EFCD));
 886
 887         /* n02 = xh0_2 + xh0_3;    n22 = xh0_2 - xh0_3; */
 888         /* n03 = xh1_2 + xh1_3;    n23 = xh1_2 - xh1_3; */
 889         n02_n03_n22_n23 = _addsub2(xh0_2_xh1_2, xh0_3_xh1_3);
 890
 891         /* n32 = xl0_2 + xl1_3;    n12 = xl0_2 - xl1_3; */
 892         /* n13 = xl1_2 + xl0_3;    n33 = xl1_2 - xl0_3; */
 893         xl1_3_xl0_3 = _rotl(xl0_3_xl1_3, 16);
 894         n12_n33_n32_n13 = _addsub2(xl0_2_xl1_2, xl1_3_xl0_3);
 895
 896         n32_n33 = _packhl2(_hill(n12_n33_n32_n13), _loll(n12_n33_n32_n13));
 897         n12_n13 = _packhl2(_loll(n12_n33_n32_n13), _hill(n12_n33_n32_n13));
 898
 899         /*-----------------------------------------------------------------*/
 900         /* Points that are read from succesive locations map to y, y[N/4]  */
 901         /* y[N/2], y[3N/4] in a radix4 scheme, y, y[N/8], y[N/2],y[5N/8]   */
 902         /*-----------------------------------------------------------------*/
 903
 904         /*----Imaginary----------Real-----------*/
 905         /* y0[2*h2+0] = n01;  y0[2*h2+1] = n00; */
 906         /* y0[2*h2+2] = n03;  y0[2*h2+3] = n02; */
 907         /* y1[2*h2+0] = n11;  y1[2*h2+1] = n10; */
 908         /* y1[2*h2+2] = n13;  y1[2*h2+3] = n12; */
 909         /* y2[2*h2+0] = n21;  y2[2*h2+1] = n20; */
 910         /* y2[2*h2+2] = n23;  y2[2*h2+3] = n22; */
 911         /* y3[2*h2+0] = n31;  y3[2*h2+1] = n30; */
 912         /* y3[2*h2+2] = n33;  y3[2*h2+3] = n32; */
 913
 914         _amemd8(&y0[2 * h2]) = _itod(_hill(n02_n03_n22_n23), _hill(n00_n01_n20_n21));
 915         _amemd8(&y1[2 * h2]) = _itod(n12_n13, n10_n11);
 916         _amemd8(&y2[2 * h2]) = _itod(_loll(n02_n03_n22_n23), _loll(n00_n01_n20_n21));
 917         _amemd8(&y3[2 * h2]) = _itod(n32_n33, n30_n31);
 918
 919         j += j0;
 920         if (j == n0) {
 921             j  += n0;
 922             x0 += (int)npoints >> 1;
 923             x2 += (int)npoints >> 1;
 924         }
 925     }
 926 }
 927
 928 #else
 929 void DSP_fft16x16_i (
 930     const short * restrict ptr_w,
 931     int npoints,
 932     short * restrict ptr_x,
 933     short * restrict ptr_y
 934 )
 935 {
 936     const short * restrict w;
 937     const short * restrict w2;
 938     short * restrict x, * restrict x2, * restrict x0;
 939
 940     int j, i, l1, radix;
 941     int l2, h2, fft_jmp;
 942     int predj, tw_offset, stride;
 943
 944     int x_0o_x_1o, x_2o_x_3o;
 945     int xl2_0_1, xl2_2_3;
 946
 947     unsigned int mxt0_0_myt0_0, mxt0_1_myt0_1;
 948     unsigned int xl1_0_1, xl1_2_3, xh2_0_1, xh2_2_3;
 949     unsigned int xt1_0_yt1_0, xt1_1_yt1_1, xt2_0_yt2_0, xt2_1_yt2_1;
 950
 951     unsigned int xl1_0_xl0_0, yt1_0_xt2_0;
 952     unsigned int yt2_0_xt1_0, xl1_1_xl0_1;
 953     unsigned int yt2_1_xt1_1, yt1_1_xt2_1;
 954
 955     unsigned int xh0_0_xh1_0, xh0_1_xh1_1;
 956     unsigned int xl0_0_xl1_0, xl0_1_xl1_1;
 957     unsigned int xh20_0_xh21_0, xh20_1_xh21_1;
 958     unsigned int xl20_0_xl21_0, xl20_1_xl21_1;
 959
 960     double x_0123;
 961     double co10si10_co11si11, co20si20_co21si21, co30si30_co31si31;
 962     double x_l1_01_x_l1_23, x_l2_01_x_l2_23, x_h2_01_x_h2_23;
 963
 964     long long xt1_0_yt1_0_xt2_0_yt2_0, xt1_1_yt1_1_xt2_1_yt2_1;
 965     long long yt2_0_xt1_0_yt1_0_xt2_0, yt2_1_xt1_1_yt1_1_xt2_1;
 966     long long xh0_0_xh1_0_xl0_0_xl1_0, xh0_1_xh1_1_xl0_1_xl1_1;
 967     long long xh20_0_xh21_0_xl20_0_xl21_0, xh20_1_xh21_1_xl20_1_xl21_1;
 968
 969     /*---------------------------------------------------------------------*/
 970     /* Determine the magnitude od the number of points to be transformed.  */
 971     /* Check whether we can use a radix4 decomposition or a mixed radix    */
 972     /* transformation, by determining modulo 2.                            */
 973     /*---------------------------------------------------------------------*/
 974     radix = _norm(npoints) & 1 ? 2 : 4;
 975
 976     /*----------------------------------------------------------------------*/
 977     /* The stride is quartered with every iteration of the outer loop. It   */
 978     /* denotes the seperation between any two adjacent inputs to the butter */
 979     /* -fly. This should start out at N/4, hence stride is initially set to */
 980     /* N. For every stride, 6*stride twiddle factors are accessed. The      */
 981     /* "tw_offset" is the offset within the current twiddle factor sub-     */
 982     /* table. This is set to zero, at the start of the code and is used to  */
 983     /* obtain the appropriate sub-table twiddle pointer by offseting it     */
 984     /* with the base pointer "ptr_w".                                       */
 985     /*----------------------------------------------------------------------*/
 986     stride = npoints;
 987     tw_offset = 0;
 988     fft_jmp = 6 * stride;
 989
 990     _nassert(stride > 4);
 991     #pragma MUST_ITERATE(1,,1);
 992     while (stride > radix) {
 993         /*-----------------------------------------------------------------*/
 994         /* At the start of every iteration of the outer loop, "j" is set   */
 995         /* to zero, as "w" is pointing to the correct location within the  */
 996         /* twiddle factor array. For every iteration of the inner loop     */
 997         /* 6 * stride twiddle factors are accessed. For eg,                */
 998         /*                                                                 */
 999         /* #Iteration of outer loop  # twiddle factors    #times cycled    */
1000         /*  1                          6 N/4               1               */
1001         /*  2                          6 N/16              4               */
1002         /*  ...                                                            */
1003         /*-----------------------------------------------------------------*/
1004         j = 0;
1005         fft_jmp >>= 2;
1006
1007         /*-----------------------------------------------------------------*/
1008         /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or  */
1009         /* "N/2", "N", "3N/2" half word                                    */
1010         /*-----------------------------------------------------------------*/
1011         h2 = stride >> 1;
1012         l1 = stride;
1013         l2 = stride + (stride >> 1);
1014
1015         /*-----------------------------------------------------------------*/
1016         /*  Reset "x" to point to the start of the input data array.       */
1017         /* "tw_offset" starts off at 0, and increments by "6 * stride"     */
1018         /*  The stride quarters with every iteration of the outer loop     */
1019         /*-----------------------------------------------------------------*/
1020         x = ptr_x;
1021         w = ptr_w + tw_offset;
1022         tw_offset += fft_jmp;
1023         stride >>= 2;
1024
1025         /*----------------------------------------------------------------*/
1026         /* The following loop iterates through the different butterflies, */
1027         /* within a given stage. Recall that there are logN to base 4     */
1028         /* stages. Certain butterflies share the twiddle factors. These   */
1029         /* are grouped together. On the very first stage there are no     */
1030         /* butterflies that share the twiddle factor, all N/4 butter-     */
1031         /* flies have different factors. On the next stage two sets of    */
1032         /* N/8 butterflies share the same twiddle factor. Hence after     */
1033         /* half the butterflies are performed, j the index into the       */
1034         /* factor array resets to 0, and the twiddle factors are reused.  */
1035         /* When this happens, the data pointer 'x' is incremented by the  */
1036         /* fft_jmp amount. In addition the following code is unrolled to  */
1037         /* perform "2" radix4 butterflies in parallel.                    */
1038         /*----------------------------------------------------------------*/
1039         _nassert(npoints >= 16);
1040         #pragma MUST_ITERATE(2,,2);
1041         for (i = 0; i < npoints; i += 8) {
1042             /*------------------------------------------------------------*/
1043             /* Read the first 12 twiddle factors, six of which are used   */
1044             /* for one radix 4 butterfly and six of which are used for    */
1045             /* next one.                                                  */
1046             /*                                                            */
1047             /*  si10 = w[j+1];        co10 = w[j+0];                      */
1048             /*  si11 = w[j+3];        co11 = w[j+2];                      */
1049             /*  si20 = w[j+5];        co20 = w[j+4];                      */
1050             /*  si21 = w[j+7];        co21 = w[j+6];                      */
1051             /*  si30 = w[j+9];        co30 = w[j+8];                      */
1052             /*  si31 = w[j+11];       co31 = w[j+10];                     */
1053             /*                                                            */
1054             /*  The code shown above performs half word wide loads. The   */
1055             /*  C64x can perform double word wide loads. This is done by  */
1056             /*  the use of the _amemd8_const intrinsic. The const quali-  */
1057             /*  fies the fact that this array will only be read from and  */
1058             /*  not written to.                                           */
1059             /*------------------------------------------------------------*/
1060             w2 = w + j;
1061             co10si10_co11si11 = _amemd8_const(w2);
1062             co20si20_co21si21 = _amemd8_const(&w2[4]);
1063             co30si30_co31si31 = _amemd8_const(&w2[8]);
1064
1065             /*------------------------------------------------------------*/
1066             /* Read in the first complex input for the butterflies.       */
1067             /* 1st complex input to 1st butterfly: x[0] + jx[1]           */
1068             /* 1st complex input to 2nd butterfly: x[2] + jx[3]           */
1069             /* Read in the complex inputs for the butterflies. Each of the*/
1070             /* successive complex inputs of the butterfly are seperated   */
1071             /* by a fixed amount known as stride. The stride starts out   */
1072             /* at N/4, and quarters with every stage.                     */
1073             /*                                                            */
1074             /*  x_0    =   x[0];          x_1    =   x[1];                */
1075             /*  x_2    =   x[2];          x_3    =   x[3];                */
1076             /*  x_l1_0 =   x[l1  ];       x_l1_1 =   x[l1 + 1];           */
1077             /*  x_l1_2 =   x[l1 + 2];     x_l1_3 =   x[l1 + 3];           */
1078             /*  x_l2_0 =   x[l2  ];       x_l2_1 =   x[l2 + 1];           */
1079             /*  x_l2_2 =   x[l2 + 2];     x_l2_3 =   x[l2 + 3];           */
1080             /*  x_h2_0 =   x[h2  ];       x_h2_1 =   x[h2 + 1];           */
1081             /*  x_h2_2 =   x[h2 + 2];     x_h2_3 =   x[h2 + 3];           */
1082             /*                                                            */
1083             /* These loads are performed using _amemd8. Note that there   */
1084             /* is no const qualifier in this case as this loop writes     */
1085             /* back results into the data array "x". The "a" in _amemd8   */
1086             /* qualifies the fact that the loads are aligned.             */
1087             /*------------------------------------------------------------*/
1088             x_0123 = _amemd8(&x[0]);
1089             x_l1_01_x_l1_23 = _amemd8(&x[l1]);
1090             x_l2_01_x_l2_23 = _amemd8(&x[l2]);
1091             x_h2_01_x_h2_23 = _amemd8(&x[h2]);
1092
1093             /*-----------------------------------------------------------*/
1094             /* Two butterflies are evaluated in parallel. The following  */
1095             /* results will be shown for one butterfly only, although    */
1096             /* both are being evaluated in parallel.                     */
1097             /*                                                           */
1098             /* Perform radix2 style DIF butterflies.                     */
1099             /*                                                           */
1100             /*  xh0_0  = x_0    + x_l1_0;     xh1_0  = x_1    + x_l1_1;  */
1101             /*  xh0_1  = x_2    + x_l1_2;     xh1_1  = x_3    + x_l1_3;  */
1102             /*  xl0_0  = x_0    - x_l1_0;     xl1_0  = x_1    - x_l1_1;  */
1103             /*  xl0_1  = x_2    - x_l1_2;     xl1_1  = x_3    - x_l1_3;  */
1104             /*  xh20_0 = x_h2_0 + x_l2_0;     xh21_0 = x_h2_1 + x_l2_1;  */
1105             /*  xh20_1 = x_h2_2 + x_l2_2;     xh21_1 = x_h2_3 + x_l2_3;  */
1106             /*  xl20_0 = x_h2_0 - x_l2_0;     xl21_0 = x_h2_1 - x_l2_1;  */
1107             /*  xl20_1 = x_h2_2 - x_l2_2;     xl21_1 = x_h2_3 - x_l2_3;  */
1108             /*                                                           */
1109             /*  These are done using packed data processing, and these   */
1110             /*  are done using _add2, _sub2.                             */
1111             /*  For example "xh1_0_xh0_0" has a packed value xh1_0,      */
1112             /*  xh0_0.                                                   */
1113             /*-----------------------------------------------------------*/
1114             xh0_0_xh1_0_xl0_0_xl1_0 = _addsub2((_hi(x_0123)), (_hi(x_l1_01_x_l1_23)));
1115             xh0_1_xh1_1_xl0_1_xl1_1 = _addsub2((_lo(x_0123)), (_lo(x_l1_01_x_l1_23)));
1116
1117             xh0_0_xh1_0 = (_hill(xh0_0_xh1_0_xl0_0_xl1_0));
1118             xh0_1_xh1_1 = (_hill(xh0_1_xh1_1_xl0_1_xl1_1));
1119             xl0_0_xl1_0 = (_loll(xh0_0_xh1_0_xl0_0_xl1_0));
1120             xl0_1_xl1_1 = (_loll(xh0_1_xh1_1_xl0_1_xl1_1));
1121
1122             xh20_0_xh21_0_xl20_0_xl21_0 = _addsub2((_hi(x_h2_01_x_h2_23)), (_hi(x_l2_01_x_l2_23)));
1123             xh20_1_xh21_1_xl20_1_xl21_1 = _addsub2((_lo(x_h2_01_x_h2_23)), (_lo(x_l2_01_x_l2_23)));
1124
1125             xh20_0_xh21_0 = (_hill(xh20_0_xh21_0_xl20_0_xl21_0));
1126             xh20_1_xh21_1 = (_hill(xh20_1_xh21_1_xl20_1_xl21_1));
1127             xl20_0_xl21_0 = (_loll(xh20_0_xh21_0_xl20_0_xl21_0));
1128             xl20_1_xl21_1 = (_loll(xh20_1_xh21_1_xl20_1_xl21_1));
1129
1130             /*----------------------------------------------------------*/
1131             /* Derive output pointers using the input pointer "x"       */
1132             /*----------------------------------------------------------*/
1133             x0 = (short *)_mvd((int)x);
1134             x2 = (short *)_mvd((int)x0);
1135
1136             /*-----------------------------------------------------------*/
1137             /* When the twiddle factors are not to be re-used, j is      */
1138             /* incremented by 12, to reflect the fact that 12 half words */
1139             /* are consumed in every iteration. The input data pointer   */
1140             /* increments by 4. Note that within a stage, the stride     */
1141             /* does not change and hence the offsets for the other three */
1142             /* legs, 0, h2, l1, l2.                                      */
1143             /*-----------------------------------------------------------*/
1144             j += 12;
1145             x += 4;
1146
1147             predj = (j - fft_jmp);
1148             if (!predj) x += fft_jmp;
1149             if (!predj) j = 0;
1150
1151             /*----------------------------------------------------------*/
1152             /* x0[0] = (xh0_0 + xh20_0 + 1) >> 1;                       */
1153             /* x0[1] = (xh1_0 + xh21_0 + 1) >> 1;                       */
1154             /* x0[2] = (xh0_1 + xh20_1 + 1) >> 1;                       */
1155             /* x0[3] = (xh1_1 + xh21_1 + 1) >> 1;                       */
1156             /*                                                          */
1157             /* "y0x0" and "y1x1" are the first four outputs of the      */
1158             /*  radix4 butterfly. These are computed on the C64x        */
1159             /* using _avg2.                                             */
1160             /*----------------------------------------------------------*/
1161             x_0o_x_1o = _avg2(xh20_0_xh21_0, xh0_0_xh1_0);
1162             x_2o_x_3o = _avg2(xh20_1_xh21_1, xh0_1_xh1_1);
1163
1164             /*----------------------------------------------------------*/
1165             /* Perform one more stage of radix2 decompositions as shown */
1166             /* These are performed using packed data processing.        */
1167             /*                                                          */
1168             /* xt0_0 = xh0_0 - xh20_0;  yt0_0 = xh1_0 - xh21_0;         */
1169             /* xt0_1 = xh0_1 - xh20_1;  yt0_1 = xh1_1 - xh21_1;         */
1170             /* xt1_0 = xl0_0 + xl21_0;  yt2_0 = xl1_0 + xl20_0;         */
1171             /* xt2_0 = xl0_0 - xl21_0;  yt1_0 = xl1_0 - xl20_0;         */
1172             /* xt1_1 = xl0_1 + xl21_1;  yt2_1 = xl1_1 + xl20_1;         */
1173             /* xt2_1 = xl0_1 - xl21_1;  yt1_1 = xl1_1 - xl20_1;         */
1174             /*                                                          */
1175             /*----------------------------------------------------------*/
1176             mxt0_0_myt0_0 = _sub2(xh20_0_xh21_0, xh0_0_xh1_0);
1177             mxt0_1_myt0_1 = _sub2(xh20_1_xh21_1, xh0_1_xh1_1);
1178             /*----------------------------------------------------------*/
1179             /* Repack coefficients to sustain packed data processing.   */
1180             /* This requires a swizzle so that intermediate results     */
1181             /* end up in the right spot. This is needed to compute      */
1182             /* other intermediate results using add2 and sub2.          */
1183             /*----------------------------------------------------------*/
1184             xl1_0_xl0_0 = _rotl(xl0_0_xl1_0, 16);
1185             xl1_1_xl0_1 = _rotl(xl0_1_xl1_1, 16);
1186
1187             //;----------------------------------------------------------;
1188             //; xt1_0 = xl0_0 + xl21_0   yt2_0 = xl1_0 + xl20_0          ;
1189             //; xt1_1 = xl0_1 + xl21_1   yt2_1 = xl1_1 + xl20_1          ;
1190             //; xt2_0 = xl0_0 - xl21_0   yt1_0 = xl1_0 - xl20_0          ;
1191             //; xt2_1 = xl0_1 - xl21_1   yt1_1 = xl1_1 - xl20_1          ;
1192             //;----------------------------------------------------------;
1193             yt2_0_xt1_0_yt1_0_xt2_0 = _addsub2(xl1_0_xl0_0, xl20_0_xl21_0);
1194             yt2_1_xt1_1_yt1_1_xt2_1 = _addsub2(xl1_1_xl0_1, xl20_1_xl21_1);
1195
1196             yt2_0_xt1_0 = (_hill(yt2_0_xt1_0_yt1_0_xt2_0));
1197             yt2_1_xt1_1 = (_hill(yt2_1_xt1_1_yt1_1_xt2_1));
1198             yt1_0_xt2_0 = (_loll(yt2_0_xt1_0_yt1_0_xt2_0));
1199             yt1_1_xt2_1 = (_loll(yt2_1_xt1_1_yt1_1_xt2_1));
1200
1201             /*----------------------------------------------------------*/
1202             /* Repack results so that xt_XXyt_XX are a packed  32 bit   */
1203             /* quantity. The operations performed above tend to leave   */
1204             /* the results as xt_XXyt_YY and xt_YYyt_XX. This is where  */
1205             /* the packed instructions come in handy. For example       */
1206             /* xt_XXyt_XX = _packlh2(xt_XXyt_YY, xt_YY_yt_XX)          */
1207             /*----------------------------------------------------------*/
1208             xt1_0_yt1_0_xt2_0_yt2_0 = _dpackx2(yt1_0_xt2_0, yt2_0_xt1_0);
1209             xt1_1_yt1_1_xt2_1_yt2_1 = _dpackx2(yt1_1_xt2_1, yt2_1_xt1_1);
1210
1211             xt1_0_yt1_0 = (_hill(xt1_0_yt1_0_xt2_0_yt2_0));
1212             xt1_1_yt1_1 = (_hill(xt1_1_yt1_1_xt2_1_yt2_1));
1213             xt2_0_yt2_0 = (_loll(xt1_0_yt1_0_xt2_0_yt2_0));
1214             xt2_1_yt2_1 = (_loll(xt1_1_yt1_1_xt2_1_yt2_1));
1215
1216             /*---------------------------------------------------------*/
1217             /* Notice that in this version of the code the two middle  */
1218             /* legs are swapped as indicated by the stores to x[l1]..  */
1219             /* x[l1 + 3] which preceede the stores to x[h2]..x[h2 + 3] */
1220             /* This reversal guarantees that a radix4 DIF butterfly    */
1221             /* produces results in digit reversed order. Note that in  */
1222             /* addition to the rounding, the shift is performed by 16, */
1223             /* as opposed to 15, to give scaling.                      */
1224             /*---------------------------------------------------------*/
1225
1226             /*---------------------------------------------------------*/
1227             /*                                                         */
1228             /* x2[l1  ] = (si10 * yt1_0 + co10 * xt1_0 + 0x8000) >> 16;*/
1229             /* x2[l1+1] = (co10 * yt1_0 - si10 * xt1_0 + 0x8000) >> 16;*/
1230             /* x2[l1+2] = (si11 * yt1_1 + co11 * xt1_1 + 0x8000) >> 16;*/
1231             /* x2[l1+3] = (co11 * yt1_1 - si11 * xt1_1 + 0x8000) >> 16;*/
1232             /*                                                         */
1233             /* These four results are retained in registers and a      */
1234             /* double word is formed so that it can be stored with     */
1235             /* one STDW.                                               */
1236             /*---------------------------------------------------------*/
1237             xh2_0_1 = _cmpyr((_hi(co10si10_co11si11)), xt1_0_yt1_0);
1238             xh2_2_3 = _cmpyr((_lo(co10si10_co11si11)), xt1_1_yt1_1);
1239
1240             /*---------------------------------------------------------*/
1241             /* The following code computes intermediate results for:   */
1242             /*                                                         */
1243             /* x2[h2  ] = (si20 * yt0_0 + co20 * xt0_0 + 0x8000) >> 16;*/
1244             /* x2[h2+1] = (co20 * yt0_0 - si20 * xt0_0 + 0x8000) >> 16;*/
1245             /* x2[h2+2] = (si21 * yt0_1 + co21 * xt0_1 + 0x8000) >> 16;*/
1246             /* x2[h2+3] = (co21 * yt0_1 - si21 * xt0_1 + 0x8000) >> 16;*/
1247             /*---------------------------------------------------------*/
1248             xl1_0_1 = _cmpyr((_hi(co20si20_co21si21)), mxt0_0_myt0_0);
1249             xl1_2_3 = _cmpyr((_lo(co20si20_co21si21)), mxt0_1_myt0_1);
1250
1251             /*---------------------------------------------------------*/
1252             /* The following code computes intermediate results for:   */
1253             /*                                                         */
1254             /* x2[l2  ] = (si30 * yt2_0 + co30 * xt2_0 + 0x8000) >> 16;*/
1255             /* x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0 + 0x8000) >> 16;*/
1256             /* x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1 + 0x8000) >> 16;*/
1257             /* x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1 + 0x8000) >> 16;*/
1258             /*---------------------------------------------------------*/
1259             xl2_0_1 = _cmpyr((_hi(co30si30_co31si31)), xt2_0_yt2_0);
1260             xl2_2_3 = _cmpyr((_lo(co30si30_co31si31)), xt2_1_yt2_1);
1261
1262             /*---------------------------------------------------------*/
1263             /* Prepare the doubleword for each of the four outputs, by */
1264             /* the use of the _itod intrinsic that takes integers, and */
1265             /* forms a double word wide quantity.                      */
1266             /* Store out the outputs to the four legs of the butterfly */
1267             /* using aligned store double words. Notice the use of the */
1268             /* indices "l1", "l2", "h2" to derive the pointers for the */
1269             /* legs of the butterfly.                                  */
1270             /*---------------------------------------------------------*/
1271             _amemd8(&x2[0]) = _itod(x_0o_x_1o, x_2o_x_3o);
1272             _amemd8(&x2[l1]) = _itod(xl1_0_1, xl1_2_3);
1273             _amemd8(&x2[h2]) = _itod(xh2_0_1, xh2_2_3);
1274             _amemd8(&x2[l2]) = _itod(xl2_0_1, xl2_2_3);
1275         }
1276     }
1277
1278     if (radix == 2)
1279         radix_2(ptr_x, ptr_y, npoints);
1280     else if (radix == 4)
1281         radix_4(ptr_x, ptr_y, npoints);
1282
1283     return;
1284 }
1285
1286 void radix_2(
1287     short *restrict ptr_x,
1288     short *restrict ptr_y,
1289     int npoints
1290 )
1291 {
1292     short * restrict x2, * restrict x0;
1293     short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
1294     short n0, j0;
1295
1296     int i, j, l1, h2;
1297     int y_01, y_23, y_45, y_67, y_89, y_AB, y_CD, y_EF;
1298
1299     double x_1032, x_5476, x_98BA, x_DCFE;
1300
1301     /*-----------------------------------------------------------------*/
1302     /* The following code performs either a standard radix4 pass or a  */
1303     /* radix2 pass. Two pointers are used to access the input data.    */
1304     /* The input data is read "N/4" complex samples apart or "N/2"     */
1305     /* words apart using pointers "x0" and "x2". This produces out-    */
1306     /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8    */
1307     /* N/2, 3N/8 for radix 2.                                          */
1308     /*-----------------------------------------------------------------*/
1309     y0 = ptr_y;
1310     y2 = ptr_y + (int)npoints;
1311     x0 = ptr_x;
1312     x2 = ptr_x + (int)(npoints >> 1);
1313
1314     /*----------------------------------------------------------------*/
1315     /* The pointers are set at the following locations which are half */
1316     /* the offsets of a radix4 FFT.                                   */
1317     /*----------------------------------------------------------------*/
1318     y1 = y0 + (int)(npoints >> 2);
1319     y3 = y2 + (int)(npoints >> 2);
1320     l1 = _norm(npoints) + 1;
1321     j0 = 8;
1322     n0 = npoints >> 1;
1323
1324     /*--------------------------------------------------------------------*/
1325     /* The following code reads data indentically for either a radix 4    */
1326     /* or a radix 2 style decomposition. It writes out at different       */
1327     /* locations though. It checks if either half the points, or a        */
1328     /* quarter of the complex points have been exhausted to jump to       */
1329     /* pervent double reversal.                                           */
1330     /*--------------------------------------------------------------------*/
1331     j = 0;
1332
1333     _nassert((int)(n0) % 4 == 0);
1334     _nassert((int)(ptr_x) % 8 == 0);
1335     _nassert((int)(ptr_y) % 8 == 0);
1336     _nassert((int)(x0) % 8 == 0);
1337     _nassert((int)(x2) % 8 == 0);
1338     _nassert((int)(y0) % 8 == 0);
1339     _nassert(npoints >= 16);
1340     #pragma MUST_ITERATE(2,,2);
1341     for (i = 0; i < npoints; i += 8) {
1342         /*----------------------------------------------------------------*/
1343         /* Digit reverse the index starting from 0. The increment to "j"  */
1344         /* is either by 4, or 8.                                          */
1345         /*----------------------------------------------------------------*/
1346         h2 = _deal(j);
1347         h2 = _bitr(h2);
1348         h2 = _rotl(h2, 16);
1349         h2 = _shfl(h2);
1350         h2 >>= l1;
1351         /*----------------------------------------------------------------*/
1352         /* Read in the input data. These are transformed as a radix2.     */
1353         /*----------------------------------------------------------------*/
1354         x_1032 = _amemd8(&x0[0]);
1355         x_5476 = _amemd8(&x0[4]);
1356         x0 += 8;
1357         x_98BA = _amemd8(&x2[0]);
1358         x_DCFE = _amemd8(&x2[4]);
1359         x2 += 8;
1360
1361         /*----------------------------------------------------------------------------*/
1362         /* Perform radix2 style decomposition.                                        */
1363         /*----------------------------------------------------------------------------*/
1364         /* n00 = x_0 + x_2;    n01 = x_1 + x_3;    y_01 = n00_n01 = _add2(x_10, x_32) */
1365         /* n20 = x_0 - x_2;    n21 = x_1 - x_3;    y_89 = n20_n21 = _sub2(x_10, x_32) */
1366         /* n10 = x_4 + x_6;    n11 = x_5 + x_7;    y_45 = n10_n11 = _add2(x_54, x_76) */
1367         /* n30 = x_4 - x_6;    n31 = x_5 - x_7;    y_CD = n30_n31 = _sub2(x_54, x_76) */
1368         /* n02 = x_8 + x_a;    n03 = x_9 + x_b;    y_23 = n03_n02 = _add2(x_98, x_BA) */
1369         /* n12 = x_c + x_e;    n13 = x_d + x_f;    y_67 = n12_n13 = _add2(x_DC, x_FE) */
1370         /* n22 = x_8 - x_a;    n23 = x_9 - x_b;    y_AB = n22_n23 = _sub2(x_98, x_BA) */
1371         /* n32 = x_c - x_e;    n33 = x_d - x_f;    y_EF = n32_n33 = _sub2(x_DC, x_FE) */
1372
1373         y_01 = _add2(_hi(x_1032), _lo(x_1032));
1374         y_89 = _sub2(_hi(x_1032), _lo(x_1032));
1375         y_45 = _add2(_hi(x_5476), _lo(x_5476));
1376         y_CD = _sub2(_hi(x_5476), _lo(x_5476));
1377         y_23 = _add2(_hi(x_98BA), _lo(x_98BA));
1378         y_67 = _add2(_hi(x_DCFE), _lo(x_DCFE));
1379         y_AB = _sub2(_hi(x_98BA), _lo(x_98BA));
1380         y_EF = _sub2(_hi(x_DCFE), _lo(x_DCFE));
1381
1382         /*----Imaginary------------Real-----------*/
1383         /* y0[2*h2+0] = n01;    y0[2*h2+1] = n00; */
1384         /* y0[2*h2+2] = n03;    y0[2*h2+3] = n02; */
1385         /* y1[2*h2+0] = n11;    y1[2*h2+1] = n10; */
1386         /* y1[2*h2+2] = n13;    y1[2*h2+3] = n12; */
1387         /* y2[2*h2+0] = n21;    y2[2*h2+1] = n20; */
1388         /* y2[2*h2+2] = n23;    y2[2*h2+3] = n22; */
1389         /* y3[2*h2+0] = n31;    y3[2*h2+1] = n30; */
1390         /* y3[2*h2+2] = n33;    y3[2*h2+3] = n32; */
1391
1392         /*------------------------------------------------------------*/
1393         /*  Store out the results of all four butterflies as double   */
1394         /*  words.                                                    */
1395         /*------------------------------------------------------------*/
1396         _amemd8(&y0[2 * h2]) = _itod(y_01, y_23);
1397         _amemd8(&y1[2 * h2]) = _itod(y_45, y_67);
1398         _amemd8(&y2[2 * h2]) = _itod(y_89, y_AB);
1399         _amemd8(&y3[2 * h2]) = _itod(y_CD, y_EF);
1400
1401         j += j0;
1402         if (j == n0) {
1403             j += n0;
1404             x0 += (int)npoints >> 1;
1405             x2 += (int)npoints >> 1;
1406         }
1407     }
1408 }
1409
1410 void radix_4(
1411     short *restrict ptr_x,
1412     short *restrict ptr_y,
1413     int npoints
1414 )
1415 {
1416     short * restrict x2, * restrict x0;
1417     short * restrict y0, * restrict y1, * restrict y2, * restrict y3;
1418     short n0, j0;
1419
1420     int i, j, l1, h2;
1421     int xh1_0_xh0_0, xh1_1_xh0_1, xh1_2_xh0_2, xh1_3_xh0_3;
1422     int xl1_0_xl0_0, xl1_1_xl0_1, xl0_1_xl1_1, xl1_2_xl0_2, xl1_3_xl0_3, xl0_3_xl1_3;
1423     int n11_n10, n31_n30, n13_n12, n33_n32;
1424
1425     double x_1032, x_5476, x_98BA, x_DCFE;
1426
1427     long long n01_n00_n21_n20, n11_n30_n31_n10, n03_n02_n23_n22, n13_n32_n33_n12;
1428
1429     /*-----------------------------------------------------------------*/
1430     /* The following code performs either a standard radix4 pass or a  */
1431     /* radix2 pass. Two pointers are used to access the input data.    */
1432     /* The input data is read "N/4" complex samples apart or "N/2"     */
1433     /* words apart using pointers "x0" and "x2". This produces out-    */
1434     /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8    */
1435     /* N/2, 3N/8 for radix 2.                                          */
1436     /*-----------------------------------------------------------------*/
1437     y0 = ptr_y;
1438     y2 = ptr_y + (int)npoints;
1439     x0 = ptr_x;
1440     x2 = ptr_x + (int)(npoints >> 1);
1441
1442     /*----------------------------------------------------------------*/
1443     /* The pointers are set at the following locations which are half */
1444     /* the offsets of a radix4 FFT.                                   */
1445     /*----------------------------------------------------------------*/
1446     y3 = y2 + (int)(npoints >> 1);
1447     y1 = y0 + (int)(npoints >> 1);
1448     l1 = _norm(npoints) + 2;
1449     j0 = 4;
1450     n0 = npoints >> 2;
1451
1452     /*--------------------------------------------------------------------*/
1453     /* The following code reads data indentically for either a radix 4    */
1454     /* or a radix 2 style decomposition. It writes out at different       */
1455     /* locations though. It checks if either half the points, or a        */
1456     /* quarter of the complex points have been exhausted to jump to       */
1457     /* pervent double reversal.                                           */
1458     /*--------------------------------------------------------------------*/
1459     j = 0;
1460
1461     _nassert((int)(n0) % 4 == 0);
1462     _nassert((int)(x0) % 8 == 0);
1463     _nassert((int)(x2) % 8 == 0);
1464     _nassert((int)(y0) % 8 == 0);
1465     _nassert(npoints >= 16);
1466     #pragma MUST_ITERATE(2,,2);
1467
1468     for (i = 0; i < npoints; i += 8) {
1469         /*----------------------------------------------------------------*/
1470         /* Digit reverse the index starting from 0. The increment to "j"  */
1471         /* is either by 4, or 8.                                          */
1472         /*----------------------------------------------------------------*/
1473         h2 = _deal(j);
1474         h2 = _bitr(h2);
1475         h2 = _rotl(h2, 16);
1476         h2 = _shfl(h2);
1477         h2 >>= l1;
1478
1479         /*----------------------------------------------------------------*/
1480         /* Read in the input data. These are transformed as a radix4.     */
1481         /*----------------------------------------------------------------*/
1482         x_1032 = _amemd8(&x0[0]);
1483         x_5476 = _amemd8(&x0[4]);
1484         x0 += 8;
1485         x_98BA = _amemd8(&x2[0]);
1486         x_DCFE = _amemd8(&x2[4]);
1487         x2 += 8;
1488
1489         /*------------------------------------------*/
1490         /* Perform radix4 style decomposition.      */
1491         /*------------------------------------------*/
1492         /* xh0_0 = x_0 + x_4;    xh1_0 = x_1 + x_5; */
1493         /* xl0_0 = x_0 - x_4;    xl1_0 = x_1 - x_5; */
1494         /* xh0_1 = x_2 + x_6;    xh1_1 = x_3 + x_7; */
1495         /* xl0_1 = x_2 - x_6;    xl1_1 = x_3 - x_7; */
1496         xh1_0_xh0_0 = _add2(_hi(x_1032), _hi(x_5476));
1497         xl1_0_xl0_0 = _sub2(_hi(x_1032), _hi(x_5476));
1498         xh1_1_xh0_1 = _add2(_lo(x_1032), _lo(x_5476));
1499         xl1_1_xl0_1 = _sub2(_lo(x_1032), _lo(x_5476));
1500
1501         /* n00 = xh0_0 + xh0_1;    n20 = xh0_0 - xh0_1; */
1502         /* n01 = xh1_0 + xh1_1;    n21 = xh1_0 - xh1_1; */
1503         n01_n00_n21_n20 = _addsub2(xh1_0_xh0_0, xh1_1_xh0_1);
1504
1505         /* n30 = xl0_0 + xl1_1;    n10 = xl0_0 - xl1_1; */
1506         /* n11 = xl1_0 + xl0_1;    n31 = xl1_0 - xl0_1; */
1507         xl0_1_xl1_1 = _rotl(xl1_1_xl0_1, 16);
1508         n11_n30_n31_n10 = _addsub2(xl1_0_xl0_0, xl0_1_xl1_1);
1509
1510         n31_n30 = _packhl2(_loll(n11_n30_n31_n10), _hill(n11_n30_n31_n10));
1511         n11_n10 = _packhl2(_hill(n11_n30_n31_n10), _loll(n11_n30_n31_n10));
1512
1513         /*------------------------------------------*/
1514         /* Perform radix4 style decomposition.      */
1515         /*------------------------------------------*/
1516         /* xh0_2 = x_8 + x_c;    xh1_2 = x_9 + x_d; */
1517         /* xl0_2 = x_8 - x_c;    xl1_2 = x_9 - x_d; */
1518         /* xh0_3 = x_a + x_e;    xh1_3 = x_b + x_f; */
1519         /* xl0_3 = x_a - x_e;    xl1_3 = x_b - x_f; */
1520         xh1_2_xh0_2 = _add2(_hi(x_98BA), _hi(x_DCFE));
1521         xl1_2_xl0_2 = _sub2(_hi(x_98BA), _hi(x_DCFE));
1522         xh1_3_xh0_3 = _add2(_lo(x_98BA), _lo(x_DCFE));
1523         xl1_3_xl0_3 = _sub2(_lo(x_98BA), _lo(x_DCFE));
1524
1525         /* n02 = xh0_2 + xh0_3;    n22 = xh0_2 - xh0_3; */
1526         /* n03 = xh1_2 + xh1_3;    n23 = xh1_2 - xh1_3; */
1527         n03_n02_n23_n22 = _addsub2(xh1_2_xh0_2, xh1_3_xh0_3);
1528
1529         /* n32 = xl0_2 + xl1_3;    n12 = xl0_2 - xl1_3; */
1530         /* n13 = xl1_2 + xl0_3;    n33 = xl1_2 - xl0_3; */
1531         xl0_3_xl1_3 = _rotl(xl1_3_xl0_3, 16);
1532         n13_n32_n33_n12 = _addsub2(xl1_2_xl0_2, xl0_3_xl1_3);
1533
1534         n33_n32 = _packhl2(_loll(n13_n32_n33_n12), _hill(n13_n32_n33_n12));
1535         n13_n12 = _packhl2(_hill(n13_n32_n33_n12), _loll(n13_n32_n33_n12));
1536
1537         /*-----------------------------------------------------------------*/
1538         /* Points that are read from succesive locations map to y, y[N/4]  */
1539         /* y[N/2], y[3N/4] in a radix4 scheme, y, y[N/8], y[N/2],y[5N/8]   */
1540         /*-----------------------------------------------------------------*/
1541
1542         /*----Imaginary----------Real-----------*/
1543         /* y0[2*h2+0] = n01;  y0[2*h2+1] = n00; */
1544         /* y0[2*h2+2] = n03;  y0[2*h2+3] = n02; */
1545         /* y1[2*h2+0] = n11;  y1[2*h2+1] = n10; */
1546         /* y1[2*h2+2] = n13;  y1[2*h2+3] = n12; */
1547         /* y2[2*h2+0] = n21;  y2[2*h2+1] = n20; */
1548         /* y2[2*h2+2] = n23;  y2[2*h2+3] = n22; */
1549         /* y3[2*h2+0] = n31;  y3[2*h2+1] = n30; */
1550         /* y3[2*h2+2] = n33;  y3[2*h2+3] = n32; */
1551         _amemd8(&y0[2 * h2]) = _itod(_hill(n01_n00_n21_n20), _hill(n03_n02_n23_n22));
1552         _amemd8(&y1[2 * h2]) = _itod(n11_n10, n13_n12);
1553         _amemd8(&y2[2 * h2]) = _itod(_loll(n01_n00_n21_n20), _loll(n03_n02_n23_n22));
1554         _amemd8(&y3[2 * h2]) = _itod(n31_n30, n33_n32);
1555
1556         j += j0;
1557         if (j == n0) {
1558             j  += n0;
1559             x0 += (int)npoints >> 1;
1560             x2 += (int)npoints >> 1;
1561         }
1562     }
1563 }
1564 #endif
1565 /* ======================================================================== */
1566 /*  End of file: DSP_fft16x16_i.c                                           */
1567 /* ------------------------------------------------------------------------ */
1568 /*          Copyright (C) 2011 Texas Instruments, Incorporated.             */
1569 /*                          All Rights Reserved.                            */
1570 /* ======================================================================== */
1571