[ep-processor-libraries/dsplib.git] / ti / dsplib / src / DSP_fft16x16r / c64P / DSP_fft16x16r_sa.sa
1 * ======================================================================== *
2 * TEXAS INSTRUMENTS, INC. *
3 * *
4 * NAME *
5 * DSP_fft16x16r_sa -- DSP_fft16x16r *
6 * *
7 * USAGE *
8 * *
9 * void fft16x16r( *
10 * int N, short * ptr_x, const short * ptr_w, unsigned char brev *
11 * short * ptr_y, int n_min, int offset, int n_max); *
12 * *
13 * N = length of fft in complex samples, power of 2 <=16384 *
14 * ptr_x = pointer to complex data input *
15 * ptr_w = pointer to complex twiddle factor (see below) *
16 * brev = pointer to bit reverse table containing 64 entries *
17 * n_min = smallest fft butterfly used in computation *
18 * used for decomposing fft into subffts, see notes *
19 * offset = index in complex samples of sub-fft from start of main f *
20 * n_max = size of main fft in complex samples *
21 * *
22 * (See the C compiler reference guide.) *
23 * *
24 * DESCRIPTION *
25 * The benchmark performs a mixed radix forwards fft using *
26 * a special sequence of coefficients generated in the following *
27 * way: *
28 * *
29 * -* generate vector of twiddle factors for optimized algorithm *- *
30 * void tw_gen(short * w, int N) *
31 * { *
32 * int j, k; *
33 * double x_t, y_t, theta1, theta2, theta3; *
34 * const double PI = 3.141592654, M = 32767.0; *
35 * -* M is 16383 for scale by 4 *- *
36 * *
37 * for (j=1, k=0; j <= N>>2; j = j<<2) *
38 * { *
39 * for (i=0; i < N>>2; i+=j) *
40 * { *
41 * theta1 = 2*PI*i/N; *
42 * x_t = M*cos(theta1); *
43 * y_t = M*sin(theta1); *
44 * w[k] = (short)x_t; *
45 * w[k+1] = (short)y_t; *
46 * *
47 * theta2 = 4*PI*i/N; *
48 * x_t = M*cos(theta2); *
49 * y_t = M*sin(theta2); *
50 * w[k+2] = (short)x_t; *
51 * w[k+3] = (short)y_t; *
52 * *
53 * theta3 = 6*PI*i/N; *
54 * x_t = M*cos(theta3); *
55 * y_t = M*sin(theta3); *
56 * w[k+4] = (short)x_t; *
57 * w[k+5] = (short)y_t; *
58 * k+=6; *
59 * } *
60 * } *
61 * } *
62 * This redundent set of twiddle factors is size 2*N short samples. *
63 * As pointed out later dividing these twiddle factors by 2 will give *
64 * an effective divide by 4 at each stage to guarentee no overflow. *
65 * The function is accurate to about 68dB of signal to noise ratio *
66 * to the DFT function below: *
67 * *
68 * void dft(int n, short x[], short y[]) *
69 * { *
70 * int k,i, index; *
71 * const double PI = 3.14159654; *
72 * short * p_x; *
73 * double arg, fx_0, fx_1, fy_0, fy_1, co, si; *
74 * *
75 * for(k = 0; k<n; k++) *
76 * { *
77 * p_x = x; *
78 * fy_0 = 0; *
79 * fy_1 = 0; *
80 * for(i=0; i<n; i++) *
81 * { *
82 * fx_0 = (double)p_x[0]; *
83 * fx_1 = (double)p_x[1]; *
84 * p_x += 2; *
85 * index = (i*k) % n; *
86 * arg = 2*PI*index/n; *
87 * co = cos(arg); *
88 * si = -sin(arg); *
89 * fy_0 += ((fx_0 * co) - (fx_1 * si)); *
90 * fy_1 += ((fx_1 * co) + (fx_0 * si)); *
91 * } *
92 * y[2*k] = (short)2*fy_0/sqrt(N); *
93 * y[2*k+1] = (short)2*fy_1/sqrt(N); *
94 * } *
95 * } *
96 * Scaling takes place at each stage except the last one. *
97 * This is a divide by 2 to prevent overflow. All shifts are rounded *
98 * reduce truncation noise power by 3dB. *
99 * The function takes the table and input data and calculates the fft *
100 * producing the frequency domain data in the Y array. *
101 * As the fft allows every input point to effect every output point i *
102 * a cache based system such as the c64xx, this causes cache thrashin *
103 * This is mitigated by allowing the main fft of size N to be divided *
104 * into several steps, allowing as much data reuse as possible. *
105 * *
106 * For example the following function: *
107 * *
108 * fft16x16r (1024, &x_asm[0],&w[0],y_asm,brev,4, 0,1024); *
109 * *
110 * is equvalent to: *
111 * *
112 * fft16x16r (1024,&x_asm[2*0], &w[0] ,y_asm,brev,256, 0,1024); *
113 * fft16x16r (256, &x_asm[2*0], &w[2*768],y_asm,brev,4, 0,1024); *
114 * fft16x16r (256, &x_asm[2*256],&w[2*768],y_asm,brev,4, 256,1024); *
115 * fft16x16r (256, &x_asm[2*512],&w[2*768],y_asm,brev,4, 512,1024); *
116 * fft16x16r (256, &x_asm[2*768],&w[2*768],y_asm,brev,4, 768,1024); *
117 * *
118 * Notice how the 1st fft function is called on the entire 1K data se *
119 * it covers the 1st pass of the fft until the butterfly size is 256. *
120 * The following 4 ffts do 256 pt ffts 25% of the size. These continu *
121 * down to the end when the buttefly is of size 4. The use an index t *
122 * the main twiddle factor array of 0.75*2*N. This is because the *
123 * twiddle factor array is composed of successively decimated version *
124 * of the main array. *
125 * *
126 * N not equal to a power of 4 can be used, i.e. 512. In this case to *
127 * decompose the fft the following would be needed : *
128 * *
129 * fft16x16r (512, &x_asm[0],&w[0],y_asm,brev,2, 0,512); *
130 * *
131 * is equvalent to: *
132 * *
133 * fft16x16r (512, &x_asm[0], &w[0], y_asm,brev,128, 0,512); *
134 * fft16x16r (128, &x_asm[2*0], &w[2*384],y_asm,brev,2, 0,512); *
135 * fft16x16r (128, &x_asm[2*128],&w[2*384],y_asm,brev,2, 128,512); *
136 * fft16x16r (128, &x_asm[2*256],&w[2*384],y_asm,brev,2, 256,512); *
137 * fft16x16r (128, &x_asm[2*384],&w[2*384],y_asm,brev,2, 384,512); *
138 * *
139 * The twiddle factor array is composed of log4(N) sets of twiddle *
140 * factors, (3/4)*N, (3/16)*N, (3/64)*N, etc. The index into this *
141 * array for each stage of the fft is calculated by summing these *
142 * indices up appropriately. *
143 * For multiple ffts they can share the same table by calling the sma *
144 * ffts from further down in the twiddle factor array. In the same wa *
145 * as the decomposition works for more data reuse. *
146 * *
147 * Thus, the above decomposition can be summarized for a general N , *
148 * radix "rad" as follows: *
149 * *
150 * fft16x16r(N, &x_cn[0], &w[0], brev, y_cn, N/4, 0, N *
151 * fft16x16r(N/4,&x_cn[0], &w[2*(3*N/4)],brev, y_cn, rad, 0, N *
152 * fft16x16r(N/4,&x_cn[2*(N/4)], &w[2*(3*N/4)],brev, y_cn, rad, N/4, N *
153 * fft16x16r(N/4,&x_cn[2*(N/2)], &w[2*(3*N/4)],brev, y_cn, rad, N/2, N *
154 * fft16x16r(N/4,&x_cn[2*(3*N/4)], &w[2*3*N/4)], brev, y_cn, rad, 3*N/4, N *
155 * *
156 * As discussed previously, N can be either a power of 4 or 2. If N *
157 * N is a power of 4, rad = 4, and if N is a power of 2, and not a *
158 * power of 4, then rad = 2. "rad" is used to control how many stages *
159 * of decomposition are performed. It is also used to dtermine whethe *
160 * a radix4 or radix2 decomposition should be performed at the last *
161 * stage. Hence when "rad" is set to "N/4" the first stage of the *
162 * transform alone is performed and the code exits. To complete the *
163 * FFT four other calls are required to perform N/4 size FFT's. In *
164 * fact the ordering of these 4 FFT's amonst themselves does not *
165 * matter and hence from a cahe perspective it helps to go through *
166 * the remaining 4 FFT's in exactly the opposite order to the first. *
167 * *
168 * This is illustrated as follows: *
169 * *
170 * fft16x16r(N, &x_cn[0], &w[0], brev, y_cn, N/4, 0, N *
171 * fft16x16r(N/4,&x_cn[2*(3*N/4)], &w[2*3*N/4)], brev, y_cn, rad, 3*N/4, N *
172 * fft16x16r(N/4,&x_cn[2*(N/2)], &w[2*(3*N/4)],brev, y_cn, rad, N/2, N *
173 * fft16x16r(N/4,&x_cn[2*(N/4)], &w[2*(3*N/4)],brev, y_cn, rad, N/4, N *
174 * fft16x16r(N/4,&x_cn[0], &w[2*(3*N/4)],brev, y_cn, rad, 0, N *
175 * *
176 * In addition this function can be used to minimize call overhead, b *
177 * completing the FFT with one function call invocation as shown belo *
178 * *
179 * fft16x16r(N, &x_cn[0], &w[0], y_cn, brev, rad, 0, N) *
180 * *
181 * ASSUMPTIONS: *
182 * n must be a power of 2 and n >= 8 n <= 16384 points. *
183 * Complex time data x and twiddle facotrs w are aligned on double *
184 * word boundares. Real values are stored in even word positions and *
185 * imaginary values in odd positions. *
186 * *
187 * All data is in short precision integer fixed point form. The *
188 * complex frequency data will be returned in linear order. *
189 * *
190 * *
191 * MEMORY NOTE: *
192 * Configuration is LITTLE ENDIAN the code will not function if the - *
193 * flag is enabled but it can be modified for BIG ENDIAN usage. *
194 * *
195 * TECHNIQUES *
196 * A special sequence of coeffs. used as generated above *
197 * produces the fft. This collapses the inner 2 loops in the *
198 * taditional Burrus and Parks implementation Fortran Code. *
199 * *
200 * The revised FFT uses a redundant sequence of twiddle factors to *
201 * allow a linear access through the data. This linear access enables *
202 * data and instruction level parallelism. *
203 * The data produced by the fft16x16r fft is in normal form, the *
204 * whole data array is written into a new output buffer. *
205 * *
206 * The fft16x16r butterfly is bit reversed, i.e. the inner 2 points o *
207 * the butterfly are corssed over, this has the effect of making the *
208 * data come out in bit reversed rather than in radix 4 digit reverse *
209 * order. This simplifies the last pass of the loop. It is performed *
210 * using the _bitr instruction on C64x architecture. It is performed *
211 * using a macro BIT_REV instead. *
212 * *
213 * NOTES *
214 * For more aggressive overflow control the shift in the DC term can *
215 * adjusted to 2 and the twiddle factors shifted right by 1. This giv *
216 * a divide by 4 at each stage. For better accuracy the data can be p *
217 * asserted left by so many bits so that as it builds in magnitude th *
218 * divide by 2 prevents too much growth. An optimal point for example *
219 * with an 8192pt fft with input data precision of 8 bits is to asert *
220 * the input 4 bits left to make it 12 bits. This gives an SNR of 68d *
221 * at the output. By trying combinations the optimal can be found. *
222 * If scaling isnot required it is possible to replace the MPY by SMP *
223 * this will give a shift left by 1 so a shift right by 16 gives a *
224 * total 15 bit shift right. The DC term must be adjusted to give a *
225 * zero shift. *
226 * *
227 * C CODE *
228 * The following code is the traditional Burrus and Parks implemen- *
229 * tation, which performs a mixed radix FFT capable of 2^M, 4^M. *
230 * However it does not support multiple calls. It uses a traditional *
231 * twiddle factor array wn, generated as follows: *
232 * *
233 * const double M = 32767.0; *
234 * const double PI = 3.141592654; *
235 * *
236 * for (i=0, k = 0; i < 3*(N>>2); i++) *
237 * { *
238 * theta1 = 2*PI*i/N; *
239 * x_t = M*cos(theta1); *
240 * y_t = M*sin(theta1); *
241 * wn[k] = (short) x_t; *
242 * if (x_t >= M) wn[k ] = 0x7fff; *
243 * wn[k+1] = (short) y_t; *
244 * if (y_t >= M) wn[k+1] = 0x7fff; *
245 * k+=2; *
246 * } *
247 * *
248 * The C code that implements the traditional mixed radix FFT is *
249 * shown below. It has three nested loops, one for the stages, *
250 * one for the groups of butterflies, one for the passes. *
251 * *
252 * void fft16x16r_cn(int n, short x[], short wn[], *
253 * unsigned char brev[], short y[], int radix, int offset, int nmax) *
254 * { *
255 * int n1, n2, ie, ia1, ia2, ia3, i0, i1, i2, i3, i, l0; *
256 * short co1, co2, co3, si1, si2, si3; *
257 * short xt0, yt0, xt1, yt1, xt2, yt2; *
258 * short xh0,xh1,xh20,xh21,xl0,xl1,xl20,xl21; *
259 * short * ptr_x0, * y0; *
260 * unsigned int j0, j1, k0, k1, k, j; *
261 * short x0, x1, x2, x3, x4, x5, x6, x7; *
262 * short xh0_0, xh1_0, xh0_1, xh1_1; *
263 * short xl0_0, xl1_0, xl0_1, xl1_1; *
264 * short yt3, yt4, yt5, yt6, yt7; *
265 * *
266 * n2 = n; *
267 * ie = 1; *
268 * for (k = n; k > radix; k >>= 2) *
269 * { *
270 * n1 = n2; *
271 * n2 >>= 2; *
272 * ia1 = 0; *
273 * for (j = 0; j < n2; j++) *
274 * { *
275 * ia2 = ia1 + ia1; *
276 * ia3 = ia2 + ia1; *
277 * co1 = w[2 * ia1 ]; *
278 * si1 = w[2 * ia1 + 1]; *
279 * co2 = w[2 * ia2 ]; *
280 * si2 = w[2 * ia2 + 1]; *
281 * co3 = w[2 * ia3 ]; *
282 * si3 = w[2 * ia3 + 1]; *
283 * ia1 = ia1 + ie; *
284 * for (i0 = j; i0 < n; i0 += n1) *
285 * { *
286 * i1 = i0 + n2; *
287 * i2 = i1 + n2; *
288 * i3 = i2 + n2; *
289 * *
290 * xh0 = x[2 * i0 ] + x[2 * i2 ]; *
291 * xh1 = x[2 * i0 + 1] + x[2 * i2 + 1]; *
292 * xl0 = x[2 * i0 ] - x[2 * i2 ]; *
293 * xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; *
294 * *
295 * xh20 = x[2 * i1 ] + x[2 * i3 ]; *
296 * xh21 = x[2 * i1 + 1] + x[2 * i3 + 1]; *
297 * xl20 = x[2 * i1 ] - x[2 * i3 ]; *
298 * xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; *
299 * *
300 * x[2 * i0 ] = (xh0 + xh20 + 1)>>1; *
301 * x[2 * i0 + 1] = (xh1 + xh21 + 1)>>1; *
302 * *
303 * xt0 = xh0 - xh20; *
304 * yt0 = xh1 - xh21; *
305 * xt1 = xl0 + xl21; *
306 * yt2 = xl1 + xl20; *
307 * xt2 = xl0 - xl21; *
308 * yt1 = xl1 - xl20; *
309 * *
310 * x[2 * i2 ]= (xt1 * co1 + yt1 * si1 + 0x00008000)>> 16*
311 * x[2 * i2 + 1]= (yt1 * co1 - xt1 * si1 + 0x00008000)>> 16*
312 * x[2 * i1 ]= (xt0 * co2 + yt0 * si2 + 0x00008000)>> 16*
313 * x[2 * i1 + 1]= (yt0 * co2 - xt0 * si2 + 0x00008000)>> 16*
314 * x[2 * i3 ]= (xt2 * co3 + yt2 * si3 + 0x00008000)>> 16*
315 * x[2 * i3 + 1]= (yt2 * co3 - xt2 * si3 + 0x00008000)>> 16*
316 * } *
317 * } *
318 * *
319 * ie <<= 2; *
320 * } *
321 * *
322 * j = 0; *
323 * ptr_x0 = x; *
324 * y0 = y; *
325 * l0 = _norm(n) - 17; *
326 * *
327 * if(radix == 2 || radix == 4) for (i = 0; i < n; i += 4) *
328 * { *
329 * j0 = (j ) & 0x3F; *
330 * j1 = (j >> 6) & 0x3F; *
331 * k0 = brev[j0]; *
332 * k1 = brev[j1]; *
333 * k = (k0 << 6) | k1; *
334 * if (l0 < 0) k = k << -l0; *
335 * else k = k >> l0; *
336 * j++; *
337 * *
338 * x0 = ptr_x0[0]; x1 = ptr_x0[1]; *
339 * x2 = ptr_x0[2]; x3 = ptr_x0[3]; *
340 * x4 = ptr_x0[4]; x5 = ptr_x0[5]; *
341 * x6 = ptr_x0[6]; x7 = ptr_x0[7]; *
342 * ptr_x0 += 8; *
343 * *
344 * xh0_0 = x0 + x4; *
345 * xh1_0 = x1 + x5; *
346 * xh0_1 = x2 + x6; *
347 * xh1_1 = x3 + x7; *
348 * *
349 * if (radix == 2) *
350 * { *
351 * xh0_0 = x0; *
352 * xh1_0 = x1; *
353 * xh0_1 = x2; *
354 * xh1_1 = x3; *
355 * } *
356 * *
357 * yt0 = xh0_0 + xh0_1; *
358 * yt1 = xh1_0 + xh1_1; *
359 * yt4 = xh0_0 - xh0_1; *
360 * yt5 = xh1_0 - xh1_1; *
361 * *
362 * xl0_0 = x0 - x4; *
363 * xl1_0 = x1 - x5; *
364 * xl0_1 = x2 - x6; *
365 * xl1_1 = x3 - x7; *
366 * *
367 * if (radix == 2) *
368 * { *
369 * xl0_0 = x4; *
370 * xl1_0 = x5; *
371 * xl1_1 = x6; *
372 * xl0_1 = x7; *
373 * } *
374 * *
375 * yt2 = xl0_0 + xl1_1; *
376 * yt3 = xl1_0 - xl0_1; *
377 * *
378 * yt6 = xl0_0 - xl1_1; *
379 * yt7 = xl1_0 + xl0_1; *
380 * *
381 * if (radix == 2) *
382 * { *
383 * yt7 = xl1_0 - xl0_1; *
384 * yt3 = xl1_0 + xl0_1; *
385 * } *
386 * *
387 * y0[k] = yt0; y0[k+1] = yt1; *
388 * k += n>>1 *
389 * y0[k] = yt2; y0[k+1] = yt3; *
390 * k += n>>1; *
391 * y0[k] = yt4; y0[k+1] = yt5; *
392 * k += n>>1; *
393 * y0[k] = yt6; y0[k+1] = yt7; *
394 * } *
395 * } *
396 * *
397 * Although code shown above is the simplest equivalent way of writin *
398 * this code, it already carries several optimization ideas. It has *
399 * a special last stage to avoid multiplication by 1. In addition it *
400 * was shown by Panos Papamichalis that if the two middle legs of a *
401 * radix 4 butterfly are reversed, the outputs for a radix4 transform *
402 * end up in the bit reversed fashion. The code also carries a linear *
403 * time look up table for bit reversal. This can be used as shown in *
404 * the code to construct a bit reversed index. The last stage perfo- *
405 * rms either a radix4 or radix2 as the case may be. *
406 * *
407 * The code shown below performs loop coalescing as it is realized *
408 * that while the "i" and "j" loop individually iterate for variable *
409 * number of times, together they always iterate for N/4 times. The *
410 * natural C code and the code shown below use a modified twiddle *
411 * factor array to allow for vectorization of the loop. In addition *
412 * bit-reversal is performed by a macro BIT_REV. This makes the bit- *
413 * reversal table redundant. *
414 * *
415 * This is the C equivalent of the assembly code without restrictions *
416 * Note that the assembly code is hand optimized and restrictions may *
417 * apply. *
418 * *
419 * *
420 * void fft16x16r_co(int n, short ptr_x[], short ptr_w[], short ptr_y[ *
421 * unsigned char brev[], int n_min, int offset, int n_max*
422 * { *
423 * int i, j, k, l1, l2, h2, predj; *
424 * int tw_offset, stride, fft_jmp; *
425 * *
426 * short x0, x1, x2, x3,x4,x5,x6,x7; *
427 * short xt0, yt0, xt1, yt1, xt2, yt2, yt3; *
428 * short yt4, yt5, yt6, yt7; *
429 * short si1,si2,si3,co1,co2,co3; *
430 * short xh0,xh1,xh20,xh21,xl0,xl1,xl20,xl21; *
431 * short x_0, x_1, x_l1, x_l1p1, x_h2 , x_h2p1, x_l2, x_l2p1; *
432 * short xl0_0, xl1_0, xl0_1, xl1_1; *
433 * short xh0_0, xh1_0, xh0_1, xh1_1; *
434 * short *x,*w; *
435 * int k0, k1, j0, j1, l0, radix; *
436 * short * y0, * ptr_x0, * ptr_x2; *
437 * *
438 * radix = n_min; *
439 * *
440 * stride = n; -* n is the number of complex samples *- *
441 * tw_offset = 0; *
442 * while (stride > radix) *
443 * { *
444 * j = 0; *
445 * fft_jmp = stride + (stride>>1); *
446 * h2 = stride>>1; *
447 * l1 = stride; *
448 * l2 = stride + (stride>>1); *
449 * x = ptr_x; *
450 * w = ptr_w + tw_offset; *
451 * *
452 * for (i = 0; i < n; i += 4) *
453 * { *
454 * co1 = w[j]; *
455 * si1 = w[j+1]; *
456 * co2 = w[j+2]; *
457 * si2 = w[j+3]; *
458 * co3 = w[j+4]; *
459 * si3 = w[j+5]; *
460 * *
461 * x_0 = x[0]; *
462 * x_1 = x[1]; *
463 * x_h2 = x[h2]; *
464 * x_h2p1 = x[h2+1]; *
465 * x_l1 = x[l1]; *
466 * x_l1p1 = x[l1+1]; *
467 * x_l2 = x[l2]; *
468 * x_l2p1 = x[l2+1]; *
469 * *
470 * xh0 = x_0 + x_l1; *
471 * xh1 = x_1 + x_l1p1; *
472 * xl0 = x_0 - x_l1; *
473 * xl1 = x_1 - x_l1p1; *
474 * *
475 * xh20 = x_h2 + x_l2; *
476 * xh21 = x_h2p1 + x_l2p1; *
477 * xl20 = x_h2 - x_l2; *
478 * xl21 = x_h2p1 - x_l2p1; *
479 * *
480 * ptr_x0 = x; *
481 * ptr_x0[0] = ((short) (xh0 + xh20))>>1; can be changed to*
482 * ptr_x0[1] = ((short) (xh1 + xh21))>>1; can be changed to*
483 * *
484 * ptr_x2 = ptr_x0; *
485 * x += 2; *
486 * j += 6; *
487 * predj = (j - fft_jmp); *
488 * if (!predj) x += fft_jmp; *
489 * if (!predj) j = 0; *
490 * *
491 * xt0 = xh0 - xh20; *
492 * yt0 = xh1 - xh21; *
493 * xt1 = xl0 + xl21; *
494 * yt2 = xl1 + xl20; *
495 * xt2 = xl0 - xl21; *
496 * yt1 = xl1 - xl20; *
497 * *
498 * ptr_x2[l1 ] = (xt1 * co1 + yt1 * si1 + 0x8000)>>16; *
499 * ptr_x2[l1+1] = (yt1 * co1 - xt1 * si1 + 0x8000)>>16; *
500 * ptr_x2[h2 ] = (xt0 * co2 + yt0 * si2 + 0x8000)>>16; *
501 * ptr_x2[h2+1] = (yt0 * co2 - xt0 * si2 + 0x8000)>>16; *
502 * ptr_x2[l2 ] = (xt2 * co3 + yt2 * si3 + 0x8000)>>16; *
503 * ptr_x2[l2+1] = (yt2 * co3 - xt2 * si3 + 0x8000)>>16; *
504 * } *
505 * tw_offset += fft_jmp; *
506 * stride = stride>>2; *
507 * }-* end while *- *
508 * *
509 * j = offset>>2; *
510 * *
511 * ptr_x0 = ptr_x; *
512 * y0 = ptr_y; *
513 * l0 = _norm(nmax) - 17; -* get size of fft *- *
514 * *
515 * if (radix <= 4) for (i = 0; i < n; i += 4) *
516 * { *
517 * -* reversal computation *- *
518 * *
519 * j0 = (j ) & 0x3F; *
520 * j1 = (j >> 6) & 0x3F; *
521 * k0 = brev[j0]; *
522 * k1 = brev[j1]; *
523 * k = (k0 << 6) | k1; *
524 * k = k >> l0; *
525 * j++; -* multiple of 4 index *- *
526 * *
527 * x0 = ptr_x0[0]; x1 = ptr_x0[1]; *
528 * x2 = ptr_x0[2]; x3 = ptr_x0[3]; *
529 * x4 = ptr_x0[4]; x5 = ptr_x0[5]; *
530 * x6 = ptr_x0[6]; x7 = ptr_x0[7]; *
531 * ptr_x0 += 8; *
532 * *
533 * xh0_0 = x0 + x4; *
534 * xh1_0 = x1 + x5; *
535 * xh0_1 = x2 + x6; *
536 * xh1_1 = x3 + x7; *
537 * *
538 * if (radix == 2) { *
539 * xh0_0 = x0; *
540 * xh1_0 = x1; *
541 * xh0_1 = x2; *
542 * xh1_1 = x3; *
543 * } *
544 * *
545 * yt0 = xh0_0 + xh0_1; *
546 * yt1 = xh1_0 + xh1_1; *
547 * yt4 = xh0_0 - xh0_1; *
548 * yt5 = xh1_0 - xh1_1; *
549 * *
550 * xl0_0 = x0 - x4; *
551 * xl1_0 = x1 - x5; *
552 * xl0_1 = x2 - x6; *
553 * xl1_1 = x3 - x7; *
554 * *
555 * if (radix == 2) { *
556 * xl0_0 = x4; *
557 * xl1_0 = x5; *
558 * xl1_1 = x6; *
559 * xl0_1 = x7; *
560 * } *
561 * *
562 * yt2 = xl0_0 + xl1_1; *
563 * yt3 = xl1_0 - xl0_1; *
564 * yt6 = xl0_0 - xl1_1; *
565 * yt7 = xl1_0 + xl0_1; *
566 * *
567 * if (radix == 2) { *
568 * yt7 = xl1_0 - xl0_1; *
569 * yt3 = xl1_0 + xl0_1; *
570 * } *
571 * *
572 * y0[k] = yt0; y0[k+1] = yt1; *
573 * k += n>>1; *
574 * y0[k] = yt2; y0[k+1] = yt3; *
575 * k += n>>1; *
576 * y0[k] = yt4; y0[k+1] = yt5; *
577 * k += n>>1; *
578 * y0[k] = yt6; y0[k+1] = yt7; *
579 * } *
580 * } *
581 * *
582 * Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/ * \r
583 * *\r
584 * *\r
585 * Redistribution and use in source and binary forms, with or without *\r
586 * modification, are permitted provided that the following conditions *\r
587 * are met: *\r
588 * *\r
589 * Redistributions of source code must retain the above copyright *\r
590 * notice, this list of conditions and the following disclaimer. *\r
591 * *\r
592 * Redistributions in binary form must reproduce the above copyright *\r
593 * notice, this list of conditions and the following disclaimer in the *\r
594 * documentation and/or other materials provided with the *\r
595 * distribution. *\r
596 * *\r
597 * Neither the name of Texas Instruments Incorporated nor the names of *\r
598 * its contributors may be used to endorse or promote products derived *\r
599 * from this software without specific prior written permission. *\r
600 * *\r
601 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *\r
602 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT *\r
603 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR *\r
604 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT *\r
605 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *\r
606 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT *\r
607 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, *\r
608 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *\r
609 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *\r
610 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE *\r
611 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *\r
612 * *\r
613 * ======================================================================= *\r
614 .sect ".text:psa"
615 .if __TI_EABI__
616 .asg DSP_fft16x16r, _DSP_fft16x16r
617 .endif
619 \r
1226 * ======================================================================== *
1227 * End of file: DSP_fft16x16r_sa.sa *
1228 * ------------------------------------------------------------------------ *
1229 * Copyright (C) 2011 Texas Instruments, Incorporated. *
1230 * All Rights Reserved. *
1231 * ======================================================================== *