[ep-processor-libraries/dsplib.git] / ti / dsplib / src / DSP_fir_gen_hM17_rA8X8 / c66 / DSP_fir_gen_hM17_rA8X8.c
1 /* ======================================================================= */
2 /* TEXAS INSTRUMENTS, INC. */
3 /* */
4 /* DSPLIB DSP Signal Processing Library */
5 /* */
6 /* This library contains proprietary intellectual property of Texas */
7 /* Instruments, Inc. The library and its source code are protected by */
8 /* various copyrights, and portions may also be protected by patents or */
9 /* other legal protections. */
10 /* */
11 /* This software is licensed for use with Texas Instruments TMS320 */
12 /* family DSPs. This license was provided to you prior to installing */
13 /* the software. You may review this license by consulting the file */
14 /* TI_license.PDF which accompanies the files in this library. */
15 /* */
16 /* ----------------------------------------------------------------------- */
17 /* */
18 /* DSP_fir_gen_hM17_rA8X8.c -- FIR Filter (Radix 8) */
19 /* Intrinsic C Implementation */
20 /* */
21 /* Rev 0.0.1 */
22 /* */
23 /* Usage */
24 /* This routine is C-callable and can be called as: */
25 /* */
26 /* void DSP_fir_gen_hM17_rA8X8 ( */
27 /* const short *restrict x, */
28 /* const short *restrict h, */
29 /* short *restrict r, */
30 /* int nh, */
31 /* int nr, */
32 /* ) */
33 /* */
34 /* Description */
35 /* Computes a real FIR filter (direct-form) using coefficients */
36 /* stored in vector h. The real data input is stored in vector x. */
37 /* The filter output result is stored in vector r. Input data and */
38 /* filter taps are 16-bit, with intermediate values kept at 32-bit */
39 /* precision. Filter taps are expected in Q15 format. */
40 /* */
41 /* Assumptions */
42 /* Arrays x, h, and r do not overlap */
43 /* nr >= 8; nr % 8 == 0 */
44 /* */
45 /* Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/ */
46 /* */
47 /* */
48 /* Redistribution and use in source and binary forms, with or without */
49 /* modification, are permitted provided that the following conditions */
50 /* are met: */
51 /* */
52 /* Redistributions of source code must retain the above copyright */
53 /* notice, this list of conditions and the following disclaimer. */
54 /* */
55 /* Redistributions in binary form must reproduce the above copyright */
56 /* notice, this list of conditions and the following disclaimer in the */
57 /* documentation and/or other materials provided with the */
58 /* distribution. */
59 /* */
60 /* Neither the name of Texas Instruments Incorporated nor the names of */
61 /* its contributors may be used to endorse or promote products derived */
62 /* from this software without specific prior written permission. */
63 /* */
64 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
65 /* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */
66 /* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR */
67 /* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT */
68 /* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
69 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT */
70 /* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
71 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY */
72 /* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
73 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE */
74 /* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
75 /* */
76 /* ======================================================================= */
78 #pragma CODE_SECTION(DSP_fir_gen_hM17_rA8X8, ".text:optimized");
80 #include "DSP_fir_gen_hM17_rA8X8.h"
82 #ifdef _LITTLE_ENDIAN
83 void DSP_fir_gen_hM17_rA8X8 (
84 const short *restrict x, /* Input array [nr+nh-1 elements] */
85 const short *restrict h, /* Coeff array [nh elements] */
86 short *restrict r, /* Output array [nr elements] */
87 int nh, /* Number of coefficients */
88 int nr /* Number of output samples */
89 )
90 {
91 int i, j, mask;
92 int h_32, h_10;
93 int r_76, r_54, r_32, r_10;
94 int dot_0, dot_1, dot_2, dot_3, dot_4, dot_5, dot_6, dot_7;
95 int sum_0, sum_1, sum_2, sum_3, sum_4, sum_5, sum_6, sum_7;
97 long long x_3210, x_4321, x_5432, x_6543, x_7654, x_8765, x_9876, x_A987, x_BA98;
98 long long h_3210, h_3210_mod;
100 /* Pad the end of the filter tap array with zeros in order make the
101 * array length a multiple of 4. This allows the loop to be optimized. */
102 mask = nh & 3;
103 if (!mask) mask = 4;
105 h_3210 = _amem8_const(&h[nh - mask]);
106 h_10 = _loll(h_3210);
107 h_32 = _hill(h_3210);
109 if (mask == 3) {
110 /* Mask out the last 16 bits (1 short) */
111 h_32 &= 0x0000FFFF;
112 }
113 if (mask == 2) {
114 /* Mask out the last 32 bits (2 shorts) */
115 h_32 = 0;
116 }
117 if (mask == 1) {
118 /* Mask out the last 48 bits (3 shorts) */
119 h_32 = 0;
120 h_10 &= 0x0000FFFF;
121 }
123 /* Modified taps to be used during the the filter tap loop */
124 h_3210_mod = _itoll(h_32, h_10);
126 _nassert(nr % 8 == 0);
127 _nassert(nr >= 8);
128 for (j = 0; j < nr; j += 8) {
129 sum_0 = 0;
130 sum_1 = 0;
131 sum_2 = 0;
132 sum_3 = 0;
133 sum_4 = 0;
134 sum_5 = 0;
135 sum_6 = 0;
136 sum_7 = 0;
138 /* Loop through the number of coefficients, 4 at a time
139 * summing all dot products together to form 8 results */
140 #pragma MUST_ITERATE(1,,1)
141 for (i = 0; i < nh; i += 4)
142 {
143 /* Load the 4 elements of the coefficient array using aligned
144 * double word wide load */
145 h_3210 = _amem8_const(&h[i]);
147 /* Use modified taps during the last iteration of the loop. */
148 if (i >= nh - 4) {
149 h_3210 = h_3210_mod;
150 }
152 /* Load the 12 elements of the data array using aligned
153 * double word wide loads */
154 x_3210 = _amem8_const(&x[i + j]);
155 x_7654 = _amem8_const(&x[i + j + 4]);
156 x_BA98 = _amem8_const(&x[i + j + 8]);
158 /* Form result 2's and 6's filter taps */
159 x_5432 = _dmv(_loll(x_7654),_hill(x_3210));
160 x_9876 = _dmv(_loll(x_BA98),_hill(x_7654));
162 /* Form result 1's and 5's filter taps */
163 x_4321 = _mem8_const(&x[i + j + 1]);
164 x_8765 = _dpacklh2(x_9876,x_7654);
166 /* Load result 3's and 7'3 filter taps */
167 x_6543 = _mem8_const(&x[i + j + 3]);
168 x_A987 = _dpacklh2(x_BA98,x_9876);
170 /* Compute result 3 dot product xAh3 + x9h2 + x8h1 + x7h0 */
171 dot_7 = _dotp4h(x_A987, h_3210);
172 /* Compute result 2 dot product x9h3 + x8h2 + x7h1 + x6h0 */
173 dot_6 = _dotp4h(x_9876, h_3210);
174 /* Compute result 1 dot product x8h3 + x7h2 + x6h1 + x5h0 */
175 dot_5 = _dotp4h(x_8765, h_3210);
176 /* Compute result 0 dot product x7h3 + x6h2 + x5h1 + x4h0 */
177 dot_4 = _dotp4h(x_7654, h_3210);
178 /* Compute result 3 dot product x6h3 + x5h2 + x4h1 + x3h0 */
179 dot_3 = _dotp4h(x_6543, h_3210);
180 /* Compute result 2 dot product x5h3 + x4h2 + x3h1 + x2h0 */
181 dot_2 = _dotp4h(x_5432, h_3210);
182 /* Compute result 1 dot product x4h3 + x3h2 + x2h1 + x1h0 */
183 dot_1 = _dotp4h(x_4321, h_3210);
184 /* Compute result 0 dot product x3h3 + x2h2 + x1h1 + x0h0 */
185 dot_0 = _dotp4h(x_3210, h_3210);
187 /* Sum each dot_X register to form each results complete sum */
188 sum_7 += dot_7; /* Add xAh3 + x9h2 + x8h1 + x7h0 */
189 sum_6 += dot_6; /* Add x9h3 + x8h2 + x7h1 + x6h0 */
190 sum_5 += dot_5; /* Add x8h3 + x7h2 + x6h1 + x5h0 */
191 sum_4 += dot_4; /* Add x7h3 + x6h2 + x5h1 + x4h0 */
192 sum_3 += dot_3; /* Add x6h3 + x5h2 + x4h1 + x3h0 */
193 sum_2 += dot_2; /* Add x5h3 + x4h2 + x3h1 + x2h0 */
194 sum_1 += dot_1; /* Add x4h3 + x3h2 + x2h1 + x1h0 */
195 sum_0 += dot_0; /* Add x3h3 + x2h2 + x1h1 + x0h0 */
196 }
198 /* Shift accumulators up 1 into upper halfword, for Q15 math and
199 * pack results together so that 8 output samples may be stored
200 * as a double word minimizing the number of memory operations. */
201 r_10 = _packh2(sum_1 << 1, sum_0 << 1);
202 r_32 = _packh2(sum_3 << 1, sum_2 << 1);
203 r_54 = _packh2(sum_5 << 1, sum_4 << 1);
204 r_76 = _packh2(sum_7 << 1, sum_6 << 1);
206 /* Store out 4 output samples at a time using STDW */
207 _amem8(&r[j]) = _itoll(r_32, r_10);
208 _amem8(&r[j+4]) = _itoll(r_76, r_54);
209 }
210 }
212 /*-----------------------------------------------------------*/
213 /* Big Endian version */
214 /*-----------------------------------------------------------*/
215 #else
216 void DSP_fir_gen_hM17_rA8X8 (
217 const short *restrict x, /* Input array [nr+nh-1 elements] */
218 const short *restrict h, /* Coeff array [nh elements] */
219 short *restrict r, /* Output array [nr elements] */
220 int nh, /* Number of coefficients */
221 int nr /* Number of output samples */
222 )
223 {
224 int i, j, mask;
225 int h_01, h_23;
226 int r_01, r_23, r_45, r_67;
227 int dot_0, dot_1, dot_2, dot_3, dot_4, dot_5, dot_6, dot_7;
228 int sum_0, sum_1, sum_2, sum_3, sum_4, sum_5, sum_6, sum_7;
230 long long x_0123, x_1234, x_2345, x_3456, x_4567, x_5678, x_6789, x_789A, x_89AB;
231 long long h_0123, h_0123_mod;
233 /* Pad the end of the filter tap array with zeros in order make the
234 * array length a multiple of 4. This allows the loop to be optimized. */
235 mask = nh & 3;
236 if (!mask) mask = 4;
238 h_0123 = _mem8_const(&h[nh - mask]);
239 h_01 = _hill(h_0123);
240 h_23 = _loll(h_0123);
242 if (mask == 3) {
243 /* Mask out the last 16 bits (1 short) */
244 h_23 &= 0xFFFF0000;
245 }
246 if (mask == 2) {
247 /* Mask out the last 32 bits (2 shorts) */
248 h_23 = 0;
249 }
250 if (mask == 1) {
251 /* Mask out the last 48 bits (3 shorts) */
252 h_23 = 0;
253 h_01 &= 0xFFFF0000;
254 }
256 /* Modified taps to be used during the the filter tap loop */
257 h_0123_mod = _itoll(h_01, h_23);
259 _nassert(nr % 8 == 0);
260 _nassert(nr >= 8);
261 for (j = 0; j < nr; j += 8) {
262 sum_0 = 0;
263 sum_1 = 0;
264 sum_2 = 0;
265 sum_3 = 0;
266 sum_4 = 0;
267 sum_5 = 0;
268 sum_6 = 0;
269 sum_7 = 0;
271 /* Loop through the number of coefficients, 4 at a time
272 * summing all dot products together to form 8 results */
273 #pragma MUST_ITERATE(1,,1)
274 for (i = 0; i < nh; i += 4) {
275 /* Load the four elements of the coefficient array using aligned
276 * double word wide load */
277 h_0123 = _amem8_const(&h[i]);
279 /* Use modified taps during the last iteration of the loop. */
280 if (i >= nh - 4) {
281 h_0123 = h_0123_mod;
282 }
284 /* Load the seven elements of the data array using aligned
285 * double word wide loads */
286 x_0123 = _amem8_const(&x[i + j]);
287 x_4567 = _amem8_const(&x[i + j + 4]);
288 x_89AB = _amem8_const(&x[i + j + 8]);
290 /* Form result 2's and 6's filter taps */
291 x_2345 = _dmv(_loll(x_0123),_hill(x_4567));
292 x_6789 = _dmv(_loll(x_4567),_hill(x_89AB));
294 /* Form result 1's and 5's filter taps */
295 x_1234 = _mem8_const(&x[i + j + 1]);
296 x_5678 = _dpacklh2(x_4567, x_6789);
298 /* Load result 3's and 7's filter taps */
299 x_3456 = _mem8_const(&x[i + j + 3]);
300 x_789A = _dpacklh2(x_6789, x_89AB);
302 /* Compute result 3 dot product x7h0 + x8h1 + x9h2 + xAh3 */
303 dot_7 = _dotp4h(x_789A, h_0123);
304 /* Compute result 2 dot product x6h0 + x7h1 + x8h2 + x9h3 */
305 dot_6 = _dotp4h(x_6789, h_0123);
306 /* Compute result 1 dot product x5h0 + x6h1 + x7h2 + x8h3 */
307 dot_5 = _dotp4h(x_5678, h_0123);
308 /* Compute result 0 dot product x4h0 + x5h1 + x6h2 + x7h3 */
309 dot_4 = _dotp4h(x_4567, h_0123);
310 /* Compute result 3 dot product x3h0 + x4h1 + x5h2 + x6h3 */
311 dot_3 = _dotp4h(x_3456, h_0123);
312 /* Compute result 2 dot product x2h0 + x3h1 + x4h2 + x5h3 */
313 dot_2 = _dotp4h(x_2345, h_0123);
314 /* Compute result 1 dot product x1h0 + x2h1 + x3h2 + x4h3 */
315 dot_1 = _dotp4h(x_1234, h_0123);
316 /* Compute result 0 dot product x0h0 + x1h1 + x2h2 + x3h3 */
317 dot_0 = _dotp4h(x_0123, h_0123);
319 /* Sum each ddot_X register to form each results complete sum */
320 sum_7 += dot_7; /* Add x7h0 + x8h1 + x9h2 + xAh3 */
321 sum_6 += dot_6; /* Add x6h0 + x7h1 + x8h2 + x9h3 */
322 sum_5 += dot_5; /* Add x5h0 + x6h1 + x7h2 + x8h3 */
323 sum_4 += dot_4; /* Add x4h0 + x5h1 + x6h2 + x7h3 */
324 sum_3 += dot_3; /* Add x3h0 + x4h1 + x5h2 + x6h3 */
325 sum_2 += dot_2; /* Add x2h0 + x3h1 + x4h2 + x5h3 */
326 sum_1 += dot_1; /* Add x1h0 + x2h1 + x3h2 + x4h3 */
327 sum_0 += dot_0; /* Add x0h0 + x1h1 + x2h2 + x3h3 */
328 }
330 /* Shift accumulators up 1 into upper halfword, for Q15 math and
331 * pack results together so that 8 output samples may be stored
332 * as a double word minimizing the number of memory operations. */
333 r_01 = _packh2(sum_0 << 1, sum_1 << 1);
334 r_23 = _packh2(sum_2 << 1, sum_3 << 1);
335 r_45 = _packh2(sum_4 << 1, sum_5 << 1);
336 r_67 = _packh2(sum_6 << 1, sum_7 << 1);
338 /* Store out 4 output samples at a time using STDW */
339 _amem8(&r[j]) = _itoll(r_01, r_23);
340 _amem8(&r[j+4]) = _itoll(r_45, r_67);
341 }
342 }
343 #endif
345 /* ======================================================================= */
346 /* End of file: DSP_fir_gen_hM17_rA8X8.c */
347 /* ----------------------------------------------------------------------- */
348 /* Copyright (c) 2011 Texas Instruments, Incorporated. */
349 /* All Rights Reserved. */
350 /* ======================================================================= */