]> Gitweb @ Texas Instruments - Open Source Git Repositories - git.TI.com/gitweb - ep-processor-libraries/dsplib.git/blob - ti/dsplib/src/DSP_fir_cplx_hM4X4/c66/DSP_fir_cplx_hM4X4.c
DSPLIB: optimized signal processing functions for TI DSPs
[ep-processor-libraries/dsplib.git] / ti / dsplib / src / DSP_fir_cplx_hM4X4 / c66 / DSP_fir_cplx_hM4X4.c
1 /*======================================================================= */
2 /*  TEXAS INSTRUMENTS, INC.                                                */
3 /*                                                                         */
4 /*  DSPLIB  DSP Signal Processing Library                                  */
5 /*                                                                         */
6 /*  This library contains proprietary intellectual property of Texas       */
7 /*  Instruments, Inc.  The library and its source code are protected by    */
8 /*  various copyrights, and portions may also be protected by patents or   */
9 /*  other legal protections.                                               */
10 /*                                                                         */
11 /*  This software is licensed for use with Texas Instruments TMS320        */
12 /*  family DSPs.  This license was provided to you prior to installing     */
13 /*  the software.  You may review this license by consulting the file      */
14 /*  TI_license.PDF which accompanies the files in this library.            */
15 /*                                                                         */
16 /* ----------------------------------------------------------------------- */
17 /*                                                                         */
18 /* DSP_fir_cplx_hM4X4.c -- Complex FIR Filter                              */
19 /*                   Optimized C Implementation (w/ Intrinsics)            */
20 /*                                                                         */
21 /*  Usage                                                                  */
22 /*     This routine is C-callable and can be called as:                    */
23 /*                                                                         */
24 /*     void DSP_fir_cplx_hM4X4 (                                           */
25 /*         const short *restrict x,                                        */
26 /*         const short *restrict h,                                        */
27 /*         short *restrict r,                                              */
28 /*         int nh,                                                         */
29 /*         int nr,                                                         */
30 /*     )                                                                   */
31 /*                                                                         */
32 /*     x[2*(nr+nh-1)] : Complex input data. x must point to x[2*(nh-1)].   */
33 /*     h[2*nh]        : Complex coefficients (in normal order).            */
34 /*     r[2*nr]        : Complex output data.                               */
35 /*     nh             : Number of complex coefficients.                    */
36 /*     nr             : Number of complex output samples.                  */
37 /*                                                                         */
38 /*  Description                                                            */
39 /*      This complex FIR computes nr complex output samples using nh       */
40 /*      complex coefficients. It operates on 16-bit data with a 32-bit     */
41 /*      accumulate. Each array consists of an even and odd term with even  */
42 /*      terms representing the real part of the element and the odd terms  */
43 /*      the imaginary part. The pointer to input array x must point to the */
44 /*      (nh)th complex sample, i.e. element 2*(nh-1), upon entry to the    */
45 /*      function. The coefficients are expected in normal order.           */
46 /*                                                                         */
47 /*  Assumptions                                                            */
48 /*     Arrays x, h, and r do not overlap                                   */
49 /*     nr >= 8; nr % 4 == 0                                                */
50 /*     nh >= 4; nh % 4 == 0                                                */
51 /*                                                                         */
52 /* Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/  */ 
53 /*                                                                         */
54 /*                                                                         */
55 /*  Redistribution and use in source and binary forms, with or without     */
56 /*  modification, are permitted provided that the following conditions     */
57 /*  are met:                                                               */
58 /*                                                                         */
59 /*    Redistributions of source code must retain the above copyright       */
60 /*    notice, this list of conditions and the following disclaimer.        */
61 /*                                                                         */
62 /*    Redistributions in binary form must reproduce the above copyright    */
63 /*    notice, this list of conditions and the following disclaimer in the  */
64 /*    documentation and/or other materials provided with the               */
65 /*    distribution.                                                        */
66 /*                                                                         */
67 /*    Neither the name of Texas Instruments Incorporated nor the names of  */
68 /*    its contributors may be used to endorse or promote products derived  */
69 /*    from this software without specific prior written permission.        */
70 /*                                                                         */
71 /*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    */
72 /*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      */
73 /*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR  */
74 /*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT   */
75 /*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,  */
76 /*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT       */
77 /*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  */
78 /*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  */
79 /*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT    */
80 /*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  */
81 /*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.   */
82 /*                                                                         */
83 /* ======================================================================= */
85 #pragma CODE_SECTION(DSP_fir_cplx_hM4X4, ".text:optimized");
87 #include "DSP_fir_cplx_hM4X4.h"
88 #ifdef __TI_COMPILER_VERSION__
89 #include "c6x.h"
90 #endif
92 #ifdef _LITTLE_ENDIAN
93 void DSP_fir_cplx_hM4X4 (
94     const short *restrict x,    /* Input array [nr+nh-1 elements] */
95     const short *restrict h,    /* Coeff array [nh elements]      */
96     short       *restrict r,    /* Output array [nr elements]     */
97     int nh,                     /* Number of coefficients         */
98     int nr                      /* Number of output samples       */
99 )
101     int i, j, imag_real_0, imag_real_1, imag_real_2, imag_real_3;    
102     long long h_3210, x_3210,x_7654,x_ba98;
103     long long real0imag0, real1imag1, real2imag2, real3imag3;  
104     __x128_t  x_54_32_76_54, x_98_76_ba_98, re1im1re0im0, re3im3re2im2;
106     /*--------------------------------------------------------------------*/
107     /* _nasserts are used to inform the compiler that the input, filter,  */
108     /* output arrays are word or double word aligned. In addition the  #  */
109     /* filter taps and output samples is stated to be even.               */
110     /*--------------------------------------------------------------------*/
111     _nassert((int)nr >= 8);
112     _nassert((int)nr % 4 == 0);
113     _nassert((int)nh >= 4);
114     _nassert((int)nh % 4 == 0);
116     /*--------------------------------------------------------------------*/
117     /* Inform the compiler that the following loop will iterate at least  */
118     /* twice and that the # output samples is a multiple of 4.            */
119     /*--------------------------------------------------------------------*/
120     #pragma MUST_ITERATE(2,,1)
121     for (i = 0; i < 2*nr; i += 8) {
122         /*----------------------------------------------------------------*/
123         /* Zero out accumulators for 4 complex output samples             */
124         /*----------------------------------------------------------------*/
125         real0imag0 = real1imag1=0;
126         real2imag2 = real3imag3=0;
127         
128         x_ba98 = _mem8((void *)&x[i+4]);
129         x_7654 = _mem8((void *)&x[i]);    
131         /*----------------------------------------------------------------*/
132         /* Inform compiler that filter taps is at least 4, and a multiple */
133         /* of 4.                                                          */
134         /*----------------------------------------------------------------*/
135         _nassert((int)nr >= 8);
136         _nassert((int)nr % 4 == 0);
137         _nassert((int)nh >= 4);
138         _nassert((int)nh % 4 == 0);
140         #pragma MUST_ITERATE(2,,2)
141         #pragma UNROLL(2)
142         for (j = 0; j < 2*nh; j += 4) {
143             /*------------------------------------------------------------*/
144             /* Perform double word loads using intrinsic                  */   
145             /*------------------------------------------------------------*/
146             h_3210 = _amem8((void *)&h[j]);
148             /*------------------------------------------------------------*/
149             /* Load input data using Double word loads.                   */
150             /*------------------------------------------------------------*/
151             x_3210 = _mem8((void *)&x[i - j - 4]);
152               
153             /*------------------------------------------------------------*/
154             /* Create 2*2 complex matrix for _cmatmpy intrinsic           */
155             /* Perform complex matrix multiply using _cmatmpy             */
156             /*------------------------------------------------------------*/
157             x_54_32_76_54 = _llto128(_dmv(_loll(x_7654),_hill(x_3210)),x_7654); 
158             re1im1re0im0  = _cmatmpy(h_3210,x_54_32_76_54);
160             /*------------------------------------------------------------*/
161             /* Create 2*2 complex matrix for _cmatmpy intrinsic           */
162             /* Perform complex matrix multiply using _cmatmpy             */
163             /*------------------------------------------------------------*/            
164             x_98_76_ba_98 = _llto128(_dmv(_loll(x_ba98),_hill(x_7654)),x_ba98);
165             re3im3re2im2  = _cmatmpy(h_3210,x_98_76_ba_98);
166             
167             /*------------------------------------------------------------*/
168             /* Accumalate 4 complex output using _dadd()                  */
169             /*------------------------------------------------------------*/
170             real0imag0 = _dadd(real0imag0,_lo128(re1im1re0im0));
171             real1imag1 = _dadd(real1imag1,_hi128(re1im1re0im0));
172             real2imag2 = _dadd(real2imag2,_lo128(re3im3re2im2));
173             real3imag3 = _dadd(real3imag3,_hi128(re3im3re2im2));            
174              
175             /*------------------------------------------------------------*/
176             /* Save inputs for the next iteration                         */
177             /*------------------------------------------------------------*/                            
178             x_ba98 = x_7654;        
179             x_7654 = x_3210;
180         }
182         /*----------------------------------------------------------------*/
183         /*  Shift out accumulated sum, pack and store as double words     */
184         /*----------------------------------------------------------------*/
185         real0imag0 = _dshl(real0imag0,1);
186         real1imag1 = _dshl(real1imag1,1);
187         real2imag2 = _dshl(real2imag2,1);
188         real3imag3 = _dshl(real3imag3,1);  
189             
190         imag_real_0 = _packh2(_hill(real0imag0), _loll(real0imag0));
191         imag_real_1 = _packh2(_hill(real1imag1), _loll(real1imag1));
192         imag_real_2 = _packh2(_hill(real2imag2), _loll(real2imag2));
193         imag_real_3 = _packh2(_hill(real3imag3), _loll(real3imag3));
194                 
195         _amem8(&r[i])   = _dcrot270(_itoll(imag_real_1, imag_real_0));
196         _amem8(&r[i+4]) = _dcrot270(_itoll(imag_real_3, imag_real_2));        
197     }
200 /*-----------------------------------------------------------*/
201 /*  Big Endian version                                       */
202 /*-----------------------------------------------------------*/
203 #else
204 void DSP_fir_cplx_hM4X4 (
205     const short *restrict x,    /* Input array [nr+nh-1 elements] */
206     const short *restrict h,    /* Coeff array [nh elements]      */
207     short       *restrict r,    /* Output array [nr elements]     */
208     int nh,                     /* Number of coefficients         */
209     int nr                      /* Number of output samples       */
212     int i, j, real_imag_0, real_imag_1, real_imag_2, real_imag_3;    
213     long long h_0123, x_0123 ,x_4567 ,x_89ab;
214     long long imag0real0, imag1real1, imag2real2, imag3real3;  
215     __x128_t  x_45_67_23_45, x_89_ab_67_89, im0re0im1re1, im2re2im3re3;  
217     /*--------------------------------------------------------------------*/
218     /* _nasserts are used to inform the compiler that the input, filter,  */
219     /* output arrays are word or double word aligned. In addition the  #  */
220     /* filter taps and output samples is stated to be even.               */
221     /*--------------------------------------------------------------------*/
222     _nassert((int)nr >= 8);
223     _nassert((int)nr % 4 == 0);
224     _nassert((int)nh >= 4);
225     _nassert((int)nh % 4 == 0);
227     /*--------------------------------------------------------------------*/
228     /* Inform the compiler that the following loop will iterate at least  */
229     /* twice and that the # output samples is a multiple of 4.            */
230     /*--------------------------------------------------------------------*/
231     #pragma MUST_ITERATE(2,,1)
232     for (i = 0; i < 2*nr; i += 8) {
233         /*----------------------------------------------------------------*/
234         /* Zero out accumulators for 4 complex output samples             */
235         /*----------------------------------------------------------------*/
236         imag0real0 = imag1real1=0;
237         imag2real2 = imag3real3=0;
238         
239         x_89ab = _mem8((void *)&x[i+4]);
240         x_4567 = _mem8((void *)&x[i]);    
242         /*----------------------------------------------------------------*/
243         /* Inform compiler that filter taps is at least 4, and a multiple */
244         /* of 4.                                                          */
245         /*----------------------------------------------------------------*/
246         _nassert((int)nr >= 8);
247         _nassert((int)nr % 4 == 0);
248         _nassert((int)nh >= 4);
249         _nassert((int)nh % 4 == 0);
251         #pragma MUST_ITERATE(2,,2)
252         #pragma UNROLL(2)
253         for (j = 0; j < 2*nh; j += 4) {
254             /*------------------------------------------------------------*/
255             /* Perform double word loads using intrinsic                  */   
256             /*------------------------------------------------------------*/
257             h_0123 = _amem8((void *)&h[j]);
258             /*------------------------------------------------------------*/
259             /* Load input data using Double word loads.                   */
260             /*------------------------------------------------------------*/
261             x_0123 = _mem8((void *)&x[i - j - 4]);
262               
263             /*------------------------------------------------------------*/
264             /* Create 2*2 complex matrix for _cmatmpy intrinsic           */
265             /* Perform complex matrix multiply using _cmatmpy             */
266             /*------------------------------------------------------------*/
267             x_45_67_23_45 = _llto128(x_4567,_dmv(_loll(x_0123),_hill(x_4567))); 
268             im0re0im1re1  = _cmatmpy(h_0123,x_45_67_23_45);
270             /*------------------------------------------------------------*/
271             /* Create 2*2 complex matrix for _cmatmpy intrinsic           */
272             /* Perform complex matrix multiply using _cmatmpy             */
273             /*------------------------------------------------------------*/            
274             x_89_ab_67_89 = _llto128(x_89ab,_dmv(_loll(x_4567),_hill(x_89ab)));
275             im2re2im3re3  = _cmatmpy(h_0123,x_89_ab_67_89);
276             
277             /*------------------------------------------------------------*/
278             /* Accumalate 4 complex output using _dadd()                  */
279             /*------------------------------------------------------------*/
280             imag0real0 = _dadd(imag0real0,_hi128(im0re0im1re1));
281             imag1real1 = _dadd(imag1real1,_lo128(im0re0im1re1));
282             imag2real2 = _dadd(imag2real2,_hi128(im2re2im3re3));
283             imag3real3 = _dadd(imag3real3,_lo128(im2re2im3re3));            
284              
285             /*------------------------------------------------------------*/
286             /* Save inputs for the next iteration                         */
287             /*------------------------------------------------------------*/                            
288             x_89ab = x_4567;        
289             x_4567 = x_0123;
290         }
292         /*----------------------------------------------------------------*/
293         /*  Shift out accumulated sum, pack and store as double words     */
294         /*----------------------------------------------------------------*/
295         imag0real0 = _dshl(imag0real0,1);
296         imag1real1 = _dshl(imag1real1,1);
297         imag2real2 = _dshl(imag2real2,1);
298         imag3real3 = _dshl(imag3real3,1);  
299               
300         real_imag_0 = _packh2(_hill(imag0real0), _loll(imag0real0));
301         real_imag_1 = _packh2(_hill(imag1real1), _loll(imag1real1));
302         real_imag_2 = _packh2(_hill(imag2real2), _loll(imag2real2));
303         real_imag_3 = _packh2(_hill(imag3real3), _loll(imag3real3));
304                                                                      
305         _amem8(&r[i])   = _itoll(real_imag_0, real_imag_1);
306         _amem8(&r[i+4]) = _itoll(real_imag_2, real_imag_3);
307     }
309 #endif
311 /* ======================================================================= */
312 /*  End of file:  DSP_fir_cplx_hM4X4.c                                     */
313 /* ----------------------------------------------------------------------- */
314 /*            Copyright (c) 2011 Texas Instruments, Incorporated.          */
315 /*                           All Rights Reserved.                          */
316 /* ======================================================================= */