ti/dsplib/src/DSP_fir_cplx_hM4X4/c66/DSP_fir_cplx_hM4X4.c

   1 /*======================================================================= */
   2 /*  TEXAS INSTRUMENTS, INC.                                                */
   3 /*                                                                         */
   4 /*  DSPLIB  DSP Signal Processing Library                                  */
   5 /*                                                                         */
   6 /*  This library contains proprietary intellectual property of Texas       */
   7 /*  Instruments, Inc.  The library and its source code are protected by    */
   8 /*  various copyrights, and portions may also be protected by patents or   */
   9 /*  other legal protections.                                               */
  10 /*                                                                         */
  11 /*  This software is licensed for use with Texas Instruments TMS320        */
  12 /*  family DSPs.  This license was provided to you prior to installing     */
  13 /*  the software.  You may review this license by consulting the file      */
  14 /*  TI_license.PDF which accompanies the files in this library.            */
  15 /*                                                                         */
  16 /* ----------------------------------------------------------------------- */
  17 /*                                                                         */
  18 /* DSP_fir_cplx_hM4X4.c -- Complex FIR Filter                              */
  19 /*                   Optimized C Implementation (w/ Intrinsics)            */
  20 /*                                                                         */
  21 /*  Usage                                                                  */
  22 /*     This routine is C-callable and can be called as:                    */
  23 /*                                                                         */
  24 /*     void DSP_fir_cplx_hM4X4 (                                           */
  25 /*         const short *restrict x,                                        */
  26 /*         const short *restrict h,                                        */
  27 /*         short *restrict r,                                              */
  28 /*         int nh,                                                         */
  29 /*         int nr,                                                         */
  30 /*     )                                                                   */
  31 /*                                                                         */
  32 /*     x[2*(nr+nh-1)] : Complex input data. x must point to x[2*(nh-1)].   */
  33 /*     h[2*nh]        : Complex coefficients (in normal order).            */
  34 /*     r[2*nr]        : Complex output data.                               */
  35 /*     nh             : Number of complex coefficients.                    */
  36 /*     nr             : Number of complex output samples.                  */
  37 /*                                                                         */
  38 /*  Description                                                            */
  39 /*      This complex FIR computes nr complex output samples using nh       */
  40 /*      complex coefficients. It operates on 16-bit data with a 32-bit     */
  41 /*      accumulate. Each array consists of an even and odd term with even  */
  42 /*      terms representing the real part of the element and the odd terms  */
  43 /*      the imaginary part. The pointer to input array x must point to the */
  44 /*      (nh)th complex sample, i.e. element 2*(nh-1), upon entry to the    */
  45 /*      function. The coefficients are expected in normal order.           */
  46 /*                                                                         */
  47 /*  Assumptions                                                            */
  48 /*     Arrays x, h, and r do not overlap                                   */
  49 /*     nr >= 8; nr % 4 == 0                                                */
  50 /*     nh >= 4; nh % 4 == 0                                                */
  51 /*                                                                         */
  52 /* Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/  */
  53 /*                                                                         */
  54 /*                                                                         */
  55 /*  Redistribution and use in source and binary forms, with or without     */
  56 /*  modification, are permitted provided that the following conditions     */
  57 /*  are met:                                                               */
  58 /*                                                                         */
  59 /*    Redistributions of source code must retain the above copyright       */
  60 /*    notice, this list of conditions and the following disclaimer.        */
  61 /*                                                                         */
  62 /*    Redistributions in binary form must reproduce the above copyright    */
  63 /*    notice, this list of conditions and the following disclaimer in the  */
  64 /*    documentation and/or other materials provided with the               */
  65 /*    distribution.                                                        */
  66 /*                                                                         */
  67 /*    Neither the name of Texas Instruments Incorporated nor the names of  */
  68 /*    its contributors may be used to endorse or promote products derived  */
  69 /*    from this software without specific prior written permission.        */
  70 /*                                                                         */
  71 /*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    */
  72 /*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      */
  73 /*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR  */
  74 /*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT   */
  75 /*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,  */
  76 /*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT       */
  77 /*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,  */
  78 /*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY  */
  79 /*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT    */
  80 /*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  */
  81 /*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.   */
  82 /*                                                                         */
  83 /* ======================================================================= */
  84
  85 #pragma CODE_SECTION(DSP_fir_cplx_hM4X4, ".text:optimized");
  86
  87 #include "DSP_fir_cplx_hM4X4.h"
  88 #ifdef __TI_COMPILER_VERSION__
  89 #include "c6x.h"
  90 #endif
  91
  92 #ifdef _LITTLE_ENDIAN
  93 void DSP_fir_cplx_hM4X4 (
  94     const short *restrict x,    /* Input array [nr+nh-1 elements] */
  95     const short *restrict h,    /* Coeff array [nh elements]      */
  96     short       *restrict r,    /* Output array [nr elements]     */
  97     int nh,                     /* Number of coefficients         */
  98     int nr                      /* Number of output samples       */
  99 )
 100 {
 101     int i, j, imag_real_0, imag_real_1, imag_real_2, imag_real_3;
 102     long long h_3210, x_3210,x_7654,x_ba98;
 103     long long real0imag0, real1imag1, real2imag2, real3imag3;
 104     __x128_t  x_54_32_76_54, x_98_76_ba_98, re1im1re0im0, re3im3re2im2;
 105
 106     /*--------------------------------------------------------------------*/
 107     /* _nasserts are used to inform the compiler that the input, filter,  */
 108     /* output arrays are word or double word aligned. In addition the  #  */
 109     /* filter taps and output samples is stated to be even.               */
 110     /*--------------------------------------------------------------------*/
 111     _nassert((int)nr >= 8);
 112     _nassert((int)nr % 4 == 0);
 113     _nassert((int)nh >= 4);
 114     _nassert((int)nh % 4 == 0);
 115
 116     /*--------------------------------------------------------------------*/
 117     /* Inform the compiler that the following loop will iterate at least  */
 118     /* twice and that the # output samples is a multiple of 4.            */
 119     /*--------------------------------------------------------------------*/
 120     #pragma MUST_ITERATE(2,,1)
 121     for (i = 0; i < 2*nr; i += 8) {
 122         /*----------------------------------------------------------------*/
 123         /* Zero out accumulators for 4 complex output samples             */
 124         /*----------------------------------------------------------------*/
 125         real0imag0 = real1imag1=0;
 126         real2imag2 = real3imag3=0;
 127
 128         x_ba98 = _mem8((void *)&x[i+4]);
 129         x_7654 = _mem8((void *)&x[i]);
 130
 131         /*----------------------------------------------------------------*/
 132         /* Inform compiler that filter taps is at least 4, and a multiple */
 133         /* of 4.                                                          */
 134         /*----------------------------------------------------------------*/
 135         _nassert((int)nr >= 8);
 136         _nassert((int)nr % 4 == 0);
 137         _nassert((int)nh >= 4);
 138         _nassert((int)nh % 4 == 0);
 139
 140         #pragma MUST_ITERATE(2,,2)
 141         #pragma UNROLL(2)
 142         for (j = 0; j < 2*nh; j += 4) {
 143             /*------------------------------------------------------------*/
 144             /* Perform double word loads using intrinsic                  */
 145             /*------------------------------------------------------------*/
 146             h_3210 = _amem8((void *)&h[j]);
 147
 148             /*------------------------------------------------------------*/
 149             /* Load input data using Double word loads.                   */
 150             /*------------------------------------------------------------*/
 151             x_3210 = _mem8((void *)&x[i - j - 4]);
 152
 153             /*------------------------------------------------------------*/
 154             /* Create 2*2 complex matrix for _cmatmpy intrinsic           */
 155             /* Perform complex matrix multiply using _cmatmpy             */
 156             /*------------------------------------------------------------*/
 157             x_54_32_76_54 = _llto128(_dmv(_loll(x_7654),_hill(x_3210)),x_7654);
 158             re1im1re0im0  = _cmatmpy(h_3210,x_54_32_76_54);
 159
 160             /*------------------------------------------------------------*/
 161             /* Create 2*2 complex matrix for _cmatmpy intrinsic           */
 162             /* Perform complex matrix multiply using _cmatmpy             */
 163             /*------------------------------------------------------------*/
 164             x_98_76_ba_98 = _llto128(_dmv(_loll(x_ba98),_hill(x_7654)),x_ba98);
 165             re3im3re2im2  = _cmatmpy(h_3210,x_98_76_ba_98);
 166
 167             /*------------------------------------------------------------*/
 168             /* Accumalate 4 complex output using _dadd()                  */
 169             /*------------------------------------------------------------*/
 170             real0imag0 = _dadd(real0imag0,_lo128(re1im1re0im0));
 171             real1imag1 = _dadd(real1imag1,_hi128(re1im1re0im0));
 172             real2imag2 = _dadd(real2imag2,_lo128(re3im3re2im2));
 173             real3imag3 = _dadd(real3imag3,_hi128(re3im3re2im2));
 174
 175             /*------------------------------------------------------------*/
 176             /* Save inputs for the next iteration                         */
 177             /*------------------------------------------------------------*/
 178             x_ba98 = x_7654;
 179             x_7654 = x_3210;
 180         }
 181
 182         /*----------------------------------------------------------------*/
 183         /*  Shift out accumulated sum, pack and store as double words     */
 184         /*----------------------------------------------------------------*/
 185         real0imag0 = _dshl(real0imag0,1);
 186         real1imag1 = _dshl(real1imag1,1);
 187         real2imag2 = _dshl(real2imag2,1);
 188         real3imag3 = _dshl(real3imag3,1);
 189
 190         imag_real_0 = _packh2(_hill(real0imag0), _loll(real0imag0));
 191         imag_real_1 = _packh2(_hill(real1imag1), _loll(real1imag1));
 192         imag_real_2 = _packh2(_hill(real2imag2), _loll(real2imag2));
 193         imag_real_3 = _packh2(_hill(real3imag3), _loll(real3imag3));
 194
 195         _amem8(&r[i])   = _dcrot270(_itoll(imag_real_1, imag_real_0));
 196         _amem8(&r[i+4]) = _dcrot270(_itoll(imag_real_3, imag_real_2));
 197     }
 198 }
 199
 200 /*-----------------------------------------------------------*/
 201 /*  Big Endian version                                       */
 202 /*-----------------------------------------------------------*/
 203 #else
 204 void DSP_fir_cplx_hM4X4 (
 205     const short *restrict x,    /* Input array [nr+nh-1 elements] */
 206     const short *restrict h,    /* Coeff array [nh elements]      */
 207     short       *restrict r,    /* Output array [nr elements]     */
 208     int nh,                     /* Number of coefficients         */
 209     int nr                      /* Number of output samples       */
 210 )
 211 {
 212     int i, j, real_imag_0, real_imag_1, real_imag_2, real_imag_3;
 213     long long h_0123, x_0123 ,x_4567 ,x_89ab;
 214     long long imag0real0, imag1real1, imag2real2, imag3real3;
 215     __x128_t  x_45_67_23_45, x_89_ab_67_89, im0re0im1re1, im2re2im3re3;
 216
 217     /*--------------------------------------------------------------------*/
 218     /* _nasserts are used to inform the compiler that the input, filter,  */
 219     /* output arrays are word or double word aligned. In addition the  #  */
 220     /* filter taps and output samples is stated to be even.               */
 221     /*--------------------------------------------------------------------*/
 222     _nassert((int)nr >= 8);
 223     _nassert((int)nr % 4 == 0);
 224     _nassert((int)nh >= 4);
 225     _nassert((int)nh % 4 == 0);
 226
 227     /*--------------------------------------------------------------------*/
 228     /* Inform the compiler that the following loop will iterate at least  */
 229     /* twice and that the # output samples is a multiple of 4.            */
 230     /*--------------------------------------------------------------------*/
 231     #pragma MUST_ITERATE(2,,1)
 232     for (i = 0; i < 2*nr; i += 8) {
 233         /*----------------------------------------------------------------*/
 234         /* Zero out accumulators for 4 complex output samples             */
 235         /*----------------------------------------------------------------*/
 236         imag0real0 = imag1real1=0;
 237         imag2real2 = imag3real3=0;
 238
 239         x_89ab = _mem8((void *)&x[i+4]);
 240         x_4567 = _mem8((void *)&x[i]);
 241
 242         /*----------------------------------------------------------------*/
 243         /* Inform compiler that filter taps is at least 4, and a multiple */
 244         /* of 4.                                                          */
 245         /*----------------------------------------------------------------*/
 246         _nassert((int)nr >= 8);
 247         _nassert((int)nr % 4 == 0);
 248         _nassert((int)nh >= 4);
 249         _nassert((int)nh % 4 == 0);
 250
 251         #pragma MUST_ITERATE(2,,2)
 252         #pragma UNROLL(2)
 253         for (j = 0; j < 2*nh; j += 4) {
 254             /*------------------------------------------------------------*/
 255             /* Perform double word loads using intrinsic                  */
 256             /*------------------------------------------------------------*/
 257             h_0123 = _amem8((void *)&h[j]);
 258             /*------------------------------------------------------------*/
 259             /* Load input data using Double word loads.                   */
 260             /*------------------------------------------------------------*/
 261             x_0123 = _mem8((void *)&x[i - j - 4]);
 262
 263             /*------------------------------------------------------------*/
 264             /* Create 2*2 complex matrix for _cmatmpy intrinsic           */
 265             /* Perform complex matrix multiply using _cmatmpy             */
 266             /*------------------------------------------------------------*/
 267             x_45_67_23_45 = _llto128(x_4567,_dmv(_loll(x_0123),_hill(x_4567)));
 268             im0re0im1re1  = _cmatmpy(h_0123,x_45_67_23_45);
 269
 270             /*------------------------------------------------------------*/
 271             /* Create 2*2 complex matrix for _cmatmpy intrinsic           */
 272             /* Perform complex matrix multiply using _cmatmpy             */
 273             /*------------------------------------------------------------*/
 274             x_89_ab_67_89 = _llto128(x_89ab,_dmv(_loll(x_4567),_hill(x_89ab)));
 275             im2re2im3re3  = _cmatmpy(h_0123,x_89_ab_67_89);
 276
 277             /*------------------------------------------------------------*/
 278             /* Accumalate 4 complex output using _dadd()                  */
 279             /*------------------------------------------------------------*/
 280             imag0real0 = _dadd(imag0real0,_hi128(im0re0im1re1));
 281             imag1real1 = _dadd(imag1real1,_lo128(im0re0im1re1));
 282             imag2real2 = _dadd(imag2real2,_hi128(im2re2im3re3));
 283             imag3real3 = _dadd(imag3real3,_lo128(im2re2im3re3));
 284
 285             /*------------------------------------------------------------*/
 286             /* Save inputs for the next iteration                         */
 287             /*------------------------------------------------------------*/
 288             x_89ab = x_4567;
 289             x_4567 = x_0123;
 290         }
 291
 292         /*----------------------------------------------------------------*/
 293         /*  Shift out accumulated sum, pack and store as double words     */
 294         /*----------------------------------------------------------------*/
 295         imag0real0 = _dshl(imag0real0,1);
 296         imag1real1 = _dshl(imag1real1,1);
 297         imag2real2 = _dshl(imag2real2,1);
 298         imag3real3 = _dshl(imag3real3,1);
 299
 300         real_imag_0 = _packh2(_hill(imag0real0), _loll(imag0real0));
 301         real_imag_1 = _packh2(_hill(imag1real1), _loll(imag1real1));
 302         real_imag_2 = _packh2(_hill(imag2real2), _loll(imag2real2));
 303         real_imag_3 = _packh2(_hill(imag3real3), _loll(imag3real3));
 304
 305         _amem8(&r[i])   = _itoll(real_imag_0, real_imag_1);
 306         _amem8(&r[i+4]) = _itoll(real_imag_2, real_imag_3);
 307     }
 308 }
 309 #endif
 310
 311 /* ======================================================================= */
 312 /*  End of file:  DSP_fir_cplx_hM4X4.c                                     */
 313 /* ----------------------------------------------------------------------- */
 314 /*            Copyright (c) 2011 Texas Instruments, Incorporated.          */
 315 /*                           All Rights Reserved.                          */
 316 /* ======================================================================= */
 317