blis/kernels/c66x/3/bli_gemm_ukernels.c

   1 /*
   2
   3    BLIS
   4    An object-based framework for developing high-performance BLAS-like
   5    libraries.
   6
   7    Copyright (C) 2014, The University of Texas
   8
   9    Redistribution and use in source and binary forms, with or without
  10    modification, are permitted provided that the following conditions are
  11    met:
  12     - Redistributions of source code must retain the above copyright
  13       notice, this list of conditions and the following disclaimer.
  14     - Redistributions in binary form must reproduce the above copyright
  15       notice, this list of conditions and the following disclaimer in the
  16       documentation and/or other materials provided with the distribution.
  17     - Neither the name of The University of Texas nor the names of its
  18       contributors may be used to endorse or promote products derived
  19       from this software without specific prior written permission.
  20
  21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32
  33 */
  34 #include "blis.h"
  35
  36 //#define BLIS_ENABLE_CYCLE_COUNT
  37
  38 void bli_sgemm_ukernel_4x8(
  39                         dim_t              k,
  40                         float*    restrict alpha,
  41                         float*    restrict a,
  42                         float*    restrict b,
  43                         float*    restrict beta,
  44                         float*    restrict c, inc_t rs_c, inc_t cs_c,
  45                         auxinfo_t*         data
  46                       )
  47 {
  48           __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
  49           __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;
  50           __float2_t * restrict ptrB     = (__float2_t *) b;
  51           __float2_t * restrict ptrA     = (__float2_t *) a;
  52           __float2_t * restrict ptrC;
  53           __float2_t regA2, regC, regS, regR;
  54           int_least16_t index;
  55           float* restrict c0, * restrict c1;
  56           __float2_t regB2;
  57
  58 #ifdef BLIS_ENABLE_CYCLE_COUNT
  59           volatile int counter_start;
  60           volatile int counter_end;
  61 #endif
  62
  63 #ifdef BLIS_ENABLE_CYCLE_COUNT
  64           TSCL=0;
  65           counter_start = TSCL;
  66 #endif
  67           //touch routine: both a & b
  68           //Length of b = NR*K*size of float;
  69 #ifdef BLIS_ENABLE_PREFETCH
  70           //touch(a, k*BLIS_DEFAULT_MR_S*4);
  71 #endif
  72
  73           // zero out accumulators
  74           sum0 = 0.0;
  75           sum1 = 0.0;
  76           sum2 = 0.0;
  77           sum3 = 0.0;
  78           sum4 = 0.0;
  79           sum5 = 0.0;
  80           sum6 = 0.0;
  81           sum7 = 0.0;
  82           sum8 = 0.0;
  83           sum9 = 0.0;
  84           suma = 0.0;
  85           sumb = 0.0;
  86           sumc = 0.0;
  87           sumd = 0.0;
  88           sume = 0.0;
  89           sumf = 0.0;
  90
  91
  92           for (index = 0; index < k; index++)
  93           { // loop over k;
  94             // each iteration performs rank one update of 4x1 by 1x8
  95             // matrices of A and B respectively; result is
  96             // accumulated over 4x8 matrix
  97                 __float2_t b01, b23, b45, b67, a01, a23;
  98                 __x128_t   reg128;
  99
 100             a01 = *ptrA++;
 101                 a23 = *ptrA++;
 102
 103             b01 = *ptrB++;
 104             b23 = *ptrB++;
 105             b45 = *ptrB++;
 106             b67 = *ptrB++;
 107
 108             reg128 = _cmpysp(a01, b01);
 109             // accumulate a[0]*b[1] and -a[0]*b[0]
 110             sum0 = _daddsp(sum0, _lof2_128(reg128));
 111             // accumulate a[1]*b[0] and a[1]*b[1]
 112             sum1 = _daddsp(sum1, _hif2_128(reg128));
 113
 114             reg128 = _cmpysp(a01, b23);
 115             // accumulate a[0]*b[3] and -a[0]*b[2]
 116             sum2 = _daddsp(sum2, _lof2_128(reg128));
 117             // accumulate a[1]*b[2] and a[1]*b[3]
 118             sum3 = _daddsp(sum3, _hif2_128(reg128));
 119
 120             reg128 = _cmpysp(a01, b45);
 121             // accumulate a[0]*b[5] and -a[0]*b[4]
 122             sum4 = _daddsp(sum4, _lof2_128(reg128));
 123             // accumulate a[1]*b[4] and a[1]*b[5]
 124             sum5 = _daddsp(sum5, _hif2_128(reg128));
 125
 126             reg128 = _cmpysp(a01, b67);
 127             // accumulate a[0]*b[7] and -a[0]*b[6]
 128             sum6 = _daddsp(sum6, _lof2_128(reg128));
 129             // accumulate a[1]*b[6] and a[1]*b[7]
 130             sum7 = _daddsp(sum7, _hif2_128(reg128));
 131
 132             reg128 = _cmpysp(a23, b01);
 133             // accumulate a[2]*b[1] and -a[2]*b[0]
 134             sum8 = _daddsp(sum8, _lof2_128(reg128));
 135             // accumulate a[3]*b[0] and a[3]*b[1]
 136             sum9 = _daddsp(sum9, _hif2_128(reg128));
 137
 138             reg128 = _cmpysp(a23, b23);
 139             // accumulate a[2]*b[3] and -a[2]*b[2]
 140             suma = _daddsp(suma, _lof2_128(reg128));
 141             // accumulate a[3]*b[2] and a[3]*b[3]
 142             sumb = _daddsp(sumb, _hif2_128(reg128));
 143
 144             reg128 = _cmpysp(a23, b45);
 145             // accumulate a[2]*b[5] and -a[2]*b[4]
 146             sumc = _daddsp(sumc, _lof2_128(reg128));
 147             // accumulate a[3]*b[4] and a[3]*b[5]
 148             sumd = _daddsp(sumd, _hif2_128(reg128));
 149
 150             reg128 = _cmpysp(a23, b67);
 151             // accumulate a[2]*b[7] and -a[2]*b[6]
 152             sume = _daddsp(sume, _lof2_128(reg128));
 153             // accumulate a[3]*b[6] and a[3]*b[7]
 154             sumf = _daddsp(sumf, _hif2_128(reg128));
 155
 156           }
 157
 158 #ifdef BLIS_ENABLE_CYCLE_COUNT
 159           counter_end=TSCL;
 160           if (lib_get_coreID () == 0)
 161                   printf("%d\t%d\t",k, counter_end-counter_start);
 162 #endif
 163
 164 #ifdef BLIS_ENABLE_CYCLE_COUNT
 165           TSCL=0;
 166           counter_start = TSCL;
 167 #endif
 168             regA2 = _ftof2(*alpha, *alpha);
 169             regB2 = _ftof2(*beta, *beta);
 170             if (rs_c != 1)
 171             {
 172                 // update c[0,0] and c[1,0]
 173                 c0 = (c + 0*rs_c + 0*cs_c);
 174                 c1 = (c + 1*rs_c + 0*cs_c);
 175                 regC = _ftof2(*c1,*c0);
 176                 regC = _dmpysp(regC, regB2);
 177                 regS = _ftof2(_lof2(sum1),-_hif2(sum0));
 178                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 179                 *c0 = _lof2(regR);
 180                 *c1 = _hif2(regR);
 181
 182                 //update c[0,1] and c[1,1]
 183                 c0 = c0 + 1*cs_c;
 184                 c1 = c1 + 1*cs_c;
 185                 regC = _ftof2(*c1,*c0);
 186                 regC = _dmpysp(regC, regB2);
 187                 regS = _ftof2(_hif2(sum1),_lof2(sum0));
 188                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 189                 *c0 = _lof2(regR);
 190                 *c1 = _hif2(regR);
 191
 192                 //update c[0,2] and c[1,2]
 193                 c0 = c0 + 1*cs_c;
 194                 c1 = c1 + 1*cs_c;
 195                 regC = _ftof2(*c1,*c0);
 196                 regC = _dmpysp(regC, regB2);
 197                 regS = _ftof2(_lof2(sum3),-_hif2(sum2));
 198                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 199                 *c0 = _lof2(regR);
 200                 *c1 = _hif2(regR);
 201
 202                 //update  c[0,3] and c[1,3]
 203                 c0 = c0 + 1*cs_c;
 204                 c1 = c1 + 1*cs_c;
 205                 regC = _ftof2(*c1,*c0);
 206                 regC = _dmpysp(regC, regB2);
 207                 regS = _ftof2(_hif2(sum3),_lof2(sum2));
 208                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 209                 *c0 = _lof2(regR);
 210                 *c1 = _hif2(regR);
 211
 212                 //update c[0,4] and c[1,4]
 213                 c0 = c0 + 1*cs_c;
 214                 c1 = c1 + 1*cs_c;
 215                 regC = _ftof2(*c1,*c0);
 216                 regC = _dmpysp(regC, regB2);
 217                 regS = _ftof2(_lof2(sum5),-_hif2(sum4));
 218                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 219                 *c0 = _lof2(regR);
 220                 *c1 = _hif2(regR);
 221
 222                 //update c[0,5] and c[1,5]
 223                 c0 = c0 + 1*cs_c;
 224                 c1 = c1 + 1*cs_c;
 225                 regC = _ftof2(*c1,*c0);
 226                 regC = _dmpysp(regC, regB2);
 227                 regS = _ftof2(_hif2(sum5),_lof2(sum4));
 228                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 229                 *c0 = _lof2(regR);
 230                 *c1 = _hif2(regR);
 231
 232                 //update c[0,6] and c[1,6]
 233                 c0 = c0 + 1*cs_c;
 234                 c1 = c1 + 1*cs_c;
 235                 regC = _ftof2(*c1,*c0);
 236                 regC = _dmpysp(regC, regB2);
 237                 regS = _ftof2(_lof2(sum7),-_hif2(sum6));
 238                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 239                 *c0 = _lof2(regR);
 240                 *c1 = _hif2(regR);
 241
 242                 //update c[0,7] and c[1,7]
 243                 c0 = c0 + 1*cs_c;
 244                 c1 = c1 + 1*cs_c;
 245                 regC = _ftof2(*c1,*c0);
 246                 regC = _dmpysp(regC, regB2);
 247                 regS = _ftof2(_hif2(sum7),_lof2(sum6));
 248                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 249                 *c0 = _lof2(regR);
 250                 *c1 = _hif2(regR);
 251
 252                 // update c[2,0] and c[3,0]
 253                 c0 = (c + 2*rs_c + 0*cs_c);
 254                 c1 = (c + 3*rs_c + 0*cs_c);
 255                 regC = _ftof2(*c1,*c0);
 256                 regC = _dmpysp(regC, regB2);
 257                 regS = _ftof2(_lof2(sum9),-_hif2(sum8));
 258                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 259                 *c0 = _lof2(regR);
 260                 *c1 = _hif2(regR);
 261
 262                 // update c[2,1] and c[3,1]
 263                 c0 = c0 + 1*cs_c;
 264                 c1 = c1 + 1*cs_c;
 265                 regC = _ftof2(*c1,*c0);
 266                 regC = _dmpysp(regC, regB2);
 267                 regS = _ftof2(_hif2(sum9),_lof2(sum8));
 268                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 269                 *c0 = _lof2(regR);
 270                 *c1 = _hif2(regR);
 271
 272                 //update c[2,2] and c[3,2]
 273                 c0 = c0 + 1*cs_c;
 274                 c1 = c1 + 1*cs_c;
 275                 regC = _ftof2(*c1,*c0);
 276                 regC = _dmpysp(regC, regB2);
 277                 regS = _ftof2(_lof2(sumb),-_hif2(suma));
 278                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 279                 *c0 = _lof2(regR);
 280                 *c1 = _hif2(regR);
 281
 282                 //update  c[2,3] and c[3,3]
 283                 c0 = c0 + 1*cs_c;
 284                 c1 = c1 + 1*cs_c;
 285                 regC = _ftof2(*c1,*c0);
 286                 regC = _dmpysp(regC, regB2);
 287                 regS = _ftof2(_hif2(sumb),_lof2(suma));
 288                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 289                 *c0 = _lof2(regR);
 290                 *c1 = _hif2(regR);
 291
 292                 //update c[2,4] and c[3,4]
 293                 c0 = c0 + 1*cs_c;
 294                 c1 = c1 + 1*cs_c;
 295                 regC = _ftof2(*c1,*c0);
 296                 regC = _dmpysp(regC, regB2);
 297                 regS = _ftof2(_lof2(sumd),-_hif2(sumc));
 298                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 299                 *c0 = _lof2(regR);
 300                 *c1 = _hif2(regR);
 301
 302                 //update c[2,5] and c[3,5]
 303                 c0 = c0 + 1*cs_c;
 304                 c1 = c1 + 1*cs_c;
 305                 regC = _ftof2(*c1,*c0);
 306                 regC = _dmpysp(regC, regB2);
 307                 regS = _ftof2(_hif2(sumd),_lof2(sumc));
 308                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 309                 *c0 = _lof2(regR);
 310                 *c1 = _hif2(regR);
 311
 312                 //update c[2,6] and c[2,6]
 313                 c0 = c0 + 1*cs_c;
 314                 c1 = c1 + 1*cs_c;
 315                 regC = _ftof2(*c1,*c0);
 316                 regC = _dmpysp(regC, regB2);
 317                 regS = _ftof2(_lof2(sumf),-_hif2(sume));
 318                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 319                 *c0 = _lof2(regR);
 320                 *c1 = _hif2(regR);
 321
 322                 //update c[2,7] and c[2,7]
 323                 c0 = c0 + 1*cs_c;
 324                 c1 = c1 + 1*cs_c;
 325                 regC = _ftof2(*c1,*c0);
 326                 regC = _dmpysp(regC, regB2);
 327                 regS = _ftof2(_hif2(sumf),_lof2(sume));
 328                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 329                 *c0 = _lof2(regR);
 330                 *c1 = _hif2(regR);
 331
 332             }
 333             else
 334             {
 335 #if 0
 336                 // update c[0,0] and c[1,0]
 337                 ptrC = (__float2_t *) c;
 338                 regC = *ptrC;
 339                 regC = _dmpysp(regC, regB2);
 340                 regS = _ftof2(_lof2(sum1),-_hif2(sum0));
 341                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 342                 *ptrC = regR;
 343
 344                 //update c[0,1] and c[1,1]
 345                 ptrC += (cs_c>>1);
 346                 regC = *ptrC;
 347                 regC = _dmpysp(regC, regB2);
 348                 regS = _ftof2(_hif2(sum1),_lof2(sum0));
 349                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 350                 *ptrC = regR;
 351
 352                 //update c[0,2] and c[1,2]
 353                 ptrC += (cs_c>>1);
 354                 regC = *ptrC;
 355                 regC = _dmpysp(regC, regB2);
 356                 regS = _ftof2(_lof2(sum3),-_hif2(sum2));
 357                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 358                 *ptrC = regR;
 359
 360                 //update  c[0,3] and c[1,3]
 361                 ptrC += (cs_c>>1);
 362                 regC = *ptrC;
 363                 regC = _dmpysp(regC, regB2);
 364                 regS = _ftof2(_hif2(sum3),_lof2(sum2));
 365                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 366                 *ptrC = regR;
 367
 368                 //update c[0,4] and c[1,4]
 369                 ptrC += (cs_c>>1);
 370                 regC = *ptrC;
 371                 regC = _dmpysp(regC, regB2);
 372                 regS = _ftof2(_lof2(sum5),-_hif2(sum4));
 373                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 374                 *ptrC = regR;
 375
 376                 //update c[0,5] and c[1,5]
 377                 ptrC += (cs_c>>1);
 378                 regC = *ptrC;
 379                 regC = _dmpysp(regC, regB2);
 380                 regS = _ftof2(_hif2(sum5),_lof2(sum4));
 381                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 382                 *ptrC = regR;
 383
 384                 //update c[0,6] and c[1,6]
 385                 ptrC += (cs_c>>1);
 386                 regC = *ptrC;
 387                 regC = _dmpysp(regC, regB2);
 388                 regS = _ftof2(_lof2(sum7),-_hif2(sum6));
 389                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 390                 *ptrC = regR;
 391
 392                 //update c[0,7] and c[1,7]
 393                 ptrC += (cs_c>>1);
 394                 regC = *ptrC;
 395                 regC = _dmpysp(regC, regB2);
 396                 regS = _ftof2(_hif2(sum7),_lof2(sum6));
 397                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 398                 *ptrC = regR;
 399
 400                 // update c[2,0] and c[3,0]
 401                 ptrC = (__float2_t *) (c+2);
 402                 regC = *ptrC;
 403                 regC = _dmpysp(regC, regB2);
 404                 regS = _ftof2(_lof2(sum9),-_hif2(sum8));
 405                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 406                 *ptrC = regR;
 407
 408                 // update c[2,1] and c[3,1]
 409                 ptrC += (cs_c>>1);
 410                 regC = *ptrC;
 411                 regC = _dmpysp(regC, regB2);
 412                 regS = _ftof2(_hif2(sum9),_lof2(sum8));
 413                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 414                 *ptrC = regR;
 415
 416                 //update c[2,2] and c[3,2]
 417                 ptrC += (cs_c>>1);
 418                 regC = *ptrC;
 419                 regC = _dmpysp(regC, regB2);
 420                 regS = _ftof2(_lof2(sumb),-_hif2(suma));
 421                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 422                 *ptrC = regR;
 423
 424                 //update  c[2,3] and c[3,3]
 425                 ptrC += (cs_c>>1);
 426                 regC = *ptrC;
 427                 regC = _dmpysp(regC, regB2);
 428                 regS = _ftof2(_hif2(sumb),_lof2(suma));
 429                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 430                 *ptrC = regR;
 431
 432                 //update c[2,4] and c[3,4]
 433                 ptrC += (cs_c>>1);
 434                 regC = *ptrC;
 435                 regC = _dmpysp(regC, regB2);
 436                 regS = _ftof2(_lof2(sumd),-_hif2(sumc));
 437                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 438                 *ptrC = regR;
 439
 440                 //update c[2,5] and c[3,5]
 441                 ptrC += (cs_c>>1);
 442                 regC = *ptrC;
 443                 regC = _dmpysp(regC, regB2);
 444                 regS = _ftof2(_hif2(sumd),_lof2(sumc));
 445                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 446                 *ptrC = regR;
 447
 448                 //update c[2,6] and c[2,6]
 449                 ptrC += (cs_c>>1);
 450                 regC = *ptrC;
 451                 regC = _dmpysp(regC, regB2);
 452                 regS = _ftof2(_lof2(sumf),-_hif2(sume));
 453                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 454                 *ptrC = regR;
 455
 456                 //update c[2,7] and c[2,7]
 457                 ptrC += (cs_c>>1);
 458                 regC = *ptrC;
 459                 regC = _dmpysp(regC, regB2);
 460                 regS = _ftof2(_hif2(sumf),_lof2(sume));
 461                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 462                 *ptrC = regR;
 463 #else
 464 /*              __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
 465                 __float2_t c8, c9, ca, cb, cc, cd, ce, cf;
 466
 467                 ptrC = (__float2_t *) c;
 468                 c0 = *ptrC++;
 469                 sum0 = _dmpysp(regA2, sum0);
 470                 c1 = *ptrC--;
 471                 sum1 = _dmpysp(regA2, sum1);
 472                 ptrC += (cs_c>>1);
 473                 c2 = *ptrC++;
 474                 sum2 = _dmpysp(regA2, sum2);
 475                 c3 = *ptrC--;
 476                 sum3 = _dmpysp(regA2, sum3);
 477                 ptrC += (cs_c>>1);
 478                 c4 = *ptrC++;
 479                 sum8 = _dmpysp(regA2, sum8);
 480                 c5 = *ptrC--;
 481                 sum9 = _dmpysp(regA2, sum9);
 482                 ptrC += (cs_c>>1);
 483                 c6 = *ptrC++;
 484                 suma = _dmpysp(regA2, suma);
 485                 c7 = *ptrC--;
 486                 sumb = _dmpysp(regA2, sumb);
 487
 488                 c0 = _dmpysp(c0, regB2);
 489                 c1 = _dmpysp(c1, regB2);
 490                 c2 = _dmpysp(c2, regB2);
 491                 c3 = _dmpysp(c3, regB2);
 492                 c4 = _dmpysp(c4, regB2);
 493                 c5 = _dmpysp(c5, regB2);
 494                 c6 = _dmpysp(c6, regB2);
 495                 c7 = _dmpysp(c7, regB2);
 496
 497                 // update c[0,0] and c[1,0]
 498                 c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
 499                 // update c[2,0] and c[3,0]
 500                 c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);
 501                 //update c[0,1] and c[1,1]
 502                 c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
 503                 // update c[2,1] and c[3,1]
 504                 c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);
 505                 //update c[0,2] and c[1,2]
 506                 c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
 507                 //update c[2,2] and c[3,2]
 508                 c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);
 509                 //update  c[0,3] and c[1,3]
 510                 c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
 511                 //update  c[2,3] and c[3,3]
 512                 c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);
 513                 //update c[0,4] and c[1,4]
 514
 515                 ptrC = (__float2_t *) c;
 516                 *ptrC++ = c0;
 517                 *ptrC-- = c1;
 518                 ptrC += (cs_c>>1);
 519                 *ptrC++ = c2;
 520                 *ptrC-- = c3;
 521                 ptrC += (cs_c>>1);
 522                 *ptrC++ = c4;
 523                 *ptrC-- = c5;
 524                 ptrC += (cs_c>>1);
 525                 *ptrC++ = c6;
 526                 *ptrC-- = c7;
 527
 528                 ptrC = (__float2_t *) (c+(cs_c<<2));
 529                 c8 = *ptrC++;
 530                 c9 = *ptrC--;
 531                 ptrC += (cs_c>>1);
 532                 ca = *ptrC++;
 533                 cb = *ptrC--;
 534                 ptrC += (cs_c>>1);
 535                 cc = *ptrC++;
 536                 cd = *ptrC--;
 537                 ptrC += (cs_c>>1);
 538                 ce = *ptrC++;
 539                 cf = *ptrC;
 540
 541                 sum4 = _dmpysp(regA2, sum4);
 542                 sum5 = _dmpysp(regA2, sum5);
 543                 sum6 = _dmpysp(regA2, sum6);
 544                 sum7 = _dmpysp(regA2, sum7);
 545                 sumc = _dmpysp(regA2, sumc);
 546                 sumd = _dmpysp(regA2, sumd);
 547                 sume = _dmpysp(regA2, sume);
 548                 sumf = _dmpysp(regA2, sumf);
 549
 550                 c8 = _dmpysp(c8, regB2);
 551                 c9 = _dmpysp(c9, regB2);
 552                 ca = _dmpysp(ca, regB2);
 553                 cb = _dmpysp(cb, regB2);
 554                 cc = _dmpysp(cc, regB2);
 555                 cd = _dmpysp(cd, regB2);
 556                 ce = _dmpysp(ce, regB2);
 557                 cf = _dmpysp(cf, regB2);
 558
 559                 c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);
 560                 //update c[2,4] and c[3,4]
 561                 c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);
 562                 //update c[0,5] and c[1,5]
 563                 ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);
 564                 //update c[2,5] and c[3,5]
 565                 cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);
 566                 //update c[0,6] and c[1,6]
 567                 cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);
 568                 //update c[2,6] and c[3,6]
 569                 cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);
 570                 //update c[0,7] and c[1,7]
 571                 ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);
 572                 //update c[2,7] and c[3,7]
 573                 cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);
 574
 575                 ptrC = (__float2_t *) (c+(cs_c<<2));
 576                 *ptrC++ = c8;
 577                 *ptrC-- = c9;
 578                 ptrC += (cs_c>>1);
 579                 *ptrC++ = ca;
 580                 *ptrC-- = cb;
 581                 ptrC += (cs_c>>1);
 582                 *ptrC++ = cc;
 583                 *ptrC-- = cd;
 584                 ptrC += (cs_c>>1);
 585                 *ptrC++ = ce;
 586                 *ptrC = cf;*/
 587
 588                 __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
 589                 __float2_t c8, c9, ca, cb, cc, cd, ce, cf;
 590
 591                 ptrC = (__float2_t *) c;
 592                 c0 = *ptrC++;
 593                 c1 = *ptrC--;
 594                 ptrC += (cs_c>>1);
 595                 c2 = *ptrC++;
 596                 c3 = *ptrC--;
 597                 ptrC += (cs_c>>1);
 598                 c4 = *ptrC++;
 599                 c5 = *ptrC--;
 600                 ptrC += (cs_c>>1);
 601                 c6 = *ptrC++;
 602                 c7 = *ptrC--;
 603                 ptrC += (cs_c>>1);
 604                 c8 = *ptrC++;
 605                 c9 = *ptrC--;
 606                 ptrC += (cs_c>>1);
 607                 ca = *ptrC++;
 608                 cb = *ptrC--;
 609                 ptrC += (cs_c>>1);
 610                 cc = *ptrC++;
 611                 cd = *ptrC--;
 612                 ptrC += (cs_c>>1);
 613                 ce = *ptrC++;
 614                 cf = *ptrC;
 615
 616                 sum0 = _dmpysp(regA2, sum0);
 617                 sum1 = _dmpysp(regA2, sum1);
 618                 sum2 = _dmpysp(regA2, sum2);
 619                 sum3 = _dmpysp(regA2, sum3);
 620                 sum4 = _dmpysp(regA2, sum4);
 621                 sum5 = _dmpysp(regA2, sum5);
 622                 sum6 = _dmpysp(regA2, sum6);
 623                 sum7 = _dmpysp(regA2, sum7);
 624                 sum8 = _dmpysp(regA2, sum8);
 625                 sum9 = _dmpysp(regA2, sum9);
 626                 suma = _dmpysp(regA2, suma);
 627                 sumb = _dmpysp(regA2, sumb);
 628                 sumc = _dmpysp(regA2, sumc);
 629                 sumd = _dmpysp(regA2, sumd);
 630                 sume = _dmpysp(regA2, sume);
 631                 sumf = _dmpysp(regA2, sumf);
 632
 633                 c0 = _dmpysp(c0, regB2);
 634                 c1 = _dmpysp(c1, regB2);
 635                 c2 = _dmpysp(c2, regB2);
 636                 c3 = _dmpysp(c3, regB2);
 637                 c4 = _dmpysp(c4, regB2);
 638                 c5 = _dmpysp(c5, regB2);
 639                 c6 = _dmpysp(c6, regB2);
 640                 c7 = _dmpysp(c7, regB2);
 641                 c8 = _dmpysp(c8, regB2);
 642                 c9 = _dmpysp(c9, regB2);
 643                 ca = _dmpysp(ca, regB2);
 644                 cb = _dmpysp(cb, regB2);
 645                 cc = _dmpysp(cc, regB2);
 646                 cd = _dmpysp(cd, regB2);
 647                 ce = _dmpysp(ce, regB2);
 648                 cf = _dmpysp(cf, regB2);
 649
 650                 // update c[0,0] and c[1,0]
 651                 c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
 652                 // update c[2,0] and c[3,0]
 653                 c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);
 654                 //update c[0,1] and c[1,1]
 655                 c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
 656                 // update c[2,1] and c[3,1]
 657                 c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);
 658                 //update c[0,2] and c[1,2]
 659                 c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
 660                 //update c[2,2] and c[3,2]
 661                 c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);
 662                 //update  c[0,3] and c[1,3]
 663                 c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
 664                 //update  c[2,3] and c[3,3]
 665                 c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);
 666                 //update c[0,4] and c[1,4]
 667                 c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);
 668                 //update c[2,4] and c[3,4]
 669                 c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);
 670                 //update c[0,5] and c[1,5]
 671                 ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);
 672                 //update c[2,5] and c[3,5]
 673                 cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);
 674                 //update c[0,6] and c[1,6]
 675                 cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);
 676                 //update c[2,6] and c[3,6]
 677                 cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);
 678                 //update c[0,7] and c[1,7]
 679                 ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);
 680                 //update c[2,7] and c[3,7]
 681                 cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);
 682
 683                 ptrC = (__float2_t *) c;
 684                 *ptrC++ = c0;
 685                 *ptrC-- = c1;
 686                 ptrC += (cs_c>>1);
 687                 *ptrC++ = c2;
 688                 *ptrC-- = c3;
 689                 ptrC += (cs_c>>1);
 690                 *ptrC++ = c4;
 691                 *ptrC-- = c5;
 692                 ptrC += (cs_c>>1);
 693                 *ptrC++ = c6;
 694                 *ptrC-- = c7;
 695                 ptrC += (cs_c>>1);
 696                 *ptrC++ = c8;
 697                 *ptrC-- = c9;
 698                 ptrC += (cs_c>>1);
 699                 *ptrC++ = ca;
 700                 *ptrC-- = cb;
 701                 ptrC += (cs_c>>1);
 702                 *ptrC++ = cc;
 703                 *ptrC-- = cd;
 704                 ptrC += (cs_c>>1);
 705                 *ptrC++ = ce;
 706                 *ptrC = cf;
 707
 708
 709 #endif
 710             }
 711 #ifdef BLIS_ENABLE_CYCLE_COUNT
 712           counter_end=TSCL;
 713           if (lib_get_coreID () == 0)
 714                   printf("%d\n", counter_end-counter_start);
 715 #endif
 716
 717 }
 718
 719 void bli_sgemm_ukernel_4x4(
 720                         dim_t              k,
 721                         float*    restrict alpha,
 722                         float*    restrict a,
 723                         float*    restrict b,
 724                         float*    restrict beta,
 725                         float*    restrict c, inc_t rs_c, inc_t cs_c,
 726                         auxinfo_t*         data
 727                       )
 728 {
 729     __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
 730         __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;
 731         __float2_t * restrict ptrB     = (__float2_t *) b;
 732         __float2_t * restrict ptrA     = (__float2_t *) a;
 733         __float2_t * restrict ptrC;
 734         __float2_t regA2, regB2, regC, regS, regR;
 735         float* restrict c0, * restrict c1;
 736         int_least16_t index;
 737         int kEven, kLeft;
 738
 739 #ifdef BLIS_ENABLE_CYCLE_COUNT
 740           volatile int counter_start;
 741           volatile int counter_end;
 742 #endif
 743
 744 #ifdef BLIS_ENABLE_CYCLE_COUNT
 745           TSCL=0;
 746           counter_start = TSCL;
 747 #endif
 748
 749         // zero out accumulators
 750         sum0 = 0.0;
 751         sum1 = 0.0;
 752         sum2 = 0.0;
 753         sum3 = 0.0;
 754         sum4 = 0.0;
 755         sum5 = 0.0;
 756         sum6 = 0.0;
 757         sum7 = 0.0;
 758         sum8 = 0.0;
 759         sum9 = 0.0;
 760         suma = 0.0;
 761         sumb = 0.0;
 762         sumc = 0.0;
 763         sumd = 0.0;
 764         sume = 0.0;
 765         sumf = 0.0;
 766
 767         kEven=k>>1;
 768         kLeft=k&1;
 769         //TSCL = 0;
 770         //cycles = TSCL;
 771
 772
 773         for (index = 0; index < kEven; index++)
 774         { // loop over k;
 775                 // each iteration performs rank one update of 4x1 by 1x4
 776                 // matrices of A and B respectively; result is
 777                 // accumulated over 4x4 matrix
 778                 __float2_t b01, b23, a01, a23;
 779                 __x128_t   reg128;
 780
 781                 // for even k
 782                 a01 = *ptrA++;
 783                 a23 = *ptrA++;
 784
 785                 b01 = *ptrB++;
 786                 b23 = *ptrB++;
 787
 788                 reg128 = _cmpysp(a01, b01);
 789                 // accumulate a[0]*b[1] and -a[0]*b[0]
 790                 sum0 = _daddsp(sum0, _lof2_128(reg128));
 791                 // accumulate a[1]*b[0] and a[1]*b[1]
 792                 sum1 = _daddsp(sum1, _hif2_128(reg128));
 793
 794                 reg128 = _cmpysp(a01, b23);
 795                 // accumulate a[0]*b[3] and -a[0]*b[2]
 796                 sum2 = _daddsp(sum2, _lof2_128(reg128));
 797                 // accumulate a[1]*b[2] and a[1]*b[3]
 798                 sum3 = _daddsp(sum3, _hif2_128(reg128));
 799
 800                 reg128 = _cmpysp(a23, b01);
 801                 // accumulate a[2]*b[1] and -a[2]*b[0]
 802                 sum4 = _daddsp(sum4, _lof2_128(reg128));
 803                 // accumulate a[3]*b[0] and a[3]*b[1]
 804                 sum5 = _daddsp(sum5, _hif2_128(reg128));
 805
 806                 reg128 = _cmpysp(a23, b23);
 807                 // accumulate a[2]*b[3] and -a[2]*b[2]
 808                 sum6 = _daddsp(sum6, _lof2_128(reg128));
 809                 // accumulate a[3]*b[2] and a[3]*b[3]
 810                 sum7 = _daddsp(sum7, _hif2_128(reg128));
 811
 812
 813
 814                 // for odd k
 815                 a01 = *ptrA++;
 816                 a23 = *ptrA++;
 817
 818                 b01 = *ptrB++;
 819                 b23 = *ptrB++;
 820
 821                 reg128 = _cmpysp(a01, b01);
 822                 // accumulate a[0]*b[1] and -a[0]*b[0]
 823                 sum8 = _daddsp(sum8, _lof2_128(reg128));
 824                 // accumulate a[1]*b[0] and a[1]*b[1]
 825                 sum9 = _daddsp(sum9, _hif2_128(reg128));
 826
 827                 reg128 = _cmpysp(a01, b23);
 828                 // accumulate a[0]*b[3] and -a[0]*b[2]
 829                 suma = _daddsp(suma, _lof2_128(reg128));
 830                 // accumulate a[1]*b[2] and a[1]*b[3]
 831                 sumb = _daddsp(sumb, _hif2_128(reg128));
 832
 833                 reg128 = _cmpysp(a23, b01);
 834                 // accumulate a[2]*b[1] and -a[2]*b[0]
 835                 sumc = _daddsp(sumc, _lof2_128(reg128));
 836                 // accumulate a[3]*b[0] and a[3]*b[1]
 837                 sumd = _daddsp(sumd, _hif2_128(reg128));
 838
 839                 reg128 = _cmpysp(a23, b23);
 840                 // accumulate a[2]*b[3] and -a[2]*b[2]
 841                 sume = _daddsp(sume, _lof2_128(reg128));
 842                 // accumulate a[3]*b[2] and a[3]*b[3]
 843                 sumf = _daddsp(sumf, _hif2_128(reg128));
 844
 845         }
 846         if(kLeft)
 847         { // last k if left;
 848                 __float2_t b01, b23, a01, a23;
 849                 __x128_t   reg128;
 850
 851                 a01 = *ptrA++;
 852                 a23 = *ptrA++;
 853
 854                 b01 = *ptrB++;
 855                 b23 = *ptrB++;
 856
 857                 reg128 = _cmpysp(a01, b01);
 858                 // accumulate a[0]*b[1] and -a[0]*b[0]
 859                 sum0 = _daddsp(sum0, _lof2_128(reg128));
 860                 // accumulate a[1]*b[0] and a[1]*b[1]
 861                 sum1 = _daddsp(sum1, _hif2_128(reg128));
 862
 863                 reg128 = _cmpysp(a01, b23);
 864                 // accumulate a[0]*b[3] and -a[0]*b[2]
 865                 sum2 = _daddsp(sum2, _lof2_128(reg128));
 866                 // accumulate a[1]*b[2] and a[1]*b[3]
 867                 sum3 = _daddsp(sum3, _hif2_128(reg128));
 868
 869                 reg128 = _cmpysp(a23, b01);
 870                 // accumulate a[2]*b[1] and -a[2]*b[0]
 871                 sum4 = _daddsp(sum4, _lof2_128(reg128));
 872                 // accumulate a[3]*b[0] and a[3]*b[1]
 873                 sum5 = _daddsp(sum5, _hif2_128(reg128));
 874
 875                 reg128 = _cmpysp(a23, b23);
 876                 // accumulate a[2]*b[3] and -a[2]*b[2]
 877                 sum6 = _daddsp(sum6, _lof2_128(reg128));
 878                 // accumulate a[3]*b[2] and a[3]*b[3]
 879                 sum7 = _daddsp(sum7, _hif2_128(reg128));
 880
 881         }
 882 #ifdef BLIS_ENABLE_CYCLE_COUNT
 883           counter_end=TSCL;
 884           if (lib_get_coreID () == 0)
 885                   printf("%d\t%d\t",k, counter_end-counter_start);
 886 #endif
 887
 888 #ifdef BLIS_ENABLE_CYCLE_COUNT
 889           TSCL=0;
 890           counter_start = TSCL;
 891 #endif
 892         sum0 = _daddsp(sum0, sum8);
 893         sum1 = _daddsp(sum1, sum9);
 894         sum2 = _daddsp(sum2, suma);
 895         sum3 = _daddsp(sum3, sumb);
 896         sum4 = _daddsp(sum4, sumc);
 897         sum5 = _daddsp(sum5, sumd);
 898         sum6 = _daddsp(sum6, sume);
 899         sum7 = _daddsp(sum7, sumf);
 900
 901
 902         regA2 = _ftof2(*alpha, *alpha);
 903         regB2 = _ftof2(*beta, *beta);
 904         if (rs_c != 1)
 905         {
 906                 // update c[0,0] and c[1,0]
 907                 c0 = (c + 0*rs_c + 0*cs_c);
 908                 c1 = (c + 1*rs_c + 0*cs_c);
 909                 regC = _ftof2(*c1,*c0);
 910                 regC = _dmpysp(regC, regB2);
 911                 regS = _ftof2(_lof2(sum1),-_hif2(sum0));
 912                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 913                 *c0 = _lof2(regR);
 914                 *c1 = _hif2(regR);
 915
 916                 //update c[0,1] and c[1,1]
 917                 c0 = c0 + 1*cs_c;
 918                 c1 = c1 + 1*cs_c;
 919                 regC = _ftof2(*c1,*c0);
 920                 regC = _dmpysp(regC, regB2);
 921                 regS = _ftof2(_hif2(sum1),_lof2(sum0));
 922                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 923                 *c0 = _lof2(regR);
 924                 *c1 = _hif2(regR);
 925
 926                 //update c[0,2] and c[1,2]
 927                 c0 = c0 + 1*cs_c;
 928                 c1 = c1 + 1*cs_c;
 929                 regC = _ftof2(*c1,*c0);
 930                 regC = _dmpysp(regC, regB2);
 931                 regS = _ftof2(_lof2(sum3),-_hif2(sum2));
 932                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 933                 *c0 = _lof2(regR);
 934                 *c1 = _hif2(regR);
 935
 936                 //update  c[0,3] and c[1,3]
 937                 c0 = c0 + 1*cs_c;
 938                 c1 = c1 + 1*cs_c;
 939                 regC = _ftof2(*c1,*c0);
 940                 regC = _dmpysp(regC, regB2);
 941                 regS = _ftof2(_hif2(sum3),_lof2(sum2));
 942                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 943                 *c0 = _lof2(regR);
 944                 *c1 = _hif2(regR);
 945
 946                 // update c[2,0] and c[3,0]
 947                 c0 = (c + 2*rs_c + 0*cs_c);
 948                 c1 = (c + 3*rs_c + 0*cs_c);
 949                 regC = _ftof2(*c1,*c0);
 950                 regC = _dmpysp(regC, regB2);
 951                 regS = _ftof2(_lof2(sum5),-_hif2(sum4));
 952                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 953                 *c0 = _lof2(regR);
 954                 *c1 = _hif2(regR);
 955
 956                 // update c[2,1] and c[3,1]
 957                 c0 = c0 + 1*cs_c;
 958                 c1 = c1 + 1*cs_c;
 959                 regC = _ftof2(*c1,*c0);
 960                 regC = _dmpysp(regC, regB2);
 961                 regS = _ftof2(_hif2(sum5),_lof2(sum4));
 962                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 963                 *c0 = _lof2(regR);
 964                 *c1 = _hif2(regR);
 965
 966                 //update c[2,2] and c[3,2]
 967                 c0 = c0 + 1*cs_c;
 968                 c1 = c1 + 1*cs_c;
 969                 regC = _ftof2(*c1,*c0);
 970                 regC = _dmpysp(regC, regB2);
 971                 regS = _ftof2(_lof2(sum7),-_hif2(sum6));
 972                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 973                 *c0 = _lof2(regR);
 974                 *c1 = _hif2(regR);
 975
 976                 //update  c[2,3] and c[3,3]
 977                 c0 = c0 + 1*cs_c;
 978                 c1 = c1 + 1*cs_c;
 979                 regC = _ftof2(*c1,*c0);
 980                 regC = _dmpysp(regC, regB2);
 981                 regS = _ftof2(_hif2(sum7),_lof2(sum6));
 982                 regR = _daddsp(_dmpysp(regA2, regS), regC);
 983                 *c0 = _lof2(regR);
 984                 *c1 = _hif2(regR);
 985
 986         }
 987         else
 988         {
 989                 __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
 990
 991                 ptrC = (__float2_t *) c;
 992                 c0 = *ptrC++;
 993                 c1 = *ptrC--;
 994                 ptrC += (cs_c>>1);
 995                 c2 = *ptrC++;
 996                 c3 = *ptrC--;
 997                 ptrC += (cs_c>>1);
 998                 c4 = *ptrC++;
 999                 c5 = *ptrC--;
1000                 ptrC += (cs_c>>1);
1001                 c6 = *ptrC++;
1002                 c7 = *ptrC--;
1003
1004                 sum0 = _dmpysp(regA2, sum0);
1005                 sum1 = _dmpysp(regA2, sum1);
1006                 sum2 = _dmpysp(regA2, sum2);
1007                 sum3 = _dmpysp(regA2, sum3);
1008                 sum4 = _dmpysp(regA2, sum4);
1009                 sum5 = _dmpysp(regA2, sum5);
1010                 sum6 = _dmpysp(regA2, sum6);
1011                 sum7 = _dmpysp(regA2, sum7);
1012
1013                 c0 = _dmpysp(c0, regB2);
1014                 c1 = _dmpysp(c1, regB2);
1015                 c2 = _dmpysp(c2, regB2);
1016                 c3 = _dmpysp(c3, regB2);
1017                 c4 = _dmpysp(c4, regB2);
1018                 c5 = _dmpysp(c5, regB2);
1019                 c6 = _dmpysp(c6, regB2);
1020                 c7 = _dmpysp(c7, regB2);
1021
1022                 // update c[0,0] and c[1,0]
1023                 c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
1024                 // update c[2,0] and c[3,0]
1025                 c1 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c1);
1026                 //update c[0,1] and c[1,1]
1027                 c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
1028                 // update c[2,1] and c[3,1]
1029                 c3 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),c3);
1030                 //update c[0,2] and c[1,2]
1031                 c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
1032                 //update c[2,2] and c[3,2]
1033                 c5 = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),c5);
1034                 //update  c[0,3] and c[1,3]
1035                 c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
1036                 //update  c[2,3] and c[3,3]
1037                 c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),c7);
1038
1039                 ptrC = (__float2_t *) c;
1040                 *ptrC++ = c0;
1041                 *ptrC-- = c1;
1042                 ptrC += (cs_c>>1);
1043                 *ptrC++ = c2;
1044                 *ptrC-- = c3;
1045                 ptrC += (cs_c>>1);
1046                 *ptrC++ = c4;
1047                 *ptrC-- = c5;
1048                 ptrC += (cs_c>>1);
1049                 *ptrC++ = c6;
1050                 *ptrC-- = c7;
1051
1052         }
1053 #ifdef BLIS_ENABLE_CYCLE_COUNT
1054           counter_end=TSCL;
1055           if (lib_get_coreID () == 0)
1056                   printf("%d\n", counter_end-counter_start);
1057 #endif
1058 }
1059
1060
1061 //void dgemmKernel(const double *pA, const double *pB, double *pC, const double a, const int k, const int stepC)
1062 void bli_dgemm_ukernel_4x4(
1063                          dim_t              k,
1064                          double*    restrict alpha,
1065                          double*    restrict a,
1066                          double*    restrict b,
1067                          double*    restrict beta,
1068                          double*    restrict c, inc_t rs_c, inc_t cs_c,
1069                          auxinfo_t*         data
1070                           )
1071 {
1072   double sum00, sum01, sum02, sum03;
1073   double sum10, sum11, sum12, sum13;
1074   double sum20, sum21, sum22, sum23;
1075   double sum30, sum31, sum32, sum33;
1076   int index;
1077   double al = *alpha;
1078   double be = *beta;
1079 #ifdef BLIS_ENABLE_CYCLE_COUNT
1080   volatile int counter_start;
1081   volatile int counter_end;
1082 #endif
1083
1084
1085 #ifdef BLIS_ENABLE_CYCLE_COUNT
1086   TSCL = 0;
1087   counter_start = TSCL;
1088 #endif
1089   //touch routine: both a & b
1090   //Length of b = NR*K*size of double;
1091   //Length of a = MR*K*size of double;
1092 #ifdef BLIS_ENABLE_PREFETCH
1093   //touch(b, k*BLIS_DEFAULT_NR_D*8);
1094   //touch(a, k*BLIS_DEFAULT_MR_D*8);
1095 #endif
1096
1097
1098
1099
1100   sum00 = 0.0;
1101   sum01 = 0.0;
1102   sum02 = 0.0;
1103   sum03 = 0.0;
1104   sum10 = 0.0;
1105   sum11 = 0.0;
1106   sum12 = 0.0;
1107   sum13 = 0.0;
1108   sum20 = 0.0;
1109   sum21 = 0.0;
1110   sum22 = 0.0;
1111   sum23 = 0.0;
1112   sum30 = 0.0;
1113   sum31 = 0.0;
1114   sum32 = 0.0;
1115   sum33 = 0.0;
1116
1117
1118   for(index = 0; index < k; index++)
1119   {  // loop over k;
1120          // each iteration performs rank one update of 4x1 by 1x4
1121          // matrices of A and B respectively; result is
1122      // accumulated over 4x4 matrix
1123      register double a0, a1, a2, a3;
1124      register double b0, b1, b2, b3;
1125
1126      a0 = *a++;
1127      a1 = *a++;
1128      a2 = *a++;
1129      a3 = *a++;
1130      b0 = *b++;
1131      b1 = *b++;
1132      b2 = *b++;
1133      b3 = *b++;
1134
1135      // a[0]*b[0]
1136      sum00 += a0*b0;
1137      // a[0]*b[1]
1138      sum01 += a0*b1;
1139      // a[0]*b[2]
1140      sum02 += a0*b2;
1141      // a[0]*b[3]
1142      sum03 += a0*b3;
1143      // a[1]*b[0]
1144      sum10 += a1*b0;
1145      // a[1]*b[1]
1146      sum11 += a1*b1;
1147      // a[1]*b[2]
1148      sum12 += a1*b2;
1149      // a[1]*b[3]
1150      sum13 += a1*b3;
1151      // a[2]*b[0]
1152      sum20 += a2*b0;
1153      // a[2]*b[1]
1154      sum21 += a2*b1;
1155      // a[2]*b[2]
1156      sum22 += a2*b2;
1157      // a[2]*b[3]
1158      sum23 += a2*b3;
1159      // a[3]*b[0]
1160      sum30 += a3*b0;
1161      // a[3]*b[1]
1162      sum31 += a3*b1;
1163      // a[3]*b[2]
1164      sum32 += a3*b2;
1165      // a[3]*b[3]
1166      sum33 += a3*b3;
1167   }
1168 #ifdef BLIS_ENABLE_CYCLE_COUNT
1169           counter_end=TSCL;
1170           if (lib_get_coreID () == 0)
1171                   printf("%d\t%d\t",k, counter_end-counter_start);
1172 #endif
1173 #ifdef BLIS_ENABLE_CYCLE_COUNT
1174   TSCL = 0;
1175   counter_start = TSCL;
1176 #endif
1177
1178         double* restrict cptr;
1179         // 0th Column
1180         // updating C[00]
1181     cptr = c;
1182     *cptr = *cptr * be;
1183     *cptr += sum00 * al;
1184
1185     // updating C[10]
1186     cptr += rs_c;
1187     *cptr = *cptr*be;
1188     *cptr += sum10 * al;
1189
1190     // updating C[20]
1191     cptr += rs_c;
1192     *cptr = *cptr*be;
1193     *cptr += sum20 * al;
1194
1195     // updating C[30]
1196     cptr += rs_c;
1197     *cptr = *cptr*be;
1198     *cptr += sum30 * al;
1199
1200     // 1st column
1201     // updating C[01]
1202     cptr = c + cs_c;
1203     *cptr = *cptr*be;
1204     *cptr += sum01 * al;
1205
1206     // updating C[11]
1207     cptr += rs_c;
1208     *cptr = *cptr*be;
1209     *cptr += sum11 * al;
1210
1211     // updating C[21]
1212     cptr += rs_c;
1213     *cptr = *cptr*be;
1214     *cptr += sum21 * al;
1215
1216     // updating C[31]
1217     cptr += rs_c;
1218     *cptr = *cptr*be;
1219     *cptr += sum31 * al;
1220
1221     // 2nd Column
1222     // updating C[02]
1223     cptr = c + 2*cs_c;
1224     *cptr = *cptr*be;
1225     *cptr += sum02 * al;
1226
1227     // updating C[12]
1228     cptr += rs_c;
1229     *cptr = *cptr*be;
1230     *cptr += sum12 * al;
1231
1232     // updating C[22]
1233     cptr += rs_c;
1234     *cptr = *cptr*be;
1235     *cptr += sum22 * al;
1236
1237     // updating C[32]
1238     cptr += rs_c;
1239     *cptr = *cptr*be;
1240     *cptr += sum32 * al;
1241
1242     // 3rd Column
1243     // updating C[03]
1244     cptr = c + 3*cs_c;
1245     *cptr = *cptr*be;
1246     *cptr += sum03 * al;
1247
1248     // updating C[13]
1249     cptr += rs_c;
1250     *cptr = *cptr*be;
1251     *cptr += sum13 * al;
1252
1253     // updating C[23]
1254     cptr += rs_c;
1255     *cptr = *cptr*be;
1256     *cptr += sum23 * al;
1257
1258     // updating C[33]
1259     cptr += rs_c;
1260     *cptr = *cptr*be;
1261     *cptr += sum33 * al;
1262 #ifdef BLIS_ENABLE_CYCLE_COUNT
1263           counter_end=TSCL;
1264           if (lib_get_coreID () == 0)
1265                   printf("%d\n", counter_end-counter_start);
1266 #endif
1267           return;
1268 }
1269
1270 void bli_cgemm_ukernel_2x4(
1271                                                         dim_t              k,
1272                                                         scomplex* restrict alpha,
1273                                                         scomplex* restrict a,
1274                                                         scomplex* restrict b,
1275                                                         scomplex* restrict beta,
1276                                                         scomplex* restrict c, inc_t rs_c, inc_t cs_c,
1277                                                         auxinfo_t*         data
1278                                                   )
1279 {
1280         __float2_t sum00a, sum10a, sum00b, sum10b;
1281         __float2_t sum01a, sum11a, sum01b, sum11b;
1282         __float2_t sum02a, sum12a, sum02b, sum12b;
1283         __float2_t sum03a, sum13a, sum03b, sum13b;
1284         __float2_t * restrict ptrB     = (__float2_t *) b;
1285         __float2_t * restrict ptrA     = (__float2_t *) a;
1286         __float2_t * restrict ptrC;
1287         __float2_t regA, regB, regC;
1288         int_least16_t index;
1289
1290 #ifdef BLIS_ENABLE_CYCLE_COUNT
1291           volatile int counter_start;
1292           volatile int counter_end;
1293 #endif
1294
1295         // zero out accumulators
1296         sum00a = 0.0;
1297         sum10a = 0.0;
1298         sum01a = 0.0;
1299         sum11a = 0.0;
1300         sum02a = 0.0;
1301         sum12a = 0.0;
1302         sum03a = 0.0;
1303         sum13a = 0.0;
1304         sum00b = 0.0;
1305         sum10b = 0.0;
1306         sum01b = 0.0;
1307         sum11b = 0.0;
1308         sum02b = 0.0;
1309         sum12b = 0.0;
1310         sum03b = 0.0;
1311         sum13b = 0.0;
1312
1313 #ifdef BLIS_ENABLE_CYCLE_COUNT
1314           TSCL=0;
1315           counter_start = TSCL;
1316 #endif
1317
1318         for (index = 0; index < k; index++)
1319         { // loop over k;
1320         // each iteration performs rank one update of 2x1 by 1x4
1321                 // matrices of A and B respectively; result is
1322                 // accumulated over 2x4 matrix
1323                 __float2_t b0, b1, b2, b3, a0, a1;
1324                 __x128_t   reg128;
1325
1326                 a0 = *ptrA++;
1327                 a1 = *ptrA++;
1328
1329                 b0 = *ptrB++;
1330                 b1 = *ptrB++;
1331                 b2 = *ptrB++;
1332                 b3 = *ptrB++;
1333
1334                 // the four partial sums are accumulated independently
1335                 // a[0]*b[0]
1336                 reg128 = _cmpysp(a0, b0);
1337                 sum00a = _daddsp(sum00a, _lof2_128(reg128));
1338                 sum00b = _daddsp(sum00b, _hif2_128(reg128));
1339
1340                 // a[1]*b[0]
1341                 reg128 = _cmpysp(a1, b0);
1342                 sum10a = _daddsp(sum10a, _lof2_128(reg128));
1343                 sum10b = _daddsp(sum10b, _hif2_128(reg128));
1344
1345                 // a[0]*b[1]
1346                 reg128 = _cmpysp(a0, b1);
1347                 sum01a = _daddsp(sum01a, _lof2_128(reg128));
1348                 sum01b = _daddsp(sum01b, _hif2_128(reg128));
1349
1350                 // a[1]*b[1]
1351                 reg128 = _cmpysp(a1, b1);
1352                 sum11a = _daddsp(sum11a, _lof2_128(reg128));
1353                 sum11b = _daddsp(sum11b, _hif2_128(reg128));
1354
1355                 // a[0]*b[2]
1356                 reg128 = _cmpysp(a0, b2);
1357                 sum02a = _daddsp(sum02a, _lof2_128(reg128));
1358                 sum02b = _daddsp(sum02b, _hif2_128(reg128));
1359
1360                 // a[1]*b[2]
1361                 reg128 = _cmpysp(a1, b2);
1362                 sum12a = _daddsp(sum12a, _lof2_128(reg128));
1363                 sum12b = _daddsp(sum12b, _hif2_128(reg128));
1364
1365                 // a[0]*b[3]
1366                 reg128 = _cmpysp(a0, b3);
1367                 sum03a = _daddsp(sum03a, _lof2_128(reg128));
1368                 sum03b = _daddsp(sum03b, _hif2_128(reg128));
1369
1370                 // a[1]*b[3]
1371                 reg128 = _cmpysp(a1, b3);
1372                 sum13a = _daddsp(sum13a, _lof2_128(reg128));
1373                 sum13b = _daddsp(sum13b, _hif2_128(reg128));
1374         }
1375
1376 #ifdef BLIS_ENABLE_CYCLE_COUNT
1377           counter_end=TSCL;
1378           if (lib_get_coreID () == 0)
1379                   printf("%d\t%d\t",k, counter_end-counter_start);
1380 #endif
1381
1382 #ifdef BLIS_ENABLE_CYCLE_COUNT
1383           TSCL=0;
1384           counter_start = TSCL;
1385 #endif
1386         {
1387                 __x128_t   reg128;
1388                 ptrA  = (__float2_t *) alpha;
1389                 ptrB =  (__float2_t *) beta;
1390                 regA  = *ptrA;
1391                 regB = *ptrB;
1392
1393                 // the value of a and the final values need to be
1394                 // rearranged due to the specific way cmpysp assumes
1395                 // data arrangement
1396                 regA  =_ftof2(-_lof(regA), _hif(regA));
1397                 //regB = _ftof2(_lof(regB),_hif(regB));
1398                 ptrC  = (__float2_t *) c;
1399
1400                 // update and save c[0,0]
1401                 sum00a = _daddsp(sum00a, sum00b);
1402                 reg128 = _cmpysp(regA, sum00a);
1403                 sum00a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1404                 regC = *ptrC;
1405                 //regC = _ftof2(_lof(regC), _hif(regC));
1406                 reg128 = _cmpysp(regC,regB);
1407                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1408                 *ptrC = _daddsp(_ftof2(-_lof(sum00a),_hif(sum00a)),_ftof2(_lof(regC),-_hif(regC)));
1409
1410                 ptrC = (__float2_t *) c + rs_c;
1411
1412                 // update and save c[1,0]
1413                 sum10a = _daddsp(sum10a, sum10b);
1414                 reg128 = _cmpysp(regA, sum10a);
1415                 sum10a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1416                 regC = *ptrC;
1417                 //regC = _ftof2(_lof(regC), _hif(regC));
1418                 reg128 = _cmpysp(regC,regB);
1419                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1420                 *ptrC = _daddsp(_ftof2(-_lof(sum10a),_hif(sum10a)),_ftof2(_lof(regC),-_hif(regC)));
1421
1422
1423                 ptrC = (__float2_t *) c + cs_c;
1424
1425                 // update and save c[0,1]
1426                 sum01a = _daddsp(sum01a, sum01b);
1427                 reg128 = _cmpysp(regA, sum01a);
1428                 sum01a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1429                 regC = *ptrC;
1430                 //regC = _ftof2(_lof(regC), _hif(regC));
1431                 reg128 = _cmpysp(regC,regB);
1432                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1433                 *ptrC = _daddsp(_ftof2(-_lof(sum01a),_hif(sum01a)),_ftof2(_lof(regC),-_hif(regC)));
1434
1435                 ptrC = (__float2_t *) c + rs_c + cs_c;
1436
1437                 // update and save c[1,1]
1438                 sum11a = _daddsp(sum11a, sum11b);
1439                 reg128 = _cmpysp(regA, sum11a);
1440                 sum11a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1441                 regC = *ptrC;
1442                 //regC = _ftof2(_lof(regC), _hif(regC));
1443                 reg128 = _cmpysp(regC,regB);
1444                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1445                 *ptrC = _daddsp(_ftof2(-_lof(sum11a),_hif(sum11a)),_ftof2(_lof(regC),-_hif(regC)));
1446
1447                 ptrC = (__float2_t *) c + 2 * cs_c;
1448
1449                 // update and save c[0,2]
1450                 sum02a = _daddsp(sum02a, sum02b);
1451                 reg128 = _cmpysp(regA, sum02a);
1452                 sum02a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1453                 regC = *ptrC;
1454                 //regC = _ftof2(_lof(regC), _hif(regC));
1455                 reg128 = _cmpysp(regC,regB);
1456                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1457                 *ptrC = _daddsp(_ftof2(-_lof(sum02a),_hif(sum02a)),_ftof2(_lof(regC),-_hif(regC)));
1458
1459                 ptrC = (__float2_t *) c + rs_c + 2* cs_c;
1460
1461                 // update and save c[1,2]
1462                 sum12a = _daddsp(sum12a, sum12b);
1463                 reg128 = _cmpysp(regA, sum12a);
1464                 sum12a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1465                 regC = *ptrC;
1466                 //regC = _ftof2(_lof(regC), _hif(regC));
1467                 reg128 = _cmpysp(regC,regB);
1468                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1469                 *ptrC = _daddsp(_ftof2(-_lof(sum12a),_hif(sum12a)),_ftof2(_lof(regC),-_hif(regC)));
1470
1471                 ptrC = (__float2_t *) c + 3 * cs_c;
1472
1473                 // update and save c[0,3]
1474                 sum03a = _daddsp(sum03a, sum03b);
1475                 reg128 = _cmpysp(regA, sum03a);
1476                 sum03a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1477                 regC = *ptrC;
1478                 //regC = _ftof2(_lof(regC), _hif(regC));
1479                 reg128 = _cmpysp(regC,regB);
1480                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1481                 *ptrC = _daddsp(_ftof2(-_lof(sum03a),_hif(sum03a)),_ftof2(_lof(regC),-_hif(regC)));
1482
1483                 ptrC = (__float2_t *) c + rs_c + 3 * cs_c;
1484
1485                 // update and save c[1,3]
1486                 sum13a = _daddsp(sum13a, sum13b);
1487                 reg128 = _cmpysp(regA, sum13a);
1488                 sum13a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1489                 regC = *ptrC;
1490                 //regC = _ftof2(_lof(regC), _hif(regC));
1491                 reg128 = _cmpysp(regC,regB);
1492                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1493                 *ptrC = _daddsp(_ftof2(-_lof(sum13a),_hif(sum13a)),_ftof2(_lof(regC),-_hif(regC)));
1494
1495         }
1496
1497 #ifdef BLIS_ENABLE_CYCLE_COUNT
1498         counter_end=TSCL;
1499         if (lib_get_coreID () == 0)
1500                 printf("%d\n", counter_end-counter_start);
1501 #endif
1502         return;
1503 }
1504
1505 void bli_zgemm_ukernel_2x2(
1506                                                         dim_t              k,
1507                                                         dcomplex* restrict alpha,
1508                                                         dcomplex* restrict a,
1509                                                         dcomplex* restrict b,
1510                                                         dcomplex* restrict beta,
1511                                                         dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
1512                                                         auxinfo_t*         data
1513                                                   )
1514 {
1515
1516         double * restrict ptrA = (double *) a;
1517         double * restrict ptrB = (double *) b;
1518         //double * restrict ptrC = (double *) c;
1519         double sum00r, sum00i;
1520         int index;
1521         int kEven = k&0xFFFE;
1522 #ifdef BLIS_ENABLE_CYCLE_COUNT
1523           volatile int counter_start;
1524           volatile int counter_end;
1525 #endif
1526
1527         sum00r = 0.0;
1528         sum00i = 0.0;
1529 #ifdef BLIS_ENABLE_CYCLE_COUNT
1530   TSCL = 0;
1531   counter_start = TSCL;
1532 #endif
1533
1534         if(k>4) // The loop is safe for k > 4
1535         {
1536 #pragma UNROLL(2)
1537                 for(index = 0; index<kEven; index++)
1538                 { // loop over k;
1539                         // each iteration performs rank one update of 1x1 by 1x1
1540                         // matrices of A and B respectively; result is
1541                         // accumulated over 1x1 matrix
1542                         double a0r, a0i;
1543                         double b0r, b0i;
1544
1545                         a0r = *ptrA++;
1546                         a0i = *ptrA++;
1547
1548                         b0r = *ptrB++;
1549                         b0i = *ptrB++;
1550
1551                         sum00r += a0r*b0r;
1552                         sum00r -= a0i*b0i;
1553                         sum00i += a0r*b0i;
1554                         sum00i += a0i*b0r;
1555
1556                 }
1557                 if(k&1) // odd k; one left to do
1558                 {
1559                         double a0r, a0i;
1560                         double b0r, b0i;
1561
1562                         a0r = *ptrA++;
1563                         a0i = *ptrA++;
1564
1565                         b0r = *ptrB++;
1566                         b0i = *ptrB++;
1567
1568                         sum00r += a0r*b0r;
1569                         sum00r -= a0i*b0i;
1570                         sum00i += a0r*b0i;
1571                         sum00i += a0i*b0r;
1572                 }
1573         }
1574         else
1575         {
1576                 if(k>0)
1577                 {
1578                         double a0r, a0i;
1579                         double b0r, b0i;
1580
1581                         a0r = *ptrA++;
1582                         a0i = *ptrA++;
1583
1584                         b0r = *ptrB++;
1585                         b0i = *ptrB++;
1586
1587                         sum00r += a0r*b0r;
1588                         sum00r -= a0i*b0i;
1589                         sum00i += a0r*b0i;
1590                         sum00i += a0i*b0r;
1591                 }
1592                 if(k>1)
1593                 {
1594                         double a0r, a0i;
1595                         double b0r, b0i;
1596
1597                         a0r = *ptrA++;
1598                         a0i = *ptrA++;
1599
1600                         b0r = *ptrB++;
1601                         b0i = *ptrB++;
1602
1603                         sum00r += a0r*b0r;
1604                         sum00r -= a0i*b0i;
1605                         sum00i += a0r*b0i;
1606                         sum00i += a0i*b0r;
1607                 }
1608                 if(k>2)
1609                 {
1610                         double a0r, a0i;
1611                         double b0r, b0i;
1612
1613                         a0r = *ptrA++;
1614                         a0i = *ptrA++;
1615
1616                         b0r = *ptrB++;
1617                         b0i = *ptrB++;
1618
1619                         sum00r += a0r*b0r;
1620                         sum00r -= a0i*b0i;
1621                         sum00i += a0r*b0i;
1622                         sum00i += a0i*b0r;
1623                 }
1624                 if(k>3)
1625                 {
1626                         double a0r, a0i;
1627                         double b0r, b0i;
1628
1629                         a0r = *ptrA++;
1630                         a0i = *ptrA++;
1631
1632                         b0r = *ptrB++;
1633                         b0i = *ptrB++;
1634
1635                         sum00r += a0r*b0r;
1636                         sum00r -= a0i*b0i;
1637                         sum00i += a0r*b0i;
1638                         sum00i += a0i*b0r;
1639                 }
1640
1641         }
1642
1643 #ifdef BLIS_ENABLE_CYCLE_COUNT
1644           counter_end=TSCL;
1645           if (lib_get_coreID () == 0)
1646                   printf("%d\t%d\t",k, counter_end-counter_start);
1647 #endif
1648
1649 #ifdef BLIS_ENABLE_CYCLE_COUNT
1650           TSCL=0;
1651           counter_start = TSCL;
1652 #endif
1653
1654         {  // final saving
1655                 double alphar, alphai, betar, betai, cr, ci;
1656                 alphar = alpha->real;
1657                 alphai = alpha->imag;
1658                 betar  = beta->real;
1659                 betai  = beta->imag;
1660
1661                 cr = c->real;
1662                 ci = c->imag;
1663
1664                 c->imag = (betar * ci + betai * cr);
1665                 c->real = (betar * cr - betai * ci);
1666                 c->real += (alphar * sum00r - alphai * sum00i);
1667                 c->imag += (alphar * sum00i + alphai * sum00r);
1668         }
1669 #ifdef BLIS_ENABLE_CYCLE_COUNT
1670           counter_end=TSCL;
1671           if (lib_get_coreID () == 0)
1672                   printf("%d\n", counter_end-counter_start);
1673 #endif
1674
1675
1676                 return;
1677 }
1678
1679