]> Gitweb @ Texas Instruments - Open Source Git Repositories - git.TI.com/gitweb - dense-linear-algebra-libraries/linalg.git/blob - blis/kernels/c66x/3/bli_gemm_ukernels.c
Refactored BLIS code using LibArch API for memory transfers.
[dense-linear-algebra-libraries/linalg.git] / blis / kernels / c66x / 3 / bli_gemm_ukernels.c
1 /*
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
7    Copyright (C) 2014, The University of Texas
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name of The University of Texas nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34 #include "blis.h"
36 //#define BLIS_ENABLE_CYCLE_COUNT
38 void bli_sgemm_ukernel_4x8(
39                         dim_t              k,
40                         float*    restrict alpha,
41                         float*    restrict a,
42                         float*    restrict b,
43                         float*    restrict beta,
44                         float*    restrict c, inc_t rs_c, inc_t cs_c,
45                         auxinfo_t*         data
46                       )
47 {
48           __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
49           __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;
50           __float2_t * restrict ptrB     = (__float2_t *) b;
51           __float2_t * restrict ptrA     = (__float2_t *) a;
52           __float2_t * restrict ptrC;
53           __float2_t regA2, regC, regS, regR;
54           int_least16_t index;
55           float* restrict c0, * restrict c1;
56           __float2_t regB2;
58 #ifdef BLIS_ENABLE_CYCLE_COUNT
59           volatile int counter_start;
60           volatile int counter_end;
61 #endif
63 #ifdef BLIS_ENABLE_CYCLE_COUNT
64           TSCL=0;
65           counter_start = TSCL;
66 #endif
67           //touch routine: both a & b
68           //Length of b = NR*K*size of float;
69 #ifdef BLIS_ENABLE_PREFETCH
70           //touch(a, k*BLIS_DEFAULT_MR_S*4);
71 #endif
73           // zero out accumulators
74           sum0 = 0.0;
75           sum1 = 0.0;
76           sum2 = 0.0;
77           sum3 = 0.0;
78           sum4 = 0.0;
79           sum5 = 0.0;
80           sum6 = 0.0;
81           sum7 = 0.0;
82           sum8 = 0.0;
83           sum9 = 0.0;
84           suma = 0.0;
85           sumb = 0.0;
86           sumc = 0.0;
87           sumd = 0.0;
88           sume = 0.0;
89           sumf = 0.0;
92           for (index = 0; index < k; index++)
93           { // loop over k;
94             // each iteration performs rank one update of 4x1 by 1x8
95             // matrices of A and B respectively; result is
96             // accumulated over 4x8 matrix
97                 __float2_t b01, b23, b45, b67, a01, a23;
98                 __x128_t   reg128;
100             a01 = *ptrA++;
101                 a23 = *ptrA++;
103             b01 = *ptrB++;
104             b23 = *ptrB++;
105             b45 = *ptrB++;
106             b67 = *ptrB++;
108             reg128 = _cmpysp(a01, b01);
109             // accumulate a[0]*b[1] and -a[0]*b[0]
110             sum0 = _daddsp(sum0, _lof2_128(reg128));
111             // accumulate a[1]*b[0] and a[1]*b[1]
112             sum1 = _daddsp(sum1, _hif2_128(reg128));
114             reg128 = _cmpysp(a01, b23);
115             // accumulate a[0]*b[3] and -a[0]*b[2]
116             sum2 = _daddsp(sum2, _lof2_128(reg128));
117             // accumulate a[1]*b[2] and a[1]*b[3]
118             sum3 = _daddsp(sum3, _hif2_128(reg128));
120             reg128 = _cmpysp(a01, b45);
121             // accumulate a[0]*b[5] and -a[0]*b[4]
122             sum4 = _daddsp(sum4, _lof2_128(reg128));
123             // accumulate a[1]*b[4] and a[1]*b[5]
124             sum5 = _daddsp(sum5, _hif2_128(reg128));
126             reg128 = _cmpysp(a01, b67);
127             // accumulate a[0]*b[7] and -a[0]*b[6]
128             sum6 = _daddsp(sum6, _lof2_128(reg128));
129             // accumulate a[1]*b[6] and a[1]*b[7]
130             sum7 = _daddsp(sum7, _hif2_128(reg128));
132             reg128 = _cmpysp(a23, b01);
133             // accumulate a[2]*b[1] and -a[2]*b[0]
134             sum8 = _daddsp(sum8, _lof2_128(reg128));
135             // accumulate a[3]*b[0] and a[3]*b[1]
136             sum9 = _daddsp(sum9, _hif2_128(reg128));
138             reg128 = _cmpysp(a23, b23);
139             // accumulate a[2]*b[3] and -a[2]*b[2]
140             suma = _daddsp(suma, _lof2_128(reg128));
141             // accumulate a[3]*b[2] and a[3]*b[3]
142             sumb = _daddsp(sumb, _hif2_128(reg128));
144             reg128 = _cmpysp(a23, b45);
145             // accumulate a[2]*b[5] and -a[2]*b[4]
146             sumc = _daddsp(sumc, _lof2_128(reg128));
147             // accumulate a[3]*b[4] and a[3]*b[5]
148             sumd = _daddsp(sumd, _hif2_128(reg128));
150             reg128 = _cmpysp(a23, b67);
151             // accumulate a[2]*b[7] and -a[2]*b[6]
152             sume = _daddsp(sume, _lof2_128(reg128));
153             // accumulate a[3]*b[6] and a[3]*b[7]
154             sumf = _daddsp(sumf, _hif2_128(reg128));
156           }
158 #ifdef BLIS_ENABLE_CYCLE_COUNT
159           counter_end=TSCL;
160           if (lib_get_coreID () == 0)
161                   printf("%d\t%d\t",k, counter_end-counter_start);
162 #endif
164 #ifdef BLIS_ENABLE_CYCLE_COUNT
165           TSCL=0;
166           counter_start = TSCL;
167 #endif
168             regA2 = _ftof2(*alpha, *alpha);
169             regB2 = _ftof2(*beta, *beta);
170             if (rs_c != 1)
171             {
172                 // update c[0,0] and c[1,0]
173                 c0 = (c + 0*rs_c + 0*cs_c);
174                 c1 = (c + 1*rs_c + 0*cs_c);
175                 regC = _ftof2(*c1,*c0);
176                 regC = _dmpysp(regC, regB2);
177                 regS = _ftof2(_lof2(sum1),-_hif2(sum0));
178                 regR = _daddsp(_dmpysp(regA2, regS), regC);
179                 *c0 = _lof2(regR);
180                 *c1 = _hif2(regR);
182                 //update c[0,1] and c[1,1]
183                 c0 = c0 + 1*cs_c;
184                 c1 = c1 + 1*cs_c;
185                 regC = _ftof2(*c1,*c0);
186                 regC = _dmpysp(regC, regB2);
187                 regS = _ftof2(_hif2(sum1),_lof2(sum0));
188                 regR = _daddsp(_dmpysp(regA2, regS), regC);
189                 *c0 = _lof2(regR);
190                 *c1 = _hif2(regR);
192                 //update c[0,2] and c[1,2]
193                 c0 = c0 + 1*cs_c;
194                 c1 = c1 + 1*cs_c;
195                 regC = _ftof2(*c1,*c0);
196                 regC = _dmpysp(regC, regB2);
197                 regS = _ftof2(_lof2(sum3),-_hif2(sum2));
198                 regR = _daddsp(_dmpysp(regA2, regS), regC);
199                 *c0 = _lof2(regR);
200                 *c1 = _hif2(regR);
202                 //update  c[0,3] and c[1,3]
203                 c0 = c0 + 1*cs_c;
204                 c1 = c1 + 1*cs_c;
205                 regC = _ftof2(*c1,*c0);
206                 regC = _dmpysp(regC, regB2);
207                 regS = _ftof2(_hif2(sum3),_lof2(sum2));
208                 regR = _daddsp(_dmpysp(regA2, regS), regC);
209                 *c0 = _lof2(regR);
210                 *c1 = _hif2(regR);
212                 //update c[0,4] and c[1,4]
213                 c0 = c0 + 1*cs_c;
214                 c1 = c1 + 1*cs_c;
215                 regC = _ftof2(*c1,*c0);
216                 regC = _dmpysp(regC, regB2);
217                 regS = _ftof2(_lof2(sum5),-_hif2(sum4));
218                 regR = _daddsp(_dmpysp(regA2, regS), regC);
219                 *c0 = _lof2(regR);
220                 *c1 = _hif2(regR);
222                 //update c[0,5] and c[1,5]
223                 c0 = c0 + 1*cs_c;
224                 c1 = c1 + 1*cs_c;
225                 regC = _ftof2(*c1,*c0);
226                 regC = _dmpysp(regC, regB2);
227                 regS = _ftof2(_hif2(sum5),_lof2(sum4));
228                 regR = _daddsp(_dmpysp(regA2, regS), regC);
229                 *c0 = _lof2(regR);
230                 *c1 = _hif2(regR);
232                 //update c[0,6] and c[1,6]
233                 c0 = c0 + 1*cs_c;
234                 c1 = c1 + 1*cs_c;
235                 regC = _ftof2(*c1,*c0);
236                 regC = _dmpysp(regC, regB2);
237                 regS = _ftof2(_lof2(sum7),-_hif2(sum6));
238                 regR = _daddsp(_dmpysp(regA2, regS), regC);
239                 *c0 = _lof2(regR);
240                 *c1 = _hif2(regR);
242                 //update c[0,7] and c[1,7]
243                 c0 = c0 + 1*cs_c;
244                 c1 = c1 + 1*cs_c;
245                 regC = _ftof2(*c1,*c0);
246                 regC = _dmpysp(regC, regB2);
247                 regS = _ftof2(_hif2(sum7),_lof2(sum6));
248                 regR = _daddsp(_dmpysp(regA2, regS), regC);
249                 *c0 = _lof2(regR);
250                 *c1 = _hif2(regR);
252                 // update c[2,0] and c[3,0]
253                 c0 = (c + 2*rs_c + 0*cs_c);
254                 c1 = (c + 3*rs_c + 0*cs_c);
255                 regC = _ftof2(*c1,*c0);
256                 regC = _dmpysp(regC, regB2);
257                 regS = _ftof2(_lof2(sum9),-_hif2(sum8));
258                 regR = _daddsp(_dmpysp(regA2, regS), regC);
259                 *c0 = _lof2(regR);
260                 *c1 = _hif2(regR);
262                 // update c[2,1] and c[3,1]
263                 c0 = c0 + 1*cs_c;
264                 c1 = c1 + 1*cs_c;
265                 regC = _ftof2(*c1,*c0);
266                 regC = _dmpysp(regC, regB2);
267                 regS = _ftof2(_hif2(sum9),_lof2(sum8));
268                 regR = _daddsp(_dmpysp(regA2, regS), regC);
269                 *c0 = _lof2(regR);
270                 *c1 = _hif2(regR);
272                 //update c[2,2] and c[3,2]
273                 c0 = c0 + 1*cs_c;
274                 c1 = c1 + 1*cs_c;
275                 regC = _ftof2(*c1,*c0);
276                 regC = _dmpysp(regC, regB2);
277                 regS = _ftof2(_lof2(sumb),-_hif2(suma));
278                 regR = _daddsp(_dmpysp(regA2, regS), regC);
279                 *c0 = _lof2(regR);
280                 *c1 = _hif2(regR);
282                 //update  c[2,3] and c[3,3]
283                 c0 = c0 + 1*cs_c;
284                 c1 = c1 + 1*cs_c;
285                 regC = _ftof2(*c1,*c0);
286                 regC = _dmpysp(regC, regB2);
287                 regS = _ftof2(_hif2(sumb),_lof2(suma));
288                 regR = _daddsp(_dmpysp(regA2, regS), regC);
289                 *c0 = _lof2(regR);
290                 *c1 = _hif2(regR);
292                 //update c[2,4] and c[3,4]
293                 c0 = c0 + 1*cs_c;
294                 c1 = c1 + 1*cs_c;
295                 regC = _ftof2(*c1,*c0);
296                 regC = _dmpysp(regC, regB2);
297                 regS = _ftof2(_lof2(sumd),-_hif2(sumc));
298                 regR = _daddsp(_dmpysp(regA2, regS), regC);
299                 *c0 = _lof2(regR);
300                 *c1 = _hif2(regR);
302                 //update c[2,5] and c[3,5]
303                 c0 = c0 + 1*cs_c;
304                 c1 = c1 + 1*cs_c;
305                 regC = _ftof2(*c1,*c0);
306                 regC = _dmpysp(regC, regB2);
307                 regS = _ftof2(_hif2(sumd),_lof2(sumc));
308                 regR = _daddsp(_dmpysp(regA2, regS), regC);
309                 *c0 = _lof2(regR);
310                 *c1 = _hif2(regR);
312                 //update c[2,6] and c[2,6]
313                 c0 = c0 + 1*cs_c;
314                 c1 = c1 + 1*cs_c;
315                 regC = _ftof2(*c1,*c0);
316                 regC = _dmpysp(regC, regB2);
317                 regS = _ftof2(_lof2(sumf),-_hif2(sume));
318                 regR = _daddsp(_dmpysp(regA2, regS), regC);
319                 *c0 = _lof2(regR);
320                 *c1 = _hif2(regR);
322                 //update c[2,7] and c[2,7]
323                 c0 = c0 + 1*cs_c;
324                 c1 = c1 + 1*cs_c;
325                 regC = _ftof2(*c1,*c0);
326                 regC = _dmpysp(regC, regB2);
327                 regS = _ftof2(_hif2(sumf),_lof2(sume));
328                 regR = _daddsp(_dmpysp(regA2, regS), regC);
329                 *c0 = _lof2(regR);
330                 *c1 = _hif2(regR);
332             }
333             else
334             {
335 #if 0
336                 // update c[0,0] and c[1,0]
337                 ptrC = (__float2_t *) c;
338                 regC = *ptrC;
339                 regC = _dmpysp(regC, regB2);
340                 regS = _ftof2(_lof2(sum1),-_hif2(sum0));
341                 regR = _daddsp(_dmpysp(regA2, regS), regC);
342                 *ptrC = regR;
344                 //update c[0,1] and c[1,1]
345                 ptrC += (cs_c>>1);
346                 regC = *ptrC;
347                 regC = _dmpysp(regC, regB2);
348                 regS = _ftof2(_hif2(sum1),_lof2(sum0));
349                 regR = _daddsp(_dmpysp(regA2, regS), regC);
350                 *ptrC = regR;
352                 //update c[0,2] and c[1,2]
353                 ptrC += (cs_c>>1);
354                 regC = *ptrC;
355                 regC = _dmpysp(regC, regB2);
356                 regS = _ftof2(_lof2(sum3),-_hif2(sum2));
357                 regR = _daddsp(_dmpysp(regA2, regS), regC);
358                 *ptrC = regR;
360                 //update  c[0,3] and c[1,3]
361                 ptrC += (cs_c>>1);
362                 regC = *ptrC;
363                 regC = _dmpysp(regC, regB2);
364                 regS = _ftof2(_hif2(sum3),_lof2(sum2));
365                 regR = _daddsp(_dmpysp(regA2, regS), regC);
366                 *ptrC = regR;
368                 //update c[0,4] and c[1,4]
369                 ptrC += (cs_c>>1);
370                 regC = *ptrC;
371                 regC = _dmpysp(regC, regB2);
372                 regS = _ftof2(_lof2(sum5),-_hif2(sum4));
373                 regR = _daddsp(_dmpysp(regA2, regS), regC);
374                 *ptrC = regR;
376                 //update c[0,5] and c[1,5]
377                 ptrC += (cs_c>>1);
378                 regC = *ptrC;
379                 regC = _dmpysp(regC, regB2);
380                 regS = _ftof2(_hif2(sum5),_lof2(sum4));
381                 regR = _daddsp(_dmpysp(regA2, regS), regC);
382                 *ptrC = regR;
384                 //update c[0,6] and c[1,6]
385                 ptrC += (cs_c>>1);
386                 regC = *ptrC;
387                 regC = _dmpysp(regC, regB2);
388                 regS = _ftof2(_lof2(sum7),-_hif2(sum6));
389                 regR = _daddsp(_dmpysp(regA2, regS), regC);
390                 *ptrC = regR;
392                 //update c[0,7] and c[1,7]
393                 ptrC += (cs_c>>1);
394                 regC = *ptrC;
395                 regC = _dmpysp(regC, regB2);
396                 regS = _ftof2(_hif2(sum7),_lof2(sum6));
397                 regR = _daddsp(_dmpysp(regA2, regS), regC);
398                 *ptrC = regR;
400                 // update c[2,0] and c[3,0]
401                 ptrC = (__float2_t *) (c+2);
402                 regC = *ptrC;
403                 regC = _dmpysp(regC, regB2);
404                 regS = _ftof2(_lof2(sum9),-_hif2(sum8));
405                 regR = _daddsp(_dmpysp(regA2, regS), regC);
406                 *ptrC = regR;
408                 // update c[2,1] and c[3,1]
409                 ptrC += (cs_c>>1);
410                 regC = *ptrC;
411                 regC = _dmpysp(regC, regB2);
412                 regS = _ftof2(_hif2(sum9),_lof2(sum8));
413                 regR = _daddsp(_dmpysp(regA2, regS), regC);
414                 *ptrC = regR;
416                 //update c[2,2] and c[3,2]
417                 ptrC += (cs_c>>1);
418                 regC = *ptrC;
419                 regC = _dmpysp(regC, regB2);
420                 regS = _ftof2(_lof2(sumb),-_hif2(suma));
421                 regR = _daddsp(_dmpysp(regA2, regS), regC);
422                 *ptrC = regR;
424                 //update  c[2,3] and c[3,3]
425                 ptrC += (cs_c>>1);
426                 regC = *ptrC;
427                 regC = _dmpysp(regC, regB2);
428                 regS = _ftof2(_hif2(sumb),_lof2(suma));
429                 regR = _daddsp(_dmpysp(regA2, regS), regC);
430                 *ptrC = regR;
432                 //update c[2,4] and c[3,4]
433                 ptrC += (cs_c>>1);
434                 regC = *ptrC;
435                 regC = _dmpysp(regC, regB2);
436                 regS = _ftof2(_lof2(sumd),-_hif2(sumc));
437                 regR = _daddsp(_dmpysp(regA2, regS), regC);
438                 *ptrC = regR;
440                 //update c[2,5] and c[3,5]
441                 ptrC += (cs_c>>1);
442                 regC = *ptrC;
443                 regC = _dmpysp(regC, regB2);
444                 regS = _ftof2(_hif2(sumd),_lof2(sumc));
445                 regR = _daddsp(_dmpysp(regA2, regS), regC);
446                 *ptrC = regR;
448                 //update c[2,6] and c[2,6]
449                 ptrC += (cs_c>>1);
450                 regC = *ptrC;
451                 regC = _dmpysp(regC, regB2);
452                 regS = _ftof2(_lof2(sumf),-_hif2(sume));
453                 regR = _daddsp(_dmpysp(regA2, regS), regC);
454                 *ptrC = regR;
456                 //update c[2,7] and c[2,7]
457                 ptrC += (cs_c>>1);
458                 regC = *ptrC;
459                 regC = _dmpysp(regC, regB2);
460                 regS = _ftof2(_hif2(sumf),_lof2(sume));
461                 regR = _daddsp(_dmpysp(regA2, regS), regC);
462                 *ptrC = regR;
463 #else
464 /*              __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
465                 __float2_t c8, c9, ca, cb, cc, cd, ce, cf;
467                 ptrC = (__float2_t *) c;
468                 c0 = *ptrC++;
469                 sum0 = _dmpysp(regA2, sum0);
470                 c1 = *ptrC--;
471                 sum1 = _dmpysp(regA2, sum1);
472                 ptrC += (cs_c>>1);
473                 c2 = *ptrC++;
474                 sum2 = _dmpysp(regA2, sum2);
475                 c3 = *ptrC--;
476                 sum3 = _dmpysp(regA2, sum3);
477                 ptrC += (cs_c>>1);
478                 c4 = *ptrC++;
479                 sum8 = _dmpysp(regA2, sum8);
480                 c5 = *ptrC--;
481                 sum9 = _dmpysp(regA2, sum9);
482                 ptrC += (cs_c>>1);
483                 c6 = *ptrC++;
484                 suma = _dmpysp(regA2, suma);
485                 c7 = *ptrC--;
486                 sumb = _dmpysp(regA2, sumb);
488                 c0 = _dmpysp(c0, regB2);
489                 c1 = _dmpysp(c1, regB2);
490                 c2 = _dmpysp(c2, regB2);
491                 c3 = _dmpysp(c3, regB2);
492                 c4 = _dmpysp(c4, regB2);
493                 c5 = _dmpysp(c5, regB2);
494                 c6 = _dmpysp(c6, regB2);
495                 c7 = _dmpysp(c7, regB2);
497                 // update c[0,0] and c[1,0]
498                 c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
499                 // update c[2,0] and c[3,0]
500                 c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);
501                 //update c[0,1] and c[1,1]
502                 c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
503                 // update c[2,1] and c[3,1]
504                 c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);
505                 //update c[0,2] and c[1,2]
506                 c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
507                 //update c[2,2] and c[3,2]
508                 c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);
509                 //update  c[0,3] and c[1,3]
510                 c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
511                 //update  c[2,3] and c[3,3]
512                 c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);
513                 //update c[0,4] and c[1,4]
515                 ptrC = (__float2_t *) c;
516                 *ptrC++ = c0;
517                 *ptrC-- = c1;
518                 ptrC += (cs_c>>1);
519                 *ptrC++ = c2;
520                 *ptrC-- = c3;
521                 ptrC += (cs_c>>1);
522                 *ptrC++ = c4;
523                 *ptrC-- = c5;
524                 ptrC += (cs_c>>1);
525                 *ptrC++ = c6;
526                 *ptrC-- = c7;
528                 ptrC = (__float2_t *) (c+(cs_c<<2));
529                 c8 = *ptrC++;
530                 c9 = *ptrC--;
531                 ptrC += (cs_c>>1);
532                 ca = *ptrC++;
533                 cb = *ptrC--;
534                 ptrC += (cs_c>>1);
535                 cc = *ptrC++;
536                 cd = *ptrC--;
537                 ptrC += (cs_c>>1);
538                 ce = *ptrC++;
539                 cf = *ptrC;
541                 sum4 = _dmpysp(regA2, sum4);
542                 sum5 = _dmpysp(regA2, sum5);
543                 sum6 = _dmpysp(regA2, sum6);
544                 sum7 = _dmpysp(regA2, sum7);
545                 sumc = _dmpysp(regA2, sumc);
546                 sumd = _dmpysp(regA2, sumd);
547                 sume = _dmpysp(regA2, sume);
548                 sumf = _dmpysp(regA2, sumf);
550                 c8 = _dmpysp(c8, regB2);
551                 c9 = _dmpysp(c9, regB2);
552                 ca = _dmpysp(ca, regB2);
553                 cb = _dmpysp(cb, regB2);
554                 cc = _dmpysp(cc, regB2);
555                 cd = _dmpysp(cd, regB2);
556                 ce = _dmpysp(ce, regB2);
557                 cf = _dmpysp(cf, regB2);
559                 c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);
560                 //update c[2,4] and c[3,4]
561                 c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);
562                 //update c[0,5] and c[1,5]
563                 ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);
564                 //update c[2,5] and c[3,5]
565                 cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);
566                 //update c[0,6] and c[1,6]
567                 cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);
568                 //update c[2,6] and c[3,6]
569                 cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);
570                 //update c[0,7] and c[1,7]
571                 ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);
572                 //update c[2,7] and c[3,7]
573                 cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);
575                 ptrC = (__float2_t *) (c+(cs_c<<2));
576                 *ptrC++ = c8;
577                 *ptrC-- = c9;
578                 ptrC += (cs_c>>1);
579                 *ptrC++ = ca;
580                 *ptrC-- = cb;
581                 ptrC += (cs_c>>1);
582                 *ptrC++ = cc;
583                 *ptrC-- = cd;
584                 ptrC += (cs_c>>1);
585                 *ptrC++ = ce;
586                 *ptrC = cf;*/
588                 __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
589                 __float2_t c8, c9, ca, cb, cc, cd, ce, cf;
591                 ptrC = (__float2_t *) c;
592                 c0 = *ptrC++;
593                 c1 = *ptrC--;
594                 ptrC += (cs_c>>1);
595                 c2 = *ptrC++;
596                 c3 = *ptrC--;
597                 ptrC += (cs_c>>1);
598                 c4 = *ptrC++;
599                 c5 = *ptrC--;
600                 ptrC += (cs_c>>1);
601                 c6 = *ptrC++;
602                 c7 = *ptrC--;
603                 ptrC += (cs_c>>1);
604                 c8 = *ptrC++;
605                 c9 = *ptrC--;
606                 ptrC += (cs_c>>1);
607                 ca = *ptrC++;
608                 cb = *ptrC--;
609                 ptrC += (cs_c>>1);
610                 cc = *ptrC++;
611                 cd = *ptrC--;
612                 ptrC += (cs_c>>1);
613                 ce = *ptrC++;
614                 cf = *ptrC;
616                 sum0 = _dmpysp(regA2, sum0);
617                 sum1 = _dmpysp(regA2, sum1);
618                 sum2 = _dmpysp(regA2, sum2);
619                 sum3 = _dmpysp(regA2, sum3);
620                 sum4 = _dmpysp(regA2, sum4);
621                 sum5 = _dmpysp(regA2, sum5);
622                 sum6 = _dmpysp(regA2, sum6);
623                 sum7 = _dmpysp(regA2, sum7);
624                 sum8 = _dmpysp(regA2, sum8);
625                 sum9 = _dmpysp(regA2, sum9);
626                 suma = _dmpysp(regA2, suma);
627                 sumb = _dmpysp(regA2, sumb);
628                 sumc = _dmpysp(regA2, sumc);
629                 sumd = _dmpysp(regA2, sumd);
630                 sume = _dmpysp(regA2, sume);
631                 sumf = _dmpysp(regA2, sumf);
633                 c0 = _dmpysp(c0, regB2);
634                 c1 = _dmpysp(c1, regB2);
635                 c2 = _dmpysp(c2, regB2);
636                 c3 = _dmpysp(c3, regB2);
637                 c4 = _dmpysp(c4, regB2);
638                 c5 = _dmpysp(c5, regB2);
639                 c6 = _dmpysp(c6, regB2);
640                 c7 = _dmpysp(c7, regB2);
641                 c8 = _dmpysp(c8, regB2);
642                 c9 = _dmpysp(c9, regB2);
643                 ca = _dmpysp(ca, regB2);
644                 cb = _dmpysp(cb, regB2);
645                 cc = _dmpysp(cc, regB2);
646                 cd = _dmpysp(cd, regB2);
647                 ce = _dmpysp(ce, regB2);
648                 cf = _dmpysp(cf, regB2);
650                 // update c[0,0] and c[1,0]
651                 c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
652                 // update c[2,0] and c[3,0]
653                 c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);
654                 //update c[0,1] and c[1,1]
655                 c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
656                 // update c[2,1] and c[3,1]
657                 c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);
658                 //update c[0,2] and c[1,2]
659                 c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
660                 //update c[2,2] and c[3,2]
661                 c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);
662                 //update  c[0,3] and c[1,3]
663                 c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
664                 //update  c[2,3] and c[3,3]
665                 c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);
666                 //update c[0,4] and c[1,4]
667                 c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);
668                 //update c[2,4] and c[3,4]
669                 c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);
670                 //update c[0,5] and c[1,5]
671                 ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);
672                 //update c[2,5] and c[3,5]
673                 cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);
674                 //update c[0,6] and c[1,6]
675                 cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);
676                 //update c[2,6] and c[3,6]
677                 cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);
678                 //update c[0,7] and c[1,7]
679                 ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);
680                 //update c[2,7] and c[3,7]
681                 cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);
683                 ptrC = (__float2_t *) c;
684                 *ptrC++ = c0;
685                 *ptrC-- = c1;
686                 ptrC += (cs_c>>1);
687                 *ptrC++ = c2;
688                 *ptrC-- = c3;
689                 ptrC += (cs_c>>1);
690                 *ptrC++ = c4;
691                 *ptrC-- = c5;
692                 ptrC += (cs_c>>1);
693                 *ptrC++ = c6;
694                 *ptrC-- = c7;
695                 ptrC += (cs_c>>1);
696                 *ptrC++ = c8;
697                 *ptrC-- = c9;
698                 ptrC += (cs_c>>1);
699                 *ptrC++ = ca;
700                 *ptrC-- = cb;
701                 ptrC += (cs_c>>1);
702                 *ptrC++ = cc;
703                 *ptrC-- = cd;
704                 ptrC += (cs_c>>1);
705                 *ptrC++ = ce;
706                 *ptrC = cf;
709 #endif
710             }
711 #ifdef BLIS_ENABLE_CYCLE_COUNT
712           counter_end=TSCL;
713           if (lib_get_coreID () == 0)
714                   printf("%d\n", counter_end-counter_start);
715 #endif
719 void bli_sgemm_ukernel_4x4(
720                         dim_t              k,
721                         float*    restrict alpha,
722                         float*    restrict a,
723                         float*    restrict b,
724                         float*    restrict beta,
725                         float*    restrict c, inc_t rs_c, inc_t cs_c,
726                         auxinfo_t*         data
727                       )
729     __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
730         __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;
731         __float2_t * restrict ptrB     = (__float2_t *) b;
732         __float2_t * restrict ptrA     = (__float2_t *) a;
733         __float2_t * restrict ptrC;
734         __float2_t regA2, regB2, regC, regS, regR;
735         float* restrict c0, * restrict c1;
736         int_least16_t index;
737         int kEven, kLeft;
739 #ifdef BLIS_ENABLE_CYCLE_COUNT
740           volatile int counter_start;
741           volatile int counter_end;
742 #endif
744 #ifdef BLIS_ENABLE_CYCLE_COUNT
745           TSCL=0;
746           counter_start = TSCL;
747 #endif
749         // zero out accumulators
750         sum0 = 0.0;
751         sum1 = 0.0;
752         sum2 = 0.0;
753         sum3 = 0.0;
754         sum4 = 0.0;
755         sum5 = 0.0;
756         sum6 = 0.0;
757         sum7 = 0.0;
758         sum8 = 0.0;
759         sum9 = 0.0;
760         suma = 0.0;
761         sumb = 0.0;
762         sumc = 0.0;
763         sumd = 0.0;
764         sume = 0.0;
765         sumf = 0.0;
767         kEven=k>>1;
768         kLeft=k&1;
769         //TSCL = 0;
770         //cycles = TSCL;
773         for (index = 0; index < kEven; index++)
774         { // loop over k;
775                 // each iteration performs rank one update of 4x1 by 1x4
776                 // matrices of A and B respectively; result is
777                 // accumulated over 4x4 matrix
778                 __float2_t b01, b23, a01, a23;
779                 __x128_t   reg128;
781                 // for even k
782                 a01 = *ptrA++;
783                 a23 = *ptrA++;
785                 b01 = *ptrB++;
786                 b23 = *ptrB++;
788                 reg128 = _cmpysp(a01, b01);
789                 // accumulate a[0]*b[1] and -a[0]*b[0]
790                 sum0 = _daddsp(sum0, _lof2_128(reg128));
791                 // accumulate a[1]*b[0] and a[1]*b[1]
792                 sum1 = _daddsp(sum1, _hif2_128(reg128));
794                 reg128 = _cmpysp(a01, b23);
795                 // accumulate a[0]*b[3] and -a[0]*b[2]
796                 sum2 = _daddsp(sum2, _lof2_128(reg128));
797                 // accumulate a[1]*b[2] and a[1]*b[3]
798                 sum3 = _daddsp(sum3, _hif2_128(reg128));
800                 reg128 = _cmpysp(a23, b01);
801                 // accumulate a[2]*b[1] and -a[2]*b[0]
802                 sum4 = _daddsp(sum4, _lof2_128(reg128));
803                 // accumulate a[3]*b[0] and a[3]*b[1]
804                 sum5 = _daddsp(sum5, _hif2_128(reg128));
806                 reg128 = _cmpysp(a23, b23);
807                 // accumulate a[2]*b[3] and -a[2]*b[2]
808                 sum6 = _daddsp(sum6, _lof2_128(reg128));
809                 // accumulate a[3]*b[2] and a[3]*b[3]
810                 sum7 = _daddsp(sum7, _hif2_128(reg128));
814                 // for odd k
815                 a01 = *ptrA++;
816                 a23 = *ptrA++;
818                 b01 = *ptrB++;
819                 b23 = *ptrB++;
821                 reg128 = _cmpysp(a01, b01);
822                 // accumulate a[0]*b[1] and -a[0]*b[0]
823                 sum8 = _daddsp(sum8, _lof2_128(reg128));
824                 // accumulate a[1]*b[0] and a[1]*b[1]
825                 sum9 = _daddsp(sum9, _hif2_128(reg128));
827                 reg128 = _cmpysp(a01, b23);
828                 // accumulate a[0]*b[3] and -a[0]*b[2]
829                 suma = _daddsp(suma, _lof2_128(reg128));
830                 // accumulate a[1]*b[2] and a[1]*b[3]
831                 sumb = _daddsp(sumb, _hif2_128(reg128));
833                 reg128 = _cmpysp(a23, b01);
834                 // accumulate a[2]*b[1] and -a[2]*b[0]
835                 sumc = _daddsp(sumc, _lof2_128(reg128));
836                 // accumulate a[3]*b[0] and a[3]*b[1]
837                 sumd = _daddsp(sumd, _hif2_128(reg128));
839                 reg128 = _cmpysp(a23, b23);
840                 // accumulate a[2]*b[3] and -a[2]*b[2]
841                 sume = _daddsp(sume, _lof2_128(reg128));
842                 // accumulate a[3]*b[2] and a[3]*b[3]
843                 sumf = _daddsp(sumf, _hif2_128(reg128));
845         }
846         if(kLeft)
847         { // last k if left;
848                 __float2_t b01, b23, a01, a23;
849                 __x128_t   reg128;
851                 a01 = *ptrA++;
852                 a23 = *ptrA++;
854                 b01 = *ptrB++;
855                 b23 = *ptrB++;
857                 reg128 = _cmpysp(a01, b01);
858                 // accumulate a[0]*b[1] and -a[0]*b[0]
859                 sum0 = _daddsp(sum0, _lof2_128(reg128));
860                 // accumulate a[1]*b[0] and a[1]*b[1]
861                 sum1 = _daddsp(sum1, _hif2_128(reg128));
863                 reg128 = _cmpysp(a01, b23);
864                 // accumulate a[0]*b[3] and -a[0]*b[2]
865                 sum2 = _daddsp(sum2, _lof2_128(reg128));
866                 // accumulate a[1]*b[2] and a[1]*b[3]
867                 sum3 = _daddsp(sum3, _hif2_128(reg128));
869                 reg128 = _cmpysp(a23, b01);
870                 // accumulate a[2]*b[1] and -a[2]*b[0]
871                 sum4 = _daddsp(sum4, _lof2_128(reg128));
872                 // accumulate a[3]*b[0] and a[3]*b[1]
873                 sum5 = _daddsp(sum5, _hif2_128(reg128));
875                 reg128 = _cmpysp(a23, b23);
876                 // accumulate a[2]*b[3] and -a[2]*b[2]
877                 sum6 = _daddsp(sum6, _lof2_128(reg128));
878                 // accumulate a[3]*b[2] and a[3]*b[3]
879                 sum7 = _daddsp(sum7, _hif2_128(reg128));
881         }
882 #ifdef BLIS_ENABLE_CYCLE_COUNT
883           counter_end=TSCL;
884           if (lib_get_coreID () == 0)
885                   printf("%d\t%d\t",k, counter_end-counter_start);
886 #endif
888 #ifdef BLIS_ENABLE_CYCLE_COUNT
889           TSCL=0;
890           counter_start = TSCL;
891 #endif
892         sum0 = _daddsp(sum0, sum8);
893         sum1 = _daddsp(sum1, sum9);
894         sum2 = _daddsp(sum2, suma);
895         sum3 = _daddsp(sum3, sumb);
896         sum4 = _daddsp(sum4, sumc);
897         sum5 = _daddsp(sum5, sumd);
898         sum6 = _daddsp(sum6, sume);
899         sum7 = _daddsp(sum7, sumf);
902         regA2 = _ftof2(*alpha, *alpha);
903         regB2 = _ftof2(*beta, *beta);
904         if (rs_c != 1)
905         {
906                 // update c[0,0] and c[1,0]
907                 c0 = (c + 0*rs_c + 0*cs_c);
908                 c1 = (c + 1*rs_c + 0*cs_c);
909                 regC = _ftof2(*c1,*c0);
910                 regC = _dmpysp(regC, regB2);
911                 regS = _ftof2(_lof2(sum1),-_hif2(sum0));
912                 regR = _daddsp(_dmpysp(regA2, regS), regC);
913                 *c0 = _lof2(regR);
914                 *c1 = _hif2(regR);
916                 //update c[0,1] and c[1,1]
917                 c0 = c0 + 1*cs_c;
918                 c1 = c1 + 1*cs_c;
919                 regC = _ftof2(*c1,*c0);
920                 regC = _dmpysp(regC, regB2);
921                 regS = _ftof2(_hif2(sum1),_lof2(sum0));
922                 regR = _daddsp(_dmpysp(regA2, regS), regC);
923                 *c0 = _lof2(regR);
924                 *c1 = _hif2(regR);
926                 //update c[0,2] and c[1,2]
927                 c0 = c0 + 1*cs_c;
928                 c1 = c1 + 1*cs_c;
929                 regC = _ftof2(*c1,*c0);
930                 regC = _dmpysp(regC, regB2);
931                 regS = _ftof2(_lof2(sum3),-_hif2(sum2));
932                 regR = _daddsp(_dmpysp(regA2, regS), regC);
933                 *c0 = _lof2(regR);
934                 *c1 = _hif2(regR);
936                 //update  c[0,3] and c[1,3]
937                 c0 = c0 + 1*cs_c;
938                 c1 = c1 + 1*cs_c;
939                 regC = _ftof2(*c1,*c0);
940                 regC = _dmpysp(regC, regB2);
941                 regS = _ftof2(_hif2(sum3),_lof2(sum2));
942                 regR = _daddsp(_dmpysp(regA2, regS), regC);
943                 *c0 = _lof2(regR);
944                 *c1 = _hif2(regR);
946                 // update c[2,0] and c[3,0]
947                 c0 = (c + 2*rs_c + 0*cs_c);
948                 c1 = (c + 3*rs_c + 0*cs_c);
949                 regC = _ftof2(*c1,*c0);
950                 regC = _dmpysp(regC, regB2);
951                 regS = _ftof2(_lof2(sum5),-_hif2(sum4));
952                 regR = _daddsp(_dmpysp(regA2, regS), regC);
953                 *c0 = _lof2(regR);
954                 *c1 = _hif2(regR);
956                 // update c[2,1] and c[3,1]
957                 c0 = c0 + 1*cs_c;
958                 c1 = c1 + 1*cs_c;
959                 regC = _ftof2(*c1,*c0);
960                 regC = _dmpysp(regC, regB2);
961                 regS = _ftof2(_hif2(sum5),_lof2(sum4));
962                 regR = _daddsp(_dmpysp(regA2, regS), regC);
963                 *c0 = _lof2(regR);
964                 *c1 = _hif2(regR);
966                 //update c[2,2] and c[3,2]
967                 c0 = c0 + 1*cs_c;
968                 c1 = c1 + 1*cs_c;
969                 regC = _ftof2(*c1,*c0);
970                 regC = _dmpysp(regC, regB2);
971                 regS = _ftof2(_lof2(sum7),-_hif2(sum6));
972                 regR = _daddsp(_dmpysp(regA2, regS), regC);
973                 *c0 = _lof2(regR);
974                 *c1 = _hif2(regR);
976                 //update  c[2,3] and c[3,3]
977                 c0 = c0 + 1*cs_c;
978                 c1 = c1 + 1*cs_c;
979                 regC = _ftof2(*c1,*c0);
980                 regC = _dmpysp(regC, regB2);
981                 regS = _ftof2(_hif2(sum7),_lof2(sum6));
982                 regR = _daddsp(_dmpysp(regA2, regS), regC);
983                 *c0 = _lof2(regR);
984                 *c1 = _hif2(regR);
986         }
987         else
988         {
989                 __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
991                 ptrC = (__float2_t *) c;
992                 c0 = *ptrC++;
993                 c1 = *ptrC--;
994                 ptrC += (cs_c>>1);
995                 c2 = *ptrC++;
996                 c3 = *ptrC--;
997                 ptrC += (cs_c>>1);
998                 c4 = *ptrC++;
999                 c5 = *ptrC--;
1000                 ptrC += (cs_c>>1);
1001                 c6 = *ptrC++;
1002                 c7 = *ptrC--;
1004                 sum0 = _dmpysp(regA2, sum0);
1005                 sum1 = _dmpysp(regA2, sum1);
1006                 sum2 = _dmpysp(regA2, sum2);
1007                 sum3 = _dmpysp(regA2, sum3);
1008                 sum4 = _dmpysp(regA2, sum4);
1009                 sum5 = _dmpysp(regA2, sum5);
1010                 sum6 = _dmpysp(regA2, sum6);
1011                 sum7 = _dmpysp(regA2, sum7);
1013                 c0 = _dmpysp(c0, regB2);
1014                 c1 = _dmpysp(c1, regB2);
1015                 c2 = _dmpysp(c2, regB2);
1016                 c3 = _dmpysp(c3, regB2);
1017                 c4 = _dmpysp(c4, regB2);
1018                 c5 = _dmpysp(c5, regB2);
1019                 c6 = _dmpysp(c6, regB2);
1020                 c7 = _dmpysp(c7, regB2);
1022                 // update c[0,0] and c[1,0]
1023                 c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
1024                 // update c[2,0] and c[3,0]
1025                 c1 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c1);
1026                 //update c[0,1] and c[1,1]
1027                 c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
1028                 // update c[2,1] and c[3,1]
1029                 c3 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),c3);
1030                 //update c[0,2] and c[1,2]
1031                 c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
1032                 //update c[2,2] and c[3,2]
1033                 c5 = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),c5);
1034                 //update  c[0,3] and c[1,3]
1035                 c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
1036                 //update  c[2,3] and c[3,3]
1037                 c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),c7);
1039                 ptrC = (__float2_t *) c;
1040                 *ptrC++ = c0;
1041                 *ptrC-- = c1;
1042                 ptrC += (cs_c>>1);
1043                 *ptrC++ = c2;
1044                 *ptrC-- = c3;
1045                 ptrC += (cs_c>>1);
1046                 *ptrC++ = c4;
1047                 *ptrC-- = c5;
1048                 ptrC += (cs_c>>1);
1049                 *ptrC++ = c6;
1050                 *ptrC-- = c7;
1052         }
1053 #ifdef BLIS_ENABLE_CYCLE_COUNT
1054           counter_end=TSCL;
1055           if (lib_get_coreID () == 0)
1056                   printf("%d\n", counter_end-counter_start);
1057 #endif
1061 //void dgemmKernel(const double *pA, const double *pB, double *pC, const double a, const int k, const int stepC)
1062 void bli_dgemm_ukernel_4x4(
1063                          dim_t              k,
1064                          double*    restrict alpha,
1065                          double*    restrict a,
1066                          double*    restrict b,
1067                          double*    restrict beta,
1068                          double*    restrict c, inc_t rs_c, inc_t cs_c,
1069                          auxinfo_t*         data
1070                           )
1072   double sum00, sum01, sum02, sum03;
1073   double sum10, sum11, sum12, sum13;
1074   double sum20, sum21, sum22, sum23;
1075   double sum30, sum31, sum32, sum33;
1076   int index;
1077   double al = *alpha;
1078   double be = *beta;
1079 #ifdef BLIS_ENABLE_CYCLE_COUNT
1080   volatile int counter_start;
1081   volatile int counter_end;
1082 #endif
1085 #ifdef BLIS_ENABLE_CYCLE_COUNT
1086   TSCL = 0;
1087   counter_start = TSCL;
1088 #endif
1089   //touch routine: both a & b
1090   //Length of b = NR*K*size of double;
1091   //Length of a = MR*K*size of double;
1092 #ifdef BLIS_ENABLE_PREFETCH
1093   //touch(b, k*BLIS_DEFAULT_NR_D*8);
1094   //touch(a, k*BLIS_DEFAULT_MR_D*8);
1095 #endif
1100   sum00 = 0.0;
1101   sum01 = 0.0;
1102   sum02 = 0.0;
1103   sum03 = 0.0;
1104   sum10 = 0.0;
1105   sum11 = 0.0;
1106   sum12 = 0.0;
1107   sum13 = 0.0;
1108   sum20 = 0.0;
1109   sum21 = 0.0;
1110   sum22 = 0.0;
1111   sum23 = 0.0;
1112   sum30 = 0.0;
1113   sum31 = 0.0;
1114   sum32 = 0.0;
1115   sum33 = 0.0;
1118   for(index = 0; index < k; index++)
1119   {  // loop over k;
1120          // each iteration performs rank one update of 4x1 by 1x4
1121          // matrices of A and B respectively; result is
1122      // accumulated over 4x4 matrix
1123      register double a0, a1, a2, a3;
1124      register double b0, b1, b2, b3;
1126      a0 = *a++;
1127      a1 = *a++;
1128      a2 = *a++;
1129      a3 = *a++;
1130      b0 = *b++;
1131      b1 = *b++;
1132      b2 = *b++;
1133      b3 = *b++;
1135      // a[0]*b[0]
1136      sum00 += a0*b0;
1137      // a[0]*b[1]
1138      sum01 += a0*b1;
1139      // a[0]*b[2]
1140      sum02 += a0*b2;
1141      // a[0]*b[3]
1142      sum03 += a0*b3;
1143      // a[1]*b[0]
1144      sum10 += a1*b0;
1145      // a[1]*b[1]
1146      sum11 += a1*b1;
1147      // a[1]*b[2]
1148      sum12 += a1*b2;
1149      // a[1]*b[3]
1150      sum13 += a1*b3;
1151      // a[2]*b[0]
1152      sum20 += a2*b0;
1153      // a[2]*b[1]
1154      sum21 += a2*b1;
1155      // a[2]*b[2]
1156      sum22 += a2*b2;
1157      // a[2]*b[3]
1158      sum23 += a2*b3;
1159      // a[3]*b[0]
1160      sum30 += a3*b0;
1161      // a[3]*b[1]
1162      sum31 += a3*b1;
1163      // a[3]*b[2]
1164      sum32 += a3*b2;
1165      // a[3]*b[3]
1166      sum33 += a3*b3;
1167   }
1168 #ifdef BLIS_ENABLE_CYCLE_COUNT
1169           counter_end=TSCL;
1170           if (lib_get_coreID () == 0)
1171                   printf("%d\t%d\t",k, counter_end-counter_start);
1172 #endif
1173 #ifdef BLIS_ENABLE_CYCLE_COUNT
1174   TSCL = 0;
1175   counter_start = TSCL;
1176 #endif
1178         double* restrict cptr;
1179         // 0th Column
1180         // updating C[00]
1181     cptr = c;
1182     *cptr = *cptr * be;
1183     *cptr += sum00 * al;
1185     // updating C[10]
1186     cptr += rs_c;
1187     *cptr = *cptr*be;
1188     *cptr += sum10 * al;
1190     // updating C[20]
1191     cptr += rs_c;
1192     *cptr = *cptr*be;
1193     *cptr += sum20 * al;
1195     // updating C[30]
1196     cptr += rs_c;
1197     *cptr = *cptr*be;
1198     *cptr += sum30 * al;
1200     // 1st column
1201     // updating C[01]
1202     cptr = c + cs_c;
1203     *cptr = *cptr*be;
1204     *cptr += sum01 * al;
1206     // updating C[11]
1207     cptr += rs_c;
1208     *cptr = *cptr*be;
1209     *cptr += sum11 * al;
1211     // updating C[21]
1212     cptr += rs_c;
1213     *cptr = *cptr*be;
1214     *cptr += sum21 * al;
1216     // updating C[31]
1217     cptr += rs_c;
1218     *cptr = *cptr*be;
1219     *cptr += sum31 * al;
1221     // 2nd Column
1222     // updating C[02]
1223     cptr = c + 2*cs_c;
1224     *cptr = *cptr*be;
1225     *cptr += sum02 * al;
1227     // updating C[12]
1228     cptr += rs_c;
1229     *cptr = *cptr*be;
1230     *cptr += sum12 * al;
1232     // updating C[22]
1233     cptr += rs_c;
1234     *cptr = *cptr*be;
1235     *cptr += sum22 * al;
1237     // updating C[32]
1238     cptr += rs_c;
1239     *cptr = *cptr*be;
1240     *cptr += sum32 * al;
1242     // 3rd Column
1243     // updating C[03]
1244     cptr = c + 3*cs_c;
1245     *cptr = *cptr*be;
1246     *cptr += sum03 * al;
1248     // updating C[13]
1249     cptr += rs_c;
1250     *cptr = *cptr*be;
1251     *cptr += sum13 * al;
1253     // updating C[23]
1254     cptr += rs_c;
1255     *cptr = *cptr*be;
1256     *cptr += sum23 * al;
1258     // updating C[33]
1259     cptr += rs_c;
1260     *cptr = *cptr*be;
1261     *cptr += sum33 * al;
1262 #ifdef BLIS_ENABLE_CYCLE_COUNT
1263           counter_end=TSCL;
1264           if (lib_get_coreID () == 0)
1265                   printf("%d\n", counter_end-counter_start);
1266 #endif
1267           return;
1270 void bli_cgemm_ukernel_2x4(
1271                                                         dim_t              k,
1272                                                         scomplex* restrict alpha,
1273                                                         scomplex* restrict a,
1274                                                         scomplex* restrict b,
1275                                                         scomplex* restrict beta,
1276                                                         scomplex* restrict c, inc_t rs_c, inc_t cs_c,
1277                                                         auxinfo_t*         data
1278                                                   )
1280         __float2_t sum00a, sum10a, sum00b, sum10b;
1281         __float2_t sum01a, sum11a, sum01b, sum11b;
1282         __float2_t sum02a, sum12a, sum02b, sum12b;
1283         __float2_t sum03a, sum13a, sum03b, sum13b;
1284         __float2_t * restrict ptrB     = (__float2_t *) b;
1285         __float2_t * restrict ptrA     = (__float2_t *) a;
1286         __float2_t * restrict ptrC;
1287         __float2_t regA, regB, regC;
1288         int_least16_t index;
1290 #ifdef BLIS_ENABLE_CYCLE_COUNT
1291           volatile int counter_start;
1292           volatile int counter_end;
1293 #endif
1295         // zero out accumulators
1296         sum00a = 0.0;
1297         sum10a = 0.0;
1298         sum01a = 0.0;
1299         sum11a = 0.0;
1300         sum02a = 0.0;
1301         sum12a = 0.0;
1302         sum03a = 0.0;
1303         sum13a = 0.0;
1304         sum00b = 0.0;
1305         sum10b = 0.0;
1306         sum01b = 0.0;
1307         sum11b = 0.0;
1308         sum02b = 0.0;
1309         sum12b = 0.0;
1310         sum03b = 0.0;
1311         sum13b = 0.0;
1313 #ifdef BLIS_ENABLE_CYCLE_COUNT
1314           TSCL=0;
1315           counter_start = TSCL;
1316 #endif
1318         for (index = 0; index < k; index++)
1319         { // loop over k;
1320         // each iteration performs rank one update of 2x1 by 1x4
1321                 // matrices of A and B respectively; result is
1322                 // accumulated over 2x4 matrix
1323                 __float2_t b0, b1, b2, b3, a0, a1;
1324                 __x128_t   reg128;
1326                 a0 = *ptrA++;
1327                 a1 = *ptrA++;
1329                 b0 = *ptrB++;
1330                 b1 = *ptrB++;
1331                 b2 = *ptrB++;
1332                 b3 = *ptrB++;
1334                 // the four partial sums are accumulated independently
1335                 // a[0]*b[0]
1336                 reg128 = _cmpysp(a0, b0);
1337                 sum00a = _daddsp(sum00a, _lof2_128(reg128));
1338                 sum00b = _daddsp(sum00b, _hif2_128(reg128));
1340                 // a[1]*b[0]
1341                 reg128 = _cmpysp(a1, b0);
1342                 sum10a = _daddsp(sum10a, _lof2_128(reg128));
1343                 sum10b = _daddsp(sum10b, _hif2_128(reg128));
1345                 // a[0]*b[1]
1346                 reg128 = _cmpysp(a0, b1);
1347                 sum01a = _daddsp(sum01a, _lof2_128(reg128));
1348                 sum01b = _daddsp(sum01b, _hif2_128(reg128));
1350                 // a[1]*b[1]
1351                 reg128 = _cmpysp(a1, b1);
1352                 sum11a = _daddsp(sum11a, _lof2_128(reg128));
1353                 sum11b = _daddsp(sum11b, _hif2_128(reg128));
1355                 // a[0]*b[2]
1356                 reg128 = _cmpysp(a0, b2);
1357                 sum02a = _daddsp(sum02a, _lof2_128(reg128));
1358                 sum02b = _daddsp(sum02b, _hif2_128(reg128));
1360                 // a[1]*b[2]
1361                 reg128 = _cmpysp(a1, b2);
1362                 sum12a = _daddsp(sum12a, _lof2_128(reg128));
1363                 sum12b = _daddsp(sum12b, _hif2_128(reg128));
1365                 // a[0]*b[3]
1366                 reg128 = _cmpysp(a0, b3);
1367                 sum03a = _daddsp(sum03a, _lof2_128(reg128));
1368                 sum03b = _daddsp(sum03b, _hif2_128(reg128));
1370                 // a[1]*b[3]
1371                 reg128 = _cmpysp(a1, b3);
1372                 sum13a = _daddsp(sum13a, _lof2_128(reg128));
1373                 sum13b = _daddsp(sum13b, _hif2_128(reg128));
1374         }
1376 #ifdef BLIS_ENABLE_CYCLE_COUNT
1377           counter_end=TSCL;
1378           if (lib_get_coreID () == 0)
1379                   printf("%d\t%d\t",k, counter_end-counter_start);
1380 #endif
1382 #ifdef BLIS_ENABLE_CYCLE_COUNT
1383           TSCL=0;
1384           counter_start = TSCL;
1385 #endif
1386         {
1387                 __x128_t   reg128;
1388                 ptrA  = (__float2_t *) alpha;
1389                 ptrB =  (__float2_t *) beta;
1390                 regA  = *ptrA;
1391                 regB = *ptrB;
1393                 // the value of a and the final values need to be
1394                 // rearranged due to the specific way cmpysp assumes
1395                 // data arrangement
1396                 regA  =_ftof2(-_lof(regA), _hif(regA));
1397                 //regB = _ftof2(_lof(regB),_hif(regB));
1398                 ptrC  = (__float2_t *) c;
1400                 // update and save c[0,0]
1401                 sum00a = _daddsp(sum00a, sum00b);
1402                 reg128 = _cmpysp(regA, sum00a);
1403                 sum00a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1404                 regC = *ptrC;
1405                 //regC = _ftof2(_lof(regC), _hif(regC));
1406                 reg128 = _cmpysp(regC,regB);
1407                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1408                 *ptrC = _daddsp(_ftof2(-_lof(sum00a),_hif(sum00a)),_ftof2(_lof(regC),-_hif(regC)));
1410                 ptrC = (__float2_t *) c + rs_c;
1412                 // update and save c[1,0]
1413                 sum10a = _daddsp(sum10a, sum10b);
1414                 reg128 = _cmpysp(regA, sum10a);
1415                 sum10a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1416                 regC = *ptrC;
1417                 //regC = _ftof2(_lof(regC), _hif(regC));
1418                 reg128 = _cmpysp(regC,regB);
1419                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1420                 *ptrC = _daddsp(_ftof2(-_lof(sum10a),_hif(sum10a)),_ftof2(_lof(regC),-_hif(regC)));
1423                 ptrC = (__float2_t *) c + cs_c;
1425                 // update and save c[0,1]
1426                 sum01a = _daddsp(sum01a, sum01b);
1427                 reg128 = _cmpysp(regA, sum01a);
1428                 sum01a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1429                 regC = *ptrC;
1430                 //regC = _ftof2(_lof(regC), _hif(regC));
1431                 reg128 = _cmpysp(regC,regB);
1432                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1433                 *ptrC = _daddsp(_ftof2(-_lof(sum01a),_hif(sum01a)),_ftof2(_lof(regC),-_hif(regC)));
1435                 ptrC = (__float2_t *) c + rs_c + cs_c;
1437                 // update and save c[1,1]
1438                 sum11a = _daddsp(sum11a, sum11b);
1439                 reg128 = _cmpysp(regA, sum11a);
1440                 sum11a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1441                 regC = *ptrC;
1442                 //regC = _ftof2(_lof(regC), _hif(regC));
1443                 reg128 = _cmpysp(regC,regB);
1444                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1445                 *ptrC = _daddsp(_ftof2(-_lof(sum11a),_hif(sum11a)),_ftof2(_lof(regC),-_hif(regC)));
1447                 ptrC = (__float2_t *) c + 2 * cs_c;
1449                 // update and save c[0,2]
1450                 sum02a = _daddsp(sum02a, sum02b);
1451                 reg128 = _cmpysp(regA, sum02a);
1452                 sum02a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1453                 regC = *ptrC;
1454                 //regC = _ftof2(_lof(regC), _hif(regC));
1455                 reg128 = _cmpysp(regC,regB);
1456                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1457                 *ptrC = _daddsp(_ftof2(-_lof(sum02a),_hif(sum02a)),_ftof2(_lof(regC),-_hif(regC)));
1459                 ptrC = (__float2_t *) c + rs_c + 2* cs_c;
1461                 // update and save c[1,2]
1462                 sum12a = _daddsp(sum12a, sum12b);
1463                 reg128 = _cmpysp(regA, sum12a);
1464                 sum12a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1465                 regC = *ptrC;
1466                 //regC = _ftof2(_lof(regC), _hif(regC));
1467                 reg128 = _cmpysp(regC,regB);
1468                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1469                 *ptrC = _daddsp(_ftof2(-_lof(sum12a),_hif(sum12a)),_ftof2(_lof(regC),-_hif(regC)));
1471                 ptrC = (__float2_t *) c + 3 * cs_c;
1473                 // update and save c[0,3]
1474                 sum03a = _daddsp(sum03a, sum03b);
1475                 reg128 = _cmpysp(regA, sum03a);
1476                 sum03a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1477                 regC = *ptrC;
1478                 //regC = _ftof2(_lof(regC), _hif(regC));
1479                 reg128 = _cmpysp(regC,regB);
1480                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1481                 *ptrC = _daddsp(_ftof2(-_lof(sum03a),_hif(sum03a)),_ftof2(_lof(regC),-_hif(regC)));
1483                 ptrC = (__float2_t *) c + rs_c + 3 * cs_c;
1485                 // update and save c[1,3]
1486                 sum13a = _daddsp(sum13a, sum13b);
1487                 reg128 = _cmpysp(regA, sum13a);
1488                 sum13a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
1489                 regC = *ptrC;
1490                 //regC = _ftof2(_lof(regC), _hif(regC));
1491                 reg128 = _cmpysp(regC,regB);
1492                 regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
1493                 *ptrC = _daddsp(_ftof2(-_lof(sum13a),_hif(sum13a)),_ftof2(_lof(regC),-_hif(regC)));
1495         }
1497 #ifdef BLIS_ENABLE_CYCLE_COUNT
1498         counter_end=TSCL;
1499         if (lib_get_coreID () == 0)
1500                 printf("%d\n", counter_end-counter_start);
1501 #endif
1502         return;
1505 void bli_zgemm_ukernel_2x2(
1506                                                         dim_t              k,
1507                                                         dcomplex* restrict alpha,
1508                                                         dcomplex* restrict a,
1509                                                         dcomplex* restrict b,
1510                                                         dcomplex* restrict beta,
1511                                                         dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
1512                                                         auxinfo_t*         data
1513                                                   )
1516         double * restrict ptrA = (double *) a;
1517         double * restrict ptrB = (double *) b;
1518         //double * restrict ptrC = (double *) c;
1519         double sum00r, sum00i;
1520         int index;
1521         int kEven = k&0xFFFE;
1522 #ifdef BLIS_ENABLE_CYCLE_COUNT
1523           volatile int counter_start;
1524           volatile int counter_end;
1525 #endif
1527         sum00r = 0.0;
1528         sum00i = 0.0;
1529 #ifdef BLIS_ENABLE_CYCLE_COUNT
1530   TSCL = 0;
1531   counter_start = TSCL;
1532 #endif
1534         if(k>4) // The loop is safe for k > 4
1535         {
1536 #pragma UNROLL(2)
1537                 for(index = 0; index<kEven; index++)
1538                 { // loop over k;
1539                         // each iteration performs rank one update of 1x1 by 1x1
1540                         // matrices of A and B respectively; result is
1541                         // accumulated over 1x1 matrix
1542                         double a0r, a0i;
1543                         double b0r, b0i;
1545                         a0r = *ptrA++;
1546                         a0i = *ptrA++;
1548                         b0r = *ptrB++;
1549                         b0i = *ptrB++;
1551                         sum00r += a0r*b0r;
1552                         sum00r -= a0i*b0i;
1553                         sum00i += a0r*b0i;
1554                         sum00i += a0i*b0r;
1556                 }
1557                 if(k&1) // odd k; one left to do
1558                 {
1559                         double a0r, a0i;
1560                         double b0r, b0i;
1562                         a0r = *ptrA++;
1563                         a0i = *ptrA++;
1565                         b0r = *ptrB++;
1566                         b0i = *ptrB++;
1568                         sum00r += a0r*b0r;
1569                         sum00r -= a0i*b0i;
1570                         sum00i += a0r*b0i;
1571                         sum00i += a0i*b0r;
1572                 }
1573         }
1574         else
1575         {
1576                 if(k>0)
1577                 {
1578                         double a0r, a0i;
1579                         double b0r, b0i;
1581                         a0r = *ptrA++;
1582                         a0i = *ptrA++;
1584                         b0r = *ptrB++;
1585                         b0i = *ptrB++;
1587                         sum00r += a0r*b0r;
1588                         sum00r -= a0i*b0i;
1589                         sum00i += a0r*b0i;
1590                         sum00i += a0i*b0r;
1591                 }
1592                 if(k>1)
1593                 {
1594                         double a0r, a0i;
1595                         double b0r, b0i;
1597                         a0r = *ptrA++;
1598                         a0i = *ptrA++;
1600                         b0r = *ptrB++;
1601                         b0i = *ptrB++;
1603                         sum00r += a0r*b0r;
1604                         sum00r -= a0i*b0i;
1605                         sum00i += a0r*b0i;
1606                         sum00i += a0i*b0r;
1607                 }
1608                 if(k>2)
1609                 {
1610                         double a0r, a0i;
1611                         double b0r, b0i;
1613                         a0r = *ptrA++;
1614                         a0i = *ptrA++;
1616                         b0r = *ptrB++;
1617                         b0i = *ptrB++;
1619                         sum00r += a0r*b0r;
1620                         sum00r -= a0i*b0i;
1621                         sum00i += a0r*b0i;
1622                         sum00i += a0i*b0r;
1623                 }
1624                 if(k>3)
1625                 {
1626                         double a0r, a0i;
1627                         double b0r, b0i;
1629                         a0r = *ptrA++;
1630                         a0i = *ptrA++;
1632                         b0r = *ptrB++;
1633                         b0i = *ptrB++;
1635                         sum00r += a0r*b0r;
1636                         sum00r -= a0i*b0i;
1637                         sum00i += a0r*b0i;
1638                         sum00i += a0i*b0r;
1639                 }
1641         }
1643 #ifdef BLIS_ENABLE_CYCLE_COUNT
1644           counter_end=TSCL;
1645           if (lib_get_coreID () == 0)
1646                   printf("%d\t%d\t",k, counter_end-counter_start);
1647 #endif
1649 #ifdef BLIS_ENABLE_CYCLE_COUNT
1650           TSCL=0;
1651           counter_start = TSCL;
1652 #endif
1654         {  // final saving
1655                 double alphar, alphai, betar, betai, cr, ci;
1656                 alphar = alpha->real;
1657                 alphai = alpha->imag;
1658                 betar  = beta->real;
1659                 betai  = beta->imag;
1661                 cr = c->real;
1662                 ci = c->imag;
1664                 c->imag = (betar * ci + betai * cr);
1665                 c->real = (betar * cr - betai * ci);
1666                 c->real += (alphar * sum00r - alphai * sum00i);
1667                 c->imag += (alphar * sum00i + alphai * sum00r);
1668         }
1669 #ifdef BLIS_ENABLE_CYCLE_COUNT
1670           counter_end=TSCL;
1671           if (lib_get_coreID () == 0)
1672                   printf("%d\n", counter_end-counter_start);
1673 #endif
1676                 return;