]> Gitweb @ Texas Instruments - Open Source Git Repositories - git.TI.com/gitweb - dense-linear-algebra-libraries/linalg.git/blob - src/ti/linalg/blis/frame/3/herk/bli_herk_u_ker_var2.c
1. Replaced lib_clock64() with lib_clock_read().
[dense-linear-algebra-libraries/linalg.git] / src / ti / linalg / blis / frame / 3 / herk / bli_herk_u_ker_var2.c
1 /*
3    BLIS    
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
7    Copyright (C) 2014, The University of Texas at Austin
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name of The University of Texas at Austin nor the names
18       of its contributors may be used to endorse or promote products
19       derived from this software without specific prior written permission.
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #define FUNCPTR_T herk_fp
39 typedef void (*FUNCPTR_T)(
40                            doff_t  diagoffc,
41                            pack_t  schema_a,
42                            pack_t  schema_b,
43                            dim_t   m,
44                            dim_t   n,
45                            dim_t   k,
46                            void*   alpha,
47                            void*   a, inc_t cs_a, inc_t pd_a, inc_t ps_a,
48                            void*   b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
49                            void*   beta,
50                            void*   c, inc_t rs_c, inc_t cs_c,
51                            void*   gemm_ukr,
52                            herk_thrinfo_t* thread
53                          );
55 static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
58 void bli_herk_u_ker_var2( obj_t*  a,
59                           obj_t*  b,
60                           obj_t*  c,
61                           gemm_t* cntl,
62                           herk_thrinfo_t* thread )
63 {
64         num_t     dt_exec   = bli_obj_execution_datatype( *c );
66         doff_t    diagoffc  = bli_obj_diag_offset( *c );
68         pack_t    schema_a  = bli_obj_pack_schema( *a );
69         pack_t    schema_b  = bli_obj_pack_schema( *b );
71         dim_t     m         = bli_obj_length( *c );
72         dim_t     n         = bli_obj_width( *c );
73         dim_t     k         = bli_obj_width( *a );
75         void*     buf_a     = bli_obj_buffer_at_off( *a );
76         inc_t     cs_a      = bli_obj_col_stride( *a );
77         inc_t     pd_a      = bli_obj_panel_dim( *a );
78         inc_t     ps_a      = bli_obj_panel_stride( *a );
80         void*     buf_b     = bli_obj_buffer_at_off( *b );
81         inc_t     rs_b      = bli_obj_row_stride( *b );
82         inc_t     pd_b      = bli_obj_panel_dim( *b );
83         inc_t     ps_b      = bli_obj_panel_stride( *b );
85         void*     buf_c     = bli_obj_buffer_at_off( *c );
86         inc_t     rs_c      = bli_obj_row_stride( *c );
87         inc_t     cs_c      = bli_obj_col_stride( *c );
89         obj_t     scalar_a;
90         obj_t     scalar_b;
92         void*     buf_alpha;
93         void*     buf_beta;
95         FUNCPTR_T f;
97         func_t*   gemm_ukrs;
98         void*     gemm_ukr;
101         // Detach and multiply the scalars attached to A and B.
102         bli_obj_scalar_detach( a, &scalar_a );
103         bli_obj_scalar_detach( b, &scalar_b );
104         bli_mulsc( &scalar_a, &scalar_b );
106         // Grab the addresses of the internal scalar buffers for the scalar
107         // merged above and the scalar attached to C.
108         buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
109         buf_beta  = bli_obj_internal_scalar_buffer( *c );
111         // Index into the type combination array to extract the correct
112         // function pointer.
113         f = ftypes[dt_exec];
115         // Extract from the control tree node the func_t object containing
116         // the gemm micro-kernel function addresses, and then query the
117         // function address corresponding to the current datatype.
118         gemm_ukrs = cntl_gemm_ukrs( cntl );
119         gemm_ukr  = bli_func_obj_query( dt_exec, gemm_ukrs );
121         // Invoke the function.
122         f( diagoffc,
123            schema_a,
124            schema_b,
125            m,
126            n,
127            k,
128            buf_alpha,
129            buf_a, cs_a, pd_a, ps_a,
130            buf_b, rs_b, pd_b, ps_b,
131            buf_beta,
132            buf_c, rs_c, cs_c,
133            gemm_ukr,
134            thread );
136 #ifdef BLIS_ENABLE_C66X_MEM_POOLS
138 #if defined (BLIS_ENABLE_C66X_EDMA) && defined (BLIS_ENABLE_C66X_IDMA)
140 #undef  GENTFUNC
141 #define GENTFUNC( ctype, ch, varname, ukrtype ) \
143 void PASTEMAC(ch,varname)( \
144                            doff_t  diagoffc, \
145                            pack_t  schema_a, \
146                            pack_t  schema_b, \
147                            dim_t   m, \
148                            dim_t   n, \
149                            dim_t   k, \
150                            void*   alpha, \
151                            void*   a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
152                            void*   b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
153                            void*   beta, \
154                            void*   c, inc_t rs_c, inc_t cs_c, \
155                            void*   gemm_ukr, \
156                            herk_thrinfo_t* thread \
157                          ) \
158 { \
159         /* Cast the micro-kernel address to its function pointer type. */ \
160         PASTECH(ch,ukrtype) gemm_ukr_cast = (PASTECH(ch,ukrtype)) gemm_ukr; \
162         /* Temporary C buffer for edge cases. */ \
163         ctype           ct[ PASTEMAC(ch,maxmr) * \
164                             PASTEMAC(ch,maxnr) ] \
165                             __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
166         const inc_t     rs_ct      = 1; \
167         const inc_t     cs_ct      = PASTEMAC(ch,maxmr); \
169         /* Alias some constants to simpler names. */ \
170         const dim_t     MR         = pd_a; \
171         const dim_t     NR         = pd_b; \
172         const dim_t     PACKMR     = cs_a; \
173         const dim_t     PACKNR     = rs_b; \
175         ctype* restrict zero       = PASTEMAC(ch,0); \
176         ctype* restrict a_cast     = a; \
177         ctype* restrict b_cast     = b; \
178         ctype* restrict c_cast     = c; \
179         ctype* restrict alpha_cast = alpha; \
180         ctype* restrict beta_cast  = beta; \
181         ctype* restrict b1; \
182         ctype* restrict c1; \
184         doff_t          diagoffc_ij; \
185         dim_t           m_iter, m_left; \
186         dim_t           n_iter, n_left; \
187         dim_t           m_cur; \
188         dim_t           n_cur; \
189         dim_t                   n_next; \
190         dim_t           i, j, jp; \
191         inc_t           rstep_a; \
192         inc_t           cstep_b; \
193         /*inc_t           rstep_c; */\
194         inc_t           cstep_c; \
195         inc_t           rstep_c11, rs_c11, cs_c11; \
196         inc_t           istep_a; \
197         inc_t           istep_b; \
198         auxinfo_t       aux; \
200         herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
201         dim_t jr_num_threads = thread_n_way( thread ); \
202         dim_t jr_thread_id   = thread_work_id( thread ); \
203         dim_t ir_num_threads = thread_n_way( caucus ); \
204         dim_t ir_thread_id   = thread_work_id( caucus ); \
206         mem_t b1_L1_mem; \
207         /*memcpy does not like b1_L1 if it is restrict. The resid of gemm is non zero if this is changed to ctype* restrict*/ \
208         ctype* b1_L1; \
210         mem_t a1_L1_mem, a2_L1_mem; \
211         ctype *a1_L1, *a2_L1, *temp; \
213     mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
214         ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
215         /*EDMA Declarations */ \
217         lib_emt_Handle emt_handle_b = NULL; \
218         lib_emt_Handle emt_handle_c0 = NULL; \
219         lib_emt_Handle emt_handle_c1 = NULL; \
221         /*For DSP timing*/ \
222         uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
223         uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
224         extern profile_data_t *bli_herk_profile_data; \
226         /*
227            Assumptions/assertions:
228              rs_a == 1
229              cs_a == PACKMR
230              pd_a == MR
231              ps_a == stride to next micro-panel of A
232              rs_b == PACKNR
233              cs_b == 1
234              pd_b == NR
235              ps_b == stride to next micro-panel of B
236              rs_c == (no assumptions)
237              cs_c == (no assumptions)
238         */ \
240         /* If any dimension is zero, return immediately. */ \
241         if ( bli_zero_dim3( m, n, k ) ) return; \
243         /* Safeguard: If the current panel of C is entirely below the diagonal,
244            it is not stored. So we do nothing. */ \
245         if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
247         /* If there is a zero region to the left of where the diagonal of C
248            intersects the top edge of the panel, adjust the pointer to C and B
249            and treat this case as if the diagonal offset were zero. */ \
250         if ( diagoffc > 0 ) \
251         { \
252                 jp       = diagoffc / NR; \
253                 j        = jp * NR; \
254                 n        = n - j; \
255                 diagoffc = diagoffc % NR; \
256                 c_cast   = c_cast + (j  )*cs_c; \
257                 b_cast   = b_cast + (jp )*ps_b; \
258         } \
260         /* If there is a zero region below where the diagonal of C intersects
261            the right edge of the panel, shrink it to prevent "no-op" iterations
262            from executing. */ \
263         if ( -diagoffc + n < m ) \
264         { \
265                 m = -diagoffc + n; \
266         } \
268         /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
269         PASTEMAC(ch,set0s_mxn)( MR, NR, \
270                                 ct, rs_ct, cs_ct ); \
272         /* Compute number of primary and leftover components of the m and n
273            dimensions. */ \
274         n_iter = n / NR; \
275         n_left = n % NR; \
277         m_iter = m / MR; \
278         m_left = m % MR; \
280         if ( n_left ) ++n_iter; \
281         if ( m_left ) ++m_iter; \
283         /* Determine some increments used to step through A, B, and C. */ \
284         rstep_a = ps_a; \
286         cstep_b = ps_b; \
288         /*rstep_c = rs_c * MR; */\
289         cstep_c = cs_c * NR; \
291         /* When C (MC*NR) is moved to L2 the stride to get to the next panel of MRxNR*/ \
292     rstep_c11 = MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
293     rs_c11 = 1;\
294         cs_c11 = (m%2 == 0) ? m : m+1 ; /*(m_iter-ir_thread_id)*MR;*/ /*stride to get to next column in a panel of MRxNR*/\
296         istep_a = PACKMR * k; \
297         istep_b = PACKNR * k; \
299         /* Save the pack schemas of A and B to the auxinfo_t object. */ \
300         bli_auxinfo_set_schema_a( schema_a, aux ); \
301         bli_auxinfo_set_schema_b( schema_b, aux ); \
303         /* Save the imaginary stride of A and B to the auxinfo_t object. */ \
304         bli_auxinfo_set_is_a( istep_a, aux ); \
305         bli_auxinfo_set_is_b( istep_b, aux ); \
307         b1 = b_cast; \
308         c1 = c_cast; \
310         /*Acquiring a buffer for B in L1*/ \
311         bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
312         b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
313         b1_L1 = (ctype *) ((char *) b1_L1_mem.buf + PASTEMAC(ch,bank)); \
315         /*Acquiring a buffer for A in L1*/ \
316         bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
317         a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
318         a1_L1 = a1_L1; \
320         bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
321         a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
323         /*Acquiring buffers for C (MC_x_NR) in L2 */\
324         bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
325         cNew0 = bli_mem_buffer( &c0_L2_mem ); \
327         bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
328         cNew1 = bli_mem_buffer( &c1_L2_mem ); \
330         bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
331         cNew2 = bli_mem_buffer( &c2_L2_mem ); \
333         /*Acquiring an EDMA  handle from the pool*/ \
334         bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
335         if(emt_handle_b == NULL) \
336         { \
337                 printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
338         } \
339         bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
340         if(emt_handle_c0 == NULL) \
341         { \
342                 printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
343         } \
344         /*Acquiring an EDMA  handle from the pool*/ \
345         bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
346         if(emt_handle_c1 == NULL) \
347         { \
348                 printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
349         } \
351         /* initiate first c transfer */ \
352         /* For C need to transfer mxn_cur. For smaller matrix sizes it can happen that
353          * (m_iter-ir_thread_id)*MR is not equal to m which would lead to incorrect
354          * values of C written back.*/ \
355         n_cur = ( bli_is_not_edge_f( jr_thread_id, n_iter, n_left ) ? NR : n_left ); \
357         if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
358         { \
359                 counter_start_nr = lib_clock_read();  \
360         } \
362         if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
363         { \
364                 lib_emt_copy2D2D(emt_handle_c0, c_cast+jr_thread_id*cstep_c, \
365                                                         cNew1, m*sizeof(ctype), \
366                                                         n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
367         }\
368         else \
369         { \
370                 dim_t ii; \
371                 ctype *ptr_source; \
372                 ctype *ptr_dest; \
373                 ptr_source =  c_cast+jr_thread_id*cstep_c; \
374                 ptr_dest = cNew1; \
375                 for(ii = 0; ii < n_cur; ii++) \
376                 { \
377                         memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
378                         ptr_source += cs_c; \
379                         ptr_dest   += cs_c11; \
380                 } \
381         } \
382         /* Loop over the n dimension (NR columns at a time). */ \
383         for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
384         { \
385                 ctype* restrict a1; \
386                 ctype* restrict c11; \
387                 ctype* restrict b2; \
389                 b1 = b_cast + j * cstep_b; \
390                 c1 = c_cast + j * cstep_c; \
392                 n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
393                 n_next = ( bli_is_not_edge_f( j+jr_num_threads, n_iter, n_left ) ? NR : n_left ); \
395                 m_cur = ( bli_is_not_edge_f( ir_thread_id, m_iter, m_left ) ? MR : m_left ); \
396                 /* Initialize our next panel of B to be the current panel of B. */ \
397                 b2 = b1; \
399         lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
400         lib_imt_copy(a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype)); \
401         /* wait for previous c transfer to complete and initiate next transfer */ \
402         lib_emt_wait(emt_handle_c0); \
403         if(j < (n_iter-jr_num_threads)) /* no transfer for last iteration */ \
404         { \
405                 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
406                 { \
407                         lib_emt_copy2D2D(emt_handle_c0, c1+jr_num_threads*cstep_c, \
408                                                                 cNew0, m*sizeof(ctype), /*(m_iter-ir_thread_id)*sizeof(ctype)*MR,*/ \
409                                         n_next, cs_c*sizeof(ctype), \
410                                         cs_c11*sizeof(ctype)); \
411                 }\
412                 else \
413                 { \
414                         dim_t ii; \
415                         ctype *ptr_source; \
416                         ctype *ptr_dest; \
417                         ptr_source =  c1+jr_num_threads*cstep_c; \
418                         ptr_dest = cNew0; \
419                         for(ii = 0; ii < n_next; ii++) \
420                         { \
421                                 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
422                                 ptr_source += cs_c; \
423                                 ptr_dest   += cs_c11; \
424                         } \
425                 } \
426         }\
427                 /* Interior loop over the m dimension (MR rows at a time). */ \
429                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
430                 { \
431                         counter_start_mr = lib_clock_read();  \
432                 } \
434                 for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
435                 { \
436                         ctype* restrict a2; \
438                         a1  = a_cast + i * rstep_a; \
439                         c11 = cNew1  + i * rstep_c11; \
440                         /*c11 = c1  + i * rstep_c;*/ \
442                         /* Compute the diagonal offset for the submatrix at (i,j). */ \
443                         diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
445                         m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
447                         /* Compute the addresses of the next panels of A and B. */ \
448                         a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
449                         temp = a1_L1; \
450                         a1_L1 = a2_L1; \
451                         a2_L1 = temp; \
452                         /*a1 = a2; Make the next panel the current panel for the next iteration*/ \
453                         lib_imt_wait(); \
454                         if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
455                         { \
456                                 a2 = a_cast; \
457                                 b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
458                                 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
459                                         b2 = b_cast; \
460                         } \
461                         else \
462                         {\
463                                 /*Start next panel*/ \
464                                 lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype)); \
465                         }\
466                         if(i == ir_thread_id) \
467                         { \
468                                 lib_emt_wait(emt_handle_b);\
469                         } \
471                         /* Save addresses of next panels of A and B to the auxinfo_t
472                            object. */ \
473                         bli_auxinfo_set_next_a( a2, aux ); \
474                         bli_auxinfo_set_next_b( b2, aux ); \
476                         /* If the diagonal intersects the current MR x NR submatrix, we
477                            compute it the temporary buffer and then add in the elements
478                            on or below the diagonal.
479                            Otherwise, if the submatrix is strictly above the diagonal,
480                            we compute and store as we normally would.
481                            And if we're strictly below the diagonal, we do nothing and
482                            continue. */ \
483                         if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
484                         { \
486                                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
487                                 { \
488                                         counter_start_ker = lib_clock_read();  \
489                                 } \
491                                 /* Invoke the gemm micro-kernel. */ \
492                                 gemm_ukr_cast( k, \
493                                                alpha_cast, \
494                                                a1_L1, /*a1_L1,*/ \
495                                                b1_L1, /*b1_L1,*/ \
496                                                zero, \
497                                                ct, rs_ct, cs_ct,\
498                                                &aux ); \
500                                 /* Scale C and add the result to only the stored part. */ \
501                                 PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
502                                                           m_cur, n_cur, \
503                                                           ct,  rs_ct, cs_ct, \
504                                                           beta_cast, \
505                                                           c11, rs_c11, cs_c11 /*rs_c,  cs_c  */); \
507                                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
508                                 { \
509                                         counter_end_ker = lib_clock_read();  \
510                                     bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
511                                                                                 (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
512                                 } \
514                         } \
515                         else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
516                         { \
518                                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
519                                 { \
520                                         counter_start_ker = lib_clock_read();  \
521                                 } \
523                                 /* Handle interior and edge cases separately. */ \
524                                 if ( m_cur == MR && n_cur == NR ) \
525                                 { \
526                                         /* Invoke the gemm micro-kernel. */ \
527                                         gemm_ukr_cast( k, \
528                                                        alpha_cast, \
529                                                        a1_L1, /*a1_L1,*/ \
530                                                        b1_L1, /*b1_L1,*/ \
531                                                        beta_cast, \
532                                                        c11, rs_c11, cs_c11 /* rs_c,  cs_c */, \
533                                                        &aux ); \
534                                 } \
535                                 else \
536                                 { \
537                                         /* Invoke the gemm micro-kernel. */ \
538                                         gemm_ukr_cast( k, \
539                                                        alpha_cast, \
540                                                        a1_L1, /*a1_L1,*/ \
541                                                        b1_L1, /*b1_L1,*/ \
542                                                        zero, \
543                                                        ct, rs_ct, cs_ct, \
544                                                        &aux ); \
546                                         /* Scale the edge of C and add the result. */ \
547                                         PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
548                                                                 ct,  rs_ct, cs_ct, \
549                                                                 beta_cast, \
550                                                                 c11,  rs_c11, cs_c11 /*rs_c,  cs_c */); \
551                                 } \
552                                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
553                                 { \
554                                         counter_end_ker = lib_clock_read();  \
555                                         bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
556                                                                                         (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
557                                 } \
558                         } \
559                 } \
560                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
561                 { \
562                         counter_end_mr = lib_clock_read();  \
563                         bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
564                                                                         (counter_end_mr-counter_start_mr), 2*m*k*n_cur); \
565                 } \
566                 /* circularly shift buffers */ \
567                 cNewTemp = cNew0; \
568                 cNew0 = cNew2; \
569                 cNew2 = cNew1; \
570                 cNew1 = cNewTemp; \
571                 if(j != jr_thread_id) /* wait for save c to complete; skip first iteration */ \
572                 { \
573                     lib_emt_wait(emt_handle_c1); \
574         } \
575                 /* save updated c */ \
576                 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
577                 { \
578                         lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
579                                                                 n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
580                 } \
581         else \
582         { \
583                 dim_t ii; \
584                 ctype *ptr_source; \
585                         ctype *ptr_dest; \
586                         ptr_source = cNew2; \
587                         ptr_dest = c1; \
588             for(ii = 0; ii < n_cur; ii++) \
589             { \
590                 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
591                 ptr_source += cs_c11; \
592                 ptr_dest   += cs_c; \
593             } \
594         } \
595         } \
597         if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
598         { \
599                 counter_end_nr = lib_clock_read();  \
600                 bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND],\
601                                                                 (counter_end_nr-counter_start_nr), 2*m*k*n); \
602         } \
604     bli_mem_release( &c2_L2_mem ); \
605         bli_mem_release( &c1_L2_mem ); \
606         bli_mem_release( &c0_L2_mem ); \
607         bli_mem_release( &a2_L1_mem ); \
608         bli_mem_release( &a1_L1_mem ); \
609         bli_mem_release( &b1_L1_mem ); \
610         if ( emt_handle_b != NULL ) \
611     { \
612                 bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
613                 emt_handle_b = NULL; \
614     } \
615     if ( emt_handle_c0 != NULL ) \
616     { \
617         bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
618         emt_handle_c0 = NULL; \
619     } \
620     if ( emt_handle_c1 != NULL ) \
621     { \
622         lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
623         bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
624         emt_handle_c1 = NULL; \
625     } \
628 INSERT_GENTFUNC_BASIC( herk_u_ker_var2, gemm_ukr_t )
631 #else
633 #endif
635 #else
637 #undef  GENTFUNC
638 #define GENTFUNC( ctype, ch, varname, ukrtype ) \
640 void PASTEMAC(ch,varname)( \
641                            doff_t  diagoffc, \
642                            pack_t  schema_a, \
643                            pack_t  schema_b, \
644                            dim_t   m, \
645                            dim_t   n, \
646                            dim_t   k, \
647                            void*   alpha, \
648                            void*   a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
649                            void*   b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
650                            void*   beta, \
651                            void*   c, inc_t rs_c, inc_t cs_c, \
652                            void*   gemm_ukr, \
653                            herk_thrinfo_t* thread \
654                          ) \
655 { \
656         /* Cast the micro-kernel address to its function pointer type. */ \
657         PASTECH(ch,ukrtype) gemm_ukr_cast = gemm_ukr; \
659         /* Temporary C buffer for edge cases. */ \
660         ctype           ct[ PASTEMAC(ch,maxmr) * \
661                             PASTEMAC(ch,maxnr) ] \
662                             __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
663         const inc_t     rs_ct      = 1; \
664         const inc_t     cs_ct      = PASTEMAC(ch,maxmr); \
666         /* Alias some constants to simpler names. */ \
667         const dim_t     MR         = pd_a; \
668         const dim_t     NR         = pd_b; \
669         const dim_t     PACKMR     = cs_a; \
670         const dim_t     PACKNR     = rs_b; \
672         ctype* restrict zero       = PASTEMAC(ch,0); \
673         ctype* restrict a_cast     = a; \
674         ctype* restrict b_cast     = b; \
675         ctype* restrict c_cast     = c; \
676         ctype* restrict alpha_cast = alpha; \
677         ctype* restrict beta_cast  = beta; \
678         ctype* restrict b1; \
679         ctype* restrict c1; \
681         doff_t          diagoffc_ij; \
682         dim_t           m_iter, m_left; \
683         dim_t           n_iter, n_left; \
684         dim_t           m_cur; \
685         dim_t           n_cur; \
686         dim_t           i, j, jp; \
687         inc_t           rstep_a; \
688         inc_t           cstep_b; \
689         inc_t           rstep_c, cstep_c; \
690         inc_t           istep_a; \
691         inc_t           istep_b; \
692         auxinfo_t       aux; \
694         herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
695         dim_t jr_num_threads = thread_n_way( thread ); \
696         dim_t jr_thread_id   = thread_work_id( thread ); \
697         dim_t ir_num_threads = thread_n_way( caucus ); \
698         dim_t ir_thread_id   = thread_work_id( caucus ); \
700         /*
701            Assumptions/assertions:
702              rs_a == 1
703              cs_a == PACKMR
704              pd_a == MR
705              ps_a == stride to next micro-panel of A
706              rs_b == PACKNR
707              cs_b == 1
708              pd_b == NR
709              ps_b == stride to next micro-panel of B
710              rs_c == (no assumptions)
711              cs_c == (no assumptions)
712         */ \
714         /* If any dimension is zero, return immediately. */ \
715         if ( bli_zero_dim3( m, n, k ) ) return; \
717         /* Safeguard: If the current panel of C is entirely below the diagonal,
718            it is not stored. So we do nothing. */ \
719         if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
721         /* If there is a zero region to the left of where the diagonal of C
722            intersects the top edge of the panel, adjust the pointer to C and B
723            and treat this case as if the diagonal offset were zero. */ \
724         if ( diagoffc > 0 ) \
725         { \
726                 jp       = diagoffc / NR; \
727                 j        = jp * NR; \
728                 n        = n - j; \
729                 diagoffc = diagoffc % NR; \
730                 c_cast   = c_cast + (j  )*cs_c; \
731                 b_cast   = b_cast + (jp )*ps_b; \
732         } \
734         /* If there is a zero region below where the diagonal of C intersects
735            the right edge of the panel, shrink it to prevent "no-op" iterations
736            from executing. */ \
737         if ( -diagoffc + n < m ) \
738         { \
739                 m = -diagoffc + n; \
740         } \
742         /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
743         PASTEMAC(ch,set0s_mxn)( MR, NR, \
744                                 ct, rs_ct, cs_ct ); \
746         /* Compute number of primary and leftover components of the m and n
747            dimensions. */ \
748         n_iter = n / NR; \
749         n_left = n % NR; \
751         m_iter = m / MR; \
752         m_left = m % MR; \
754         if ( n_left ) ++n_iter; \
755         if ( m_left ) ++m_iter; \
757         /* Determine some increments used to step through A, B, and C. */ \
758         rstep_a = ps_a; \
760         cstep_b = ps_b; \
762         rstep_c = rs_c * MR; \
763         cstep_c = cs_c * NR; \
765         istep_a = PACKMR * k; \
766         istep_b = PACKNR * k; \
768         /* Save the pack schemas of A and B to the auxinfo_t object. */ \
769         bli_auxinfo_set_schema_a( schema_a, aux ); \
770         bli_auxinfo_set_schema_b( schema_b, aux ); \
772         /* Save the imaginary stride of A and B to the auxinfo_t object. */ \
773         bli_auxinfo_set_is_a( istep_a, aux ); \
774         bli_auxinfo_set_is_b( istep_b, aux ); \
776         b1 = b_cast; \
777         c1 = c_cast; \
779         /* Loop over the n dimension (NR columns at a time). */ \
780         for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
781         { \
782                 ctype* restrict a1; \
783                 ctype* restrict c11; \
784                 ctype* restrict b2; \
786                 b1 = b_cast + j * cstep_b; \
787                 c1 = c_cast + j * cstep_c; \
789                 n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
791                 /* Initialize our next panel of B to be the current panel of B. */ \
792                 b2 = b1; \
794                 /* Interior loop over the m dimension (MR rows at a time). */ \
795                 for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
796                 { \
797                         ctype* restrict a2; \
799                         a1  = a_cast + i * rstep_a; \
800                         c11 = c1     + i * rstep_c; \
802                         /* Compute the diagonal offset for the submatrix at (i,j). */ \
803                         diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
805                         m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
807                         /* Compute the addresses of the next panels of A and B. */ \
808                         a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
809                         if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
810                         { \
811                                 a2 = a_cast; \
812                                 b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
813                                 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
814                                         b2 = b_cast; \
815                         } \
817                         /* Save addresses of next panels of A and B to the auxinfo_t
818                            object. */ \
819                         bli_auxinfo_set_next_a( a2, aux ); \
820                         bli_auxinfo_set_next_b( b2, aux ); \
822                         /* If the diagonal intersects the current MR x NR submatrix, we
823                            compute it the temporary buffer and then add in the elements
824                            on or below the diagonal.
825                            Otherwise, if the submatrix is strictly above the diagonal,
826                            we compute and store as we normally would.
827                            And if we're strictly below the diagonal, we do nothing and
828                            continue. */ \
829                         if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
830                         { \
831                                 /* Invoke the gemm micro-kernel. */ \
832                                 gemm_ukr_cast( k, \
833                                                alpha_cast, \
834                                                a1, \
835                                                b1, \
836                                                zero, \
837                                                ct, rs_ct, cs_ct, \
838                                                &aux ); \
840                                 /* Scale C and add the result to only the stored part. */ \
841                                 PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
842                                                           m_cur, n_cur, \
843                                                           ct,  rs_ct, cs_ct, \
844                                                           beta_cast, \
845                                                           c11, rs_c,  cs_c ); \
846                         } \
847                         else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
848                         { \
849                                 /* Handle interior and edge cases separately. */ \
850                                 if ( m_cur == MR && n_cur == NR ) \
851                                 { \
852                                         /* Invoke the gemm micro-kernel. */ \
853                                         gemm_ukr_cast( k, \
854                                                        alpha_cast, \
855                                                        a1, \
856                                                        b1, \
857                                                        beta_cast, \
858                                                        c11, rs_c, cs_c, \
859                                                        &aux ); \
860                                 } \
861                                 else \
862                                 { \
863                                         /* Invoke the gemm micro-kernel. */ \
864                                         gemm_ukr_cast( k, \
865                                                        alpha_cast, \
866                                                        a1, \
867                                                        b1, \
868                                                        zero, \
869                                                        ct, rs_ct, cs_ct, \
870                                                        &aux ); \
872                                         /* Scale the edge of C and add the result. */ \
873                                         PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
874                                                                 ct,  rs_ct, cs_ct, \
875                                                                 beta_cast, \
876                                                                 c11, rs_c,  cs_c ); \
877                                 } \
878                         } \
879                 } \
880         } \
883 INSERT_GENTFUNC_BASIC( herk_u_ker_var2, gemm_ukr_t )
885 #endif