src/ti/linalg/blis/frame/3/trmm/bli_trmm_rl_ker_var2.c

   1 /*
   2
   3    BLIS
   4    An object-based framework for developing high-performance BLAS-like
   5    libraries.
   6
   7    Copyright (C) 2014, The University of Texas at Austin
   8
   9    Redistribution and use in source and binary forms, with or without
  10    modification, are permitted provided that the following conditions are
  11    met:
  12     - Redistributions of source code must retain the above copyright
  13       notice, this list of conditions and the following disclaimer.
  14     - Redistributions in binary form must reproduce the above copyright
  15       notice, this list of conditions and the following disclaimer in the
  16       documentation and/or other materials provided with the distribution.
  17     - Neither the name of The University of Texas at Austin nor the names
  18       of its contributors may be used to endorse or promote products
  19       derived from this software without specific prior written permission.
  20
  21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32
  33 */
  34
  35 #include "blis.h"
  36
  37 #define FUNCPTR_T gemm_fp
  38
  39 #ifdef BLIS_ENABLE_PROFILE
  40 #define BLIS_ENABLE_PROFILE_KERVAR2 1
  41 #else
  42 #define BLIS_ENABLE_PROFILE_KERVAR2 0
  43 #endif
  44
  45 typedef void (*FUNCPTR_T)(
  46                            doff_t  diagoffb,
  47                            pack_t  schema_a,
  48                            pack_t  schema_b,
  49                            dim_t   m,
  50                            dim_t   n,
  51                            dim_t   k,
  52                            void*   alpha,
  53                            void*   a, inc_t cs_a, inc_t pd_a, inc_t ps_a,
  54                            void*   b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
  55                            void*   beta,
  56                            void*   c, inc_t rs_c, inc_t cs_c,
  57                            void*   gemm_ukr,
  58                            trmm_thrinfo_t* thread
  59                          );
  60
  61 static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
  62
  63
  64 void bli_trmm_rl_ker_var2( obj_t*  a,
  65                            obj_t*  b,
  66                            obj_t*  c,
  67                            gemm_t* cntl,
  68                            trmm_thrinfo_t* thread )
  69 {
  70         num_t     dt_exec   = bli_obj_execution_datatype( *c );
  71
  72         doff_t    diagoffb  = bli_obj_diag_offset( *b );
  73
  74         pack_t    schema_a  = bli_obj_pack_schema( *a );
  75         pack_t    schema_b  = bli_obj_pack_schema( *b );
  76
  77         dim_t     m         = bli_obj_length( *c );
  78         dim_t     n         = bli_obj_width( *c );
  79         dim_t     k         = bli_obj_width( *a );
  80
  81         void*     buf_a     = bli_obj_buffer_at_off( *a );
  82         inc_t     cs_a      = bli_obj_col_stride( *a );
  83         inc_t     pd_a      = bli_obj_panel_dim( *a );
  84         inc_t     ps_a      = bli_obj_panel_stride( *a );
  85
  86         void*     buf_b     = bli_obj_buffer_at_off( *b );
  87         inc_t     rs_b      = bli_obj_row_stride( *b );
  88         inc_t     pd_b      = bli_obj_panel_dim( *b );
  89         inc_t     ps_b      = bli_obj_panel_stride( *b );
  90
  91         void*     buf_c     = bli_obj_buffer_at_off( *c );
  92         inc_t     rs_c      = bli_obj_row_stride( *c );
  93         inc_t     cs_c      = bli_obj_col_stride( *c );
  94
  95         obj_t     scalar_a;
  96         obj_t     scalar_b;
  97
  98         void*     buf_alpha;
  99         void*     buf_beta;
 100
 101         FUNCPTR_T f;
 102
 103         func_t*   gemm_ukrs;
 104         void*     gemm_ukr;
 105
 106         // Detach and multiply the scalars attached to A and B.
 107         bli_obj_scalar_detach( a, &scalar_a );
 108         bli_obj_scalar_detach( b, &scalar_b );
 109         bli_mulsc( &scalar_a, &scalar_b );
 110
 111         // Grab the addresses of the internal scalar buffers for the scalar
 112         // merged above and the scalar attached to C.
 113         buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
 114         buf_beta  = bli_obj_internal_scalar_buffer( *c );
 115
 116         // Index into the type combination array to extract the correct
 117         // function pointer.
 118         f = ftypes[dt_exec];
 119
 120         // Extract from the control tree node the func_t object containing
 121         // the gemm micro-kernel function addresses, and then query the
 122         // function address corresponding to the current datatype.
 123         gemm_ukrs = cntl_gemm_ukrs( cntl );
 124         gemm_ukr  = bli_func_obj_query( dt_exec, gemm_ukrs );
 125
 126         // Invoke the function.
 127         f( diagoffb,
 128            schema_a,
 129            schema_b,
 130            m,
 131            n,
 132            k,
 133            buf_alpha,
 134            buf_a, cs_a, pd_a, ps_a,
 135            buf_b, rs_b, pd_b, ps_b,
 136            buf_beta,
 137            buf_c, rs_c, cs_c,
 138            gemm_ukr,
 139            thread );
 140 }
 141
 142 #ifdef BLIS_ENABLE_C66X_MEM_POOLS
 143
 144 #if defined (BLIS_ENABLE_C66X_EDMA) && defined (BLIS_ENABLE_C66X_IDMA)
 145 #undef  GENTFUNC
 146 #define GENTFUNC( ctype, ch, varname, ukrtype ) \
 147 \
 148 void PASTEMAC(ch,varname)( \
 149                            doff_t  diagoffb, \
 150                            pack_t  schema_a, \
 151                            pack_t  schema_b, \
 152                            dim_t   m, \
 153                            dim_t   n, \
 154                            dim_t   k, \
 155                            void*   alpha, \
 156                            void*   a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
 157                            void*   b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
 158                            void*   beta, \
 159                            void*   c, inc_t rs_c, inc_t cs_c, \
 160                            void*   gemm_ukr, \
 161                            trmm_thrinfo_t* jr_thread \
 162                          ) \
 163 { \
 164         /* Cast the micro-kernel address to its function pointer type. */ \
 165         PASTECH(ch,ukrtype) gemm_ukr_cast = (PASTECH(ch,ukrtype)) gemm_ukr; \
 166 \
 167         /* Temporary C buffer for edge cases. */ \
 168         ctype           ct[ PASTEMAC(ch,maxmr) * \
 169                             PASTEMAC(ch,maxnr) ] \
 170                             __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 171         const inc_t     rs_ct      = 1; \
 172         const inc_t     cs_ct      = PASTEMAC(ch,maxmr); \
 173 \
 174         /* Alias some constants to simpler names. */ \
 175         const dim_t     MR         = pd_a; \
 176         const dim_t     NR         = pd_b; \
 177         const dim_t     PACKMR     = cs_a; \
 178         const dim_t     PACKNR     = rs_b; \
 179 \
 180         ctype* restrict one        = PASTEMAC(ch,1); \
 181         ctype* restrict zero       = PASTEMAC(ch,0); \
 182         ctype* restrict a_cast     = a; \
 183         ctype* restrict b_cast     = b; \
 184         ctype* restrict c_cast     = c; \
 185         ctype* restrict alpha_cast = alpha; \
 186         ctype* restrict beta_cast  = beta; \
 187         ctype* restrict b1; \
 188         ctype* restrict c1; \
 189 \
 190         doff_t          diagoffb_j; \
 191         dim_t           k_full; \
 192         dim_t           m_iter, m_left; \
 193         dim_t           n_iter, n_left; \
 194         dim_t           m_cur; \
 195         dim_t           n_cur; \
 196         dim_t           k_b1121; \
 197         dim_t           off_b1121; \
 198         dim_t           i, j; \
 199         inc_t           rstep_a; \
 200         inc_t           cstep_b; \
 201         /*inc_t           rstep_c;*/ \
 202         inc_t           cstep_c; \
 203         inc_t           istep_a; \
 204         inc_t           istep_b; \
 205         inc_t           off_scl; \
 206         inc_t           ss_b_num; \
 207         inc_t           ss_b_den; \
 208         inc_t           ps_b_cur; \
 209         auxinfo_t       aux; \
 210 \
 211     trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
 212     /*dim_t jr_num_threads      = thread_n_way( jr_thread ); \
 213     dim_t jr_thread_id        = thread_work_id( jr_thread );*/ \
 214 \
 215         dim_t n_next; \
 216         inc_t rstep_c11, rs_c11, cs_c11; \
 217 \
 218         mem_t b1_L1_mem; \
 219         /*memcpy does not like b1_L1 if it is restrict. The resid of gemm is non zero if this is changed to ctype* restrict*/ \
 220         ctype* b1_L1; \
 221 \
 222         mem_t a1_L1_mem, a2_L1_mem; \
 223         ctype *a1_L1, *a2_L1, *temp; \
 224 \
 225     mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
 226         ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
 227 \
 228     /*EDMA Declarations */ \
 229         lib_emt_Handle emt_handle_b = NULL; \
 230         lib_emt_Handle emt_handle_c0 = NULL; \
 231         lib_emt_Handle emt_handle_c1 = NULL; \
 232 \
 233         /*For DSP timing*/ \
 234         uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
 235         uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
 236         extern profile_data_t *bli_trmm_profile_data; \
 237 \
 238         /*
 239            Assumptions/assertions:
 240              rs_a == 1
 241              cs_a == PACKMR
 242              pd_a == MR
 243              ps_a == stride to next micro-panel of A
 244              rs_b == PACKNR
 245              cs_b == 1
 246              pd_b == NR
 247              ps_b == stride to next micro-panel of B
 248              rs_c == (no assumptions)
 249              cs_c == (no assumptions)
 250         */ \
 251 \
 252         /* If any dimension is zero, return immediately. */ \
 253         if ( bli_zero_dim3( m, n, k ) ) return; \
 254 \
 255         /* Safeguard: If the current panel of B is entirely above the diagonal,
 256            it is implicitly zero. So we do nothing. */ \
 257         if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
 258 \
 259         /* Compute k_full. For all trmm, k_full is simply k. This is
 260            needed because some parameter combinations of trmm reduce k
 261            to advance past zero regions in the triangular matrix, and
 262            when computing the imaginary stride of A (the non-triangular
 263            matrix), which is used by 3m and 4m implementations, we need
 264            this unreduced value of k. */ \
 265         k_full = k; \
 266 \
 267         /* Compute indexing scaling factor for for 4m or 3m. This is
 268            needed because one of the packing register blocksizes (PACKMR
 269            or PACKNR) is used to index into the micro-panels of the non-
 270            triangular matrix when computing with a diagonal-intersecting
 271            micro-panel of the triangular matrix. In the case of 4m or 3m,
 272            real values are stored in both sub-panels, and so the indexing
 273            needs to occur in units of real values. The value computed
 274            here is divided into the complex pointer offset to cause the
 275            pointer to be advanced by the correct value. */ \
 276         if ( bli_is_4m_packed( schema_b ) || \
 277              bli_is_3m_packed( schema_b ) || \
 278              bli_is_rih_packed( schema_b ) ) off_scl = 2; \
 279         else                                 off_scl = 1; \
 280 \
 281         /* Compute the storage stride. Usually this is just PACKMR (for A
 282            or PACKNR (for B). However, in the case of 3m, we need to scale
 283            the offset by 3/2. Since it's possible we may need to scale
 284            the packing dimension by a non-integer value, we break up the
 285            scaling factor into numerator and denominator. */ \
 286         if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
 287                                               ss_b_den = 2; } \
 288         else                                { ss_b_num = 1*PACKNR; \
 289                                               ss_b_den = 1; } \
 290 \
 291         /* If there is a zero region above where the diagonal of B intersects
 292            the left edge of the panel, adjust the pointer to A and treat this
 293            case as if the diagonal offset were zero. Note that we don't need to
 294            adjust the pointer to B since packm would have simply skipped over
 295            the region that was not stored. */ \
 296         if ( diagoffb < 0 ) \
 297         { \
 298                 j        = -diagoffb; \
 299                 k        = k - j; \
 300                 diagoffb = 0; \
 301                 a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
 302         } \
 303 \
 304         /* If there is a zero region to the right of where the diagonal
 305            of B intersects the bottom of the panel, shrink it to prevent
 306            "no-op" iterations from executing. */ \
 307         if ( diagoffb + k < n ) \
 308         { \
 309                 n = diagoffb + k; \
 310         } \
 311 \
 312         /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
 313         PASTEMAC(ch,set0s_mxn)( MR, NR, \
 314                                 ct, rs_ct, cs_ct ); \
 315 \
 316         /* Compute number of primary and leftover components of the m and n
 317            dimensions. */ \
 318         n_iter = n / NR; \
 319         n_left = n % NR; \
 320 \
 321         m_iter = m / MR; \
 322         m_left = m % MR; \
 323 \
 324         if ( n_left ) ++n_iter; \
 325         if ( m_left ) ++m_iter; \
 326 \
 327         /* Determine some increments used to step through A, B, and C. */ \
 328         rstep_a = ps_a; \
 329 \
 330         cstep_b = ps_b; \
 331 \
 332         /*rstep_c = rs_c * MR; */\
 333         cstep_c = cs_c * NR; \
 334 \
 335         /* When C (MC*NR) is moved to L2 the stride to get to the next panel of MRxNR*/ \
 336         rstep_c11 = MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
 337         rs_c11 = 1; \
 338         cs_c11 = (m%2 == 0) ? m : m+1; /*stride to get to next column in a panel of MRxNR*/\
 339 \
 340         istep_a = PACKMR * k_full; \
 341         istep_b = PACKNR * k; \
 342 \
 343         /* Save the pack schemas of A and B to the auxinfo_t object. */ \
 344         bli_auxinfo_set_schema_a( schema_a, aux ); \
 345         bli_auxinfo_set_schema_b( schema_b, aux ); \
 346 \
 347         /* Save the imaginary stride of A to the auxinfo_t object. */ \
 348         bli_auxinfo_set_is_a( istep_a, aux ); \
 349 \
 350         b1 = b_cast; \
 351         c1 = c_cast; \
 352 \
 353         /*Acquiring a buffer for B in L1*/ \
 354         bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
 355         b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
 356         b1_L1 = (ctype *) ((char *)b1_L1_mem.buf + PASTEMAC(ch,bank)); \
 357 \
 358         /*Acquiring a buffer for A in L1*/ \
 359         bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
 360         a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
 361         a1_L1 = a1_L1; \
 362 \
 363         bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
 364         a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
 365 \
 366     /*Acquiring buffers for C (MC_x_NR) in L2 */\
 367         bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
 368         cNew0 = bli_mem_buffer( &c0_L2_mem ); \
 369 \
 370     bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
 371         cNew1 = bli_mem_buffer( &c1_L2_mem ); \
 372 \
 373     bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
 374     cNew2 = bli_mem_buffer( &c2_L2_mem ); \
 375 \
 376         /*Acquiring an EDMA  handle from the pool*/ \
 377         bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
 378         if(emt_handle_b == NULL) \
 379         { \
 380                 printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
 381         } \
 382     /*Acquiring an EDMA  handle from the pool*/ \
 383     bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
 384     if(emt_handle_c0 == NULL) \
 385     { \
 386             printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
 387     } \
 388     /*Acquiring an EDMA  handle from the pool*/ \
 389     bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
 390     if(emt_handle_c1 == NULL) \
 391     { \
 392             printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
 393     } \
 394 \
 395         n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
 396         /* Loop over the n dimension (NR columns at a time). */ \
 397         if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 398         { \
 399                 counter_start_nr = lib_clock_read();  \
 400         } \
 401         /* Transfering MC(=m)xNR*/ \
 402         if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
 403         { \
 404                 lib_emt_copy2D2D(emt_handle_c0, c1, \
 405                                                 cNew1, m*sizeof(ctype), \
 406                                                 n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
 407         } \
 408         else \
 409         { \
 410                 dim_t ii; \
 411                 ctype *ptr_source; \
 412                 ctype *ptr_dest; \
 413                 ptr_source =  c1; \
 414                 ptr_dest = cNew1; \
 415                 for(ii = 0; ii < n_cur; ii++) \
 416                 { \
 417                         memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
 418                         ptr_source += cs_c; \
 419                         ptr_dest   += cs_c11; \
 420                 } \
 421         } \
 422 \
 423     /* Loop over the n dimension (NR columns at a time). */ \
 424         for ( j = 0; j < n_iter; ++j ) \
 425         { \
 426                 ctype* restrict a1; \
 427                 ctype* restrict c11; \
 428                 ctype* restrict b2; \
 429 \
 430                 diagoffb_j = diagoffb - ( doff_t )j*NR; \
 431 \
 432                 /* Determine the offset to the beginning of the panel that
 433                    was packed so we can index into the corresponding location
 434                    in A. Then compute the length of that panel. */ \
 435                 off_b1121 = bli_max( -diagoffb_j, 0 ); \
 436                 k_b1121   = k - off_b1121; \
 437 \
 438                 a1  = a_cast; \
 439                 c11 = cNew1; \
 440 \
 441                 n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 442                 n_next = ( bli_is_not_edge_f( j+1, n_iter, n_left ) ? NR : n_left ); \
 443 \
 444                 /* Initialize our next panel of B to be the current panel of B. */ \
 445                 b2 = b1; \
 446 \
 447                 lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k_b1121*NR*sizeof(ctype)); \
 448 \
 449                 lib_emt_wait(emt_handle_c0); \
 450                 if(j < n_iter-1) /* no transfer for last iteration */ \
 451                 { \
 452                         if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
 453                         { \
 454                                 lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
 455                                                                         cNew0, m*sizeof(ctype), \
 456                                                                         n_next, cs_c*sizeof(ctype), \
 457                                                                         cs_c11*sizeof(ctype)); \
 458                         }\
 459                         else \
 460                         { \
 461                                 dim_t ii; \
 462                                 ctype *ptr_source; \
 463                                 ctype *ptr_dest; \
 464                                 ptr_source =  c1+cstep_c; \
 465                                 ptr_dest = cNew0; \
 466                                 for(ii = 0; ii < n_next; ii++) \
 467                                 { \
 468                                         memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
 469                                         ptr_source += cs_c; \
 470                                         ptr_dest   += cs_c11; \
 471                                 } \
 472                         } \
 473                 } \
 474 \
 475                 /* If the current panel of B intersects the diagonal, scale C
 476                    by beta. If it is strictly below the diagonal, scale by one.
 477                    This allows the current macro-kernel to work for both trmm
 478                    and trmm3. */ \
 479                 if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
 480                 { \
 481                         /* Compute the panel stride for the current diagonal-
 482                            intersecting micro-panel. */ \
 483                         ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \
 484 \
 485                         if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
 486 \
 487                         /* Save the 4m/3m imaginary stride of B to the auxinfo_t
 488                            object. */ \
 489                         bli_auxinfo_set_is_b( PACKNR * k_b1121, aux ); \
 490 \
 491                         /* Loop over the m dimension (MR rows at a time). */ \
 492                         if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 493                         { \
 494                                 counter_start_mr = lib_clock_read();  \
 495                         } \
 496                         for ( i = 0; i < m_iter; ++i ) \
 497                         { \
 498                                 if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
 499 \
 500                                 ctype* restrict a1_i; \
 501                                 ctype* restrict a2; \
 502 \
 503                                 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 504 \
 505                                 if(i == 0) \
 506                                 { \
 507                                         lib_imt_copy(a1 + ( off_b1121 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
 508                                 } \
 509                                 \
 510                                 /* Compute the addresses of the next panels of A and B. */ \
 511                     a2 = a1 + rstep_a; \
 512                                 lib_imt_wait(); \
 513                         temp = a1_L1; \
 514                                 a1_L1 = a2_L1; \
 515                                 a2_L1 = temp; \
 516                                 if(i == 0) \
 517                                 { \
 518                                         lib_emt_wait(emt_handle_b);\
 519                                 } \
 520                                 /*a1_i = a1_L1 + ( off_b1121 * PACKMR ) / off_scl;*/ \
 521                                 a1_i = a1_L1; \
 522 \
 523                                 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
 524                                 { \
 525                                         a2 = a_cast; \
 526                                         b2 = b1; \
 527                                         if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
 528                                                 b2 = b_cast; \
 529                                 } \
 530                                 else \
 531                                 { \
 532                                         /*Start next panel*/ \
 533                                         lib_imt_copy(a2 + ( off_b1121 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
 534                                 } \
 535 \
 536                                 /* Save addresses of next panels of A and B to the auxinfo_t
 537                                    object. */ \
 538                                 bli_auxinfo_set_next_a( a2, aux ); \
 539                                 bli_auxinfo_set_next_b( b2, aux ); \
 540 \
 541                                 /* Handle interior and edge cases separately. */ \
 542                                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 543                                 { \
 544                                         counter_start_ker = lib_clock_read();  \
 545                                 } \
 546                                 if ( m_cur == MR && n_cur == NR ) \
 547                                 { \
 548                                         /* Invoke the gemm micro-kernel. */ \
 549                                         gemm_ukr_cast( k_b1121, \
 550                                                        alpha_cast, \
 551                                                        a1_i, \
 552                                                        b1_L1, \
 553                                                        beta_cast, \
 554                                                        c11, rs_c11, cs_c11, \
 555                                                        &aux ); \
 556                                 } \
 557                                 else \
 558                                 { \
 559                                         /* Copy edge elements of C to the temporary buffer. */ \
 560                                         PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
 561                                                                 c11, rs_c11,  cs_c11, \
 562                                                                 ct,  rs_ct, cs_ct ); \
 563 \
 564                                         /* Invoke the gemm micro-kernel. */ \
 565                                         gemm_ukr_cast( k_b1121, \
 566                                                        alpha_cast, \
 567                                                        a1_i, \
 568                                                        b1_L1, \
 569                                                        beta_cast, \
 570                                                        ct, rs_ct, cs_ct, \
 571                                                        &aux ); \
 572 \
 573                                         /* Copy the result to the edge of C. */ \
 574                                         PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
 575                                                                 ct,  rs_ct, cs_ct, \
 576                                                                 c11, rs_c11,  cs_c11 ); \
 577                                 } \
 578                                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 579                                 { \
 580                                         counter_end_ker = lib_clock_read();  \
 581                                         bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
 582                                                                                         (counter_end_ker-counter_start_ker),2*k_b1121*m_cur*n_cur); \
 583                                 } \
 584                                 } \
 585 \
 586                                 a1  += rstep_a; \
 587                                 c11 += rstep_c11; \
 588                         } \
 589                         if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 590                         { \
 591                                 counter_end_mr = lib_clock_read();  \
 592                                 bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
 593                                                                                 (counter_end_mr-counter_start_mr), 2*k_b1121*m*n_cur); \
 594                         } \
 595                         } \
 596 \
 597                         b1 += ps_b_cur; \
 598                 } \
 599                 else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
 600                 { \
 601                         if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
 602 \
 603                         /* Save the 4m/3m imaginary stride of B to the auxinfo_t
 604                            object. */ \
 605                         bli_auxinfo_set_is_b( istep_b, aux ); \
 606 \
 607                         /* Loop over the m dimension (MR rows at a time). */ \
 608                         if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 609                         { \
 610                                 counter_start_mr = lib_clock_read();  \
 611                         } \
 612                         for ( i = 0; i < m_iter; ++i ) \
 613                         { \
 614                                 if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
 615 \
 616                                 ctype* restrict a2; \
 617 \
 618                                 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 619 \
 620                                 if(i == 0) \
 621                                 { \
 622                                         lib_imt_copy(a1, a2_L1, k_b1121*MR*sizeof(ctype)); \
 623                                 } \
 624 \
 625                                 /* Compute the addresses of the next panels of A and B. */ \
 626                     a2 = a1 + rstep_a; \
 627                                 lib_imt_wait(); \
 628                         temp = a1_L1; \
 629                                 a1_L1 = a2_L1; \
 630                                 a2_L1 = temp; \
 631                                 if(i == 0) \
 632                                 { \
 633                                                 lib_emt_wait(emt_handle_b);\
 634                                 } \
 635                                 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
 636                                 { \
 637                                         a2 = a_cast; \
 638                                         b2 = b1; \
 639                                         if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
 640                                                 b2 = b_cast; \
 641                                 } \
 642                                 else \
 643                                 {  \
 644                                         /*Start next panel*/ \
 645                                         lib_imt_copy(a2, a2_L1, k_b1121*MR*sizeof(ctype)); \
 646                                 } \
 647 \
 648                                 /* Save addresses of next panels of A and B to the auxinfo_t
 649                                    object. */ \
 650                                 bli_auxinfo_set_next_a( a2, aux ); \
 651                                 bli_auxinfo_set_next_b( b2, aux ); \
 652 \
 653                                 /* Handle interior and edge cases separately. */ \
 654                                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 655                                 { \
 656                                         counter_start_ker = lib_clock_read();  \
 657                                 } \
 658                                 if ( m_cur == MR && n_cur == NR ) \
 659                                 { \
 660                                         /* Invoke the gemm micro-kernel. */ \
 661                                         gemm_ukr_cast( k, \
 662                                                        alpha_cast, \
 663                                                        a1_L1, \
 664                                                        b1_L1, \
 665                                                        one, \
 666                                                        c11, rs_c11, cs_c11, \
 667                                                        &aux ); \
 668                                 } \
 669                                 else \
 670                                 { \
 671                                         /* Invoke the gemm micro-kernel. */ \
 672                                         gemm_ukr_cast( k, \
 673                                                        alpha_cast, \
 674                                                        a1_L1, \
 675                                                        b1_L1, \
 676                                                        zero, \
 677                                                        ct, rs_ct, cs_ct, \
 678                                                        &aux ); \
 679 \
 680                                         /* Add the result to the edge of C. */ \
 681                                         PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
 682                                                                ct,  rs_ct, cs_ct, \
 683                                                                c11, rs_c11,  cs_c11 ); \
 684                                 } \
 685                                 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 686                                 { \
 687                                         counter_end_ker = lib_clock_read();  \
 688                                         bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
 689                                                                                         (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
 690                                         /*printf("gemm %d %d %d %ld\n", MR, NR, k, (counter_end_ker-counter_start_ker));*/ \
 691                                 } \
 692                                 } /*if ( trmm_r_ir_my_iter( i, ir_thread ) )*/\
 693 \
 694                                 a1  += rstep_a; \
 695                                 c11 += rstep_c11; \
 696                         } /*for i*/\
 697                         if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 698                         { \
 699                                 counter_end_mr = lib_clock_read();  \
 700                                 bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
 701                                                                                 (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
 702                         } \
 703                         } /*j thread*/\
 704 \
 705                         b1 += cstep_b; \
 706                 } /*else if above diag*/\
 707 \
 708                 /* circularly shift buffers */ \
 709                 cNewTemp = cNew0; \
 710                 cNew0 = cNew2; \
 711                 cNew2 = cNew1; \
 712                 cNew1 = cNewTemp; \
 713                 if(j != 0) /* wait for save c to complete; skip first iteration */ \
 714                 { \
 715                         lib_emt_wait(emt_handle_c1); \
 716                 } \
 717                 /* save updated c*/ \
 718                 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
 719                 { \
 720                         lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype),  \
 721                                                         n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
 722                 } \
 723                 else \
 724                 { \
 725                         dim_t ii; \
 726                         ctype *ptr_source; \
 727                         ctype *ptr_dest; \
 728                         ptr_source = cNew2; \
 729                         ptr_dest = c1; \
 730                         for(ii = 0; ii < n_cur; ii++) \
 731                         { \
 732                                 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
 733                                 ptr_source += cs_c11; \
 734                                 ptr_dest   += cs_c; \
 735                         } \
 736                 } \
 737                 c1 += cstep_c; \
 738         } \
 739         if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
 740         { \
 741                 counter_end_nr = lib_clock_read();  \
 742                 bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
 743                                                                 (counter_end_nr-counter_start_nr), 2*k*m*n); \
 744         } \
 745 \
 746     bli_mem_release( &c2_L2_mem ); \
 747     bli_mem_release( &c1_L2_mem ); \
 748     bli_mem_release( &c0_L2_mem ); \
 749         bli_mem_release( &a2_L1_mem ); \
 750         bli_mem_release( &a1_L1_mem ); \
 751         bli_mem_release( &b1_L1_mem ); \
 752         if ( emt_handle_b != NULL ) \
 753         { \
 754                 bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
 755                 emt_handle_b = NULL; \
 756         } \
 757     if ( emt_handle_c0 != NULL ) \
 758     { \
 759                 bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
 760                 emt_handle_c0 = NULL; \
 761     } \
 762         if ( emt_handle_c1 != NULL ) \
 763     { \
 764         lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
 765                 bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
 766                 emt_handle_c1 = NULL; \
 767     } \
 768 /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
 769 /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
 770 }
 771
 772 INSERT_GENTFUNC_BASIC( trmm_rl_ker_var2, gemm_ukr_t )
 773
 774 #else
 775 #endif
 776
 777 #else
 778 #undef  GENTFUNC
 779 #define GENTFUNC( ctype, ch, varname, ukrtype ) \
 780 \
 781 void PASTEMAC(ch,varname)( \
 782                            doff_t  diagoffb, \
 783                            pack_t  schema_a, \
 784                            pack_t  schema_b, \
 785                            dim_t   m, \
 786                            dim_t   n, \
 787                            dim_t   k, \
 788                            void*   alpha, \
 789                            void*   a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
 790                            void*   b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
 791                            void*   beta, \
 792                            void*   c, inc_t rs_c, inc_t cs_c, \
 793                            void*   gemm_ukr, \
 794                            trmm_thrinfo_t* jr_thread \
 795                          ) \
 796 { \
 797         /* Cast the micro-kernel address to its function pointer type. */ \
 798         PASTECH(ch,ukrtype) gemm_ukr_cast = gemm_ukr; \
 799 \
 800         /* Temporary C buffer for edge cases. */ \
 801         ctype           ct[ PASTEMAC(ch,maxmr) * \
 802                             PASTEMAC(ch,maxnr) ] \
 803                             __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 804         const inc_t     rs_ct      = 1; \
 805         const inc_t     cs_ct      = PASTEMAC(ch,maxmr); \
 806 \
 807         /* Alias some constants to simpler names. */ \
 808         const dim_t     MR         = pd_a; \
 809         const dim_t     NR         = pd_b; \
 810         const dim_t     PACKMR     = cs_a; \
 811         const dim_t     PACKNR     = rs_b; \
 812 \
 813         ctype* restrict one        = PASTEMAC(ch,1); \
 814         ctype* restrict zero       = PASTEMAC(ch,0); \
 815         ctype* restrict a_cast     = a; \
 816         ctype* restrict b_cast     = b; \
 817         ctype* restrict c_cast     = c; \
 818         ctype* restrict alpha_cast = alpha; \
 819         ctype* restrict beta_cast  = beta; \
 820         ctype* restrict b1; \
 821         ctype* restrict c1; \
 822 \
 823         doff_t          diagoffb_j; \
 824         dim_t           k_full; \
 825         dim_t           m_iter, m_left; \
 826         dim_t           n_iter, n_left; \
 827         dim_t           m_cur; \
 828         dim_t           n_cur; \
 829         dim_t           k_b1121; \
 830         dim_t           off_b1121; \
 831         dim_t           i, j; \
 832         inc_t           rstep_a; \
 833         inc_t           cstep_b; \
 834         inc_t           rstep_c, cstep_c; \
 835         inc_t           istep_a; \
 836         inc_t           istep_b; \
 837         inc_t           off_scl; \
 838         inc_t           ss_b_num; \
 839         inc_t           ss_b_den; \
 840         inc_t           ps_b_cur; \
 841         auxinfo_t       aux; \
 842 \
 843     trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
 844     /*dim_t jr_num_threads      = thread_n_way( jr_thread ); \
 845     dim_t jr_thread_id        = thread_work_id( jr_thread );*/ \
 846 \
 847         /*
 848            Assumptions/assertions:
 849              rs_a == 1
 850              cs_a == PACKMR
 851              pd_a == MR
 852              ps_a == stride to next micro-panel of A
 853              rs_b == PACKNR
 854              cs_b == 1
 855              pd_b == NR
 856              ps_b == stride to next micro-panel of B
 857              rs_c == (no assumptions)
 858              cs_c == (no assumptions)
 859         */ \
 860 \
 861         /* If any dimension is zero, return immediately. */ \
 862         if ( bli_zero_dim3( m, n, k ) ) return; \
 863 \
 864         /* Safeguard: If the current panel of B is entirely above the diagonal,
 865            it is implicitly zero. So we do nothing. */ \
 866         if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
 867 \
 868         /* Compute k_full. For all trmm, k_full is simply k. This is
 869            needed because some parameter combinations of trmm reduce k
 870            to advance past zero regions in the triangular matrix, and
 871            when computing the imaginary stride of A (the non-triangular
 872            matrix), which is used by 3m and 4m implementations, we need
 873            this unreduced value of k. */ \
 874         k_full = k; \
 875 \
 876         /* Compute indexing scaling factor for for 4m or 3m. This is
 877            needed because one of the packing register blocksizes (PACKMR
 878            or PACKNR) is used to index into the micro-panels of the non-
 879            triangular matrix when computing with a diagonal-intersecting
 880            micro-panel of the triangular matrix. In the case of 4m or 3m,
 881            real values are stored in both sub-panels, and so the indexing
 882            needs to occur in units of real values. The value computed
 883            here is divided into the complex pointer offset to cause the
 884            pointer to be advanced by the correct value. */ \
 885         if ( bli_is_4m_packed( schema_b ) || \
 886              bli_is_3m_packed( schema_b ) || \
 887              bli_is_rih_packed( schema_b ) ) off_scl = 2; \
 888         else                                 off_scl = 1; \
 889 \
 890         /* Compute the storage stride. Usually this is just PACKMR (for A
 891            or PACKNR (for B). However, in the case of 3m, we need to scale
 892            the offset by 3/2. Since it's possible we may need to scale
 893            the packing dimension by a non-integer value, we break up the
 894            scaling factor into numerator and denominator. */ \
 895         if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
 896                                               ss_b_den = 2; } \
 897         else                                { ss_b_num = 1*PACKNR; \
 898                                               ss_b_den = 1; } \
 899 \
 900         /* If there is a zero region above where the diagonal of B intersects
 901            the left edge of the panel, adjust the pointer to A and treat this
 902            case as if the diagonal offset were zero. Note that we don't need to
 903            adjust the pointer to B since packm would have simply skipped over
 904            the region that was not stored. */ \
 905         if ( diagoffb < 0 ) \
 906         { \
 907                 j        = -diagoffb; \
 908                 k        = k - j; \
 909                 diagoffb = 0; \
 910                 a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
 911         } \
 912 \
 913         /* If there is a zero region to the right of where the diagonal
 914            of B intersects the bottom of the panel, shrink it to prevent
 915            "no-op" iterations from executing. */ \
 916         if ( diagoffb + k < n ) \
 917         { \
 918                 n = diagoffb + k; \
 919         } \
 920 \
 921         /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
 922         PASTEMAC(ch,set0s_mxn)( MR, NR, \
 923                                 ct, rs_ct, cs_ct ); \
 924 \
 925         /* Compute number of primary and leftover components of the m and n
 926            dimensions. */ \
 927         n_iter = n / NR; \
 928         n_left = n % NR; \
 929 \
 930         m_iter = m / MR; \
 931         m_left = m % MR; \
 932 \
 933         if ( n_left ) ++n_iter; \
 934         if ( m_left ) ++m_iter; \
 935 \
 936         /* Determine some increments used to step through A, B, and C. */ \
 937         rstep_a = ps_a; \
 938 \
 939         cstep_b = ps_b; \
 940 \
 941         rstep_c = rs_c * MR; \
 942         cstep_c = cs_c * NR; \
 943 \
 944         istep_a = PACKMR * k_full; \
 945         istep_b = PACKNR * k; \
 946 \
 947         /* Save the pack schemas of A and B to the auxinfo_t object. */ \
 948         bli_auxinfo_set_schema_a( schema_a, aux ); \
 949         bli_auxinfo_set_schema_b( schema_b, aux ); \
 950 \
 951         /* Save the imaginary stride of A to the auxinfo_t object. */ \
 952         bli_auxinfo_set_is_a( istep_a, aux ); \
 953 \
 954         b1 = b_cast; \
 955         c1 = c_cast; \
 956 \
 957     /* Loop over the n dimension (NR columns at a time). */ \
 958         for ( j = 0; j < n_iter; ++j ) \
 959         { \
 960                 ctype* restrict a1; \
 961                 ctype* restrict c11; \
 962                 ctype* restrict b2; \
 963 \
 964                 diagoffb_j = diagoffb - ( doff_t )j*NR; \
 965 \
 966                 /* Determine the offset to the beginning of the panel that
 967                    was packed so we can index into the corresponding location
 968                    in A. Then compute the length of that panel. */ \
 969                 off_b1121 = bli_max( -diagoffb_j, 0 ); \
 970                 k_b1121   = k - off_b1121; \
 971 \
 972                 a1  = a_cast; \
 973                 c11 = c1; \
 974 \
 975                 n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
 976 \
 977                 /* Initialize our next panel of B to be the current panel of B. */ \
 978                 b2 = b1; \
 979 \
 980                 /* If the current panel of B intersects the diagonal, scale C
 981                    by beta. If it is strictly below the diagonal, scale by one.
 982                    This allows the current macro-kernel to work for both trmm
 983                    and trmm3. */ \
 984                 if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
 985                 { \
 986                         /* Compute the panel stride for the current diagonal-
 987                            intersecting micro-panel. */ \
 988                         ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \
 989 \
 990                         if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
 991 \
 992                         /* Save the 4m/3m imaginary stride of B to the auxinfo_t
 993                            object. */ \
 994                         bli_auxinfo_set_is_b( PACKNR * k_b1121, aux ); \
 995 \
 996                         /* Loop over the m dimension (MR rows at a time). */ \
 997                         for ( i = 0; i < m_iter; ++i ) \
 998                         { \
 999                                 if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
1000 \
1001                                 ctype* restrict a1_i; \
1002                                 ctype* restrict a2; \
1003 \
1004                                 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
1005 \
1006                                 a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
1007 \
1008                                 /* Compute the addresses of the next panels of A and B. */ \
1009                                 a2 = a1; \
1010                                 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
1011                                 { \
1012                                         a2 = a_cast; \
1013                                         b2 = b1; \
1014                                         if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
1015                                                 b2 = b_cast; \
1016                                 } \
1017 \
1018                                 /* Save addresses of next panels of A and B to the auxinfo_t
1019                                    object. */ \
1020                                 bli_auxinfo_set_next_a( a2, aux ); \
1021                                 bli_auxinfo_set_next_b( b2, aux ); \
1022 \
1023                                 /* Handle interior and edge cases separately. */ \
1024                                 if ( m_cur == MR && n_cur == NR ) \
1025                                 { \
1026                                         /* Invoke the gemm micro-kernel. */ \
1027                                         gemm_ukr_cast( k_b1121, \
1028                                                        alpha_cast, \
1029                                                        a1_i, \
1030                                                        b1, \
1031                                                        beta_cast, \
1032                                                        c11, rs_c, cs_c, \
1033                                                        &aux ); \
1034                                 } \
1035                                 else \
1036                                 { \
1037                                         /* Copy edge elements of C to the temporary buffer. */ \
1038                                         PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
1039                                                                 c11, rs_c,  cs_c, \
1040                                                                 ct,  rs_ct, cs_ct ); \
1041 \
1042                                         /* Invoke the gemm micro-kernel. */ \
1043                                         gemm_ukr_cast( k_b1121, \
1044                                                        alpha_cast, \
1045                                                        a1_i, \
1046                                                        b1, \
1047                                                        beta_cast, \
1048                                                        ct, rs_ct, cs_ct, \
1049                                                        &aux ); \
1050 \
1051                                         /* Copy the result to the edge of C. */ \
1052                                         PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
1053                                                                 ct,  rs_ct, cs_ct, \
1054                                                                 c11, rs_c,  cs_c ); \
1055                                 } \
1056                                 } \
1057 \
1058                                 a1  += rstep_a; \
1059                                 c11 += rstep_c; \
1060                         } \
1061                         } \
1062 \
1063                         b1 += ps_b_cur; \
1064                 } \
1065                 else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
1066                 { \
1067                         if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
1068 \
1069                         /* Save the 4m/3m imaginary stride of B to the auxinfo_t
1070                            object. */ \
1071                         bli_auxinfo_set_is_b( istep_b, aux ); \
1072 \
1073                         /* Loop over the m dimension (MR rows at a time). */ \
1074                         for ( i = 0; i < m_iter; ++i ) \
1075                         { \
1076                                 if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
1077 \
1078                                 ctype* restrict a2; \
1079 \
1080                                 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
1081 \
1082                                 /* Compute the addresses of the next panels of A and B. */ \
1083                                 a2 = a1; \
1084                                 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
1085                                 { \
1086                                         a2 = a_cast; \
1087                                         b2 = b1; \
1088                                         if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
1089                                                 b2 = b_cast; \
1090                                 } \
1091 \
1092                                 /* Save addresses of next panels of A and B to the auxinfo_t
1093                                    object. */ \
1094                                 bli_auxinfo_set_next_a( a2, aux ); \
1095                                 bli_auxinfo_set_next_b( b2, aux ); \
1096 \
1097                                 /* Handle interior and edge cases separately. */ \
1098                                 if ( m_cur == MR && n_cur == NR ) \
1099                                 { \
1100                                         /* Invoke the gemm micro-kernel. */ \
1101                                         gemm_ukr_cast( k, \
1102                                                        alpha_cast, \
1103                                                        a1, \
1104                                                        b1, \
1105                                                        one, \
1106                                                        c11, rs_c, cs_c, \
1107                                                        &aux ); \
1108                                 } \
1109                                 else \
1110                                 { \
1111                                         /* Invoke the gemm micro-kernel. */ \
1112                                         gemm_ukr_cast( k, \
1113                                                        alpha_cast, \
1114                                                        a1, \
1115                                                        b1, \
1116                                                        zero, \
1117                                                        ct, rs_ct, cs_ct, \
1118                                                        &aux ); \
1119 \
1120                                         /* Add the result to the edge of C. */ \
1121                                         PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
1122                                                                ct,  rs_ct, cs_ct, \
1123                                                                c11, rs_c,  cs_c ); \
1124                                 } \
1125                                 } \
1126 \
1127                                 a1  += rstep_a; \
1128                                 c11 += rstep_c; \
1129                         } \
1130                         } \
1131 \
1132                         b1 += cstep_b; \
1133                 } \
1134 \
1135                 c1 += cstep_c; \
1136         } \
1137 \
1138 /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
1139 /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
1140 }
1141
1142 INSERT_GENTFUNC_BASIC( trmm_rl_ker_var2, gemm_ukr_t )
1143
1144 #endif