blis/frame/3/trsm/bli_trsm_rl_ker_var2.c

   1 /*
   2
   3    BLIS
   4    An object-based framework for developing high-performance BLAS-like
   5    libraries.
   6
   7    Copyright (C) 2014, The University of Texas at Austin
   8
   9    Redistribution and use in source and binary forms, with or without
  10    modification, are permitted provided that the following conditions are
  11    met:
  12     - Redistributions of source code must retain the above copyright
  13       notice, this list of conditions and the following disclaimer.
  14     - Redistributions in binary form must reproduce the above copyright
  15       notice, this list of conditions and the following disclaimer in the
  16       documentation and/or other materials provided with the distribution.
  17     - Neither the name of The University of Texas at Austin nor the names
  18       of its contributors may be used to endorse or promote products
  19       derived from this software without specific prior written permission.
  20
  21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32
  33 */
  34
  35 #include "blis.h"
  36
  37 #define FUNCPTR_T gemm_fp
  38
  39 #ifdef BLIS_ENABLE_C66X_MEM_POOLS
  40 extern char   *pool_mk_mem_L1;
  41
  42 //#define L1POOLSIZE_C (BLIS_MK_POOL_SIZE_L1 + BLIS_KN_POOL_SIZE_L1 + BLIS_MN_POOL_SIZE_L1)
  43 //
  44 //#pragma DATA_SECTION(pool,".mem_l1")
  45 //#pragma DATA_ALIGN(pool,8)
  46 //char pool[L1POOLSIZE_C];
  47 #endif
  48
  49
  50 typedef void (*FUNCPTR_T)(
  51                            doff_t  diagoffb,
  52                            pack_t  schema_a,
  53                            pack_t  schema_b,
  54                            dim_t   m,
  55                            dim_t   n,
  56                            dim_t   k,
  57                            void*   alpha1,
  58                            void*   a, inc_t cs_a, inc_t pd_a, inc_t ps_a,
  59                            void*   b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
  60                            void*   alpha2,
  61                            void*   c, inc_t rs_c, inc_t cs_c,
  62                            void*   gemmtrsm_ukr,
  63                            void*   gemm_ukr,
  64                            trsm_thrinfo_t* thread
  65                          );
  66
  67 static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
  68
  69
  70 void bli_trsm_rl_ker_var2( obj_t*  a,
  71                            obj_t*  b,
  72                            obj_t*  c,
  73                            trsm_t* cntl,
  74                            trsm_thrinfo_t* thread )
  75 {
  76         num_t     dt_exec   = bli_obj_execution_datatype( *c );
  77
  78         doff_t    diagoffb  = bli_obj_diag_offset( *b );
  79
  80         pack_t    schema_a  = bli_obj_pack_schema( *a );
  81         pack_t    schema_b  = bli_obj_pack_schema( *b );
  82
  83         dim_t     m         = bli_obj_length( *c );
  84         dim_t     n         = bli_obj_width( *c );
  85         dim_t     k         = bli_obj_width( *a );
  86
  87         void*     buf_a     = bli_obj_buffer_at_off( *a );
  88         inc_t     cs_a      = bli_obj_col_stride( *a );
  89         inc_t     pd_a      = bli_obj_panel_dim( *a );
  90         inc_t     ps_a      = bli_obj_panel_stride( *a );
  91
  92         void*     buf_b     = bli_obj_buffer_at_off( *b );
  93         inc_t     rs_b      = bli_obj_row_stride( *b );
  94         inc_t     pd_b      = bli_obj_panel_dim( *b );
  95         inc_t     ps_b      = bli_obj_panel_stride( *b );
  96
  97         void*     buf_c     = bli_obj_buffer_at_off( *c );
  98         inc_t     rs_c      = bli_obj_row_stride( *c );
  99         inc_t     cs_c      = bli_obj_col_stride( *c );
 100
 101         void*     buf_alpha1;
 102         void*     buf_alpha2;
 103
 104         FUNCPTR_T f;
 105
 106         func_t*   gemmtrsm_ukrs;
 107         func_t*   gemm_ukrs;
 108         void*     gemmtrsm_ukr;
 109         void*     gemm_ukr;
 110
 111
 112         // Grab the address of the internal scalar buffer for the scalar
 113         // attached to A. This will be the alpha scalar used in the gemmtrsm
 114         // subproblems (ie: the scalar that would be applied to the packed
 115         // copy of A prior to it being updated by the trsm subproblem). This
 116         // scalar may be unit, if for example it was applied during packing.
 117         buf_alpha1 = bli_obj_internal_scalar_buffer( *a );
 118
 119         // Grab the address of the internal scalar buffer for the scalar
 120         // attached to C. This will be the "beta" scalar used in the gemm-only
 121         // subproblems that correspond to micro-panels that do not intersect
 122         // the diagonal. We need this separate scalar because it's possible
 123         // that the alpha attached to B was reset, if it was applied during
 124         // packing.
 125         buf_alpha2 = bli_obj_internal_scalar_buffer( *c );
 126
 127         // Index into the type combination array to extract the correct
 128         // function pointer.
 129         f = ftypes[dt_exec];
 130
 131         // Extract from the control tree node the func_t objects containing
 132         // the gemmtrsm and gemm micro-kernel function addresses, and then
 133         // query the function addresses corresponding to the current datatype.
 134         gemmtrsm_ukrs = cntl_gemmtrsm_u_ukrs( cntl );
 135         gemm_ukrs     = cntl_gemm_ukrs( cntl );
 136         gemmtrsm_ukr  = bli_func_obj_query( dt_exec, gemmtrsm_ukrs );
 137         gemm_ukr      = bli_func_obj_query( dt_exec, gemm_ukrs );
 138
 139         // Invoke the function.
 140         f( diagoffb,
 141            schema_a,
 142            schema_b,
 143            m,
 144            n,
 145            k,
 146            buf_alpha1,
 147            buf_a, cs_a, pd_a, ps_a,
 148            buf_b, rs_b, pd_b, ps_b,
 149            buf_alpha2,
 150            buf_c, rs_c, cs_c,
 151            gemmtrsm_ukr,
 152            gemm_ukr,
 153            thread );
 154 }
 155 #ifdef BLIS_ENABLE_C66X_MEM_POOLS
 156
 157 #if defined (BLIS_ENABLE_C66X_EDMA) && defined (BLIS_ENABLE_C66X_IDMA)
 158
 159 #undef  GENTFUNC
 160 #define GENTFUNC( ctype, ch, varname, gemmtrsmtype, gemmtype ) \
 161 \
 162 void PASTEMAC(ch,varname)( \
 163                            doff_t  diagoffb, \
 164                            pack_t  schema_a, \
 165                            pack_t  schema_b, \
 166                            dim_t   m, \
 167                            dim_t   n, \
 168                            dim_t   k, \
 169                            void*   alpha1, \
 170                            void*   a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
 171                            void*   b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
 172                            void*   alpha2, \
 173                            void*   c, inc_t rs_c, inc_t cs_c, \
 174                            void*   gemmtrsm_ukr, \
 175                            void*   gemm_ukr, \
 176                            trsm_thrinfo_t* thread \
 177                          ) \
 178 { \
 179         /* Cast the micro-kernels' addresses to their function pointer types. */ \
 180         PASTECH(ch,gemmtrsmtype) gemmtrsm_ukr_cast = ( PASTECH(ch,gemmtrsmtype) ) gemmtrsm_ukr; \
 181         PASTECH(ch,gemmtype)     gemm_ukr_cast     = ( PASTECH(ch,gemmtype)) gemm_ukr; \
 182 \
 183         /* Temporary C buffer for edge cases. */ \
 184         ctype           ct[ PASTEMAC(ch,maxnr) * \
 185                             PASTEMAC(ch,maxmr) ] \
 186                             __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 187         const inc_t     rs_ct      = 1; \
 188         const inc_t     cs_ct      = PASTEMAC(ch,maxnr); \
 189 \
 190         /* Alias some constants to simpler names. */ \
 191         const dim_t     MR         = pd_a; \
 192         const dim_t     NR         = pd_b; \
 193         const dim_t     PACKMR     = cs_a; \
 194         const dim_t     PACKNR     = rs_b; \
 195 \
 196         ctype* restrict zero        = PASTEMAC(ch,0); \
 197         ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 198         ctype* restrict a_cast      = a; \
 199         ctype* restrict b_cast      = b; \
 200         ctype* restrict c_cast      = c; \
 201         ctype* restrict alpha1_cast = alpha1; \
 202         ctype* restrict alpha2_cast = alpha2; \
 203         ctype* restrict b1; \
 204         ctype* restrict c1; \
 205 \
 206         doff_t          diagoffb_j; \
 207         dim_t           k_full; \
 208         dim_t           m_iter, m_left; \
 209         dim_t           n_iter, n_left; \
 210         dim_t           m_cur; \
 211         dim_t           n_cur; \
 212         dim_t           k_b1121; \
 213         dim_t           k_b11; \
 214         dim_t           k_b21; \
 215         dim_t           off_b11; \
 216         /*dim_t           off_b21;*/ \
 217         dim_t           i, j, jb; \
 218         inc_t           rstep_a; \
 219         inc_t           cstep_b; \
 220         inc_t           rstep_c, cstep_c; \
 221         inc_t           istep_a; \
 222         inc_t           istep_b; \
 223         inc_t           off_scl; \
 224         inc_t           ss_b_num; \
 225         inc_t           ss_b_den; \
 226         inc_t           ps_b_cur; \
 227         auxinfo_t       aux; \
 228 \
 229         dim_t n_next; \
 230         inc_t rstep_c11, rs_c11, cs_c11; \
 231 \
 232         mem_t b1_L1_mem; \
 233         /*memcpy does not like b1_L1 if it is restrict. The resid of gemm is non zero if this is changed to ctype* restrict*/ \
 234         ctype* b1_L1; \
 235 \
 236         mem_t a1_L1_mem, a2_L1_mem; \
 237         ctype *a1_L1, *a2_L1, *temp; \
 238 \
 239     mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
 240         ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
 241         /*EDMA Declarations */ \
 242 \
 243         EdmaMgr_Handle edma_handle_b = NULL; \
 244         EdmaMgr_Handle edma_handle_c0 = NULL; \
 245         EdmaMgr_Handle edma_handle_c1 = NULL; \
 246 \
 247         /*
 248            Assumptions/assertions:
 249              rs_a == 1
 250              cs_a == PACKNR
 251              pd_a == NR
 252              ps_a == stride to next micro-panel of A
 253              rs_b == PACKMR
 254              cs_b == 1
 255              pd_b == MR
 256              ps_b == stride to next micro-panel of B
 257              rs_c == (no assumptions)
 258              cs_c == (no assumptions)
 259
 260           Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
 261           swapping of values in the control tree (ie: those values used when
 262           packing). This swapping is needed since we cast right-hand trsm in
 263           terms of transposed left-hand trsm. So, if we're going to be
 264           transposing the operation, then A needs to be packed with NR and B
 265           needs to be packed with MR (remember: B is the triangular matrix in
 266           the right-hand side parameter case).
 267         */ \
 268 \
 269         /* If any dimension is zero, return immediately. */ \
 270         if ( bli_zero_dim3( m, n, k ) ) return; \
 271 \
 272         /* Safeguard: If the current panel of B is entirely above its diagonal,
 273            it is implicitly zero. So we do nothing. */ \
 274         if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
 275 \
 276         /* Compute k_full as k inflated up to a multiple of NR. This is
 277            needed because some parameter combinations of trsm reduce k
 278            to advance past zero regions in the triangular matrix, and
 279            when computing the imaginary stride of B (the non-triangular
 280            matrix), which is used by 3m and 4m implementations, we need
 281            this unreduced value of k. */ \
 282         k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
 283 \
 284         /* Compute indexing scaling factor for for 4m or 3m. This is
 285            needed because one of the packing register blocksizes (PACKMR
 286            or PACKNR) is used to index into the micro-panels of the non-
 287            triangular matrix when computing with a diagonal-intersecting
 288            micro-panel of the triangular matrix. In the case of 4m or 3m,
 289            real values are stored in both sub-panels, and so the indexing
 290            needs to occur in units of real values. The value computed
 291            here is divided into the complex pointer offset to cause the
 292            pointer to be advanced by the correct value. */ \
 293         if ( bli_is_4m_packed( schema_b ) || \
 294              bli_is_3m_packed( schema_b ) || \
 295              bli_is_rih_packed( schema_b ) ) off_scl = 2; \
 296         else                                 off_scl = 1; \
 297 \
 298         /* Compute the storage stride. Usually this is just PACKMR (for A
 299            or PACKNR (for B). However, in the case of 3m, we need to scale
 300            the offset by 3/2. Since it's possible we may need to scale
 301            the packing dimension by a non-integer value, we break up the
 302            scaling factor into numerator and denominator. */ \
 303         if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
 304                                               ss_b_den = 2; } \
 305         else                                { ss_b_num = 1*PACKNR; \
 306                                               ss_b_den = 1; } \
 307 \
 308         /* If there is a zero region above where the diagonal of B intersects
 309            the left edge of the panel, adjust the pointer to A and treat this
 310            case as if the diagonal offset were zero. Note that we don't need to
 311            adjust the pointer to B since packm would have simply skipped over
 312            the region that was not stored. */ \
 313         if ( diagoffb < 0 ) \
 314         { \
 315                 j        = -diagoffb; \
 316                 k        = k - j; \
 317                 diagoffb = 0; \
 318                 a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
 319         } \
 320 \
 321         /* If there is a zero region to the right of where the diagonal
 322            of B intersects the bottom of the panel, shrink it so that
 323            we can index to the correct place in C (corresponding to the
 324            part of the panel of B that was packed).
 325            NOTE: This is NOT being done to skip over "no-op" iterations,
 326            as with the trsm_lu macro-kernel. This MUST be done for correct
 327            execution because we use n (via n_iter) to compute diagonal and
 328            index offsets for backwards movement through B. */ \
 329         if ( diagoffb + k < n ) \
 330         { \
 331                 n = diagoffb + k; \
 332         } \
 333 \
 334         /* Check the k dimension, which needs to be a multiple of NR. If k
 335            isn't a multiple of NR, we adjust it higher to satisfy the micro-
 336            kernel, which is expecting to perform an NR x NR triangular solve.
 337            This adjustment of k is consistent with what happened when B was
 338            packed: all of its bottom/right edges were zero-padded, and
 339            furthermore, the panel that stores the bottom-right corner of the
 340            matrix has its diagonal extended into the zero-padded region (as
 341            identity). This allows the trsm of that bottom-right panel to
 342            proceed without producing any infs or NaNs that would infect the
 343            "good" values of the corresponding block of A. */ \
 344         if ( k % NR != 0 ) k += NR - ( k % NR ); \
 345 \
 346         /* NOTE: We don't need to check that n is a multiple of PACKNR since we
 347            know that the underlying buffer was already allocated to have an n
 348            dimension that is a multiple of PACKNR, with the region between the
 349            last column and the next multiple of NR zero-padded accordingly. */ \
 350 \
 351         /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
 352         PASTEMAC(ch,set0s_mxn)( MR, NR, \
 353                                 ct, rs_ct, cs_ct ); \
 354 \
 355         /* Compute number of primary and leftover components of the m and n
 356        dimensions. */ \
 357         n_iter = n / NR; \
 358         n_left = n % NR; \
 359 \
 360         m_iter = m / MR; \
 361         m_left = m % MR; \
 362 \
 363         if ( n_left ) ++n_iter; \
 364         if ( m_left ) ++m_iter; \
 365 \
 366         /* Determine some increments used to step through A, B, and C. */ \
 367         rstep_a = ps_a; \
 368 \
 369         cstep_b = ps_b; \
 370 \
 371         rstep_c = rs_c * MR; \
 372         cstep_c = cs_c * NR; \
 373 \
 374         /* When C (MC*NR) is moved to L2 the stride to get to the next panel of MRxNR*/ \
 375         if(rs_c == 1) \
 376         { \
 377                 rstep_c11 = MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
 378                 rs_c11 = 1; \
 379                 cs_c11 = (m%2 == 0) ? m : m+1; /*stride to get to next column in a panel of MRxNR*/\
 380         } \
 381         else\
 382         { \
 383                 rstep_c11 = NR*MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
 384                 rs_c11 = NR; /* stride to get to next row in MRxNR panel*/\
 385                 cs_c11 = 1; /*stride to get to next column in a panel of MRxNR*/\
 386 \
 387                 rstep_c11 = rstep_c; \
 388                 rs_c11 = rs_c; \
 389                 cs_c11 = cs_c; \
 390         } \
 391 \
 392         istep_a = PACKMR * k_full; \
 393         istep_b = PACKNR * k; \
 394 \
 395         /* Save the pack schemas of A and B to the auxinfo_t object.
 396            NOTE: We swap the values for A and B since the triangular
 397            "A" matrix is actually contained within B. */ \
 398         bli_auxinfo_set_schema_a( schema_b, aux ); \
 399         bli_auxinfo_set_schema_b( schema_a, aux ); \
 400 \
 401         /* Save the imaginary stride of A to the auxinfo_t object.
 402            NOTE: We swap the values for A and B since the triangular
 403            "A" matrix is actually contained within B. */ \
 404         bli_auxinfo_set_is_b( istep_a, aux ); \
 405 \
 406         b1 = b_cast; \
 407         c1 = c_cast; \
 408 \
 409         /*printf("m %d n %d  k %d pd_a %d, ps_a %d pd_b %d, ps_b %d\n",m, n, k, pd_a, ps_a, pd_b, ps_b);*/\
 410         /*printf("Acquire B k %d, NR %d, size of ctype %d, B size requested %d\n", k, NR, sizeof(ctype), k*NR*sizeof(ctype)); \
 411         printf("Acquire A k %d, MR %d, size of ctype %d, A size requested %d\n", k, MR, sizeof(ctype), k*MR*sizeof(ctype));*/ \
 412         /*Acquiring a buffer for B in L1*/ \
 413         if((MKSTR(ch)=="c")==1) \
 414         {\
 415                 a1_L1 = (ctype*) (pool_mk_mem_L1 ); \
 416                 a2_L1 = (ctype*) (pool_mk_mem_L1 + k*MR*sizeof(ctype) ) ;\
 417                 b1_L1 = (ctype*) (pool_mk_mem_L1 + PASTEMAC(ch,bank) +  2 * k*MR*sizeof(ctype)) ;\
 418                 /*printf("%x %x %x \n", a1_L1, a2_L1, b1_L1);*/\
 419         }\
 420         else { \
 421         bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
 422         b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
 423         b1_L1 = (ctype *) ((char *) b1_L1_mem.buf + PASTEMAC(ch,bank)); \
 424 \
 425         /*Acquiring a buffer for A in L1*/ \
 426         bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
 427         a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
 428 \
 429         bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
 430         a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
 431 \
 432         }\
 433         /*Acquiring buffers for C (MC_x_NR) in L2 */\
 434         /*printf("Acquire C NR %d, MR %d, size of ctype %d, C size requested %d\n", m, NR, sizeof(ctype), m*NR*sizeof(ctype));*/ \
 435         bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
 436         cNew0 = bli_mem_buffer( &c0_L2_mem ); \
 437 \
 438         bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
 439         cNew1 = bli_mem_buffer( &c1_L2_mem ); \
 440 \
 441         bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
 442         cNew2 = bli_mem_buffer( &c2_L2_mem ); \
 443 \
 444         /*Acquiring an EDMA  handle from the pool*/ \
 445         bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
 446         if(edma_handle_b == NULL) \
 447         { \
 448                 printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
 449         } \
 450         bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
 451         if(edma_handle_c0 == NULL) \
 452         { \
 453                 printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
 454         } \
 455         /*Acquiring an EDMA  handle from the pool*/ \
 456         bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
 457         if(edma_handle_c1 == NULL) \
 458         { \
 459                 printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
 460         } \
 461 \
 462         /*for(jb = 0; jb < 16; jb ++) \
 463                 printf("%f \t %f\n", a_cast[jb], b_cast[jb]);*/ \
 464             /* initiate first c transfer */ \
 465         /*printf("cstep_c %d rstep_c %d rs_c %d cs_c %d rstep_c11 %d rs_c11 %d cs_c11 %d\n", cstep_c, rstep_c, rs_c, cs_c, rstep_c11, rs_c11, cs_c11);*/\
 466         n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
 467         if(rs_c == 1) \
 468         {\
 469                 c1 = c1 + (n_iter-1)*cstep_c; \
 470                 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
 471                 { \
 472                         EdmaMgr_copy2D2DSep(edma_handle_c0, c1, \
 473                                     cNew1, m*sizeof(ctype), \
 474                                     n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
 475                 } \
 476                 else \
 477                 { \
 478                         dim_t ii; \
 479                         ctype *ptr_source; \
 480                         ctype *ptr_dest; \
 481                         ptr_source =  c1; \
 482                         ptr_dest = cNew1; \
 483                         for(ii = 0; ii < n_cur; ii++) \
 484                         { \
 485                         memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
 486                         ptr_source += cs_c; \
 487                         ptr_dest   += cs_c11; \
 488                     } \
 489                 }\
 490         }\
 491         /*else \
 492                 EdmaMgr_copy2D2DSep(edma_handle_c0, c1 = c1 + (n_iter-1)*cstep_c, \
 493                                         cNew1, n_cur*sizeof(ctype), \
 494                                         m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype));*/ \
 495 \
 496         /* Loop over the n dimension (NR columns at a time). */ \
 497         for ( jb = 0; jb < n_iter; ++jb ) \
 498         { \
 499                 ctype* restrict a1; \
 500                 ctype* restrict c11; \
 501                 ctype* restrict b11; \
 502                 ctype* restrict b21; \
 503                 ctype* restrict b2; \
 504 \
 505                 j          = n_iter - 1 - jb; \
 506                 diagoffb_j = diagoffb - ( doff_t )j*NR; \
 507                 a1         = a_cast; \
 508                 if(rs_c == 1) \
 509                         c11    = cNew1; \
 510                 else \
 511                         c11    = c1 + (n_iter-1)*cstep_c; \
 512 \
 513                 n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
 514                 n_next = ( bli_is_not_edge_b( jb+1, n_iter, n_left ) ? NR : n_left ); \
 515 \
 516                 EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
 517                 /* Initialize our next panel of B to be the current panel of B. */ \
 518                 b2 = b1; \
 519                 if(rs_c == 1) \
 520                         EdmaMgr_wait(edma_handle_c0); \
 521                 if(jb < n_iter-1) /* no transfer for last iteration */ \
 522                 { \
 523                         if (rs_c == 1) \
 524                         { \
 525                                 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
 526                                 { \
 527                                         EdmaMgr_copy2D2DSep(edma_handle_c0, c1-cstep_c, \
 528                                                 cNew0, m*sizeof(ctype), \
 529                                                 n_next, cs_c*sizeof(ctype), \
 530                                                 cs_c11*sizeof(ctype)); \
 531                                 } \
 532                                 else \
 533                                 { \
 534                                         dim_t ii; \
 535                                         ctype *ptr_source; \
 536                                         ctype *ptr_dest; \
 537                                         ptr_source =  c1-cstep_c; \
 538                                         ptr_dest = cNew0; \
 539                                         for(ii = 0; ii < n_next; ii++) \
 540                                         { \
 541                                                 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
 542                                                 ptr_source += cs_c; \
 543                                                 ptr_dest   += cs_c11; \
 544                                         } \
 545                                 }\
 546                         }\
 547                     /*else \
 548                     { \
 549                         EdmaMgr_copy2D2DSep(edma_handle_c0, c1 - cstep_c, \
 550                                                 cNew0, n_next*sizeof(ctype), \
 551                                             m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
 552                     }*/ \
 553                 } \
 554 \
 555                 /* If the current panel of B intersects the diagonal, use a
 556                    special micro-kernel that performs a fused gemm and trsm.
 557                    If the current panel of B resides below the diagonal, use a
 558                    a regular gemm micro-kernel. Otherwise, if it is above the
 559                    diagonal, it was not packed (because it is implicitly zero)
 560                    and so we do nothing. */ \
 561                 if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
 562                 { \
 563                         /* Determine the offset to and length of the panel that was packed
 564                            so we can index into the corresponding location in A. */ \
 565                         off_b11   = bli_max( -diagoffb_j, 0 ); \
 566                         k_b1121   = k - off_b11; \
 567                         k_b11     = NR; \
 568                         k_b21     = k_b1121 - NR; \
 569                         /*off_b21   = off_b11 + k_b11;*/ \
 570 \
 571                         /* Compute the addresses of the triangular block B11 and the
 572                            panel B21. */ \
 573                         /*b11       = b1; \
 574                         b21       = b1 + ( k_b11 * PACKNR ) / off_scl;*/ \
 575                         b11       = b1_L1; \
 576                         b21       = b1_L1 + ( k_b11 * PACKNR ) / off_scl; \
 577 \
 578                         /* Compute the panel stride for the current micro-panel. */ \
 579                         ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \
 580 \
 581                         /* Save the imaginary stride of B to the auxinfo_t object.
 582                            NOTE: We swap the values for A and B since the triangular
 583                            "A" matrix is actually contained within B. */ \
 584                         bli_auxinfo_set_is_a( PACKNR * k_b1121, aux ); \
 585 \
 586                         /* Loop over the m dimension (MR rows at a time). */ \
 587                         for ( i = 0; i < m_iter; ++i ) \
 588                         { \
 589                                 if( trsm_my_iter( i, thread ) ){ \
 590 \
 591                                 ctype* restrict a11; \
 592                                 ctype* restrict a12; \
 593                                 ctype* restrict a2; \
 594 \
 595                                 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 596 \
 597                                 /*printf("%d %d %d %d \n", k, MR, off_b11, PACKMR);*/ \
 598                                 if (i == 0) \
 599                                 { \
 600                                         idma1_setup(a2_L1, a1 + ( off_b11 * PACKMR ) / off_scl, k_b1121*MR*sizeof(ctype), 0, 0, 7); \
 601                                 } \
 602 \
 603                                 /*ORIG TRSM*/ \
 604                                 /* Compute the addresses of the next panels of A and B. */ \
 605                                 /*a2 = a1;*/ \
 606 \
 607                                 /* Compute the addresses of the next panels of A and B. */ \
 608                     a2 = a1 + rstep_a; \
 609                                 while(!idma1_done()){;} \
 610                         temp = a1_L1; \
 611                                 a1_L1 = a2_L1; \
 612                                 a2_L1 = temp; \
 613                                 if(i == 0) \
 614                                 { \
 615                                         EdmaMgr_wait(edma_handle_b);\
 616                                 } \
 617                                 /*if ( i + thread_num_threads(thread) >= m_iter )*/ \
 618                                 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
 619                                 { \
 620                                         a2 = a_cast; \
 621                                         b2 = b1 + ps_b_cur; \
 622                                         if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
 623                                                 b2 = b_cast; \
 624                                 } \
 625                                 else \
 626                                 { \
 627                                         /*Start next panel*/ \
 628                                         idma1_setup(a2_L1, a2 + ( off_b11 * PACKMR ) / off_scl , k_b1121*MR*sizeof(ctype), 0, 0, 7); \
 629                                 } \
 630 \
 631                                 /* Save addresses of next panels of A and B to the auxinfo_t
 632                                    object. NOTE: We swap the values for A and B since the
 633                                    triangular "A" matrix is actually contained within B. */ \
 634                                 bli_auxinfo_set_next_a( b2, aux ); \
 635                                 bli_auxinfo_set_next_b( a2, aux ); \
 636 \
 637                                 /* Compute the addresses of the A11 block and A12 panel. */ \
 638                                 /*a11  = a1 + ( off_b11 * PACKMR ) / off_scl; */\
 639                                 /*a12 = a1 + ( off_b21 * PACKMR ) / off_scl;*/ \
 640                                 a11 = a1_L1;\
 641                                 a12 = a1_L1 + ( k_b11 * PACKMR ) / off_scl; \
 642                                 /* Handle interior and edge cases separately. */ \
 643                                 /*printf("Calling GEMMTRSM ukernel\n");*/\
 644                                 if ( m_cur == MR && n_cur == NR ) \
 645                                 { \
 646                                         /* Invoke the fused gemm/trsm micro-kernel. */ \
 647                                         gemmtrsm_ukr_cast( k_b21, \
 648                                                            alpha1_cast, \
 649                                                            b21, \
 650                                                            b11, \
 651                                                            a12, \
 652                                                            a11, \
 653                                                            c11, cs_c11, rs_c11, /*cs_c, rs_c, cstep_c11, 1,*/ \
 654                                                            &aux ); \
 655                                 } \
 656                                 else \
 657                                 { \
 658                                         /* Invoke the fused gemm/trsm micro-kernel. */ \
 659                                         gemmtrsm_ukr_cast( k_b21, \
 660                                                            alpha1_cast, \
 661                                                            b21, \
 662                                                            b11, \
 663                                                            a12, \
 664                                                            a11, \
 665                                                            ct, cs_ct, rs_ct, \
 666                                                            &aux ); \
 667 \
 668                                         /* Copy the result to the bottom edge of C. */ \
 669                                         PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
 670                                                                 ct,  rs_ct, cs_ct, \
 671                                                                 c11, rs_c11, cs_c11 /* rs_c,  cs_c **** 1, cstep_c11*/); \
 672                                 } \
 673                                 while(!idma1_done()){;} \
 674                                 /*printf("%d %d \n", k_b11, PACKMR);*/ \
 675                                 idma1_setup(a1 + ( off_b11 * PACKMR ) / off_scl, a11, k_b11*PACKMR*sizeof(ctype), 0,0,7); \
 676                                 } \
 677 \
 678                                 a1  += rstep_a; \
 679                                 /*c11 += rstep_c;*/ \
 680                                 c11 += rstep_c11; \
 681                         } \
 682 \
 683                         b1 += ps_b_cur; \
 684                 } \
 685                 else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
 686                 { \
 687                         /* Save the imaginary stride of B to the auxinfo_t object.
 688                            NOTE: We swap the values for A and B since the triangular
 689                            "A" matrix is actually contained within B. */ \
 690                         bli_auxinfo_set_is_a( istep_b, aux ); \
 691 \
 692                         /* Loop over the m dimension (MR rows at a time). */ \
 693                         for ( i = 0; i < m_iter; ++i ) \
 694                         { \
 695                                 if( trsm_my_iter( i, thread ) ){ \
 696 \
 697                                 ctype* restrict a2; \
 698 \
 699                                 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
 700 \
 701                                 if(i == 0) \
 702                                 { \
 703                                         idma1_setup(a2_L1, a1, k*MR*sizeof(ctype), 0, 0, 7); \
 704                                 } \
 705 \
 706                                 /*ORIG TRSM*/ \
 707                                 /* Compute the addresses of the next panels of A and B. */ \
 708                                 /*a2 = a1;*/\
 709                                 /* Compute the addresses of the next panels of A and B. */ \
 710                                 a2 = a1 + rstep_a; \
 711                                 while(!idma1_done()){;} \
 712                                 temp = a1_L1; \
 713                                 a1_L1 = a2_L1; \
 714                                 a2_L1 = temp; \
 715                                 if(i == 0) \
 716                                 { \
 717                                         EdmaMgr_wait(edma_handle_b);\
 718                                 }\
 719                                 /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
 720                                 if ( i + thread_num_threads(thread) >= m_iter ) \
 721                                 { \
 722                                         a2 = a_cast; \
 723                                         b2 = b1 + cstep_b; \
 724                                         if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
 725                                                 b2 = b_cast; \
 726                                 } \
 727                                 else \
 728                                 { \
 729                                         /*Start next panel*/ \
 730                                         idma1_setup(a2_L1, a2 , k*MR*sizeof(ctype), 0, 0, 7); \
 731                                 } \
 732 \
 733                                 /* Save addresses of next panels of A and B to the auxinfo_t
 734                                    object. NOTE: We swap the values for A and B since the
 735                                    triangular "A" matrix is actually contained within B. */ \
 736                                 bli_auxinfo_set_next_a( b2, aux ); \
 737                                 bli_auxinfo_set_next_b( a2, aux ); \
 738 \
 739                                 /* Handle interior and edge cases separately. */ \
 740                                 /*printf("Calling GEMM ukernel\n");*/\
 741                                 if ( m_cur == MR && n_cur == NR ) \
 742                                 { \
 743                                         /* Invoke the gemm micro-kernel. */ \
 744                                         gemm_ukr_cast( k, \
 745                                                        minus_one, \
 746                                                        b1_L1, \
 747                                                        a1_L1, \
 748                                                        alpha2_cast, \
 749                                                        c11, cs_c11, rs_c11, /*cs_c, rs_c,  * cstep_c11, 1,*/ \
 750                                                        &aux ); \
 751                                 } \
 752                                 else \
 753                                 { \
 754                                         /* Invoke the gemm micro-kernel. */ \
 755                                         gemm_ukr_cast( k, \
 756                                                        minus_one, \
 757                                                        b1_L1, \
 758                                                        a1_L1, \
 759                                                        zero, \
 760                                                        ct, cs_ct, rs_ct, \
 761                                                        &aux ); \
 762 \
 763                                         /* Add the result to the edge of C. */ \
 764                                         PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
 765                                                                 ct,  rs_ct, cs_ct, \
 766                                                                 alpha2_cast, \
 767                                                                 c11, rs_c11, cs_c11 /*rs_c,  cs_c  *1, cstep_c11 */); \
 768                                 } \
 769                                 } \
 770 \
 771                                 a1  += rstep_a; \
 772                                 /*c11 += rstep_c;*/ \
 773                                 c11 += rstep_c11; \
 774                         } \
 775 \
 776                         b1 += cstep_b; \
 777                 } \
 778 \
 779                 /* circularly shift buffers */ \
 780                 if(rs_c==1) \
 781                 { \
 782                 cNewTemp = cNew0; \
 783                 cNew0 = cNew2; \
 784                 cNew2 = cNew1; \
 785                 cNew1 = cNewTemp; \
 786                 if(j != 0) /* wait for save c to complete; skip first iteration */ \
 787                 { \
 788                         EdmaMgr_wait(edma_handle_c1); \
 789                 } \
 790                 } \
 791                 /* save updated c*/ \
 792                 if(rs_c==1) \
 793                 { \
 794                         if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
 795                         { \
 796                                 EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1, m*sizeof(ctype),  \
 797                                                                 n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
 798                         }\
 799                         else \
 800                         { \
 801                                 dim_t ii; \
 802                                 ctype *ptr_source; \
 803                                 ctype *ptr_dest; \
 804                                 ptr_source = cNew2; \
 805                                 ptr_dest = c1; \
 806                                 for(ii = 0; ii < n_cur; ii++) \
 807                                 { \
 808                                         memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
 809                                         ptr_source += cs_c11; \
 810                                         ptr_dest   += cs_c; \
 811                                 } \
 812                         } \
 813                 } \
 814                 /*else \
 815                         EdmaMgr_copy2D2DSep(edma_handle_c1, cNew1, c1, n_cur*sizeof(ctype), \
 816                                                                 m, rs_c11*sizeof(ctype), rs_c*sizeof(ctype)); */\
 817 \
 818                 c1 -= cstep_c; \
 819         } \
 820         \
 821     bli_mem_release( &c2_L2_mem ); \
 822         bli_mem_release( &c1_L2_mem ); \
 823         bli_mem_release( &c0_L2_mem ); \
 824         if((MKSTR(ch)=="c")==0) \
 825         {\
 826         bli_mem_release( &a2_L1_mem ); \
 827         bli_mem_release( &a1_L1_mem ); \
 828         bli_mem_release( &b1_L1_mem ); \
 829         }\
 830         if ( edma_handle_b != NULL ) \
 831     { \
 832                 bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
 833                 edma_handle_b = NULL; \
 834     } \
 835     if ( edma_handle_c0 != NULL ) \
 836     { \
 837         bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
 838         edma_handle_c0 = NULL; \
 839     } \
 840     if ( edma_handle_c1 != NULL ) \
 841     { \
 842         EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
 843         bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
 844         edma_handle_c1 = NULL; \
 845     } \
 846 }
 847
 848 INSERT_GENTFUNC_BASIC2( trsm_rl_ker_var2, gemmtrsm_ukr_t, gemm_ukr_t )
 849 #endif
 850
 851 #else
 852
 853 #undef  GENTFUNC
 854 #define GENTFUNC( ctype, ch, varname, gemmtrsmtype, gemmtype ) \
 855 \
 856 void PASTEMAC(ch,varname)( \
 857                            doff_t  diagoffb, \
 858                            pack_t  schema_a, \
 859                            pack_t  schema_b, \
 860                            dim_t   m, \
 861                            dim_t   n, \
 862                            dim_t   k, \
 863                            void*   alpha1, \
 864                            void*   a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
 865                            void*   b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
 866                            void*   alpha2, \
 867                            void*   c, inc_t rs_c, inc_t cs_c, \
 868                            void*   gemmtrsm_ukr, \
 869                            void*   gemm_ukr, \
 870                            trsm_thrinfo_t* thread \
 871                          ) \
 872 { \
 873         /* Cast the micro-kernels' addresses to their function pointer types. */ \
 874         PASTECH(ch,gemmtrsmtype) gemmtrsm_ukr_cast = gemmtrsm_ukr; \
 875         PASTECH(ch,gemmtype)     gemm_ukr_cast     = gemm_ukr; \
 876 \
 877         /* Temporary C buffer for edge cases. */ \
 878         ctype           ct[ PASTEMAC(ch,maxnr) * \
 879                             PASTEMAC(ch,maxmr) ] \
 880                             __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 881         const inc_t     rs_ct      = 1; \
 882         const inc_t     cs_ct      = PASTEMAC(ch,maxnr); \
 883 \
 884         /* Alias some constants to simpler names. */ \
 885         const dim_t     MR         = pd_a; \
 886         const dim_t     NR         = pd_b; \
 887         const dim_t     PACKMR     = cs_a; \
 888         const dim_t     PACKNR     = rs_b; \
 889 \
 890         ctype* restrict zero        = PASTEMAC(ch,0); \
 891         ctype* restrict minus_one   = PASTEMAC(ch,m1); \
 892         ctype* restrict a_cast      = a; \
 893         ctype* restrict b_cast      = b; \
 894         ctype* restrict c_cast      = c; \
 895         ctype* restrict alpha1_cast = alpha1; \
 896         ctype* restrict alpha2_cast = alpha2; \
 897         ctype* restrict b1; \
 898         ctype* restrict c1; \
 899 \
 900         doff_t          diagoffb_j; \
 901         dim_t           k_full; \
 902         dim_t           m_iter, m_left; \
 903         dim_t           n_iter, n_left; \
 904         dim_t           m_cur; \
 905         dim_t           n_cur; \
 906         dim_t           k_b1121; \
 907         dim_t           k_b11; \
 908         dim_t           k_b21; \
 909         dim_t           off_b11; \
 910         dim_t           off_b21; \
 911         dim_t           i, j, jb; \
 912         inc_t           rstep_a; \
 913         inc_t           cstep_b; \
 914         inc_t           rstep_c, cstep_c; \
 915         inc_t           istep_a; \
 916         inc_t           istep_b; \
 917         inc_t           off_scl; \
 918         inc_t           ss_b_num; \
 919         inc_t           ss_b_den; \
 920         inc_t           ps_b_cur; \
 921         auxinfo_t       aux; \
 922 \
 923         /*
 924            Assumptions/assertions:
 925              rs_a == 1
 926              cs_a == PACKNR
 927              pd_a == NR
 928              ps_a == stride to next micro-panel of A
 929              rs_b == PACKMR
 930              cs_b == 1
 931              pd_b == MR
 932              ps_b == stride to next micro-panel of B
 933              rs_c == (no assumptions)
 934              cs_c == (no assumptions)
 935
 936           Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
 937           swapping of values in the control tree (ie: those values used when
 938           packing). This swapping is needed since we cast right-hand trsm in
 939           terms of transposed left-hand trsm. So, if we're going to be
 940           transposing the operation, then A needs to be packed with NR and B
 941           needs to be packed with MR (remember: B is the triangular matrix in
 942           the right-hand side parameter case).
 943         */ \
 944 \
 945         /* If any dimension is zero, return immediately. */ \
 946         if ( bli_zero_dim3( m, n, k ) ) return; \
 947 \
 948         /* Safeguard: If the current panel of B is entirely above its diagonal,
 949            it is implicitly zero. So we do nothing. */ \
 950         if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
 951 \
 952         /* Compute k_full as k inflated up to a multiple of NR. This is
 953            needed because some parameter combinations of trsm reduce k
 954            to advance past zero regions in the triangular matrix, and
 955            when computing the imaginary stride of B (the non-triangular
 956            matrix), which is used by 3m and 4m implementations, we need
 957            this unreduced value of k. */ \
 958         k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
 959 \
 960         /* Compute indexing scaling factor for for 4m or 3m. This is
 961            needed because one of the packing register blocksizes (PACKMR
 962            or PACKNR) is used to index into the micro-panels of the non-
 963            triangular matrix when computing with a diagonal-intersecting
 964            micro-panel of the triangular matrix. In the case of 4m or 3m,
 965            real values are stored in both sub-panels, and so the indexing
 966            needs to occur in units of real values. The value computed
 967            here is divided into the complex pointer offset to cause the
 968            pointer to be advanced by the correct value. */ \
 969         if ( bli_is_4m_packed( schema_b ) || \
 970              bli_is_3m_packed( schema_b ) || \
 971              bli_is_rih_packed( schema_b ) ) off_scl = 2; \
 972         else                                 off_scl = 1; \
 973 \
 974         /* Compute the storage stride. Usually this is just PACKMR (for A
 975            or PACKNR (for B). However, in the case of 3m, we need to scale
 976            the offset by 3/2. Since it's possible we may need to scale
 977            the packing dimension by a non-integer value, we break up the
 978            scaling factor into numerator and denominator. */ \
 979         if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
 980                                               ss_b_den = 2; } \
 981         else                                { ss_b_num = 1*PACKNR; \
 982                                               ss_b_den = 1; } \
 983 \
 984         /* If there is a zero region above where the diagonal of B intersects
 985            the left edge of the panel, adjust the pointer to A and treat this
 986            case as if the diagonal offset were zero. Note that we don't need to
 987            adjust the pointer to B since packm would have simply skipped over
 988            the region that was not stored. */ \
 989         if ( diagoffb < 0 ) \
 990         { \
 991                 j        = -diagoffb; \
 992                 k        = k - j; \
 993                 diagoffb = 0; \
 994                 a_cast   = a_cast + ( j * PACKMR ) / off_scl; \
 995         } \
 996 \
 997         /* If there is a zero region to the right of where the diagonal
 998            of B intersects the bottom of the panel, shrink it so that
 999            we can index to the correct place in C (corresponding to the
1000            part of the panel of B that was packed).
1001            NOTE: This is NOT being done to skip over "no-op" iterations,
1002            as with the trsm_lu macro-kernel. This MUST be done for correct
1003            execution because we use n (via n_iter) to compute diagonal and
1004            index offsets for backwards movement through B. */ \
1005         if ( diagoffb + k < n ) \
1006         { \
1007                 n = diagoffb + k; \
1008         } \
1009 \
1010         /* Check the k dimension, which needs to be a multiple of NR. If k
1011            isn't a multiple of NR, we adjust it higher to satisfy the micro-
1012            kernel, which is expecting to perform an NR x NR triangular solve.
1013            This adjustment of k is consistent with what happened when B was
1014            packed: all of its bottom/right edges were zero-padded, and
1015            furthermore, the panel that stores the bottom-right corner of the
1016            matrix has its diagonal extended into the zero-padded region (as
1017            identity). This allows the trsm of that bottom-right panel to
1018            proceed without producing any infs or NaNs that would infect the
1019            "good" values of the corresponding block of A. */ \
1020         if ( k % NR != 0 ) k += NR - ( k % NR ); \
1021 \
1022         /* NOTE: We don't need to check that n is a multiple of PACKNR since we
1023            know that the underlying buffer was already allocated to have an n
1024            dimension that is a multiple of PACKNR, with the region between the
1025            last column and the next multiple of NR zero-padded accordingly. */ \
1026 \
1027         /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
1028         PASTEMAC(ch,set0s_mxn)( MR, NR, \
1029                                 ct, rs_ct, cs_ct ); \
1030 \
1031         /* Compute number of primary and leftover components of the m and n
1032        dimensions. */ \
1033         n_iter = n / NR; \
1034         n_left = n % NR; \
1035 \
1036         m_iter = m / MR; \
1037         m_left = m % MR; \
1038 \
1039         if ( n_left ) ++n_iter; \
1040         if ( m_left ) ++m_iter; \
1041 \
1042         /* Determine some increments used to step through A, B, and C. */ \
1043         rstep_a = ps_a; \
1044 \
1045         cstep_b = ps_b; \
1046 \
1047         rstep_c = rs_c * MR; \
1048         cstep_c = cs_c * NR; \
1049 \
1050         istep_a = PACKMR * k_full; \
1051         istep_b = PACKNR * k; \
1052 \
1053         /* Save the pack schemas of A and B to the auxinfo_t object.
1054            NOTE: We swap the values for A and B since the triangular
1055            "A" matrix is actually contained within B. */ \
1056         bli_auxinfo_set_schema_a( schema_b, aux ); \
1057         bli_auxinfo_set_schema_b( schema_a, aux ); \
1058 \
1059         /* Save the imaginary stride of A to the auxinfo_t object.
1060            NOTE: We swap the values for A and B since the triangular
1061            "A" matrix is actually contained within B. */ \
1062         bli_auxinfo_set_is_b( istep_a, aux ); \
1063 \
1064         b1 = b_cast; \
1065         c1 = c_cast; \
1066 \
1067         /* Loop over the n dimension (NR columns at a time). */ \
1068         for ( jb = 0; jb < n_iter; ++jb ) \
1069         { \
1070                 ctype* restrict a1; \
1071                 ctype* restrict c11; \
1072                 ctype* restrict b11; \
1073                 ctype* restrict b21; \
1074                 ctype* restrict b2; \
1075 \
1076                 j          = n_iter - 1 - jb; \
1077                 diagoffb_j = diagoffb - ( doff_t )j*NR; \
1078                 a1         = a_cast; \
1079                 c11        = c1 + (n_iter-1)*cstep_c; \
1080 \
1081                 n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
1082 \
1083                 /* Initialize our next panel of B to be the current panel of B. */ \
1084                 b2 = b1; \
1085 \
1086                 /* If the current panel of B intersects the diagonal, use a
1087                    special micro-kernel that performs a fused gemm and trsm.
1088                    If the current panel of B resides below the diagonal, use a
1089                    a regular gemm micro-kernel. Otherwise, if it is above the
1090                    diagonal, it was not packed (because it is implicitly zero)
1091                    and so we do nothing. */ \
1092                 if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
1093                 { \
1094                         /* Determine the offset to and length of the panel that was packed
1095                            so we can index into the corresponding location in A. */ \
1096                         off_b11   = bli_max( -diagoffb_j, 0 ); \
1097                         k_b1121   = k - off_b11; \
1098                         k_b11     = NR; \
1099                         k_b21     = k_b1121 - NR; \
1100                         off_b21   = off_b11 + k_b11; \
1101 \
1102                         /* Compute the addresses of the triangular block B11 and the
1103                            panel B21. */ \
1104                         b11       = b1; \
1105                         b21       = b1 + ( k_b11 * PACKNR ) / off_scl; \
1106 \
1107                         /* Compute the panel stride for the current micro-panel. */ \
1108                         ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \
1109 \
1110                         /* Save the imaginary stride of B to the auxinfo_t object.
1111                            NOTE: We swap the values for A and B since the triangular
1112                            "A" matrix is actually contained within B. */ \
1113                         bli_auxinfo_set_is_a( PACKNR * k_b1121, aux ); \
1114 \
1115                         /* Loop over the m dimension (MR rows at a time). */ \
1116                         for ( i = 0; i < m_iter; ++i ) \
1117                         { \
1118                                 if( trsm_my_iter( i, thread ) ){ \
1119 \
1120                                 ctype* restrict a11; \
1121                                 ctype* restrict a12; \
1122                                 ctype* restrict a2; \
1123 \
1124                                 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
1125 \
1126                                 /* Compute the addresses of the A11 block and A12 panel. */ \
1127                                 a11  = a1 + ( off_b11 * PACKMR ) / off_scl; \
1128                                 a12  = a1 + ( off_b21 * PACKMR ) / off_scl; \
1129 \
1130                                 /* Compute the addresses of the next panels of A and B. */ \
1131                                 a2 = a1; \
1132                                 /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
1133                                 if ( i + thread_num_threads(thread) >= m_iter ) \
1134                                 { \
1135                                         a2 = a_cast; \
1136                                         b2 = b1 + ps_b_cur; \
1137                                         if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
1138                                                 b2 = b_cast; \
1139                                 } \
1140 \
1141                                 /* Save addresses of next panels of A and B to the auxinfo_t
1142                                    object. NOTE: We swap the values for A and B since the
1143                                    triangular "A" matrix is actually contained within B. */ \
1144                                 bli_auxinfo_set_next_a( b2, aux ); \
1145                                 bli_auxinfo_set_next_b( a2, aux ); \
1146 \
1147                                 /* Handle interior and edge cases separately. */ \
1148                                 if ( m_cur == MR && n_cur == NR ) \
1149                                 { \
1150                                         /* Invoke the fused gemm/trsm micro-kernel. */ \
1151                                         gemmtrsm_ukr_cast( k_b21, \
1152                                                            alpha1_cast, \
1153                                                            b21, \
1154                                                            b11, \
1155                                                            a12, \
1156                                                            a11, \
1157                                                            c11, cs_c, rs_c, \
1158                                                            &aux ); \
1159                                 } \
1160                                 else \
1161                                 { \
1162                                         /* Invoke the fused gemm/trsm micro-kernel. */ \
1163                                         gemmtrsm_ukr_cast( k_b21, \
1164                                                            alpha1_cast, \
1165                                                            b21, \
1166                                                            b11, \
1167                                                            a12, \
1168                                                            a11, \
1169                                                            ct, cs_ct, rs_ct, \
1170                                                            &aux ); \
1171 \
1172                                         /* Copy the result to the bottom edge of C. */ \
1173                                         PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
1174                                                                 ct,  rs_ct, cs_ct, \
1175                                                                 c11, rs_c,  cs_c ); \
1176                                 } \
1177                                 } \
1178 \
1179                                 a1  += rstep_a; \
1180                                 c11 += rstep_c; \
1181                         } \
1182 \
1183                         b1 += ps_b_cur; \
1184                 } \
1185                 else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
1186                 { \
1187                         /* Save the imaginary stride of B to the auxinfo_t object.
1188                            NOTE: We swap the values for A and B since the triangular
1189                            "A" matrix is actually contained within B. */ \
1190                         bli_auxinfo_set_is_a( istep_b, aux ); \
1191 \
1192                         /* Loop over the m dimension (MR rows at a time). */ \
1193                         for ( i = 0; i < m_iter; ++i ) \
1194                         { \
1195                                 if( trsm_my_iter( i, thread ) ){ \
1196 \
1197                                 ctype* restrict a2; \
1198 \
1199                                 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
1200 \
1201                                 /* Compute the addresses of the next panels of A and B. */ \
1202                                 a2 = a1; \
1203                                 /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
1204                                 if ( i + thread_num_threads(thread) >= m_iter ) \
1205                                 { \
1206                                         a2 = a_cast; \
1207                                         b2 = b1 + cstep_b; \
1208                                         if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
1209                                                 b2 = b_cast; \
1210                                 } \
1211 \
1212                                 /* Save addresses of next panels of A and B to the auxinfo_t
1213                                    object. NOTE: We swap the values for A and B since the
1214                                    triangular "A" matrix is actually contained within B. */ \
1215                                 bli_auxinfo_set_next_a( b2, aux ); \
1216                                 bli_auxinfo_set_next_b( a2, aux ); \
1217 \
1218                                 /* Handle interior and edge cases separately. */ \
1219                                 if ( m_cur == MR && n_cur == NR ) \
1220                                 { \
1221                                         /* Invoke the gemm micro-kernel. */ \
1222                                         gemm_ukr_cast( k, \
1223                                                        minus_one, \
1224                                                        b1, \
1225                                                        a1, \
1226                                                        alpha2_cast, \
1227                                                        c11, cs_c, rs_c, \
1228                                                        &aux ); \
1229                                 } \
1230                                 else \
1231                                 { \
1232                                         /* Invoke the gemm micro-kernel. */ \
1233                                         gemm_ukr_cast( k, \
1234                                                        minus_one, \
1235                                                        b1, \
1236                                                        a1, \
1237                                                        zero, \
1238                                                        ct, cs_ct, rs_ct, \
1239                                                        &aux ); \
1240 \
1241                                         /* Add the result to the edge of C. */ \
1242                                         PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
1243                                                                 ct,  rs_ct, cs_ct, \
1244                                                                 alpha2_cast, \
1245                                                                 c11, rs_c,  cs_c ); \
1246                                 } \
1247                                 } \
1248 \
1249                                 a1  += rstep_a; \
1250                                 c11 += rstep_c; \
1251                         } \
1252 \
1253                         b1 += cstep_b; \
1254                 } \
1255 \
1256                 c1 -= cstep_c; \
1257         } \
1258 }
1259
1260 INSERT_GENTFUNC_BASIC2( trsm_rl_ker_var2, gemmtrsm_ukr_t, gemm_ukr_t )
1261
1262 #endif