[dense-linear-algebra-libraries/linalg.git] / src / ti / linalg / blis / frame / 3 / trmm / bli_trmm_rl_ker_var2.c
1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas at Austin
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #define FUNCPTR_T gemm_fp
39 #ifdef BLIS_ENABLE_PROFILE
40 #define BLIS_ENABLE_PROFILE_KERVAR2 1
41 #else
42 #define BLIS_ENABLE_PROFILE_KERVAR2 0
43 #endif
45 typedef void (*FUNCPTR_T)(
46 doff_t diagoffb,
47 pack_t schema_a,
48 pack_t schema_b,
49 dim_t m,
50 dim_t n,
51 dim_t k,
52 void* alpha,
53 void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a,
54 void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
55 void* beta,
56 void* c, inc_t rs_c, inc_t cs_c,
57 void* gemm_ukr,
58 trmm_thrinfo_t* thread
59 );
61 static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
64 void bli_trmm_rl_ker_var2( obj_t* a,
65 obj_t* b,
66 obj_t* c,
67 gemm_t* cntl,
68 trmm_thrinfo_t* thread )
69 {
70 num_t dt_exec = bli_obj_execution_datatype( *c );
72 doff_t diagoffb = bli_obj_diag_offset( *b );
74 pack_t schema_a = bli_obj_pack_schema( *a );
75 pack_t schema_b = bli_obj_pack_schema( *b );
77 dim_t m = bli_obj_length( *c );
78 dim_t n = bli_obj_width( *c );
79 dim_t k = bli_obj_width( *a );
81 void* buf_a = bli_obj_buffer_at_off( *a );
82 inc_t cs_a = bli_obj_col_stride( *a );
83 inc_t pd_a = bli_obj_panel_dim( *a );
84 inc_t ps_a = bli_obj_panel_stride( *a );
86 void* buf_b = bli_obj_buffer_at_off( *b );
87 inc_t rs_b = bli_obj_row_stride( *b );
88 inc_t pd_b = bli_obj_panel_dim( *b );
89 inc_t ps_b = bli_obj_panel_stride( *b );
91 void* buf_c = bli_obj_buffer_at_off( *c );
92 inc_t rs_c = bli_obj_row_stride( *c );
93 inc_t cs_c = bli_obj_col_stride( *c );
95 obj_t scalar_a;
96 obj_t scalar_b;
98 void* buf_alpha;
99 void* buf_beta;
101 FUNCPTR_T f;
103 func_t* gemm_ukrs;
104 void* gemm_ukr;
106 // Detach and multiply the scalars attached to A and B.
107 bli_obj_scalar_detach( a, &scalar_a );
108 bli_obj_scalar_detach( b, &scalar_b );
109 bli_mulsc( &scalar_a, &scalar_b );
111 // Grab the addresses of the internal scalar buffers for the scalar
112 // merged above and the scalar attached to C.
113 buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
114 buf_beta = bli_obj_internal_scalar_buffer( *c );
116 // Index into the type combination array to extract the correct
117 // function pointer.
118 f = ftypes[dt_exec];
120 // Extract from the control tree node the func_t object containing
121 // the gemm micro-kernel function addresses, and then query the
122 // function address corresponding to the current datatype.
123 gemm_ukrs = cntl_gemm_ukrs( cntl );
124 gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs );
126 // Invoke the function.
127 f( diagoffb,
128 schema_a,
129 schema_b,
130 m,
131 n,
132 k,
133 buf_alpha,
134 buf_a, cs_a, pd_a, ps_a,
135 buf_b, rs_b, pd_b, ps_b,
136 buf_beta,
137 buf_c, rs_c, cs_c,
138 gemm_ukr,
139 thread );
140 }
142 #ifdef BLIS_ENABLE_C66X_MEM_POOLS
144 #if defined (BLIS_ENABLE_C66X_EDMA) && defined (BLIS_ENABLE_C66X_IDMA)
145 #undef GENTFUNC
146 #define GENTFUNC( ctype, ch, varname, ukrtype ) \
147 \
148 void PASTEMAC(ch,varname)( \
149 doff_t diagoffb, \
150 pack_t schema_a, \
151 pack_t schema_b, \
152 dim_t m, \
153 dim_t n, \
154 dim_t k, \
155 void* alpha, \
156 void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
157 void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
158 void* beta, \
159 void* c, inc_t rs_c, inc_t cs_c, \
160 void* gemm_ukr, \
161 trmm_thrinfo_t* jr_thread \
162 ) \
163 { \
164 /* Cast the micro-kernel address to its function pointer type. */ \
165 PASTECH(ch,ukrtype) gemm_ukr_cast = (PASTECH(ch,ukrtype)) gemm_ukr; \
166 \
167 /* Temporary C buffer for edge cases. */ \
168 ctype ct[ PASTEMAC(ch,maxmr) * \
169 PASTEMAC(ch,maxnr) ] \
170 __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
171 const inc_t rs_ct = 1; \
172 const inc_t cs_ct = PASTEMAC(ch,maxmr); \
173 \
174 /* Alias some constants to simpler names. */ \
175 const dim_t MR = pd_a; \
176 const dim_t NR = pd_b; \
177 const dim_t PACKMR = cs_a; \
178 const dim_t PACKNR = rs_b; \
179 \
180 ctype* restrict one = PASTEMAC(ch,1); \
181 ctype* restrict zero = PASTEMAC(ch,0); \
182 ctype* restrict a_cast = a; \
183 ctype* restrict b_cast = b; \
184 ctype* restrict c_cast = c; \
185 ctype* restrict alpha_cast = alpha; \
186 ctype* restrict beta_cast = beta; \
187 ctype* restrict b1; \
188 ctype* restrict c1; \
189 \
190 doff_t diagoffb_j; \
191 dim_t k_full; \
192 dim_t m_iter, m_left; \
193 dim_t n_iter, n_left; \
194 dim_t m_cur; \
195 dim_t n_cur; \
196 dim_t k_b1121; \
197 dim_t off_b1121; \
198 dim_t i, j; \
199 inc_t rstep_a; \
200 inc_t cstep_b; \
201 /*inc_t rstep_c;*/ \
202 inc_t cstep_c; \
203 inc_t istep_a; \
204 inc_t istep_b; \
205 inc_t off_scl; \
206 inc_t ss_b_num; \
207 inc_t ss_b_den; \
208 inc_t ps_b_cur; \
209 auxinfo_t aux; \
210 \
211 trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
212 /*dim_t jr_num_threads = thread_n_way( jr_thread ); \
213 dim_t jr_thread_id = thread_work_id( jr_thread );*/ \
214 \
215 dim_t n_next; \
216 inc_t rstep_c11, rs_c11, cs_c11; \
217 \
218 mem_t b1_L1_mem; \
219 /*memcpy does not like b1_L1 if it is restrict. The resid of gemm is non zero if this is changed to ctype* restrict*/ \
220 ctype* b1_L1; \
221 \
222 mem_t a1_L1_mem, a2_L1_mem; \
223 ctype *a1_L1, *a2_L1, *temp; \
224 \
225 mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
226 ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
227 \
228 /*EDMA Declarations */ \
229 lib_emt_Handle emt_handle_b = NULL; \
230 lib_emt_Handle emt_handle_c0 = NULL; \
231 lib_emt_Handle emt_handle_c1 = NULL; \
232 \
233 /*For DSP timing*/ \
234 uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
235 uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
236 extern profile_data_t *bli_trmm_profile_data; \
237 \
238 /*
239 Assumptions/assertions:
240 rs_a == 1
241 cs_a == PACKMR
242 pd_a == MR
243 ps_a == stride to next micro-panel of A
244 rs_b == PACKNR
245 cs_b == 1
246 pd_b == NR
247 ps_b == stride to next micro-panel of B
248 rs_c == (no assumptions)
249 cs_c == (no assumptions)
250 */ \
251 \
252 /* If any dimension is zero, return immediately. */ \
253 if ( bli_zero_dim3( m, n, k ) ) return; \
254 \
255 /* Safeguard: If the current panel of B is entirely above the diagonal,
256 it is implicitly zero. So we do nothing. */ \
257 if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
258 \
259 /* Compute k_full. For all trmm, k_full is simply k. This is
260 needed because some parameter combinations of trmm reduce k
261 to advance past zero regions in the triangular matrix, and
262 when computing the imaginary stride of A (the non-triangular
263 matrix), which is used by 3m and 4m implementations, we need
264 this unreduced value of k. */ \
265 k_full = k; \
266 \
267 /* Compute indexing scaling factor for for 4m or 3m. This is
268 needed because one of the packing register blocksizes (PACKMR
269 or PACKNR) is used to index into the micro-panels of the non-
270 triangular matrix when computing with a diagonal-intersecting
271 micro-panel of the triangular matrix. In the case of 4m or 3m,
272 real values are stored in both sub-panels, and so the indexing
273 needs to occur in units of real values. The value computed
274 here is divided into the complex pointer offset to cause the
275 pointer to be advanced by the correct value. */ \
276 if ( bli_is_4m_packed( schema_b ) || \
277 bli_is_3m_packed( schema_b ) || \
278 bli_is_rih_packed( schema_b ) ) off_scl = 2; \
279 else off_scl = 1; \
280 \
281 /* Compute the storage stride. Usually this is just PACKMR (for A
282 or PACKNR (for B). However, in the case of 3m, we need to scale
283 the offset by 3/2. Since it's possible we may need to scale
284 the packing dimension by a non-integer value, we break up the
285 scaling factor into numerator and denominator. */ \
286 if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
287 ss_b_den = 2; } \
288 else { ss_b_num = 1*PACKNR; \
289 ss_b_den = 1; } \
290 \
291 /* If there is a zero region above where the diagonal of B intersects
292 the left edge of the panel, adjust the pointer to A and treat this
293 case as if the diagonal offset were zero. Note that we don't need to
294 adjust the pointer to B since packm would have simply skipped over
295 the region that was not stored. */ \
296 if ( diagoffb < 0 ) \
297 { \
298 j = -diagoffb; \
299 k = k - j; \
300 diagoffb = 0; \
301 a_cast = a_cast + ( j * PACKMR ) / off_scl; \
302 } \
303 \
304 /* If there is a zero region to the right of where the diagonal
305 of B intersects the bottom of the panel, shrink it to prevent
306 "no-op" iterations from executing. */ \
307 if ( diagoffb + k < n ) \
308 { \
309 n = diagoffb + k; \
310 } \
311 \
312 /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
313 PASTEMAC(ch,set0s_mxn)( MR, NR, \
314 ct, rs_ct, cs_ct ); \
315 \
316 /* Compute number of primary and leftover components of the m and n
317 dimensions. */ \
318 n_iter = n / NR; \
319 n_left = n % NR; \
320 \
321 m_iter = m / MR; \
322 m_left = m % MR; \
323 \
324 if ( n_left ) ++n_iter; \
325 if ( m_left ) ++m_iter; \
326 \
327 /* Determine some increments used to step through A, B, and C. */ \
328 rstep_a = ps_a; \
329 \
330 cstep_b = ps_b; \
331 \
332 /*rstep_c = rs_c * MR; */\
333 cstep_c = cs_c * NR; \
334 \
335 /* When C (MC*NR) is moved to L2 the stride to get to the next panel of MRxNR*/ \
336 rstep_c11 = MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
337 rs_c11 = 1; \
338 cs_c11 = (m%2 == 0) ? m : m+1; /*stride to get to next column in a panel of MRxNR*/\
339 \
340 istep_a = PACKMR * k_full; \
341 istep_b = PACKNR * k; \
342 \
343 /* Save the pack schemas of A and B to the auxinfo_t object. */ \
344 bli_auxinfo_set_schema_a( schema_a, aux ); \
345 bli_auxinfo_set_schema_b( schema_b, aux ); \
346 \
347 /* Save the imaginary stride of A to the auxinfo_t object. */ \
348 bli_auxinfo_set_is_a( istep_a, aux ); \
349 \
350 b1 = b_cast; \
351 c1 = c_cast; \
352 \
353 /*Acquiring a buffer for B in L1*/ \
354 bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
355 b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
356 b1_L1 = (ctype *) ((char *)b1_L1_mem.buf + PASTEMAC(ch,bank)); \
357 \
358 /*Acquiring a buffer for A in L1*/ \
359 bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
360 a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
361 a1_L1 = a1_L1; \
362 \
363 bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
364 a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
365 \
366 /*Acquiring buffers for C (MC_x_NR) in L2 */\
367 bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
368 cNew0 = bli_mem_buffer( &c0_L2_mem ); \
369 \
370 bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
371 cNew1 = bli_mem_buffer( &c1_L2_mem ); \
372 \
373 bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
374 cNew2 = bli_mem_buffer( &c2_L2_mem ); \
375 \
376 /*Acquiring an EDMA handle from the pool*/ \
377 bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
378 if(emt_handle_b == NULL) \
379 { \
380 printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
381 } \
382 /*Acquiring an EDMA handle from the pool*/ \
383 bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
384 if(emt_handle_c0 == NULL) \
385 { \
386 printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
387 } \
388 /*Acquiring an EDMA handle from the pool*/ \
389 bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
390 if(emt_handle_c1 == NULL) \
391 { \
392 printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
393 } \
394 \
395 n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
396 /* Loop over the n dimension (NR columns at a time). */ \
397 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
398 { \
399 counter_start_nr = lib_clock_read(); \
400 } \
401 /* Transfering MC(=m)xNR*/ \
402 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
403 { \
404 lib_emt_copy2D2D(emt_handle_c0, c1, \
405 cNew1, m*sizeof(ctype), \
406 n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
407 } \
408 else \
409 { \
410 dim_t ii; \
411 ctype *ptr_source; \
412 ctype *ptr_dest; \
413 ptr_source = c1; \
414 ptr_dest = cNew1; \
415 for(ii = 0; ii < n_cur; ii++) \
416 { \
417 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
418 ptr_source += cs_c; \
419 ptr_dest += cs_c11; \
420 } \
421 } \
422 \
423 /* Loop over the n dimension (NR columns at a time). */ \
424 for ( j = 0; j < n_iter; ++j ) \
425 { \
426 ctype* restrict a1; \
427 ctype* restrict c11; \
428 ctype* restrict b2; \
429 \
430 diagoffb_j = diagoffb - ( doff_t )j*NR; \
431 \
432 /* Determine the offset to the beginning of the panel that
433 was packed so we can index into the corresponding location
434 in A. Then compute the length of that panel. */ \
435 off_b1121 = bli_max( -diagoffb_j, 0 ); \
436 k_b1121 = k - off_b1121; \
437 \
438 a1 = a_cast; \
439 c11 = cNew1; \
440 \
441 n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
442 n_next = ( bli_is_not_edge_f( j+1, n_iter, n_left ) ? NR : n_left ); \
443 \
444 /* Initialize our next panel of B to be the current panel of B. */ \
445 b2 = b1; \
446 \
447 lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k_b1121*NR*sizeof(ctype)); \
448 \
449 lib_emt_wait(emt_handle_c0); \
450 if(j < n_iter-1) /* no transfer for last iteration */ \
451 { \
452 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
453 { \
454 lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
455 cNew0, m*sizeof(ctype), \
456 n_next, cs_c*sizeof(ctype), \
457 cs_c11*sizeof(ctype)); \
458 }\
459 else \
460 { \
461 dim_t ii; \
462 ctype *ptr_source; \
463 ctype *ptr_dest; \
464 ptr_source = c1+cstep_c; \
465 ptr_dest = cNew0; \
466 for(ii = 0; ii < n_next; ii++) \
467 { \
468 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
469 ptr_source += cs_c; \
470 ptr_dest += cs_c11; \
471 } \
472 } \
473 } \
474 \
475 /* If the current panel of B intersects the diagonal, scale C
476 by beta. If it is strictly below the diagonal, scale by one.
477 This allows the current macro-kernel to work for both trmm
478 and trmm3. */ \
479 if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
480 { \
481 /* Compute the panel stride for the current diagonal-
482 intersecting micro-panel. */ \
483 ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \
484 \
485 if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
486 \
487 /* Save the 4m/3m imaginary stride of B to the auxinfo_t
488 object. */ \
489 bli_auxinfo_set_is_b( PACKNR * k_b1121, aux ); \
490 \
491 /* Loop over the m dimension (MR rows at a time). */ \
492 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
493 { \
494 counter_start_mr = lib_clock_read(); \
495 } \
496 for ( i = 0; i < m_iter; ++i ) \
497 { \
498 if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
499 \
500 ctype* restrict a1_i; \
501 ctype* restrict a2; \
502 \
503 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
504 \
505 if(i == 0) \
506 { \
507 lib_imt_copy(a1 + ( off_b1121 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
508 } \
509 \
510 /* Compute the addresses of the next panels of A and B. */ \
511 a2 = a1 + rstep_a; \
512 lib_imt_wait(); \
513 temp = a1_L1; \
514 a1_L1 = a2_L1; \
515 a2_L1 = temp; \
516 if(i == 0) \
517 { \
518 lib_emt_wait(emt_handle_b);\
519 } \
520 /*a1_i = a1_L1 + ( off_b1121 * PACKMR ) / off_scl;*/ \
521 a1_i = a1_L1; \
522 \
523 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
524 { \
525 a2 = a_cast; \
526 b2 = b1; \
527 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
528 b2 = b_cast; \
529 } \
530 else \
531 { \
532 /*Start next panel*/ \
533 lib_imt_copy(a2 + ( off_b1121 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
534 } \
535 \
536 /* Save addresses of next panels of A and B to the auxinfo_t
537 object. */ \
538 bli_auxinfo_set_next_a( a2, aux ); \
539 bli_auxinfo_set_next_b( b2, aux ); \
540 \
541 /* Handle interior and edge cases separately. */ \
542 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
543 { \
544 counter_start_ker = lib_clock_read(); \
545 } \
546 if ( m_cur == MR && n_cur == NR ) \
547 { \
548 /* Invoke the gemm micro-kernel. */ \
549 gemm_ukr_cast( k_b1121, \
550 alpha_cast, \
551 a1_i, \
552 b1_L1, \
553 beta_cast, \
554 c11, rs_c11, cs_c11, \
555 &aux ); \
556 } \
557 else \
558 { \
559 /* Copy edge elements of C to the temporary buffer. */ \
560 PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
561 c11, rs_c11, cs_c11, \
562 ct, rs_ct, cs_ct ); \
563 \
564 /* Invoke the gemm micro-kernel. */ \
565 gemm_ukr_cast( k_b1121, \
566 alpha_cast, \
567 a1_i, \
568 b1_L1, \
569 beta_cast, \
570 ct, rs_ct, cs_ct, \
571 &aux ); \
572 \
573 /* Copy the result to the edge of C. */ \
574 PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
575 ct, rs_ct, cs_ct, \
576 c11, rs_c11, cs_c11 ); \
577 } \
578 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
579 { \
580 counter_end_ker = lib_clock_read(); \
581 bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
582 (counter_end_ker-counter_start_ker),2*k_b1121*m_cur*n_cur); \
583 } \
584 } \
585 \
586 a1 += rstep_a; \
587 c11 += rstep_c11; \
588 } \
589 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
590 { \
591 counter_end_mr = lib_clock_read(); \
592 bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
593 (counter_end_mr-counter_start_mr), 2*k_b1121*m*n_cur); \
594 } \
595 } \
596 \
597 b1 += ps_b_cur; \
598 } \
599 else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
600 { \
601 if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
602 \
603 /* Save the 4m/3m imaginary stride of B to the auxinfo_t
604 object. */ \
605 bli_auxinfo_set_is_b( istep_b, aux ); \
606 \
607 /* Loop over the m dimension (MR rows at a time). */ \
608 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
609 { \
610 counter_start_mr = lib_clock_read(); \
611 } \
612 for ( i = 0; i < m_iter; ++i ) \
613 { \
614 if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
615 \
616 ctype* restrict a2; \
617 \
618 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
619 \
620 if(i == 0) \
621 { \
622 lib_imt_copy(a1, a2_L1, k_b1121*MR*sizeof(ctype)); \
623 } \
624 \
625 /* Compute the addresses of the next panels of A and B. */ \
626 a2 = a1 + rstep_a; \
627 lib_imt_wait(); \
628 temp = a1_L1; \
629 a1_L1 = a2_L1; \
630 a2_L1 = temp; \
631 if(i == 0) \
632 { \
633 lib_emt_wait(emt_handle_b);\
634 } \
635 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
636 { \
637 a2 = a_cast; \
638 b2 = b1; \
639 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
640 b2 = b_cast; \
641 } \
642 else \
643 { \
644 /*Start next panel*/ \
645 lib_imt_copy(a2, a2_L1, k_b1121*MR*sizeof(ctype)); \
646 } \
647 \
648 /* Save addresses of next panels of A and B to the auxinfo_t
649 object. */ \
650 bli_auxinfo_set_next_a( a2, aux ); \
651 bli_auxinfo_set_next_b( b2, aux ); \
652 \
653 /* Handle interior and edge cases separately. */ \
654 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
655 { \
656 counter_start_ker = lib_clock_read(); \
657 } \
658 if ( m_cur == MR && n_cur == NR ) \
659 { \
660 /* Invoke the gemm micro-kernel. */ \
661 gemm_ukr_cast( k, \
662 alpha_cast, \
663 a1_L1, \
664 b1_L1, \
665 one, \
666 c11, rs_c11, cs_c11, \
667 &aux ); \
668 } \
669 else \
670 { \
671 /* Invoke the gemm micro-kernel. */ \
672 gemm_ukr_cast( k, \
673 alpha_cast, \
674 a1_L1, \
675 b1_L1, \
676 zero, \
677 ct, rs_ct, cs_ct, \
678 &aux ); \
679 \
680 /* Add the result to the edge of C. */ \
681 PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
682 ct, rs_ct, cs_ct, \
683 c11, rs_c11, cs_c11 ); \
684 } \
685 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
686 { \
687 counter_end_ker = lib_clock_read(); \
688 bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
689 (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
690 /*printf("gemm %d %d %d %ld\n", MR, NR, k, (counter_end_ker-counter_start_ker));*/ \
691 } \
692 } /*if ( trmm_r_ir_my_iter( i, ir_thread ) )*/\
693 \
694 a1 += rstep_a; \
695 c11 += rstep_c11; \
696 } /*for i*/\
697 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
698 { \
699 counter_end_mr = lib_clock_read(); \
700 bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
701 (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
702 } \
703 } /*j thread*/\
704 \
705 b1 += cstep_b; \
706 } /*else if above diag*/\
707 \
708 /* circularly shift buffers */ \
709 cNewTemp = cNew0; \
710 cNew0 = cNew2; \
711 cNew2 = cNew1; \
712 cNew1 = cNewTemp; \
713 if(j != 0) /* wait for save c to complete; skip first iteration */ \
714 { \
715 lib_emt_wait(emt_handle_c1); \
716 } \
717 /* save updated c*/ \
718 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
719 { \
720 lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
721 n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
722 } \
723 else \
724 { \
725 dim_t ii; \
726 ctype *ptr_source; \
727 ctype *ptr_dest; \
728 ptr_source = cNew2; \
729 ptr_dest = c1; \
730 for(ii = 0; ii < n_cur; ii++) \
731 { \
732 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
733 ptr_source += cs_c11; \
734 ptr_dest += cs_c; \
735 } \
736 } \
737 c1 += cstep_c; \
738 } \
739 if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
740 { \
741 counter_end_nr = lib_clock_read(); \
742 bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
743 (counter_end_nr-counter_start_nr), 2*k*m*n); \
744 } \
745 \
746 bli_mem_release( &c2_L2_mem ); \
747 bli_mem_release( &c1_L2_mem ); \
748 bli_mem_release( &c0_L2_mem ); \
749 bli_mem_release( &a2_L1_mem ); \
750 bli_mem_release( &a1_L1_mem ); \
751 bli_mem_release( &b1_L1_mem ); \
752 if ( emt_handle_b != NULL ) \
753 { \
754 bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
755 emt_handle_b = NULL; \
756 } \
757 if ( emt_handle_c0 != NULL ) \
758 { \
759 bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
760 emt_handle_c0 = NULL; \
761 } \
762 if ( emt_handle_c1 != NULL ) \
763 { \
764 lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
765 bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
766 emt_handle_c1 = NULL; \
767 } \
768 /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
769 /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
770 }
772 INSERT_GENTFUNC_BASIC( trmm_rl_ker_var2, gemm_ukr_t )
774 #else
775 #endif
777 #else
778 #undef GENTFUNC
779 #define GENTFUNC( ctype, ch, varname, ukrtype ) \
780 \
781 void PASTEMAC(ch,varname)( \
782 doff_t diagoffb, \
783 pack_t schema_a, \
784 pack_t schema_b, \
785 dim_t m, \
786 dim_t n, \
787 dim_t k, \
788 void* alpha, \
789 void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
790 void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
791 void* beta, \
792 void* c, inc_t rs_c, inc_t cs_c, \
793 void* gemm_ukr, \
794 trmm_thrinfo_t* jr_thread \
795 ) \
796 { \
797 /* Cast the micro-kernel address to its function pointer type. */ \
798 PASTECH(ch,ukrtype) gemm_ukr_cast = gemm_ukr; \
799 \
800 /* Temporary C buffer for edge cases. */ \
801 ctype ct[ PASTEMAC(ch,maxmr) * \
802 PASTEMAC(ch,maxnr) ] \
803 __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
804 const inc_t rs_ct = 1; \
805 const inc_t cs_ct = PASTEMAC(ch,maxmr); \
806 \
807 /* Alias some constants to simpler names. */ \
808 const dim_t MR = pd_a; \
809 const dim_t NR = pd_b; \
810 const dim_t PACKMR = cs_a; \
811 const dim_t PACKNR = rs_b; \
812 \
813 ctype* restrict one = PASTEMAC(ch,1); \
814 ctype* restrict zero = PASTEMAC(ch,0); \
815 ctype* restrict a_cast = a; \
816 ctype* restrict b_cast = b; \
817 ctype* restrict c_cast = c; \
818 ctype* restrict alpha_cast = alpha; \
819 ctype* restrict beta_cast = beta; \
820 ctype* restrict b1; \
821 ctype* restrict c1; \
822 \
823 doff_t diagoffb_j; \
824 dim_t k_full; \
825 dim_t m_iter, m_left; \
826 dim_t n_iter, n_left; \
827 dim_t m_cur; \
828 dim_t n_cur; \
829 dim_t k_b1121; \
830 dim_t off_b1121; \
831 dim_t i, j; \
832 inc_t rstep_a; \
833 inc_t cstep_b; \
834 inc_t rstep_c, cstep_c; \
835 inc_t istep_a; \
836 inc_t istep_b; \
837 inc_t off_scl; \
838 inc_t ss_b_num; \
839 inc_t ss_b_den; \
840 inc_t ps_b_cur; \
841 auxinfo_t aux; \
842 \
843 trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
844 /*dim_t jr_num_threads = thread_n_way( jr_thread ); \
845 dim_t jr_thread_id = thread_work_id( jr_thread );*/ \
846 \
847 /*
848 Assumptions/assertions:
849 rs_a == 1
850 cs_a == PACKMR
851 pd_a == MR
852 ps_a == stride to next micro-panel of A
853 rs_b == PACKNR
854 cs_b == 1
855 pd_b == NR
856 ps_b == stride to next micro-panel of B
857 rs_c == (no assumptions)
858 cs_c == (no assumptions)
859 */ \
860 \
861 /* If any dimension is zero, return immediately. */ \
862 if ( bli_zero_dim3( m, n, k ) ) return; \
863 \
864 /* Safeguard: If the current panel of B is entirely above the diagonal,
865 it is implicitly zero. So we do nothing. */ \
866 if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
867 \
868 /* Compute k_full. For all trmm, k_full is simply k. This is
869 needed because some parameter combinations of trmm reduce k
870 to advance past zero regions in the triangular matrix, and
871 when computing the imaginary stride of A (the non-triangular
872 matrix), which is used by 3m and 4m implementations, we need
873 this unreduced value of k. */ \
874 k_full = k; \
875 \
876 /* Compute indexing scaling factor for for 4m or 3m. This is
877 needed because one of the packing register blocksizes (PACKMR
878 or PACKNR) is used to index into the micro-panels of the non-
879 triangular matrix when computing with a diagonal-intersecting
880 micro-panel of the triangular matrix. In the case of 4m or 3m,
881 real values are stored in both sub-panels, and so the indexing
882 needs to occur in units of real values. The value computed
883 here is divided into the complex pointer offset to cause the
884 pointer to be advanced by the correct value. */ \
885 if ( bli_is_4m_packed( schema_b ) || \
886 bli_is_3m_packed( schema_b ) || \
887 bli_is_rih_packed( schema_b ) ) off_scl = 2; \
888 else off_scl = 1; \
889 \
890 /* Compute the storage stride. Usually this is just PACKMR (for A
891 or PACKNR (for B). However, in the case of 3m, we need to scale
892 the offset by 3/2. Since it's possible we may need to scale
893 the packing dimension by a non-integer value, we break up the
894 scaling factor into numerator and denominator. */ \
895 if ( bli_is_3m_packed( schema_b ) ) { ss_b_num = 3*PACKNR; \
896 ss_b_den = 2; } \
897 else { ss_b_num = 1*PACKNR; \
898 ss_b_den = 1; } \
899 \
900 /* If there is a zero region above where the diagonal of B intersects
901 the left edge of the panel, adjust the pointer to A and treat this
902 case as if the diagonal offset were zero. Note that we don't need to
903 adjust the pointer to B since packm would have simply skipped over
904 the region that was not stored. */ \
905 if ( diagoffb < 0 ) \
906 { \
907 j = -diagoffb; \
908 k = k - j; \
909 diagoffb = 0; \
910 a_cast = a_cast + ( j * PACKMR ) / off_scl; \
911 } \
912 \
913 /* If there is a zero region to the right of where the diagonal
914 of B intersects the bottom of the panel, shrink it to prevent
915 "no-op" iterations from executing. */ \
916 if ( diagoffb + k < n ) \
917 { \
918 n = diagoffb + k; \
919 } \
920 \
921 /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
922 PASTEMAC(ch,set0s_mxn)( MR, NR, \
923 ct, rs_ct, cs_ct ); \
924 \
925 /* Compute number of primary and leftover components of the m and n
926 dimensions. */ \
927 n_iter = n / NR; \
928 n_left = n % NR; \
929 \
930 m_iter = m / MR; \
931 m_left = m % MR; \
932 \
933 if ( n_left ) ++n_iter; \
934 if ( m_left ) ++m_iter; \
935 \
936 /* Determine some increments used to step through A, B, and C. */ \
937 rstep_a = ps_a; \
938 \
939 cstep_b = ps_b; \
940 \
941 rstep_c = rs_c * MR; \
942 cstep_c = cs_c * NR; \
943 \
944 istep_a = PACKMR * k_full; \
945 istep_b = PACKNR * k; \
946 \
947 /* Save the pack schemas of A and B to the auxinfo_t object. */ \
948 bli_auxinfo_set_schema_a( schema_a, aux ); \
949 bli_auxinfo_set_schema_b( schema_b, aux ); \
950 \
951 /* Save the imaginary stride of A to the auxinfo_t object. */ \
952 bli_auxinfo_set_is_a( istep_a, aux ); \
953 \
954 b1 = b_cast; \
955 c1 = c_cast; \
956 \
957 /* Loop over the n dimension (NR columns at a time). */ \
958 for ( j = 0; j < n_iter; ++j ) \
959 { \
960 ctype* restrict a1; \
961 ctype* restrict c11; \
962 ctype* restrict b2; \
963 \
964 diagoffb_j = diagoffb - ( doff_t )j*NR; \
965 \
966 /* Determine the offset to the beginning of the panel that
967 was packed so we can index into the corresponding location
968 in A. Then compute the length of that panel. */ \
969 off_b1121 = bli_max( -diagoffb_j, 0 ); \
970 k_b1121 = k - off_b1121; \
971 \
972 a1 = a_cast; \
973 c11 = c1; \
974 \
975 n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
976 \
977 /* Initialize our next panel of B to be the current panel of B. */ \
978 b2 = b1; \
979 \
980 /* If the current panel of B intersects the diagonal, scale C
981 by beta. If it is strictly below the diagonal, scale by one.
982 This allows the current macro-kernel to work for both trmm
983 and trmm3. */ \
984 if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
985 { \
986 /* Compute the panel stride for the current diagonal-
987 intersecting micro-panel. */ \
988 ps_b_cur = ( k_b1121 * ss_b_num ) / ss_b_den; \
989 \
990 if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
991 \
992 /* Save the 4m/3m imaginary stride of B to the auxinfo_t
993 object. */ \
994 bli_auxinfo_set_is_b( PACKNR * k_b1121, aux ); \
995 \
996 /* Loop over the m dimension (MR rows at a time). */ \
997 for ( i = 0; i < m_iter; ++i ) \
998 { \
999 if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
1000 \
1001 ctype* restrict a1_i; \
1002 ctype* restrict a2; \
1003 \
1004 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
1005 \
1006 a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
1007 \
1008 /* Compute the addresses of the next panels of A and B. */ \
1009 a2 = a1; \
1010 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
1011 { \
1012 a2 = a_cast; \
1013 b2 = b1; \
1014 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
1015 b2 = b_cast; \
1016 } \
1017 \
1018 /* Save addresses of next panels of A and B to the auxinfo_t
1019 object. */ \
1020 bli_auxinfo_set_next_a( a2, aux ); \
1021 bli_auxinfo_set_next_b( b2, aux ); \
1022 \
1023 /* Handle interior and edge cases separately. */ \
1024 if ( m_cur == MR && n_cur == NR ) \
1025 { \
1026 /* Invoke the gemm micro-kernel. */ \
1027 gemm_ukr_cast( k_b1121, \
1028 alpha_cast, \
1029 a1_i, \
1030 b1, \
1031 beta_cast, \
1032 c11, rs_c, cs_c, \
1033 &aux ); \
1034 } \
1035 else \
1036 { \
1037 /* Copy edge elements of C to the temporary buffer. */ \
1038 PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
1039 c11, rs_c, cs_c, \
1040 ct, rs_ct, cs_ct ); \
1041 \
1042 /* Invoke the gemm micro-kernel. */ \
1043 gemm_ukr_cast( k_b1121, \
1044 alpha_cast, \
1045 a1_i, \
1046 b1, \
1047 beta_cast, \
1048 ct, rs_ct, cs_ct, \
1049 &aux ); \
1050 \
1051 /* Copy the result to the edge of C. */ \
1052 PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
1053 ct, rs_ct, cs_ct, \
1054 c11, rs_c, cs_c ); \
1055 } \
1056 } \
1057 \
1058 a1 += rstep_a; \
1059 c11 += rstep_c; \
1060 } \
1061 } \
1062 \
1063 b1 += ps_b_cur; \
1064 } \
1065 else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
1066 { \
1067 if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \
1068 \
1069 /* Save the 4m/3m imaginary stride of B to the auxinfo_t
1070 object. */ \
1071 bli_auxinfo_set_is_b( istep_b, aux ); \
1072 \
1073 /* Loop over the m dimension (MR rows at a time). */ \
1074 for ( i = 0; i < m_iter; ++i ) \
1075 { \
1076 if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
1077 \
1078 ctype* restrict a2; \
1079 \
1080 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
1081 \
1082 /* Compute the addresses of the next panels of A and B. */ \
1083 a2 = a1; \
1084 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
1085 { \
1086 a2 = a_cast; \
1087 b2 = b1; \
1088 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
1089 b2 = b_cast; \
1090 } \
1091 \
1092 /* Save addresses of next panels of A and B to the auxinfo_t
1093 object. */ \
1094 bli_auxinfo_set_next_a( a2, aux ); \
1095 bli_auxinfo_set_next_b( b2, aux ); \
1096 \
1097 /* Handle interior and edge cases separately. */ \
1098 if ( m_cur == MR && n_cur == NR ) \
1099 { \
1100 /* Invoke the gemm micro-kernel. */ \
1101 gemm_ukr_cast( k, \
1102 alpha_cast, \
1103 a1, \
1104 b1, \
1105 one, \
1106 c11, rs_c, cs_c, \
1107 &aux ); \
1108 } \
1109 else \
1110 { \
1111 /* Invoke the gemm micro-kernel. */ \
1112 gemm_ukr_cast( k, \
1113 alpha_cast, \
1114 a1, \
1115 b1, \
1116 zero, \
1117 ct, rs_ct, cs_ct, \
1118 &aux ); \
1119 \
1120 /* Add the result to the edge of C. */ \
1121 PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
1122 ct, rs_ct, cs_ct, \
1123 c11, rs_c, cs_c ); \
1124 } \
1125 } \
1126 \
1127 a1 += rstep_a; \
1128 c11 += rstep_c; \
1129 } \
1130 } \
1131 \
1132 b1 += cstep_b; \
1133 } \
1134 \
1135 c1 += cstep_c; \
1136 } \
1137 \
1138 /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
1139 /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
1140 }
1142 INSERT_GENTFUNC_BASIC( trmm_rl_ker_var2, gemm_ukr_t )
1144 #endif