1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas at Austin
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #define FUNCPTR_T gemm_fp
39 typedef void (*FUNCPTR_T)(
40 doff_t diagoffa,
41 pack_t schema_a,
42 pack_t schema_b,
43 dim_t m,
44 dim_t n,
45 dim_t k,
46 void* alpha,
47 void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a,
48 void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
49 void* beta,
50 void* c, inc_t rs_c, inc_t cs_c,
51 void* gemm_ukr,
52 trmm_thrinfo_t* thread
53 );
55 static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
58 void bli_trmm_lu_ker_var2( obj_t* a,
59 obj_t* b,
60 obj_t* c,
61 gemm_t* cntl,
62 trmm_thrinfo_t* thread )
63 {
64 num_t dt_exec = bli_obj_execution_datatype( *c );
66 doff_t diagoffa = bli_obj_diag_offset( *a );
68 pack_t schema_a = bli_obj_pack_schema( *a );
69 pack_t schema_b = bli_obj_pack_schema( *b );
71 dim_t m = bli_obj_length( *c );
72 dim_t n = bli_obj_width( *c );
73 dim_t k = bli_obj_width( *a );
75 void* buf_a = bli_obj_buffer_at_off( *a );
76 inc_t cs_a = bli_obj_col_stride( *a );
77 inc_t pd_a = bli_obj_panel_dim( *a );
78 inc_t ps_a = bli_obj_panel_stride( *a );
80 void* buf_b = bli_obj_buffer_at_off( *b );
81 inc_t rs_b = bli_obj_row_stride( *b );
82 inc_t pd_b = bli_obj_panel_dim( *b );
83 inc_t ps_b = bli_obj_panel_stride( *b );
85 void* buf_c = bli_obj_buffer_at_off( *c );
86 inc_t rs_c = bli_obj_row_stride( *c );
87 inc_t cs_c = bli_obj_col_stride( *c );
89 obj_t scalar_a;
90 obj_t scalar_b;
92 void* buf_alpha;
93 void* buf_beta;
95 FUNCPTR_T f;
97 func_t* gemm_ukrs;
98 void* gemm_ukr;
101 // Detach and multiply the scalars attached to A and B.
102 bli_obj_scalar_detach( a, &scalar_a );
103 bli_obj_scalar_detach( b, &scalar_b );
104 bli_mulsc( &scalar_a, &scalar_b );
106 // Grab the addresses of the internal scalar buffers for the scalar
107 // merged above and the scalar attached to C.
108 buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
109 buf_beta = bli_obj_internal_scalar_buffer( *c );
111 // Index into the type combination array to extract the correct
112 // function pointer.
113 f = ftypes[dt_exec];
115 // Extract from the control tree node the func_t object containing
116 // the gemm micro-kernel function addresses, and then query the
117 // function address corresponding to the current datatype.
118 gemm_ukrs = cntl_gemm_ukrs( cntl );
119 gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs );
121 // Invoke the function.
122 f( diagoffa,
123 schema_a,
124 schema_b,
125 m,
126 n,
127 k,
128 buf_alpha,
129 buf_a, cs_a, pd_a, ps_a,
130 buf_b, rs_b, pd_b, ps_b,
131 buf_beta,
132 buf_c, rs_c, cs_c,
133 gemm_ukr,
134 thread );
135 }
136 #ifdef BLIS_ENABLE_C66X_MEM_POOLS
138 #if defined (BLIS_ENABLE_C66X_EDMA) && defined (BLIS_ENABLE_C66X_IDMA)
140 #undef GENTFUNC
141 #define GENTFUNC( ctype, ch, varname, ukrtype ) \
142 \
143 void PASTEMAC(ch,varname)( \
144 doff_t diagoffa, \
145 pack_t schema_a, \
146 pack_t schema_b, \
147 dim_t m, \
148 dim_t n, \
149 dim_t k, \
150 void* alpha, \
151 void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
152 void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
153 void* beta, \
154 void* c, inc_t rs_c, inc_t cs_c, \
155 void* gemm_ukr, \
156 trmm_thrinfo_t* jr_thread \
157 ) \
158 { \
159 /* Cast the micro-kernel address to its function pointer type. */ \
160 PASTECH(ch,ukrtype) gemm_ukr_cast = (PASTECH(ch,ukrtype))gemm_ukr; \
161 \
162 /* Temporary C buffer for edge cases. */ \
163 ctype ct[ PASTEMAC(ch,maxmr) * \
164 PASTEMAC(ch,maxnr) ] \
165 __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
166 const inc_t rs_ct = 1; \
167 const inc_t cs_ct = PASTEMAC(ch,maxmr); \
168 \
169 /* Alias some constants to simpler names. */ \
170 const dim_t MR = pd_a; \
171 const dim_t NR = pd_b; \
172 const dim_t PACKMR = cs_a; \
173 const dim_t PACKNR = rs_b; \
174 \
175 ctype* restrict one = PASTEMAC(ch,1); \
176 ctype* restrict zero = PASTEMAC(ch,0); \
177 ctype* restrict a_cast = a; \
178 ctype* restrict b_cast = b; \
179 ctype* restrict c_cast = c; \
180 ctype* restrict alpha_cast = alpha; \
181 ctype* restrict beta_cast = beta; \
182 ctype* restrict b1; \
183 ctype* restrict c1; \
184 \
185 doff_t diagoffa_i; \
186 dim_t k_full; \
187 dim_t m_iter, m_left; \
188 dim_t n_iter, n_left; \
189 dim_t m_cur; \
190 dim_t n_cur; \
191 dim_t k_a1112; \
192 dim_t off_a1112; \
193 dim_t i, j; \
194 inc_t rstep_a; \
195 inc_t cstep_b; \
196 /*inc_t rstep_c;*/ \
197 inc_t cstep_c; \
198 inc_t istep_a; \
199 inc_t istep_b; \
200 inc_t off_scl; \
201 inc_t ss_a_num; \
202 inc_t ss_a_den; \
203 inc_t ps_a_cur; \
204 auxinfo_t aux; \
205 \
206 trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
207 /*dim_t jr_num_threads = thread_n_way( jr_thread ); \
208 dim_t jr_thread_id = thread_work_id( jr_thread );*/ \
209 \
210 dim_t n_next, k_next, diagoffa_next; \
211 inc_t rstep_c11, rs_c11, cs_c11; \
212 \
213 mem_t b1_L1_mem; \
214 /*memcpy does not like b1_L1 if it is restrict. The resid of gemm is non zero if this is changed to ctype* restrict*/ \
215 ctype* b1_L1; \
216 \
217 mem_t a1_L1_mem, a2_L1_mem; \
218 ctype *a1_L1, *a2_L1, *temp; \
219 \
220 mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
221 ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
222 \
223 /*EDMA Declarations */ \
224 lib_emt_Handle emt_handle_b = NULL; \
225 lib_emt_Handle emt_handle_c0 = NULL; \
226 lib_emt_Handle emt_handle_c1 = NULL; \
227 \
228 /*
229 Assumptions/assertions:
230 rs_a == 1
231 cs_a == PACKMR
232 pd_a == MR
233 ps_a == stride to next micro-panel of A
234 rs_b == PACKNR
235 cs_b == 1
236 pd_b == NR
237 ps_b == stride to next micro-panel of B
238 rs_c == (no assumptions)
239 cs_c == (no assumptions)
240 */ \
241 \
242 /* If any dimension is zero, return immediately. */ \
243 if ( bli_zero_dim3( m, n, k ) ) return; \
244 \
245 /* Safeguard: If the current block of A is entirely below the diagonal,
246 it is implicitly zero. So we do nothing. */ \
247 if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
248 \
249 /* Compute k_full. For all trmm, k_full is simply k. This is
250 needed because some parameter combinations of trmm reduce k
251 to advance past zero regions in the triangular matrix, and
252 when computing the imaginary stride of B (the non-triangular
253 matrix), which is used by 3m and 4m implementations, we need
254 this unreduced value of k. */ \
255 k_full = k; \
256 \
257 /* Compute indexing scaling factor for for 4m or 3m. This is
258 needed because one of the packing register blocksizes (PACKMR
259 or PACKNR) is used to index into the micro-panels of the non-
260 triangular matrix when computing with a diagonal-intersecting
261 micro-panel of the triangular matrix. In the case of 4m or 3m,
262 real values are stored in both sub-panels, and so the indexing
263 needs to occur in units of real values. The value computed
264 here is divided into the complex pointer offset to cause the
265 pointer to be advanced by the correct value. */ \
266 if ( bli_is_4m_packed( schema_a ) || \
267 bli_is_3m_packed( schema_a ) || \
268 bli_is_rih_packed( schema_a ) ) off_scl = 2; \
269 else off_scl = 1; \
270 \
271 /* Compute the storage stride. Usually this is just PACKMR (for A
272 or PACKNR (for B). However, in the case of 3m, we need to scale
273 the offset by 3/2. Since it's possible we may need to scale
274 the packing dimension by a non-integer value, we break up the
275 scaling factor into numerator and denominator. */ \
276 if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \
277 ss_a_den = 2; } \
278 else { ss_a_num = 1*PACKMR; \
279 ss_a_den = 1; } \
280 \
281 /* If there is a zero region to the left of where the diagonal of A
282 intersects the top edge of the block, adjust the pointer to B and
283 treat this case as if the diagonal offset were zero. Note that we
284 don't need to adjust the pointer to A since packm would have simply
285 skipped over the region that was not stored. */ \
286 if ( diagoffa > 0 ) \
287 { \
288 i = diagoffa; \
289 k = k - i; \
290 diagoffa = 0; \
291 b_cast = b_cast + ( i * PACKNR ) / off_scl; \
292 } \
293 \
294 /* If there is a zero region below where the diagonal of A intersects the
295 right side of the block, shrink it to prevent "no-op" iterations from
296 executing. */ \
297 if ( -diagoffa + k < m ) \
298 { \
299 m = -diagoffa + k; \
300 } \
301 \
302 /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
303 PASTEMAC(ch,set0s_mxn)( MR, NR, \
304 ct, rs_ct, cs_ct ); \
305 \
306 /* Compute number of primary and leftover components of the m and n
307 dimensions. */ \
308 n_iter = n / NR; \
309 n_left = n % NR; \
310 \
311 m_iter = m / MR; \
312 m_left = m % MR; \
313 \
314 if ( n_left ) ++n_iter; \
315 if ( m_left ) ++m_iter; \
316 \
317 /* Determine some increments used to step through A, B, and C. */ \
318 rstep_a = ps_a; \
319 \
320 cstep_b = ps_b; \
321 \
322 /*rstep_c = rs_c * MR;*/ \
323 cstep_c = cs_c * NR; \
324 \
325 /* When C (MC*NR) is moved to L2 the stride to get to the next panel of MRxNR*/ \
326 rstep_c11 = MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
327 rs_c11 = 1; \
328 cs_c11 = (m%2 == 0) ? m : m+1; /*stride to get to next column in a panel of MRxNR*/\
329 \
330 istep_a = PACKMR * k; \
331 istep_b = PACKNR * k_full; \
332 \
333 /* Save the pack schemas of A and B to the auxinfo_t object. */ \
334 bli_auxinfo_set_schema_a( schema_a, aux ); \
335 bli_auxinfo_set_schema_b( schema_b, aux ); \
336 \
337 /* Save the imaginary stride of B to the auxinfo_t object. */ \
338 bli_auxinfo_set_is_b( istep_b, aux ); \
339 \
340 b1 = b_cast; \
341 c1 = c_cast; \
342 \
343 /*Acquiring a buffer for B in L1*/ \
344 bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
345 b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
346 b1_L1 = (ctype *) ((char *) b1_L1_mem.buf + PASTEMAC(ch,bank)); \
347 \
348 /*Acquiring a buffer for A in L1*/ \
349 /*printf("Acquire A k %d, MR %d, size of ctype %d, A size requested %d\n", k, MR, sizeof(ctype), k*MR*sizeof(ctype));*/ \
350 bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
351 a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
352 a1_L1 = a1_L1; \
353 \
354 bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
355 a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
356 \
357 /*Acquiring buffers for C (MC_x_NR) in L2 */\
358 bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
359 cNew0 = bli_mem_buffer( &c0_L2_mem ); \
360 \
361 bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
362 cNew1 = bli_mem_buffer( &c1_L2_mem ); \
363 \
364 bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
365 cNew2 = bli_mem_buffer( &c2_L2_mem ); \
366 \
367 /*Acquiring an EDMA handle from the pool*/ \
368 bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
369 if(emt_handle_b == NULL) \
370 { \
371 printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
372 } \
373 /*Acquiring an EDMA handle from the pool*/ \
374 bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
375 if(emt_handle_c0 == NULL) \
376 { \
377 printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
378 } \
379 /*Acquiring an EDMA handle from the pool*/ \
380 bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
381 if(emt_handle_c1 == NULL) \
382 { \
383 printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
384 } \
385 \
386 n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
387 /* Loop over the n dimension (NR columns at a time). */ \
388 /* Transfering MC(=m)xNR*/ \
389 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
390 { \
391 lib_emt_copy2D2D(emt_handle_c0, c1, \
392 cNew1, m*sizeof(ctype), \
393 n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
394 } \
395 else \
396 { \
397 dim_t ii; \
398 ctype *ptr_source; \
399 ctype *ptr_dest; \
400 ptr_source = c1; \
401 ptr_dest = cNew1; \
402 for(ii = 0; ii < n_cur; ii++) \
403 { \
404 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
405 ptr_source += cs_c; \
406 ptr_dest += cs_c11; \
407 } \
408 } \
409 for ( j = 0; j < n_iter; ++j ) \
410 { \
411 if ( trmm_l_jr_my_iter( j, jr_thread ) ) { \
412 \
413 ctype* restrict a1; \
414 ctype* restrict c11; \
415 ctype* restrict b2; \
416 \
417 a1 = a_cast; \
418 /*c11 = c1;*/ \
419 c11 = cNew1; \
420 \
421 n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
422 n_next = ( bli_is_not_edge_f( j+1, n_iter, n_left ) ? NR : n_left ); \
423 \
424 /* Initialize our next panel of B to be the current panel of B. */ \
425 b2 = b1; \
426 \
427 lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
428 \
429 lib_emt_wait(emt_handle_c0); \
430 if(j < n_iter-1) /* no transfer for last iteration */ \
431 { \
432 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
433 { \
434 lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
435 cNew0, m*sizeof(ctype), \
436 n_next, cs_c*sizeof(ctype), \
437 cs_c11*sizeof(ctype)); \
438 } \
439 else \
440 { \
441 dim_t ii; \
442 ctype *ptr_source; \
443 ctype *ptr_dest; \
444 ptr_source = c1+cstep_c; \
445 ptr_dest = cNew0; \
446 for(ii = 0; ii < n_next; ii++) \
447 { \
448 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
449 ptr_source += cs_c; \
450 ptr_dest += cs_c11; \
451 } \
452 } \
453 } \
454 /* Loop over the m dimension (MR rows at a time). */ \
455 for ( i = 0; i < m_iter; ++i ) \
456 { \
457 diagoffa_i = diagoffa + ( doff_t )i*MR; \
458 diagoffa_next = diagoffa + ( doff_t )(i+1)*MR; \
459 \
460 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
461 \
462 if(i == 0) \
463 { \
464 if(bli_intersects_diag_n( diagoffa_i, MR, k )) \
465 lib_imt_copy(a1, a2_L1, (k - diagoffa_i)*MR*sizeof(ctype)); \
466 else \
467 lib_imt_copy(a1, a2_L1, k*MR*sizeof(ctype)); \
468 } \
469 \
470 /* If the current panel of A intersects the diagonal, scale C
471 by beta. If it is strictly above the diagonal, scale by one.
472 This allows the current macro-kernel to work for both trmm
473 and trmm3. */ \
474 if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
475 { \
476 ctype* restrict b1_i; \
477 ctype* restrict a2; \
478 \
479 /* Determine the offset to and length of the panel that was
480 packed so we can index into the corresponding location in
481 b1. */ \
482 off_a1112 = diagoffa_i; \
483 k_a1112 = k - off_a1112; \
484 \
485 /* Compute the panel stride for the current diagonal-
486 intersecting micro-panel. */ \
487 ps_a_cur = ( k_a1112 * ss_a_num ) / ss_a_den; \
488 \
489 if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \
490 \
491 b1_i = b1_L1 + ( off_a1112 * PACKNR ) / off_scl; \
492 \
493 /* Compute the addresses of the next panels of A and B. */ \
494 a2 = a1 + ps_a_cur; \
495 lib_imt_wait(); \
496 if(i == 0) \
497 { \
498 lib_emt_wait(emt_handle_b);\
499 } \
500 temp = a1_L1; \
501 a1_L1 = a2_L1; \
502 a2_L1 = temp; \
503 /*Don't need to update this for the next iteration, right here this is updated
504 * after the micro kernel is called */ \
505 /*a1 = a2; Make the next panel the current panel for the next iteration*/ \
506 if( bli_intersects_diag_n( diagoffa_next, MR, k ) ) \
507 { \
508 k_next = k-diagoffa_next; \
509 } \
510 else \
511 k_next = k; \
512 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
513 { \
514 a2 = a_cast; \
515 b2 = b1; \
516 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
517 b2 = b_cast; \
518 } \
519 else \
520 { \
521 lib_imt_copy(a2, a2_L1, k_next*MR*sizeof(ctype)); \
522 } \
523 \
524 /* Save addresses of next panels of A and B to the auxinfo_t
525 object. */ \
526 bli_auxinfo_set_next_a( a2, aux ); \
527 bli_auxinfo_set_next_b( b2, aux ); \
528 \
529 /* Save the 4m/3m imaginary stride of A to the auxinfo_t
530 object. */ \
531 bli_auxinfo_set_is_a( PACKMR * k_a1112, aux ); \
532 \
533 /* Handle interior and edge cases separately. */ \
534 if ( m_cur == MR && n_cur == NR ) \
535 { \
536 /* Invoke the gemm micro-kernel. */ \
537 gemm_ukr_cast( k_a1112, \
538 alpha_cast, \
539 a1_L1, \
540 b1_i, \
541 beta_cast, \
542 c11, rs_c11, cs_c11, /*rs_c, cs_c,*/ \
543 &aux ); \
544 } \
545 else \
546 { \
547 /* Copy edge elements of C to the temporary buffer. */ \
548 PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
549 c11, rs_c11, cs_c11, /*rs_c, cs_c,*/ \
550 ct, rs_ct, cs_ct ); \
551 \
552 /* Invoke the gemm micro-kernel. */ \
553 gemm_ukr_cast( k_a1112, \
554 alpha_cast, \
555 a1_L1, \
556 b1_i, \
557 beta_cast, \
558 ct, rs_ct, cs_ct, \
559 &aux ); \
560 \
561 /* Copy the result to the edge of C. */ \
562 PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
563 ct, rs_ct, cs_ct, \
564 c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
565 } \
566 } /*if ( trmm_l_ir_my_iter( i, ir_thread ) )*/ \
567 \
568 a1 += ps_a_cur; \
569 } \
570 else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
571 { \
572 if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \
573 \
574 ctype* restrict a2; \
575 \
576 /* Compute the addresses of the next panels of A and B. */ \
577 a2 = a1 + rstep_a; \
578 lib_imt_wait(); \
579 if(i == 0) \
580 { \
581 lib_emt_wait(emt_handle_b);\
582 } \
583 temp = a1_L1; \
584 a1_L1 = a2_L1; \
585 a2_L1 = temp; \
586 /*Don't need to update this for the next iteration, right here this is updated
587 * after the micro kernel is called */ \
588 /*a1 = a2; Make the next panel the current panel for the next iteration*/ \
589 /*Start next panel*/ \
590 if( bli_intersects_diag_n( diagoffa_next, MR, k ) ) \
591 { \
592 k_next = k-diagoffa_next; \
593 } \
594 else \
595 k_next = k; \
596 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
597 { \
598 a2 = a_cast; \
599 b2 = b1; \
600 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
601 b2 = b_cast; \
602 } \
603 else \
604 lib_imt_copy(a2, a2_L1, k_next*MR*sizeof(ctype)); \
605 \
606 /* Save addresses of next panels of A and B to the auxinfo_t
607 object. */ \
608 bli_auxinfo_set_next_a( a2, aux ); \
609 bli_auxinfo_set_next_b( b2, aux ); \
610 \
611 /* Save the 4m/3m imaginary stride of A to the auxinfo_t
612 object. */ \
613 bli_auxinfo_set_is_a( istep_a, aux ); \
614 \
615 /* Handle interior and edge cases separately. */ \
616 if ( m_cur == MR && n_cur == NR ) \
617 { \
618 /* Invoke the gemm micro-kernel. */ \
619 gemm_ukr_cast( k, \
620 alpha_cast, \
621 a1_L1, \
622 b1_L1, \
623 one, \
624 c11, rs_c11, cs_c11,/*rs_c, cs_c, */ \
625 &aux ); \
626 } \
627 else \
628 { \
629 /* Invoke the gemm micro-kernel. */ \
630 gemm_ukr_cast( k, \
631 alpha_cast, \
632 a1_L1, \
633 b1_L1, \
634 zero, \
635 ct, rs_ct, cs_ct, \
636 &aux ); \
637 \
638 /* Add the result to the edge of C. */ \
639 PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
640 ct, rs_ct, cs_ct, \
641 c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
642 } \
643 } /*if ( trmm_l_ir_my_iter( i, ir_thread ) )*/\
644 \
645 a1 += rstep_a; \
646 } /*else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) )*/ \
647 \
648 c11 += rstep_c11; \
649 /*c11 += rstep_c;*/ \
650 } /*for ( i = 0; i < m_iter; ++i )*/\
651 /* circularly shift buffers */ \
652 cNewTemp = cNew0; \
653 cNew0 = cNew2; \
654 cNew2 = cNew1; \
655 cNew1 = cNewTemp; \
656 if(j != 0) /* wait for save c to complete; skip first iteration */ \
657 { \
658 lib_emt_wait(emt_handle_c1); \
659 } \
660 /* save updated c*/ \
661 if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
662 { \
663 lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
664 n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
665 } \
666 else \
667 { \
668 dim_t ii; \
669 ctype *ptr_source; \
670 ctype *ptr_dest; \
671 ptr_source = cNew2; \
672 ptr_dest = c1; \
673 for(ii = 0; ii < n_cur; ii++) \
674 { \
675 memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
676 ptr_source += cs_c11; \
677 ptr_dest += cs_c; \
678 } \
679 } \
680 } /*if ( trmm_l_jr_my_iter( j, jr_thread ) ) */\
681 \
682 b1 += cstep_b; \
683 c1 += cstep_c; \
684 } \
685 \
686 bli_mem_release( &c2_L2_mem ); \
687 bli_mem_release( &c1_L2_mem ); \
688 bli_mem_release( &c0_L2_mem ); \
689 bli_mem_release( &a2_L1_mem ); \
690 bli_mem_release( &a1_L1_mem ); \
691 bli_mem_release( &b1_L1_mem ); \
692 if ( emt_handle_b != NULL ) \
693 { \
694 bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
695 emt_handle_b = NULL; \
696 } \
697 if ( emt_handle_c0 != NULL ) \
698 { \
699 bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
700 emt_handle_c0 = NULL; \
701 } \
702 if ( emt_handle_c1 != NULL ) \
703 { \
704 lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
705 bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
706 emt_handle_c1 = NULL; \
707 } \
708 /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
709 /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
710 }
712 INSERT_GENTFUNC_BASIC( trmm_lu_ker_var2, gemm_ukr_t )
714 #else
716 #endif
718 #else
720 #undef GENTFUNC
721 #define GENTFUNC( ctype, ch, varname, ukrtype ) \
722 \
723 void PASTEMAC(ch,varname)( \
724 doff_t diagoffa, \
725 pack_t schema_a, \
726 pack_t schema_b, \
727 dim_t m, \
728 dim_t n, \
729 dim_t k, \
730 void* alpha, \
731 void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \
732 void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
733 void* beta, \
734 void* c, inc_t rs_c, inc_t cs_c, \
735 void* gemm_ukr, \
736 trmm_thrinfo_t* jr_thread \
737 ) \
738 { \
739 /* Cast the micro-kernel address to its function pointer type. */ \
740 PASTECH(ch,ukrtype) gemm_ukr_cast = gemm_ukr; \
741 \
742 /* Temporary C buffer for edge cases. */ \
743 ctype ct[ PASTEMAC(ch,maxmr) * \
744 PASTEMAC(ch,maxnr) ] \
745 __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
746 const inc_t rs_ct = 1; \
747 const inc_t cs_ct = PASTEMAC(ch,maxmr); \
748 \
749 /* Alias some constants to simpler names. */ \
750 const dim_t MR = pd_a; \
751 const dim_t NR = pd_b; \
752 const dim_t PACKMR = cs_a; \
753 const dim_t PACKNR = rs_b; \
754 \
755 ctype* restrict one = PASTEMAC(ch,1); \
756 ctype* restrict zero = PASTEMAC(ch,0); \
757 ctype* restrict a_cast = a; \
758 ctype* restrict b_cast = b; \
759 ctype* restrict c_cast = c; \
760 ctype* restrict alpha_cast = alpha; \
761 ctype* restrict beta_cast = beta; \
762 ctype* restrict b1; \
763 ctype* restrict c1; \
764 \
765 doff_t diagoffa_i; \
766 dim_t k_full; \
767 dim_t m_iter, m_left; \
768 dim_t n_iter, n_left; \
769 dim_t m_cur; \
770 dim_t n_cur; \
771 dim_t k_a1112; \
772 dim_t off_a1112; \
773 dim_t i, j; \
774 inc_t rstep_a; \
775 inc_t cstep_b; \
776 inc_t rstep_c, cstep_c; \
777 inc_t istep_a; \
778 inc_t istep_b; \
779 inc_t off_scl; \
780 inc_t ss_a_num; \
781 inc_t ss_a_den; \
782 inc_t ps_a_cur; \
783 auxinfo_t aux; \
784 \
785 trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
786 /*dim_t jr_num_threads = thread_n_way( jr_thread ); \
787 dim_t jr_thread_id = thread_work_id( jr_thread );*/ \
788 \
789 /*
790 Assumptions/assertions:
791 rs_a == 1
792 cs_a == PACKMR
793 pd_a == MR
794 ps_a == stride to next micro-panel of A
795 rs_b == PACKNR
796 cs_b == 1
797 pd_b == NR
798 ps_b == stride to next micro-panel of B
799 rs_c == (no assumptions)
800 cs_c == (no assumptions)
801 */ \
802 \
803 /* If any dimension is zero, return immediately. */ \
804 if ( bli_zero_dim3( m, n, k ) ) return; \
805 \
806 /* Safeguard: If the current block of A is entirely below the diagonal,
807 it is implicitly zero. So we do nothing. */ \
808 if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
809 \
810 /* Compute k_full. For all trmm, k_full is simply k. This is
811 needed because some parameter combinations of trmm reduce k
812 to advance past zero regions in the triangular matrix, and
813 when computing the imaginary stride of B (the non-triangular
814 matrix), which is used by 3m and 4m implementations, we need
815 this unreduced value of k. */ \
816 k_full = k; \
817 \
818 /* Compute indexing scaling factor for for 4m or 3m. This is
819 needed because one of the packing register blocksizes (PACKMR
820 or PACKNR) is used to index into the micro-panels of the non-
821 triangular matrix when computing with a diagonal-intersecting
822 micro-panel of the triangular matrix. In the case of 4m or 3m,
823 real values are stored in both sub-panels, and so the indexing
824 needs to occur in units of real values. The value computed
825 here is divided into the complex pointer offset to cause the
826 pointer to be advanced by the correct value. */ \
827 if ( bli_is_4m_packed( schema_a ) || \
828 bli_is_3m_packed( schema_a ) || \
829 bli_is_rih_packed( schema_a ) ) off_scl = 2; \
830 else off_scl = 1; \
831 \
832 /* Compute the storage stride. Usually this is just PACKMR (for A
833 or PACKNR (for B). However, in the case of 3m, we need to scale
834 the offset by 3/2. Since it's possible we may need to scale
835 the packing dimension by a non-integer value, we break up the
836 scaling factor into numerator and denominator. */ \
837 if ( bli_is_3m_packed( schema_a ) ) { ss_a_num = 3*PACKMR; \
838 ss_a_den = 2; } \
839 else { ss_a_num = 1*PACKMR; \
840 ss_a_den = 1; } \
841 \
842 /* If there is a zero region to the left of where the diagonal of A
843 intersects the top edge of the block, adjust the pointer to B and
844 treat this case as if the diagonal offset were zero. Note that we
845 don't need to adjust the pointer to A since packm would have simply
846 skipped over the region that was not stored. */ \
847 if ( diagoffa > 0 ) \
848 { \
849 i = diagoffa; \
850 k = k - i; \
851 diagoffa = 0; \
852 b_cast = b_cast + ( i * PACKNR ) / off_scl; \
853 } \
854 \
855 /* If there is a zero region below where the diagonal of A intersects the
856 right side of the block, shrink it to prevent "no-op" iterations from
857 executing. */ \
858 if ( -diagoffa + k < m ) \
859 { \
860 m = -diagoffa + k; \
861 } \
862 \
863 /* Clear the temporary C buffer in case it has any infs or NaNs. */ \
864 PASTEMAC(ch,set0s_mxn)( MR, NR, \
865 ct, rs_ct, cs_ct ); \
866 \
867 /* Compute number of primary and leftover components of the m and n
868 dimensions. */ \
869 n_iter = n / NR; \
870 n_left = n % NR; \
871 \
872 m_iter = m / MR; \
873 m_left = m % MR; \
874 \
875 if ( n_left ) ++n_iter; \
876 if ( m_left ) ++m_iter; \
877 \
878 /* Determine some increments used to step through A, B, and C. */ \
879 rstep_a = ps_a; \
880 \
881 cstep_b = ps_b; \
882 \
883 rstep_c = rs_c * MR; \
884 cstep_c = cs_c * NR; \
885 \
886 istep_a = PACKMR * k; \
887 istep_b = PACKNR * k_full; \
888 \
889 /* Save the pack schemas of A and B to the auxinfo_t object. */ \
890 bli_auxinfo_set_schema_a( schema_a, aux ); \
891 bli_auxinfo_set_schema_b( schema_b, aux ); \
892 \
893 /* Save the imaginary stride of B to the auxinfo_t object. */ \
894 bli_auxinfo_set_is_b( istep_b, aux ); \
895 \
896 b1 = b_cast; \
897 c1 = c_cast; \
898 \
899 /* Loop over the n dimension (NR columns at a time). */ \
900 for ( j = 0; j < n_iter; ++j ) \
901 { \
902 if ( trmm_l_jr_my_iter( j, jr_thread ) ) { \
903 \
904 ctype* restrict a1; \
905 ctype* restrict c11; \
906 ctype* restrict b2; \
907 \
908 a1 = a_cast; \
909 c11 = c1; \
910 \
911 n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
912 \
913 /* Initialize our next panel of B to be the current panel of B. */ \
914 b2 = b1; \
915 \
916 /* Loop over the m dimension (MR rows at a time). */ \
917 for ( i = 0; i < m_iter; ++i ) \
918 { \
919 diagoffa_i = diagoffa + ( doff_t )i*MR; \
920 \
921 m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
922 \
923 /* If the current panel of A intersects the diagonal, scale C
924 by beta. If it is strictly above the diagonal, scale by one.
925 This allows the current macro-kernel to work for both trmm
926 and trmm3. */ \
927 if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
928 { \
929 ctype* restrict b1_i; \
930 ctype* restrict a2; \
931 \
932 /* Determine the offset to and length of the panel that was
933 packed so we can index into the corresponding location in
934 b1. */ \
935 off_a1112 = diagoffa_i; \
936 k_a1112 = k - off_a1112; \
937 \
938 /* Compute the panel stride for the current diagonal-
939 intersecting micro-panel. */ \
940 ps_a_cur = ( k_a1112 * ss_a_num ) / ss_a_den; \
941 \
942 if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \
943 \
944 b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
945 \
946 /* Compute the addresses of the next panels of A and B. */ \
947 a2 = a1; \
948 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
949 { \
950 a2 = a_cast; \
951 b2 = b1; \
952 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
953 b2 = b_cast; \
954 } \
955 \
956 /* Save addresses of next panels of A and B to the auxinfo_t
957 object. */ \
958 bli_auxinfo_set_next_a( a2, aux ); \
959 bli_auxinfo_set_next_b( b2, aux ); \
960 \
961 /* Save the 4m/3m imaginary stride of A to the auxinfo_t
962 object. */ \
963 bli_auxinfo_set_is_a( PACKMR * k_a1112, aux ); \
964 \
965 /* Handle interior and edge cases separately. */ \
966 if ( m_cur == MR && n_cur == NR ) \
967 { \
968 /* Invoke the gemm micro-kernel. */ \
969 gemm_ukr_cast( k_a1112, \
970 alpha_cast, \
971 a1, \
972 b1_i, \
973 beta_cast, \
974 c11, rs_c, cs_c, \
975 &aux ); \
976 } \
977 else \
978 { \
979 /* Copy edge elements of C to the temporary buffer. */ \
980 PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
981 c11, rs_c, cs_c, \
982 ct, rs_ct, cs_ct ); \
983 \
984 /* Invoke the gemm micro-kernel. */ \
985 gemm_ukr_cast( k_a1112, \
986 alpha_cast, \
987 a1, \
988 b1_i, \
989 beta_cast, \
990 ct, rs_ct, cs_ct, \
991 &aux ); \
992 \
993 /* Copy the result to the edge of C. */ \
994 PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
995 ct, rs_ct, cs_ct, \
996 c11, rs_c, cs_c ); \
997 } \
998 } \
999 \
1000 /*
1001 printf( "bli_trmm_lu_ker_var2: applying ps_a_cur = %lu\n", ps_a_cur ); \
1002 */ \
1003 a1 += ps_a_cur; \
1004 } \
1005 else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
1006 { \
1007 if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \
1008 \
1009 ctype* restrict a2; \
1010 \
1011 /* Compute the addresses of the next panels of A and B. */ \
1012 a2 = a1; \
1013 if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
1014 { \
1015 a2 = a_cast; \
1016 b2 = b1; \
1017 if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
1018 b2 = b_cast; \
1019 } \
1020 \
1021 /* Save addresses of next panels of A and B to the auxinfo_t
1022 object. */ \
1023 bli_auxinfo_set_next_a( a2, aux ); \
1024 bli_auxinfo_set_next_b( b2, aux ); \
1025 \
1026 /* Save the 4m/3m imaginary stride of A to the auxinfo_t
1027 object. */ \
1028 bli_auxinfo_set_is_a( istep_a, aux ); \
1029 \
1030 /* Handle interior and edge cases separately. */ \
1031 if ( m_cur == MR && n_cur == NR ) \
1032 { \
1033 /* Invoke the gemm micro-kernel. */ \
1034 gemm_ukr_cast( k, \
1035 alpha_cast, \
1036 a1, \
1037 b1, \
1038 one, \
1039 c11, rs_c, cs_c, \
1040 &aux ); \
1041 } \
1042 else \
1043 { \
1044 /* Invoke the gemm micro-kernel. */ \
1045 gemm_ukr_cast( k, \
1046 alpha_cast, \
1047 a1, \
1048 b1, \
1049 zero, \
1050 ct, rs_ct, cs_ct, \
1051 &aux ); \
1052 \
1053 /* Add the result to the edge of C. */ \
1054 PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
1055 ct, rs_ct, cs_ct, \
1056 c11, rs_c, cs_c ); \
1057 } \
1058 } \
1059 \
1060 /*
1061 printf( "bli_trmm_lu_ker_var2: applying rstep_a = %lu\n", rstep_a ); \
1062 */ \
1063 a1 += rstep_a; \
1064 } \
1065 \
1066 c11 += rstep_c; \
1067 } \
1068 } \
1069 \
1070 b1 += cstep_b; \
1071 c1 += cstep_c; \
1072 } \
1073 \
1074 /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
1075 /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
1076 }
1078 INSERT_GENTFUNC_BASIC( trmm_lu_ker_var2, gemm_ukr_t )
1080 #endif