1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas at Austin
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #undef GENTFUNCCO
38 #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
39 \
40 void PASTEMAC(ch,varname)( \
41 struc_t strucc, \
42 doff_t diagoffc, \
43 diag_t diagc, \
44 uplo_t uploc, \
45 conj_t conjc, \
46 pack_t schema, \
47 bool_t invdiag, \
48 dim_t m_panel, \
49 dim_t n_panel, \
50 dim_t m_panel_max, \
51 dim_t n_panel_max, \
52 ctype* restrict kappa, \
53 ctype* restrict c, inc_t rs_c, inc_t cs_c, \
54 ctype* restrict p, inc_t rs_p, inc_t cs_p \
55 ) \
56 { \
57 dim_t panel_dim; \
58 dim_t panel_len; \
59 dim_t panel_len_max; \
60 inc_t incc, ldc; \
61 inc_t is_p, ldp; \
62 \
63 \
64 /* Determine the dimensions and relative strides of the micro-panel
65 based on its pack schema. */ \
66 if ( bli_is_col_packed( schema ) ) \
67 { \
68 /* Prepare to pack to row-stored column panel. */ \
69 panel_dim = n_panel; \
70 panel_len = m_panel; \
71 panel_len_max = m_panel_max; \
72 incc = cs_c; \
73 ldc = rs_c; \
74 ldp = rs_p; \
75 } \
76 else /* if ( bli_is_row_packed( schema ) ) */ \
77 { \
78 /* Prepare to pack to column-stored row panel. */ \
79 panel_dim = m_panel; \
80 panel_len = n_panel; \
81 panel_len_max = n_panel_max; \
82 incc = rs_c; \
83 ldc = cs_c; \
84 ldp = cs_p; \
85 } \
86 \
87 /* Compute the imaginary stride (ie: the element offset to the imaginary
88 panel). */ \
89 is_p = ldp * panel_len_max; \
90 \
91 \
92 /* Handle micro-panel packing based on the structure of the matrix
93 being packed. */ \
94 if ( bli_is_general( strucc ) ) \
95 { \
96 /* For micro-panels of general matrices, we can call the pack
97 kernel front-end directly. */ \
98 PASTEMAC(ch,kername)( conjc, \
99 panel_dim, \
100 panel_len, \
101 kappa, \
102 c, incc, ldc, \
103 p, is_p, ldp ); \
104 } \
105 else if ( bli_is_herm_or_symm( strucc ) ) \
106 { \
107 /* Call a helper function for micro-panels of Hermitian/symmetric
108 matrices. */ \
109 PASTEMAC(ch,packm_herm_cxk_3m)( strucc, \
110 diagoffc, \
111 uploc, \
112 conjc, \
113 schema, \
114 m_panel, \
115 n_panel, \
116 m_panel_max, \
117 n_panel_max, \
118 panel_dim, \
119 panel_len, \
120 kappa, \
121 c, rs_c, cs_c, \
122 incc, ldc, \
123 p, rs_p, cs_p, \
124 is_p, ldp ); \
125 } \
126 else /* ( bli_is_triangular( strucc ) ) */ \
127 { \
128 /* Call a helper function for micro-panels of triangular
129 matrices. */ \
130 PASTEMAC(ch,packm_tri_cxk_3m)( strucc, \
131 diagoffc, \
132 diagc, \
133 uploc, \
134 conjc, \
135 schema, \
136 invdiag, \
137 m_panel, \
138 n_panel, \
139 m_panel_max, \
140 n_panel_max, \
141 panel_dim, \
142 panel_len, \
143 kappa, \
144 c, rs_c, cs_c, \
145 incc, ldc, \
146 p, rs_p, cs_p, \
147 is_p, ldp ); \
148 } \
149 \
150 \
151 /* The packed memory region was acquired/allocated with "aligned"
152 dimensions (ie: dimensions that were possibly inflated up to a
153 multiple). When these dimension are inflated, it creates empty
154 regions along the bottom and/or right edges of the matrix. If
155 either region exists, we set them to zero. This allows the
156 micro-kernel to remain simple since it does not need to support
157 different register blockings for the edge cases. */ \
158 if ( m_panel != m_panel_max ) \
159 { \
160 ctype_r* restrict zero_r = PASTEMAC(chr,0); \
161 dim_t i = m_panel; \
162 dim_t m_edge = m_panel_max - i; \
163 dim_t n_edge = n_panel_max; \
164 ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
165 ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \
166 ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (i )*rs_p; \
167 \
168 PASTEMAC(chr,setm)( 0, \
169 BLIS_NONUNIT_DIAG, \
170 BLIS_DENSE, \
171 m_edge, \
172 n_edge, \
173 zero_r, \
174 p_edge_r, rs_p, cs_p ); \
175 PASTEMAC(chr,setm)( 0, \
176 BLIS_NONUNIT_DIAG, \
177 BLIS_DENSE, \
178 m_edge, \
179 n_edge, \
180 zero_r, \
181 p_edge_i, rs_p, cs_p ); \
182 PASTEMAC(chr,setm)( 0, \
183 BLIS_NONUNIT_DIAG, \
184 BLIS_DENSE, \
185 m_edge, \
186 n_edge, \
187 zero_r, \
188 p_edge_rpi, rs_p, cs_p ); \
189 } \
190 \
191 if ( n_panel != n_panel_max ) \
192 { \
193 ctype_r* restrict zero_r = PASTEMAC(chr,0); \
194 dim_t j = n_panel; \
195 dim_t m_edge = m_panel_max; \
196 dim_t n_edge = n_panel_max - j; \
197 ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
198 ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \
199 ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (j )*cs_p; \
200 \
201 PASTEMAC(chr,setm)( 0, \
202 BLIS_NONUNIT_DIAG, \
203 BLIS_DENSE, \
204 m_edge, \
205 n_edge, \
206 zero_r, \
207 p_edge_r, rs_p, cs_p ); \
208 PASTEMAC(chr,setm)( 0, \
209 BLIS_NONUNIT_DIAG, \
210 BLIS_DENSE, \
211 m_edge, \
212 n_edge, \
213 zero_r, \
214 p_edge_i, rs_p, cs_p ); \
215 PASTEMAC(chr,setm)( 0, \
216 BLIS_NONUNIT_DIAG, \
217 BLIS_DENSE, \
218 m_edge, \
219 n_edge, \
220 zero_r, \
221 p_edge_rpi, rs_p, cs_p ); \
222 } \
223 \
224 \
225 if ( bli_is_triangular( strucc ) ) \
226 { \
227 /* If this panel is an edge case in both panel dimension and length,
228 then it must be a bottom-right corner case. Set the part of the
229 diagonal that extends into the zero-padded region to identity.
230 NOTE: This is actually only necessary when packing for trsm, as
231 it helps prevent NaNs and Infs from creeping into the computation.
232 However, we set the region to identity for trmm as well. Those
233 1.0's end up getting muliplied by the 0.0's in the zero-padded
234 region of the other matrix, so there is no harm in this. */ \
235 if ( m_panel != m_panel_max && \
236 n_panel != n_panel_max ) \
237 { \
238 ctype_r* restrict one_r = PASTEMAC(chr,1); \
239 ctype_r* restrict zero_r = PASTEMAC(chr,0); \
240 dim_t i = m_panel; \
241 dim_t j = n_panel; \
242 dim_t m_br = m_panel_max - i; \
243 dim_t n_br = n_panel_max - j; \
244 ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
245 ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \
246 \
247 PASTEMAC(chr,setd)( 0, \
248 m_br, \
249 n_br, \
250 one_r, \
251 p_br_r, rs_p, cs_p ); \
252 PASTEMAC(chr,setd)( 0, \
253 m_br, \
254 n_br, \
255 zero_r, \
256 p_br_i, rs_p, cs_p ); \
257 } \
258 } \
259 }
261 INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_3m, packm_cxk_3m )
266 #undef GENTFUNCCO
267 #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
268 \
269 void PASTEMAC(ch,varname)( \
270 struc_t strucc, \
271 doff_t diagoffc, \
272 uplo_t uploc, \
273 conj_t conjc, \
274 pack_t schema, \
275 dim_t m_panel, \
276 dim_t n_panel, \
277 dim_t m_panel_max, \
278 dim_t n_panel_max, \
279 dim_t panel_dim, \
280 dim_t panel_len, \
281 ctype* restrict kappa, \
282 ctype* restrict c, inc_t rs_c, inc_t cs_c, \
283 inc_t incc, inc_t ldc, \
284 ctype* restrict p, inc_t rs_p, inc_t cs_p, \
285 inc_t is_p, inc_t ldp \
286 ) \
287 { \
288 doff_t diagoffc_abs; \
289 dim_t i, j; \
290 bool_t row_stored; \
291 bool_t col_stored; \
292 \
293 \
294 /* Create flags to incidate row or column storage. Note that the
295 schema bit that encodes row or column is describing the form of
296 micro-panel, not the storage in the micro-panel. Hence the
297 mismatch in "row" and "column" semantics. */ \
298 row_stored = bli_is_col_packed( schema ); \
299 col_stored = bli_is_row_packed( schema ); \
300 \
301 \
302 /* Handle the case where the micro-panel does NOT intersect the
303 diagonal separately from the case where it does intersect. */ \
304 if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
305 { \
306 /* If the current panel is unstored, we need to make a few
307 adjustments so we refer to the data where it is actually
308 stored, also taking conjugation into account. (Note this
309 implicitly assumes we are operating on a dense panel
310 within a larger symmetric or Hermitian matrix, since a
311 general matrix would not contain any unstored region.) */ \
312 if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
313 { \
314 c = c + diagoffc * ( doff_t )cs_c + \
315 -diagoffc * ( doff_t )rs_c; \
316 bli_swap_incs( incc, ldc ); \
317 \
318 if ( bli_is_hermitian( strucc ) ) \
319 bli_toggle_conj( conjc ); \
320 } \
321 \
322 /* Pack the full panel. */ \
323 PASTEMAC(ch,kername)( conjc, \
324 panel_dim, \
325 panel_len, \
326 kappa, \
327 c, incc, ldc, \
328 p, is_p, ldp ); \
329 } \
330 else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
331 { \
332 ctype_r* restrict p_r = ( ctype_r* )p; \
333 \
334 ctype_r* restrict one_r = PASTEMAC(chr,1); \
335 ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
336 \
337 ctype* restrict c10; \
338 ctype_r* restrict p10; \
339 dim_t p10_dim, p10_len; \
340 inc_t incc10, ldc10; \
341 doff_t diagoffc10; \
342 conj_t conjc10; \
343 \
344 ctype* restrict c12; \
345 ctype_r* restrict p12; \
346 dim_t p12_dim, p12_len; \
347 inc_t incc12, ldc12; \
348 doff_t diagoffc12; \
349 conj_t conjc12; \
350 \
351 /* Sanity check. Diagonals should not intersect the short end of
352 a micro-panel. If they do, then somehow the constraints on
353 cache blocksizes being a whole multiple of the register
354 blocksizes was somehow violated. */ \
355 if ( ( col_stored && diagoffc < 0 ) || \
356 ( row_stored && diagoffc > 0 ) ) \
357 bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
358 \
359 diagoffc_abs = bli_abs( diagoffc ); \
360 \
361 if ( ( row_stored && bli_is_upper( uploc ) ) || \
362 ( col_stored && bli_is_lower( uploc ) ) ) \
363 { \
364 p10_dim = panel_dim; \
365 p10_len = diagoffc_abs; \
366 p10 = p_r; \
367 c10 = c; \
368 incc10 = incc; \
369 ldc10 = ldc; \
370 conjc10 = conjc; \
371 \
372 p12_dim = panel_dim; \
373 p12_len = panel_len - p10_len; \
374 j = p10_len; \
375 diagoffc12 = diagoffc_abs - j; \
376 p12 = p_r + (j )*ldp; \
377 c12 = c + (j )*ldc; \
378 c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
379 -diagoffc12 * ( doff_t )rs_c; \
380 incc12 = ldc; \
381 ldc12 = incc; \
382 conjc12 = conjc; \
383 \
384 if ( bli_is_hermitian( strucc ) ) \
385 bli_toggle_conj( conjc12 ); \
386 } \
387 else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
388 ( col_stored && bli_is_upper( uploc ) ) ) */ \
389 { \
390 p10_dim = panel_dim; \
391 p10_len = diagoffc_abs + panel_dim; \
392 diagoffc10 = diagoffc; \
393 p10 = p_r; \
394 c10 = c; \
395 c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
396 -diagoffc10 * ( doff_t )rs_c; \
397 incc10 = ldc; \
398 ldc10 = incc; \
399 conjc10 = conjc; \
400 \
401 p12_dim = panel_dim; \
402 p12_len = panel_len - p10_len; \
403 j = p10_len; \
404 p12 = p_r + (j )*ldp; \
405 c12 = c + (j )*ldc; \
406 incc12 = incc; \
407 ldc12 = ldc; \
408 conjc12 = conjc; \
409 \
410 if ( bli_is_hermitian( strucc ) ) \
411 bli_toggle_conj( conjc10 ); \
412 } \
413 \
414 /* Pack to p10. For upper storage, this includes the unstored
415 triangle of c11. */ \
416 PASTEMAC(ch,kername)( conjc10, \
417 p10_dim, \
418 p10_len, \
419 kappa, \
420 c10, incc10, ldc10, \
421 p10, is_p, ldp ); \
422 \
423 /* Pack to p12. For lower storage, this includes the unstored
424 triangle of c11. */ \
425 PASTEMAC(ch,kername)( conjc12, \
426 p12_dim, \
427 p12_len, \
428 kappa, \
429 c12, incc12, ldc12, \
430 p12, is_p, ldp ); \
431 \
432 /* Pack the stored triangle of c11 to p11. */ \
433 { \
434 dim_t p11_m = panel_dim; \
435 dim_t p11_n = panel_dim; \
436 inc_t rs_c11 = 2*rs_c; \
437 inc_t cs_c11 = 2*cs_c; \
438 dim_t j = diagoffc_abs; \
439 ctype* c11 = ( ctype* )c + (j )*ldc; \
440 ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \
441 ctype_r* c11_r = ( ctype_r* )c11; \
442 ctype_r* c11_i = ( ctype_r* )c11 + 1; \
443 ctype_r* p11_r = ( ctype_r* )p11; \
444 ctype_r* p11_i = ( ctype_r* )p11 + is_p; \
445 ctype_r* alpha_r = one_r; \
446 ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \
447 ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
448 ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
449 \
450 /* Copy the real part of the stored triangle of c11 to p11_r. */ \
451 PASTEMAC(chr,scal2m)( 0, \
452 BLIS_NONUNIT_DIAG, \
453 uploc, \
454 BLIS_NO_TRANSPOSE, \
455 p11_m, \
456 p11_n, \
457 alpha_r, \
458 c11_r, rs_c11, cs_c11, \
459 p11_r, rs_p, cs_p ); \
460 \
461 /* Copy the imaginary part of the stored triangle of c11 to p11_i,
462 scaling by -1 if conjugation on c was requested. */ \
463 PASTEMAC(chr,scal2m)( 0, \
464 BLIS_NONUNIT_DIAG, \
465 uploc, \
466 BLIS_NO_TRANSPOSE, \
467 p11_m, \
468 p11_n, \
469 alpha_i, \
470 c11_i, rs_c11, cs_c11, \
471 p11_i, rs_p, cs_p ); \
472 \
473 /* If source matrix c is Hermitian, we have to zero out the
474 imaginary components of the diagonal of p11 in case the
475 corresponding elements in c11 were not already zero. */ \
476 if ( bli_is_hermitian( strucc ) ) \
477 { \
478 for ( i = 0; i < p11_m; ++i ) \
479 { \
480 ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
481 \
482 PASTEMAC(chr,set0s)( *pi11_i ); \
483 } \
484 } \
485 \
486 /* Apply kappa to the part of p11 that corresponds to the stored
487 part of c11 that was copied above. */ \
488 if ( bli_is_upper( uploc ) ) \
489 { \
490 PASTEMAC(ch,scalris_mxn_u)( 0, \
491 p11_m, \
492 p11_n, \
493 &kappa_r, \
494 &kappa_i, \
495 p11_r, \
496 p11_i, rs_p, cs_p ); \
497 } \
498 else \
499 { \
500 PASTEMAC(ch,scalris_mxn_l)( 0, \
501 p11_m, \
502 p11_n, \
503 &kappa_r, \
504 &kappa_i, \
505 p11_r, \
506 p11_i, rs_p, cs_p ); \
507 } \
508 \
509 /* Update the p11 section of the ri panel. It simply needs
510 to contain the sum of p11_r + p11_i. */ \
511 { \
512 ctype_r* p11_rpi = p11_i + is_p; \
513 \
514 for ( j = 0; j < p11_n; ++j ) \
515 for ( i = 0; i < p11_m; ++i ) \
516 { \
517 ctype_r* pi11_r = p11_r + (i )*rs_p + (j )*cs_p; \
518 ctype_r* pi11_i = p11_i + (i )*rs_p + (j )*cs_p; \
519 ctype_r* pi11_rpi = p11_rpi + (i )*rs_p + (j )*cs_p; \
520 \
521 PASTEMAC(chr,add3s)( *pi11_r, \
522 *pi11_i, \
523 *pi11_rpi ); \
524 } \
525 } \
526 } \
527 } \
528 }
530 INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_3m, packm_cxk_3m )
536 #undef GENTFUNCCO
537 #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
538 \
539 void PASTEMAC(ch,varname)( \
540 struc_t strucc, \
541 doff_t diagoffp, \
542 diag_t diagc, \
543 uplo_t uploc, \
544 conj_t conjc, \
545 pack_t schema, \
546 bool_t invdiag, \
547 dim_t m_panel, \
548 dim_t n_panel, \
549 dim_t m_panel_max, \
550 dim_t n_panel_max, \
551 dim_t panel_dim, \
552 dim_t panel_len, \
553 ctype* restrict kappa, \
554 ctype* restrict c, inc_t rs_c, inc_t cs_c, \
555 inc_t incc, inc_t ldc, \
556 ctype* restrict p, inc_t rs_p, inc_t cs_p, \
557 inc_t is_p, inc_t ldp \
558 ) \
559 { \
560 /* Pack the panel. */ \
561 PASTEMAC(ch,kername)( conjc, \
562 panel_dim, \
563 panel_len, \
564 kappa, \
565 c, incc, ldc, \
566 p, is_p, ldp ); \
567 \
568 \
569 /* Tweak the panel according to its triangular structure */ \
570 { \
571 ctype_r* p_r = ( ctype_r* )p + 0; \
572 ctype_r* p_i = ( ctype_r* )p + is_p; \
573 ctype_r* p_rpi = ( ctype_r* )p + 2*is_p; \
574 \
575 dim_t j = bli_abs( diagoffp ); \
576 ctype_r* p11_r = p_r + (j )*ldp; \
577 ctype_r* p11_i = p_i + (j )*ldp; \
578 ctype_r* p11_rpi = p_rpi + (j )*ldp; \
579 \
580 dim_t p11_m = m_panel; \
581 dim_t p11_n = n_panel; \
582 \
583 dim_t min_p11_m_n; \
584 \
585 if ( diagoffp < 0 ) p11_m -= j; \
586 else if ( diagoffp > 0 ) p11_n -= j; \
587 \
588 min_p11_m_n = bli_min( p11_m, p11_n ); \
589 \
590 \
591 /* If the diagonal of c is implicitly unit, explicitly set the
592 the diagonal of the packed panel to kappa. */ \
593 if ( bli_is_unit_diag( diagc ) ) \
594 { \
595 ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
596 ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
597 dim_t i; \
598 \
599 PASTEMAC(chr,setd)( diagoffp, \
600 m_panel, \
601 n_panel, \
602 &kappa_r, \
603 p_r, rs_p, cs_p ); \
604 PASTEMAC(chr,setd)( diagoffp, \
605 m_panel, \
606 n_panel, \
607 &kappa_i, \
608 p_i, rs_p, cs_p ); \
609 \
610 /* Update the diagonal of the p11 section of the rpi panel.
611 It simply needs to contain the sum of diagonals of p11_r
612 and p11_i. */ \
613 for ( i = 0; i < min_p11_m_n; ++i ) \
614 { \
615 ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
616 ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
617 ctype_r* pi11_rpi = p11_rpi + (i )*rs_p + (i )*cs_p; \
618 \
619 PASTEMAC(chr,add3s)( *pi11_r, *pi11_i, *pi11_rpi ); \
620 } \
621 } \
622 \
623 /* If requested, invert the diagonal of the packed panel. Note
624 that we do not need to update the ri panel since inverted
625 diagonals are only needed by trsm, which does not use the
626 p11 section of the ri panel. */ \
627 if ( invdiag == TRUE ) \
628 { \
629 dim_t i; \
630 \
631 for ( i = 0; i < min_p11_m_n; ++i ) \
632 { \
633 ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
634 ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
635 \
636 PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
637 } \
638 } \
639 \
640 /* Set the region opposite the diagonal of p to zero. To do this,
641 we need to reference the "unstored" region on the other side of
642 the diagonal. This amounts to toggling uploc and then shifting
643 the diagonal offset to shrink the newly referenced region (by
644 one diagonal). Note that this zero-filling is not needed for
645 trsm, since the unstored region is not referenced by the trsm
646 micro-kernel; however, zero-filling is needed for trmm, which
647 uses the gemm micro-kernel.*/ \
648 { \
649 ctype_r* restrict zero_r = PASTEMAC(chr,0); \
650 uplo_t uplop = uploc; \
651 \
652 bli_toggle_uplo( uplop ); \
653 bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
654 \
655 PASTEMAC(chr,setm)( diagoffp, \
656 BLIS_NONUNIT_DIAG, \
657 uplop, \
658 m_panel, \
659 n_panel, \
660 zero_r, \
661 p_r, rs_p, cs_p ); \
662 PASTEMAC(chr,setm)( diagoffp, \
663 BLIS_NONUNIT_DIAG, \
664 uplop, \
665 m_panel, \
666 n_panel, \
667 zero_r, \
668 p_i, rs_p, cs_p ); \
669 PASTEMAC(chr,setm)( diagoffp, \
670 BLIS_NONUNIT_DIAG, \
671 uplop, \
672 m_panel, \
673 n_panel, \
674 zero_r, \
675 p_rpi, rs_p, cs_p ); \
676 } \
677 } \
678 }
680 INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_3m, packm_cxk_3m )