1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas at Austin
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #undef GENTFUNCCO
38 #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
39 \
40 void PASTEMAC(ch,varname)( \
41 struc_t strucc, \
42 doff_t diagoffc, \
43 diag_t diagc, \
44 uplo_t uploc, \
45 conj_t conjc, \
46 pack_t schema, \
47 bool_t invdiag, \
48 dim_t m_panel, \
49 dim_t n_panel, \
50 dim_t m_panel_max, \
51 dim_t n_panel_max, \
52 ctype* restrict kappa, \
53 ctype* restrict c, inc_t rs_c, inc_t cs_c, \
54 ctype* restrict p, inc_t rs_p, inc_t cs_p \
55 ) \
56 { \
57 dim_t panel_dim; \
58 dim_t panel_len; \
59 /*dim_t panel_len_max; */\
60 inc_t incc, ldc; \
61 inc_t ldp; \
62 \
63 \
64 /* Determine the dimensions and relative strides of the micro-panel
65 based on its pack schema. */ \
66 if ( bli_is_col_packed( schema ) ) \
67 { \
68 /* Prepare to pack to row-stored column panel. */ \
69 panel_dim = n_panel; \
70 panel_len = m_panel; \
71 /*panel_len_max = m_panel_max;*/ \
72 incc = cs_c; \
73 ldc = rs_c; \
74 ldp = rs_p; \
75 } \
76 else /* if ( bli_is_row_packed( schema ) ) */ \
77 { \
78 /* Prepare to pack to column-stored row panel. */ \
79 panel_dim = m_panel; \
80 panel_len = n_panel; \
81 /*panel_len_max = n_panel_max;*/ \
82 incc = rs_c; \
83 ldc = cs_c; \
84 ldp = cs_p; \
85 } \
86 \
87 \
88 /* Handle micro-panel packing based on the structure of the matrix
89 being packed. */ \
90 if ( bli_is_general( strucc ) ) \
91 { \
92 /* For micro-panels of general matrices, we can call the pack
93 kernel front-end directly. */ \
94 PASTEMAC(ch,kername)( conjc, \
95 schema, \
96 panel_dim, \
97 panel_len, \
98 kappa, \
99 c, incc, ldc, \
100 p, ldp ); \
101 } \
102 else if ( bli_is_herm_or_symm( strucc ) ) \
103 { \
104 /* Call a helper function for micro-panels of Hermitian/symmetric
105 matrices. */ \
106 PASTEMAC(ch,packm_herm_cxk_rih)( strucc, \
107 diagoffc, \
108 uploc, \
109 conjc, \
110 schema, \
111 m_panel, \
112 n_panel, \
113 m_panel_max, \
114 n_panel_max, \
115 panel_dim, \
116 panel_len, \
117 kappa, \
118 c, rs_c, cs_c, \
119 incc, ldc, \
120 p, rs_p, cs_p, \
121 ldp ); \
122 } \
123 else /* ( bli_is_triangular( strucc ) ) */ \
124 { \
125 /* Call a helper function for micro-panels of triangular
126 matrices. */ \
127 PASTEMAC(ch,packm_tri_cxk_rih)( strucc, \
128 diagoffc, \
129 diagc, \
130 uploc, \
131 conjc, \
132 schema, \
133 invdiag, \
134 m_panel, \
135 n_panel, \
136 m_panel_max, \
137 n_panel_max, \
138 panel_dim, \
139 panel_len, \
140 kappa, \
141 c, rs_c, cs_c, \
142 incc, ldc, \
143 p, rs_p, cs_p, \
144 ldp ); \
145 } \
146 \
147 \
148 /* The packed memory region was acquired/allocated with "aligned"
149 dimensions (ie: dimensions that were possibly inflated up to a
150 multiple). When these dimension are inflated, it creates empty
151 regions along the bottom and/or right edges of the matrix. If
152 either region exists, we set them to zero. This allows the
153 micro-kernel to remain simple since it does not need to support
154 different register blockings for the edge cases. */ \
155 if ( m_panel != m_panel_max ) \
156 { \
157 ctype_r* restrict zero_r = PASTEMAC(chr,0); \
158 dim_t i = m_panel; \
159 dim_t m_edge = m_panel_max - i; \
160 dim_t n_edge = n_panel_max; \
161 ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
162 \
163 PASTEMAC(chr,setm)( 0, \
164 BLIS_NONUNIT_DIAG, \
165 BLIS_DENSE, \
166 m_edge, \
167 n_edge, \
168 zero_r, \
169 p_edge_r, rs_p, cs_p ); \
170 } \
171 \
172 if ( n_panel != n_panel_max ) \
173 { \
174 ctype_r* restrict zero_r = PASTEMAC(chr,0); \
175 dim_t j = n_panel; \
176 dim_t m_edge = m_panel_max; \
177 dim_t n_edge = n_panel_max - j; \
178 ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
179 \
180 PASTEMAC(chr,setm)( 0, \
181 BLIS_NONUNIT_DIAG, \
182 BLIS_DENSE, \
183 m_edge, \
184 n_edge, \
185 zero_r, \
186 p_edge_r, rs_p, cs_p ); \
187 } \
188 \
189 \
190 if ( bli_is_triangular( strucc ) ) \
191 { \
192 /* If this panel is an edge case in both panel dimension and length,
193 then it must be a bottom-right corner case. Set the part of the
194 diagonal that extends into the zero-padded region to identity.
195 NOTE: This is actually only necessary when packing for trsm, as
196 it helps prevent NaNs and Infs from creeping into the computation.
197 However, we set the region to identity for trmm as well. Those
198 1.0's end up getting muliplied by the 0.0's in the zero-padded
199 region of the other matrix, so there is no harm in this. */ \
200 if ( m_panel != m_panel_max && \
201 n_panel != n_panel_max ) \
202 { \
203 /* We don't need this case if we aren't supporting trsm.
204 Why? Because trmm's packm control tree node should be
205 using k dimension multiples of 1 (kr == 1), which means
206 there will never be zero padding at the far end of a
207 micro-panel. */ \
208 } \
209 } \
210 \
211 \
212 /*
213 { \
214 if ( bli_is_col_packed( schema ) ) \
215 PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: bp copied", m_panel_max, n_panel_max, \
216 ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
217 else if ( bli_is_row_packed( schema ) ) \
218 PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: ap copied", m_panel_max, n_panel_max, \
219 ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
220 } \
221 */ \
222 \
223 \
224 }
226 INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_rih, packm_cxk_rih )
231 #undef GENTFUNCCO
232 #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
233 \
234 void PASTEMAC(ch,varname)( \
235 struc_t strucc, \
236 doff_t diagoffc, \
237 uplo_t uploc, \
238 conj_t conjc, \
239 pack_t schema, \
240 dim_t m_panel, \
241 dim_t n_panel, \
242 dim_t m_panel_max, \
243 dim_t n_panel_max, \
244 dim_t panel_dim, \
245 dim_t panel_len, \
246 ctype* restrict kappa, \
247 ctype* restrict c, inc_t rs_c, inc_t cs_c, \
248 inc_t incc, inc_t ldc, \
249 ctype* restrict p, inc_t rs_p, inc_t cs_p, \
250 inc_t ldp \
251 ) \
252 { \
253 bool_t row_stored; \
254 bool_t col_stored; \
255 doff_t diagoffc_abs; \
256 dim_t j; \
257 \
258 \
259 /* Create flags to incidate row or column storage. Note that the
260 schema bit that encodes row or column is describing the form of
261 micro-panel, not the storage in the micro-panel. Hence the
262 mismatch in "row" and "column" semantics. */ \
263 row_stored = bli_is_col_packed( schema ); \
264 col_stored = bli_is_row_packed( schema ); \
265 \
266 \
267 /* Handle the case where the micro-panel does NOT intersect the
268 diagonal separately from the case where it does intersect. */ \
269 if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
270 { \
271 /* If the current panel is unstored, we need to make a few
272 adjustments so we refer to the data where it is actually
273 stored, also taking conjugation into account. (Note this
274 implicitly assumes we are operating on a dense panel
275 within a larger symmetric or Hermitian matrix, since a
276 general matrix would not contain any unstored region.) */ \
277 if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
278 { \
279 c = c + diagoffc * ( doff_t )cs_c + \
280 -diagoffc * ( doff_t )rs_c; \
281 bli_swap_incs( incc, ldc ); \
282 \
283 if ( bli_is_hermitian( strucc ) ) \
284 bli_toggle_conj( conjc ); \
285 } \
286 \
287 /* Pack the full panel. */ \
288 PASTEMAC(ch,kername)( conjc, \
289 schema, \
290 panel_dim, \
291 panel_len, \
292 kappa, \
293 c, incc, ldc, \
294 p, ldp ); \
295 } \
296 else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
297 { \
298 ctype_r* restrict p_r = ( ctype_r* )p; \
299 \
300 ctype* restrict c10; \
301 ctype_r* restrict p10; \
302 dim_t p10_dim, p10_len; \
303 inc_t incc10, ldc10; \
304 doff_t diagoffc10; \
305 conj_t conjc10; \
306 \
307 ctype* restrict c12; \
308 ctype_r* restrict p12; \
309 dim_t p12_dim, p12_len; \
310 inc_t incc12, ldc12; \
311 doff_t diagoffc12; \
312 conj_t conjc12; \
313 \
314 /* Sanity check. Diagonals should not intersect the short end of
315 a micro-panel. If they do, then somehow the constraints on
316 cache blocksizes being a whole multiple of the register
317 blocksizes was somehow violated. */ \
318 if ( ( col_stored && diagoffc < 0 ) || \
319 ( row_stored && diagoffc > 0 ) ) \
320 bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
321 \
322 diagoffc_abs = bli_abs( diagoffc ); \
323 \
324 if ( ( row_stored && bli_is_upper( uploc ) ) || \
325 ( col_stored && bli_is_lower( uploc ) ) ) \
326 { \
327 p10_dim = panel_dim; \
328 p10_len = diagoffc_abs; \
329 p10 = p_r; \
330 c10 = c; \
331 incc10 = incc; \
332 ldc10 = ldc; \
333 conjc10 = conjc; \
334 \
335 p12_dim = panel_dim; \
336 p12_len = panel_len - p10_len; \
337 j = p10_len; \
338 diagoffc12 = diagoffc_abs - j; \
339 p12 = p_r + (j )*ldp; \
340 c12 = c + (j )*ldc; \
341 c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
342 -diagoffc12 * ( doff_t )rs_c; \
343 incc12 = ldc; \
344 ldc12 = incc; \
345 conjc12 = conjc; \
346 \
347 if ( bli_is_hermitian( strucc ) ) \
348 bli_toggle_conj( conjc12 ); \
349 } \
350 else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
351 ( col_stored && bli_is_upper( uploc ) ) ) */ \
352 { \
353 p10_dim = panel_dim; \
354 p10_len = diagoffc_abs + panel_dim; \
355 diagoffc10 = diagoffc; \
356 p10 = p_r; \
357 c10 = c; \
358 c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
359 -diagoffc10 * ( doff_t )rs_c; \
360 incc10 = ldc; \
361 ldc10 = incc; \
362 conjc10 = conjc; \
363 \
364 p12_dim = panel_dim; \
365 p12_len = panel_len - p10_len; \
366 j = p10_len; \
367 p12 = p_r + (j )*ldp; \
368 c12 = c + (j )*ldc; \
369 incc12 = incc; \
370 ldc12 = ldc; \
371 conjc12 = conjc; \
372 \
373 if ( bli_is_hermitian( strucc ) ) \
374 bli_toggle_conj( conjc10 ); \
375 } \
376 \
377 /* Pack to p10. For upper storage, this includes the unstored
378 triangle of c11. */ \
379 PASTEMAC(ch,kername)( conjc10, \
380 schema, \
381 p10_dim, \
382 p10_len, \
383 kappa, \
384 c10, incc10, ldc10, \
385 p10, ldp ); \
386 \
387 /* Pack to p12. For lower storage, this includes the unstored
388 triangle of c11. */ \
389 PASTEMAC(ch,kername)( conjc12, \
390 schema, \
391 p12_dim, \
392 p12_len, \
393 kappa, \
394 c12, incc12, ldc12, \
395 p12, ldp ); \
396 \
397 /* Pack the stored triangle of c11 to p11. */ \
398 { \
399 dim_t j = diagoffc_abs; \
400 ctype_r* restrict p_r = ( ctype_r* )p; \
401 ctype* restrict c11 = c + (j )*ldc; \
402 ctype_r* restrict p11_r = p_r + (j )*ldp; \
403 \
404 PASTEMAC(ch,scal2rihs_mxn_uplo)( schema, \
405 uploc, \
406 conjc, \
407 panel_dim, \
408 kappa, \
409 c11, rs_c, cs_c, \
410 p11_r, rs_p, cs_p ); \
411 \
412 /* If we are packing a micro-panel with Hermitian structure,
413 we must take special care of the diagonal. Now, if kappa
414 were guaranteed to be unit, all we would need to do is
415 explicitly zero out the imaginary part of the diagonal of
416 p11, in case the diagonal of the source matrix contained
417 garbage (non-zero) imaginary values. HOWEVER, since kappa
418 can be non-unit, things become a little more complicated.
419 In general, we must re-apply the kappa scalar to ONLY the
420 real part of the diagonal of the source matrix and save
421 the result to the diagonal of p11. */ \
422 if ( bli_is_hermitian( strucc ) ) \
423 { \
424 PASTEMAC3(ch,chr,ch,scal2rihs_mxn_diag)( schema, \
425 panel_dim, \
426 panel_dim, \
427 kappa, \
428 c11, rs_c, cs_c, \
429 p11_r, rs_p, cs_p ); \
430 } \
431 \
432 /*
433 PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \
434 p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \
435 PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \
436 p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \
437 */ \
438 } \
439 } \
440 }
442 INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_rih, packm_cxk_rih )
448 #undef GENTFUNCCO
449 #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
450 \
451 void PASTEMAC(ch,varname)( \
452 struc_t strucc, \
453 doff_t diagoffp, \
454 diag_t diagc, \
455 uplo_t uploc, \
456 conj_t conjc, \
457 pack_t schema, \
458 bool_t invdiag, \
459 dim_t m_panel, \
460 dim_t n_panel, \
461 dim_t m_panel_max, \
462 dim_t n_panel_max, \
463 dim_t panel_dim, \
464 dim_t panel_len, \
465 ctype* restrict kappa, \
466 ctype* restrict c, inc_t rs_c, inc_t cs_c, \
467 inc_t incc, inc_t ldc, \
468 ctype* restrict p, inc_t rs_p, inc_t cs_p, \
469 inc_t ldp \
470 ) \
471 { \
472 /* Pack the panel. */ \
473 PASTEMAC(ch,kername)( conjc, \
474 schema, \
475 panel_dim, \
476 panel_len, \
477 kappa, \
478 c, incc, ldc, \
479 p, ldp ); \
480 \
481 \
482 /* Tweak the panel according to its triangular structure */ \
483 { \
484 ctype_r* p_r = ( ctype_r* )p; \
485 \
486 dim_t j = bli_abs( diagoffp ); \
487 ctype_r* p11_r = p_r + (j )*ldp; \
488 \
489 /* If the diagonal of c is implicitly unit, explicitly set the
490 the diagonal of the packed panel to kappa. */ \
491 if ( bli_is_unit_diag( diagc ) ) \
492 { \
493 PASTEMAC(ch,setrihs_mxn_diag)( schema, \
494 panel_dim, \
495 panel_dim, \
496 kappa, \
497 p11_r, rs_p, cs_p ); \
498 } \
499 \
500 \
501 /* If requested, invert the diagonal of the packed panel. */ \
502 if ( invdiag == TRUE ) \
503 { \
504 /* We don't need this case if we aren't supporting trsm. */ \
505 } \
506 \
507 \
508 /* Set the region opposite the diagonal of p to zero. To do this,
509 we need to reference the "unstored" region on the other side of
510 the diagonal. This amounts to toggling uploc and then shifting
511 the diagonal offset to shrink the newly referenced region (by
512 one diagonal). */ \
513 { \
514 ctype_r* restrict zero_r = PASTEMAC(chr,0); \
515 uplo_t uplop = uploc; \
516 \
517 bli_toggle_uplo( uplop ); \
518 bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
519 \
520 PASTEMAC(chr,setm)( diagoffp, \
521 BLIS_NONUNIT_DIAG, \
522 uplop, \
523 m_panel, \
524 n_panel, \
525 zero_r, \
526 p_r, rs_p, cs_p ); \
527 } \
528 } \
529 }
531 INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_rih, packm_cxk_rih )