1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas at Austin
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #define FUNCPTR_T packm_cxk_fp
39 typedef void (*FUNCPTR_T)(
40 conj_t conja,
41 dim_t panel_len,
42 void* kappa,
43 void* a, inc_t inca, inc_t lda,
44 void* p, inc_t is_p, inc_t ldp
45 );
47 #undef FUNCPTR_ARRAY_LENGTH
48 #define FUNCPTR_ARRAY_LENGTH 32
50 static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] =
51 {
52 /* micro-panel width = 0 */
53 {
54 NULL, NULL, NULL, NULL,
55 },
56 /* micro-panel width = 1 */
57 {
58 NULL, NULL, NULL, NULL,
59 },
60 /* micro-panel width = 2 */
61 {
62 NULL, BLIS_CPACKM_2XK_3M_KERNEL,
63 NULL, BLIS_ZPACKM_2XK_3M_KERNEL,
64 },
65 /* micro-panel width = 3 */
66 {
67 NULL, NULL, NULL, NULL,
68 },
69 /* micro-panel width = 4 */
70 {
71 NULL, BLIS_CPACKM_4XK_3M_KERNEL,
72 NULL, BLIS_ZPACKM_4XK_3M_KERNEL,
73 },
74 /* micro-panel width = 5 */
75 {
76 NULL, NULL, NULL, NULL,
77 },
78 /* micro-panel width = 6 */
79 {
80 NULL, BLIS_CPACKM_6XK_3M_KERNEL,
81 NULL, BLIS_ZPACKM_6XK_3M_KERNEL,
82 },
83 /* micro-panel width = 7 */
84 {
85 NULL, NULL, NULL, NULL,
86 },
87 /* micro-panel width = 8 */
88 {
89 NULL, BLIS_CPACKM_8XK_3M_KERNEL,
90 NULL, BLIS_ZPACKM_8XK_3M_KERNEL,
91 },
92 #ifdef BLIS_ENABLE_C66X_BUILD // these packs are not used in c66x; allows fatser compilation
93 /* micro-panel width = 9 */
94 {
95 NULL, NULL, NULL, NULL,
96 },
97 /* micro-panel width = 10 */
98 {
99 NULL, NULL, NULL, NULL,
100 },
101 /* micro-panel width = 11 */
102 {
103 NULL, NULL, NULL, NULL,
104 },
105 /* micro-panel width = 12 */
106 {
107 NULL, NULL, NULL, NULL,
108 },
109 /* micro-panel width = 13 */
110 {
111 NULL, NULL, NULL, NULL,
112 },
113 /* micro-panel width = 14 */
114 {
115 NULL, NULL, NULL, NULL,
116 },
117 /* micro-panel width = 15 */
118 {
119 NULL, NULL, NULL, NULL,
120 },
121 /* micro-panel width = 16 */
122 {
123 NULL, NULL, NULL, NULL,
124 },
125 /* micro-panel width = 17 */
126 {
127 NULL, NULL, NULL, NULL,
128 },
129 /* micro-panel width = 18 */
130 {
131 NULL, NULL, NULL, NULL,
132 },
133 /* micro-panel width = 19 */
134 {
135 NULL, NULL, NULL, NULL,
136 },
137 /* micro-panel width = 20 */
138 {
139 NULL, NULL, NULL, NULL,
140 },
141 /* micro-panel width = 21 */
142 {
143 NULL, NULL, NULL, NULL,
144 },
145 /* micro-panel width = 22 */
146 {
147 NULL, NULL, NULL, NULL,
148 },
149 /* micro-panel width = 23 */
150 {
151 NULL, NULL, NULL, NULL,
152 },
153 /* micro-panel width = 24 */
154 {
155 NULL, NULL, NULL, NULL,
156 },
157 /* micro-panel width = 25 */
158 {
159 NULL, NULL, NULL, NULL,
160 },
161 /* micro-panel width = 26 */
162 {
163 NULL, NULL, NULL, NULL,
164 },
165 /* micro-panel width = 27 */
166 {
167 NULL, NULL, NULL, NULL,
168 },
169 /* micro-panel width = 28 */
170 {
171 NULL, NULL, NULL, NULL,
172 },
173 /* micro-panel width = 29 */
174 {
175 NULL, NULL, NULL, NULL,
176 },
177 /* micro-panel width = 30 */
178 {
179 NULL, NULL, NULL, NULL,
180 },
181 /* micro-panel width = 31 */
182 {
183 NULL, NULL, NULL, NULL,
184 },
185 #else
186 /* micro-panel width = 9 */
187 {
188 NULL, NULL, NULL, NULL,
189 },
190 /* micro-panel width = 10 */
191 {
192 NULL, BLIS_CPACKM_10XK_3M_KERNEL,
193 NULL, BLIS_ZPACKM_10XK_3M_KERNEL,
194 },
195 /* micro-panel width = 11 */
196 {
197 NULL, NULL, NULL, NULL,
198 },
199 /* micro-panel width = 12 */
200 {
201 NULL, BLIS_CPACKM_12XK_3M_KERNEL,
202 NULL, BLIS_ZPACKM_12XK_3M_KERNEL,
203 },
204 /* micro-panel width = 13 */
205 {
206 NULL, NULL, NULL, NULL,
207 },
208 /* micro-panel width = 14 */
209 {
210 NULL, BLIS_CPACKM_14XK_3M_KERNEL,
211 NULL, BLIS_ZPACKM_14XK_3M_KERNEL,
212 },
213 /* micro-panel width = 15 */
214 {
215 NULL, NULL, NULL, NULL,
216 },
217 /* micro-panel width = 16 */
218 {
219 NULL, BLIS_CPACKM_16XK_3M_KERNEL,
220 NULL, BLIS_ZPACKM_16XK_3M_KERNEL,
221 },
222 /* micro-panel width = 17 */
223 {
224 NULL, NULL, NULL, NULL,
225 },
226 /* micro-panel width = 18 */
227 {
228 NULL, NULL, NULL, NULL,
229 },
230 /* micro-panel width = 19 */
231 {
232 NULL, NULL, NULL, NULL,
233 },
234 /* micro-panel width = 20 */
235 {
236 NULL, NULL, NULL, NULL,
237 },
238 /* micro-panel width = 21 */
239 {
240 NULL, NULL, NULL, NULL,
241 },
242 /* micro-panel width = 22 */
243 {
244 NULL, NULL, NULL, NULL,
245 },
246 /* micro-panel width = 23 */
247 {
248 NULL, NULL, NULL, NULL,
249 },
250 /* micro-panel width = 24 */
251 {
252 NULL, NULL, NULL, NULL,
253 },
254 /* micro-panel width = 25 */
255 {
256 NULL, NULL, NULL, NULL,
257 },
258 /* micro-panel width = 26 */
259 {
260 NULL, NULL, NULL, NULL,
261 },
262 /* micro-panel width = 27 */
263 {
264 NULL, NULL, NULL, NULL,
265 },
266 /* micro-panel width = 28 */
267 {
268 NULL, NULL, NULL, NULL,
269 },
270 /* micro-panel width = 29 */
271 {
272 NULL, NULL, NULL, NULL,
273 },
274 /* micro-panel width = 30 */
275 {
276 NULL,
277 BLIS_CPACKM_30XK_3M_KERNEL,
278 NULL,
279 BLIS_ZPACKM_30XK_3M_KERNEL,
280 },
281 /* micro-panel width = 31 */
282 {
283 NULL, NULL, NULL, NULL,
284 },
285 #endif
286 };
290 #undef GENTFUNCCO
291 #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
292 \
293 void PASTEMAC(ch,varname)( \
294 conj_t conja, \
295 dim_t panel_dim, \
296 dim_t panel_len, \
297 void* kappa, \
298 void* a, inc_t inca, inc_t lda, \
299 void* p, inc_t is_p, inc_t ldp \
300 ) \
301 { \
302 num_t dt; \
303 FUNCPTR_T f; \
304 \
305 /* Acquire the datatype for the current function. */ \
306 dt = PASTEMAC(ch,type); \
307 \
308 /* Index into the array to extract the correct function pointer.
309 If the micro-panel dimension is too big to be within the array of
310 explicitly handled kernels, then we treat that kernel the same
311 as if it were in range but unimplemented. */ \
312 if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) f = ftypes[panel_dim][dt]; \
313 else f = NULL; \
314 \
315 /* If there exists a kernel implementation for the micro-panel dimension
316 provided, we invoke the implementation. Otherwise, we use scal2m. */ \
317 if ( f != NULL ) \
318 { \
319 f( conja, \
320 panel_len, \
321 kappa, \
322 a, inca, lda, \
323 p, is_p, ldp ); \
324 } \
325 else \
326 { \
327 ctype_r* restrict kappa_r = ( ctype_r* )kappa; \
328 ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \
329 ctype_r* restrict a_r = ( ctype_r* )a; \
330 ctype_r* restrict a_i = ( ctype_r* )a + 1; \
331 ctype_r* restrict p_r = ( ctype_r* )p; \
332 ctype_r* restrict p_i = ( ctype_r* )p + is_p; \
333 ctype_r* restrict p_rpi = ( ctype_r* )p + 2*is_p; \
334 const dim_t inca2 = 2*inca; \
335 const dim_t lda2 = 2*lda; \
336 dim_t i, j; \
337 \
338 /* Treat the micro-panel as panel_dim x panel_len and column-stored
339 (unit row stride). */ \
340 \
341 /* NOTE: The loops below are inlined versions of scal2m, but
342 for separated real/imaginary storage. */ \
343 \
344 if ( bli_is_conj( conja ) ) \
345 { \
346 for ( j = 0; j < panel_len; ++j ) \
347 { \
348 for ( i = 0; i < panel_dim; ++i ) \
349 { \
350 ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
351 ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
352 ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \
353 ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \
354 ctype_r* restrict pi11_rpi = p_rpi + (i )*1 + (j )*ldp; \
355 \
356 PASTEMAC(ch,scal2jri3s)( *kappa_r, \
357 *kappa_i, \
358 *alpha11_r, \
359 *alpha11_i, \
360 *pi11_r, \
361 *pi11_i, \
362 *pi11_rpi ); \
363 } \
364 } \
365 } \
366 else /* if ( bli_is_noconj( conja ) ) */ \
367 { \
368 for ( j = 0; j < panel_len; ++j ) \
369 { \
370 for ( i = 0; i < panel_dim; ++i ) \
371 { \
372 ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
373 ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
374 ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \
375 ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \
376 ctype_r* restrict pi11_rpi = p_rpi + (i )*1 + (j )*ldp; \
377 \
378 PASTEMAC(ch,scal2ri3s)( *kappa_r, \
379 *kappa_i, \
380 *alpha11_r, \
381 *alpha11_i, \
382 *pi11_r, \
383 *pi11_i, \
384 *pi11_rpi ); \
385 } \
386 } \
387 } \
388 } \
389 }
391 INSERT_GENTFUNCCO_BASIC0( packm_cxk_3m )