1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas at Austin
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #ifndef BLIS_KERNEL_POST_MACRO_DEFS_H
36 #define BLIS_KERNEL_POST_MACRO_DEFS_H
38 /*
39 // -- Define PASTEMAC-friendly kernel function name macros ---------------------
41 //
42 // Level-3
43 //
45 // gemm micro-kernels
47 #define bli_sGEMM_UKERNEL BLIS_SGEMM_UKERNEL
48 #define bli_dGEMM_UKERNEL BLIS_DGEMM_UKERNEL
49 #define bli_cGEMM_UKERNEL BLIS_CGEMM_UKERNEL
50 #define bli_zGEMM_UKERNEL BLIS_ZGEMM_UKERNEL
52 // gemmtrsm_l micro-kernels
54 #define bli_sGEMMTRSM_L_UKERNEL BLIS_SGEMMTRSM_L_UKERNEL
55 #define bli_dGEMMTRSM_L_UKERNEL BLIS_DGEMMTRSM_L_UKERNEL
56 #define bli_cGEMMTRSM_L_UKERNEL BLIS_CGEMMTRSM_L_UKERNEL
57 #define bli_zGEMMTRSM_L_UKERNEL BLIS_ZGEMMTRSM_L_UKERNEL
59 // gemmtrsm_u micro-kernels
61 #define bli_sGEMMTRSM_U_UKERNEL BLIS_SGEMMTRSM_U_UKERNEL
62 #define bli_dGEMMTRSM_U_UKERNEL BLIS_DGEMMTRSM_U_UKERNEL
63 #define bli_cGEMMTRSM_U_UKERNEL BLIS_CGEMMTRSM_U_UKERNEL
64 #define bli_zGEMMTRSM_U_UKERNEL BLIS_ZGEMMTRSM_U_UKERNEL
66 // trsm_l micro-kernels
68 #define bli_sTRSM_L_UKERNEL BLIS_STRSM_L_UKERNEL
69 #define bli_dTRSM_L_UKERNEL BLIS_DTRSM_L_UKERNEL
70 #define bli_cTRSM_L_UKERNEL BLIS_CTRSM_L_UKERNEL
71 #define bli_zTRSM_L_UKERNEL BLIS_ZTRSM_L_UKERNEL
73 // trsm_u micro-kernels
75 #define bli_sTRSM_U_UKERNEL BLIS_STRSM_U_UKERNEL
76 #define bli_dTRSM_U_UKERNEL BLIS_DTRSM_U_UKERNEL
77 #define bli_cTRSM_U_UKERNEL BLIS_CTRSM_U_UKERNEL
78 #define bli_zTRSM_U_UKERNEL BLIS_ZTRSM_U_UKERNEL
80 //
81 // Level-3 4m
82 //
84 // gemm4m micro-kernels
86 #define bli_cGEMM4M_UKERNEL BLIS_CGEMM4M_UKERNEL
87 #define bli_zGEMM4M_UKERNEL BLIS_ZGEMM4M_UKERNEL
89 // gemmtrsm4m_l micro-kernels
91 #define bli_cGEMMTRSM4M_L_UKERNEL BLIS_CGEMMTRSM4M_L_UKERNEL
92 #define bli_zGEMMTRSM4M_L_UKERNEL BLIS_ZGEMMTRSM4M_L_UKERNEL
94 // gemmtrsm4m_u micro-kernels
96 #define bli_cGEMMTRSM4M_U_UKERNEL BLIS_CGEMMTRSM4M_U_UKERNEL
97 #define bli_zGEMMTRSM4M_U_UKERNEL BLIS_ZGEMMTRSM4M_U_UKERNEL
99 // trsm4m_l micro-kernels
101 #define bli_cTRSM4M_L_UKERNEL BLIS_CTRSM4M_L_UKERNEL
102 #define bli_zTRSM4M_L_UKERNEL BLIS_ZTRSM4M_L_UKERNEL
104 // trsm4m_u micro-kernels
106 #define bli_cTRSM4M_U_UKERNEL BLIS_CTRSM4M_U_UKERNEL
107 #define bli_zTRSM4M_U_UKERNEL BLIS_ZTRSM4M_U_UKERNEL
109 //
110 // Level-3 3m
111 //
113 // gemm3m micro-kernels
115 #define bli_cGEMM3M_UKERNEL BLIS_CGEMM3M_UKERNEL
116 #define bli_zGEMM3M_UKERNEL BLIS_ZGEMM3M_UKERNEL
118 // gemmtrsm3m_l micro-kernels
120 #define bli_cGEMMTRSM3M_L_UKERNEL BLIS_CGEMMTRSM3M_L_UKERNEL
121 #define bli_zGEMMTRSM3M_L_UKERNEL BLIS_ZGEMMTRSM3M_L_UKERNEL
123 // gemmtrsm3m_u micro-kernels
125 #define bli_cGEMMTRSM3M_U_UKERNEL BLIS_CGEMMTRSM3M_U_UKERNEL
126 #define bli_zGEMMTRSM3M_U_UKERNEL BLIS_ZGEMMTRSM3M_U_UKERNEL
128 // trsm3m_l micro-kernels
130 #define bli_cTRSM3M_L_UKERNEL BLIS_CTRSM3M_L_UKERNEL
131 #define bli_zTRSM3M_L_UKERNEL BLIS_ZTRSM3M_L_UKERNEL
133 // trsm3m_u micro-kernels
135 #define bli_cTRSM3M_U_UKERNEL BLIS_CTRSM3M_U_UKERNEL
136 #define bli_zTRSM3M_U_UKERNEL BLIS_ZTRSM3M_U_UKERNEL
138 //
139 // Level-1m
140 //
142 // NOTE: We don't need any PASTEMAC-friendly aliases to packm kernel
143 // macros because they are used directly in the initialization of the
144 // function pointer array, rather than via a templatizing wrapper macro.
147 //
148 // Level-1f
149 //
151 // axpy2v kernels
153 #define bli_sssAXPY2V_KERNEL BLIS_SAXPY2V_KERNEL
154 #define bli_dddAXPY2V_KERNEL BLIS_DAXPY2V_KERNEL
155 #define bli_cccAXPY2V_KERNEL BLIS_CAXPY2V_KERNEL
156 #define bli_zzzAXPY2V_KERNEL BLIS_ZAXPY2V_KERNEL
158 // dotaxpyv kernels
160 #define bli_sssDOTAXPYV_KERNEL BLIS_SDOTAXPYV_KERNEL
161 #define bli_dddDOTAXPYV_KERNEL BLIS_DDOTAXPYV_KERNEL
162 #define bli_cccDOTAXPYV_KERNEL BLIS_CDOTAXPYV_KERNEL
163 #define bli_zzzDOTAXPYV_KERNEL BLIS_ZDOTAXPYV_KERNEL
165 // axpyf kernels
167 #define bli_sssAXPYF_KERNEL BLIS_SAXPYF_KERNEL
168 #define bli_dddAXPYF_KERNEL BLIS_DAXPYF_KERNEL
169 #define bli_cccAXPYF_KERNEL BLIS_CAXPYF_KERNEL
170 #define bli_zzzAXPYF_KERNEL BLIS_ZAXPYF_KERNEL
172 // dotxf kernels
174 #define bli_sssDOTXF_KERNEL BLIS_SDOTXF_KERNEL
175 #define bli_dddDOTXF_KERNEL BLIS_DDOTXF_KERNEL
176 #define bli_cccDOTXF_KERNEL BLIS_CDOTXF_KERNEL
177 #define bli_zzzDOTXF_KERNEL BLIS_ZDOTXF_KERNEL
179 // dotxaxpyf kernels
181 #define bli_sssDOTXAXPYF_KERNEL BLIS_SDOTXAXPYF_KERNEL
182 #define bli_dddDOTXAXPYF_KERNEL BLIS_DDOTXAXPYF_KERNEL
183 #define bli_cccDOTXAXPYF_KERNEL BLIS_CDOTXAXPYF_KERNEL
184 #define bli_zzzDOTXAXPYF_KERNEL BLIS_ZDOTXAXPYF_KERNEL
187 //
188 // Level-1v
189 //
191 // addv kernels
193 #define bli_ssADDV_KERNEL BLIS_SADDV_KERNEL
194 #define bli_ddADDV_KERNEL BLIS_DADDV_KERNEL
195 #define bli_ccADDV_KERNEL BLIS_CADDV_KERNEL
196 #define bli_zzADDV_KERNEL BLIS_ZADDV_KERNEL
198 // axpyv kernels
200 #define bli_sssAXPYV_KERNEL BLIS_SAXPYV_KERNEL
201 #define bli_dddAXPYV_KERNEL BLIS_DAXPYV_KERNEL
202 #define bli_cccAXPYV_KERNEL BLIS_CAXPYV_KERNEL
203 #define bli_zzzAXPYV_KERNEL BLIS_ZAXPYV_KERNEL
205 // copyv kernels
207 #define bli_ssCOPYV_KERNEL BLIS_SCOPYV_KERNEL
208 #define bli_ddCOPYV_KERNEL BLIS_DCOPYV_KERNEL
209 #define bli_ccCOPYV_KERNEL BLIS_CCOPYV_KERNEL
210 #define bli_zzCOPYV_KERNEL BLIS_ZCOPYV_KERNEL
212 // dotv kernels
214 #define bli_sssDOTV_KERNEL BLIS_SDOTV_KERNEL
215 #define bli_dddDOTV_KERNEL BLIS_DDOTV_KERNEL
216 #define bli_cccDOTV_KERNEL BLIS_CDOTV_KERNEL
217 #define bli_zzzDOTV_KERNEL BLIS_ZDOTV_KERNEL
219 // dotxv kernels
221 #define bli_sssDOTXV_KERNEL BLIS_SDOTXV_KERNEL
222 #define bli_dddDOTXV_KERNEL BLIS_DDOTXV_KERNEL
223 #define bli_cccDOTXV_KERNEL BLIS_CDOTXV_KERNEL
224 #define bli_zzzDOTXV_KERNEL BLIS_ZDOTXV_KERNEL
226 // invertv kernels
228 #define bli_sINVERTV_KERNEL BLIS_SINVERTV_KERNEL
229 #define bli_dINVERTV_KERNEL BLIS_DINVERTV_KERNEL
230 #define bli_cINVERTV_KERNEL BLIS_CINVERTV_KERNEL
231 #define bli_zINVERTV_KERNEL BLIS_ZINVERTV_KERNEL
233 // scal2v kernels
235 #define bli_sssSCAL2V_KERNEL BLIS_SSCAL2V_KERNEL
236 #define bli_dddSCAL2V_KERNEL BLIS_DSCAL2V_KERNEL
237 #define bli_cccSCAL2V_KERNEL BLIS_CSCAL2V_KERNEL
238 #define bli_zzzSCAL2V_KERNEL BLIS_ZSCAL2V_KERNEL
240 // scalv kernels
242 #define bli_ssSCALV_KERNEL BLIS_SSCALV_KERNEL
243 #define bli_ddSCALV_KERNEL BLIS_DSCALV_KERNEL
244 #define bli_ccSCALV_KERNEL BLIS_CSCALV_KERNEL
245 #define bli_zzSCALV_KERNEL BLIS_ZSCALV_KERNEL
247 // setv kernels
249 #define bli_ssSETV_KERNEL BLIS_SSETV_KERNEL
250 #define bli_ddSETV_KERNEL BLIS_DSETV_KERNEL
251 #define bli_ccSETV_KERNEL BLIS_CSETV_KERNEL
252 #define bli_zzSETV_KERNEL BLIS_ZSETV_KERNEL
254 // subv kernels
256 #define bli_ssSUBV_KERNEL BLIS_SSUBV_KERNEL
257 #define bli_ddSUBV_KERNEL BLIS_DSUBV_KERNEL
258 #define bli_ccSUBV_KERNEL BLIS_CSUBV_KERNEL
259 #define bli_zzSUBV_KERNEL BLIS_ZSUBV_KERNEL
261 // swapv kernels
263 #define bli_ssSWAPV_KERNEL BLIS_SSWAPV_KERNEL
264 #define bli_ddSWAPV_KERNEL BLIS_DSWAPV_KERNEL
265 #define bli_ccSWAPV_KERNEL BLIS_CSWAPV_KERNEL
266 #define bli_zzSWAPV_KERNEL BLIS_ZSWAPV_KERNEL
267 */
270 // -- Maximum register blocksize search ----------------------------------------
272 // The macro-kernels oftentimes need to statically allocate a temporary
273 // MR x NR micro-tile of C. This micro-tile must be sized such that it will
274 // work for both native and 4m/3m implementations, since the user can switch
275 // between them at runtime. In order to facilitate the sizing of those
276 // micro-tiles, we must determine the largest the register blocksizes would
277 // need to be to accommodate both native and 4m/3m-based complex
278 // micro-kernels. For real datatypes, the maximum is never larger than the
279 // actual s and d register blocksizes. However, for complex datatypes, the
280 // "native" register blocksizes may differ from the "virtual" register
281 // blocksizes used by the 4m/3m implementations. Usually, it is the register
282 // blocksizes used for 4m/3m-based complex micro-kernels that would be
283 // larger, and thus determine the maximum for c and z datatypes. But, we
284 // prefer not to assume this, therefore, we always take the larger of the
285 // two values.
287 #define BLIS_DEFAULT_4M_MR_C BLIS_DEFAULT_MR_S
288 #define BLIS_DEFAULT_4M_NR_C BLIS_DEFAULT_NR_S
289 #define BLIS_DEFAULT_4M_MR_Z BLIS_DEFAULT_MR_D
290 #define BLIS_DEFAULT_4M_NR_Z BLIS_DEFAULT_NR_D
292 //
293 // Find the largest register blocksize MR.
294 //
296 #define BLIS_MAX_DEFAULT_MR_S BLIS_DEFAULT_MR_S
297 #define BLIS_MAX_DEFAULT_MR_D BLIS_DEFAULT_MR_D
299 // NOTE: 4m and 3m register blocksizes are assumed to be equal. Thus,
300 // we only inspect the 4m values.
302 // c: Choose between the regular and 4m/3m blocksize.
303 #define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_MR_C
304 #if BLIS_DEFAULT_4M_MR_C > BLIS_MAX_DEFAULT_MR_C
305 #undef BLIS_MAX_DEFAULT_MR_C
306 #define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_4M_MR_C
307 #endif
309 // z: Choose between the regular and 4m/3m blocksize.
310 #define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_MR_Z
311 #if BLIS_DEFAULT_4M_MR_Z > BLIS_MAX_DEFAULT_MR_Z
312 #undef BLIS_MAX_DEFAULT_MR_Z
313 #define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_4M_MR_Z
314 #endif
316 //
317 // Find the largest register blocksize NR.
318 //
320 #define BLIS_MAX_DEFAULT_NR_S BLIS_DEFAULT_NR_S
321 #define BLIS_MAX_DEFAULT_NR_D BLIS_DEFAULT_NR_D
323 // NOTE: 4m and 3m register blocksizes are assumed to be equal. Thus,
324 // we only inspect the 4m values.
326 // c: Choose between the regular and 4m/3m blocksize.
327 #define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_NR_C
328 #if BLIS_DEFAULT_4M_NR_C > BLIS_MAX_DEFAULT_NR_C
329 #undef BLIS_MAX_DEFAULT_NR_C
330 #define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_4M_NR_C
331 #endif
333 // z: Choose between the regular and 4m/3m blocksize.
334 #define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_NR_Z
335 #if BLIS_DEFAULT_4M_NR_Z > BLIS_MAX_DEFAULT_NR_Z
336 #undef BLIS_MAX_DEFAULT_NR_Z
337 #define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_4M_NR_Z
338 #endif
341 // -- Abbreiviated macros ------------------------------------------------------
343 // Here, we shorten the maximum blocksizes found above so that they can be
344 // derived via the PASTEMAC macro.
346 // Maximum MR blocksizes
348 #define bli_smaxmr BLIS_MAX_DEFAULT_MR_S
349 #define bli_dmaxmr BLIS_MAX_DEFAULT_MR_D
350 #define bli_cmaxmr BLIS_MAX_DEFAULT_MR_C
351 #define bli_zmaxmr BLIS_MAX_DEFAULT_MR_Z
353 // Maximum NR blocksizes
355 #define bli_smaxnr BLIS_MAX_DEFAULT_NR_S
356 #define bli_dmaxnr BLIS_MAX_DEFAULT_NR_D
357 #define bli_cmaxnr BLIS_MAX_DEFAULT_NR_C
358 #define bli_zmaxnr BLIS_MAX_DEFAULT_NR_Z
361 #endif