b4f117b66c2811b621db1166e5c297aaafc97161
1 /******************************************************************************
2 * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
29 #include <stdio.h>
30 #include "../../cblas/include/cblas.h"
31 #include "blis.h"
32 #define DEVICE_K2H
34 #include <dsp_c.h>
36 #define getNextMultiple(x, y) ( ( ((x)+(y)-1)/(y) )* (y) )
37 // L1 buffer is hardwared here
38 #define L1_BUF_LOC 0x00F00000
40 // note these pointers must be filled if used functions
41 char *pool_mk_mem_L1;
42 char *pool_kn_mem_L1;
43 char *pool_mn_mem_L1;
45 char *pool_mk_mem_L2;
46 char *pool_kn_mem_L2;
47 char *pool_mn_mem_L2;
49 char *pool_mk_mem_L3;
50 char *pool_kn_mem_L3;
51 char *pool_mn_mem_L3;
53 void ti_bli_init_dsp(char *l3_buf, char *l2_buf)
54 {
55 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
56 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
57 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
59 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf, BLIS_CACHE_LINE_SIZE);
60 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
61 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
63 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
64 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
65 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
66 bli_init();
67 }
69 void ti_bli_finalize_dsp(void)
70 {
71 bli_finalize();
72 }
74 void cblas_caxpy_facade(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY)
75 {
76 #pragma omp parallel
77 {
78 __cache_l2_flush();
79 __cache_l2_512k();
80 }
81 cblas_caxpy(N, alpha, X, incX, Y, incY);
83 // return default L2 cache (128 K)
84 #pragma omp parallel
85 {
86 __cache_l2_flush();
87 __cache_l2_128k();
88 }
89 }
91 void cblas_ccopy_facade(const int N, const void *X, const int incX, void *Y, const int incY)
92 {
93 #pragma omp parallel
94 {
95 __cache_l2_flush();
96 __cache_l2_512k();
97 }
98 cblas_ccopy(N, X, incX, Y, incY);
100 // return default L2 cache (128 K)
101 #pragma omp parallel
102 {
103 __cache_l2_flush();
104 __cache_l2_128k();
105 }
106 }
108 void cblas_cdotc_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc)
109 {
110 #pragma omp parallel
111 {
112 __cache_l2_flush();
113 __cache_l2_512k();
114 }
115 cblas_cdotc_sub(N, X, incX, Y, incY, dotc);
117 // return default L2 cache (128 K)
118 #pragma omp parallel
119 {
120 __cache_l2_flush();
121 __cache_l2_128k();
122 }
123 }
125 void cblas_cdotu_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu)
126 {
127 #pragma omp parallel
128 {
129 __cache_l2_flush();
130 __cache_l2_512k();
131 }
132 cblas_cdotu_sub(N, X, incX, Y, incY, dotu);
134 // return default L2 cache (128 K)
135 #pragma omp parallel
136 {
137 __cache_l2_flush();
138 __cache_l2_128k();
139 }
140 }
142 void cblas_cgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
143 {
144 #pragma omp parallel
145 {
146 __cache_l2_flush();
147 __cache_l2_512k();
148 }
149 cblas_cgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
151 // return default L2 cache (128 K)
152 #pragma omp parallel
153 {
154 __cache_l2_flush();
155 __cache_l2_128k();
156 }
157 }
159 void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
160 {
161 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
162 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
163 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
165 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
166 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
167 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
169 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
170 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
171 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
173 #pragma omp parallel
174 {
175 __cache_l1d_flush();
176 __cache_l1d_4k();
177 }
179 cblas_cgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
181 #pragma omp parallel
182 {
183 __cache_l1d_flush();
184 __cache_l1d_all();
185 }
186 }
188 void cblas_cgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
189 {
190 #pragma omp parallel
191 {
192 __cache_l2_flush();
193 __cache_l2_512k();
194 }
195 cblas_cgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
197 // return default L2 cache (128 K)
198 #pragma omp parallel
199 {
200 __cache_l2_flush();
201 __cache_l2_128k();
202 }
203 }
205 void cblas_cgerc_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
206 {
207 #pragma omp parallel
208 {
209 __cache_l2_flush();
210 __cache_l2_512k();
211 }
212 cblas_cgerc(order, M, N, alpha, X, incX, Y, incY, A, lda);
214 // return default L2 cache (128 K)
215 #pragma omp parallel
216 {
217 __cache_l2_flush();
218 __cache_l2_128k();
219 }
220 }
222 void cblas_cgeru_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
223 {
224 #pragma omp parallel
225 {
226 __cache_l2_flush();
227 __cache_l2_512k();
228 }
229 cblas_cgeru(order, M, N, alpha, X, incX, Y, incY, A, lda);
231 // return default L2 cache (128 K)
232 #pragma omp parallel
233 {
234 __cache_l2_flush();
235 __cache_l2_128k();
236 }
237 }
239 void cblas_chbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
240 {
241 #pragma omp parallel
242 {
243 __cache_l2_flush();
244 __cache_l2_512k();
245 }
246 cblas_chbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
248 // return default L2 cache (128 K)
249 #pragma omp parallel
250 {
251 __cache_l2_flush();
252 __cache_l2_128k();
253 }
254 }
256 void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
257 {
258 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
259 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
260 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
262 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
263 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
264 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
266 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
267 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
268 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
270 #pragma omp parallel
271 {
272 __cache_l1d_flush();
273 __cache_l1d_4k();
274 }
275 cblas_chemm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
277 #pragma omp parallel
278 {
279 __cache_l1d_flush();
280 __cache_l1d_all();
281 }
282 }
284 void cblas_chemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
285 {
286 #pragma omp parallel
287 {
288 __cache_l2_flush();
289 __cache_l2_512k();
290 }
291 cblas_chemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
293 // return default L2 cache (128 K)
294 #pragma omp parallel
295 {
296 __cache_l2_flush();
297 __cache_l2_128k();
298 }
299 }
301 void cblas_cher_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const void *X, const int incX, void *A, const int lda)
302 {
303 #pragma omp parallel
304 {
305 __cache_l2_flush();
306 __cache_l2_512k();
307 }
308 cblas_cher(order, Uplo, N, alpha, X, incX, A, lda);
310 // return default L2 cache (128 K)
311 #pragma omp parallel
312 {
313 __cache_l2_flush();
314 __cache_l2_128k();
315 }
316 }
318 void cblas_cher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
319 {
320 #pragma omp parallel
321 {
322 __cache_l2_flush();
323 __cache_l2_512k();
324 }
325 cblas_cher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
327 // return default L2 cache (128 K)
328 #pragma omp parallel
329 {
330 __cache_l2_flush();
331 __cache_l2_128k();
332 }
333 }
335 void cblas_cher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const float beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
336 {
337 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
338 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
339 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
341 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
342 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
343 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
345 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
346 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
347 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
349 #pragma omp parallel
350 {
351 __cache_l1d_flush();
352 __cache_l1d_4k();
353 }
354 cblas_cher2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
356 #pragma omp parallel
357 {
358 __cache_l1d_flush();
359 __cache_l1d_all();
360 }
361 }
363 void cblas_cherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const void *A, const int lda, const float beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
364 {
365 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
366 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
367 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
369 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
370 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
371 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
373 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
374 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
375 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
377 #pragma omp parallel
378 {
379 __cache_l1d_flush();
380 __cache_l1d_4k();
381 }
382 cblas_cherk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
384 #pragma omp parallel
385 {
386 __cache_l1d_flush();
387 __cache_l1d_all();
388 }
389 }
391 void cblas_chpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *Ap, const void *X, const int incX, const void *beta, void *Y, const int incY)
392 {
393 #pragma omp parallel
394 {
395 __cache_l2_flush();
396 __cache_l2_512k();
397 }
398 cblas_chpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
400 // return default L2 cache (128 K)
401 #pragma omp parallel
402 {
403 __cache_l2_flush();
404 __cache_l2_128k();
405 }
406 }
408 void cblas_chpr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const void *X, const int incX, void *A)
409 {
410 #pragma omp parallel
411 {
412 __cache_l2_flush();
413 __cache_l2_512k();
414 }
415 cblas_chpr(order, Uplo, N, alpha, X, incX, A);
417 // return default L2 cache (128 K)
418 #pragma omp parallel
419 {
420 __cache_l2_flush();
421 __cache_l2_128k();
422 }
423 }
425 void cblas_chpr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *Ap)
426 {
427 #pragma omp parallel
428 {
429 __cache_l2_flush();
430 __cache_l2_512k();
431 }
432 cblas_chpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap);
434 // return default L2 cache (128 K)
435 #pragma omp parallel
436 {
437 __cache_l2_flush();
438 __cache_l2_128k();
439 }
440 }
442 void cblas_crotg_facade(void *a, void *b, float *c, void *s)
443 {
444 #pragma omp parallel
445 {
446 __cache_l2_flush();
447 __cache_l2_512k();
448 }
449 cblas_crotg(a, b, c, s);
451 // return default L2 cache (128 K)
452 #pragma omp parallel
453 {
454 __cache_l2_flush();
455 __cache_l2_128k();
456 }
457 }
459 void cblas_cscal_facade(const int N, const void *alpha, void *X, const int incX)
460 {
461 #pragma omp parallel
462 {
463 __cache_l2_flush();
464 __cache_l2_512k();
465 }
466 cblas_cscal(N, alpha, X, incX);
468 // return default L2 cache (128 K)
469 #pragma omp parallel
470 {
471 __cache_l2_flush();
472 __cache_l2_128k();
473 }
474 }
476 void cblas_csscal_facade(const int N, const float alpha, void *X, const int incX)
477 {
478 #pragma omp parallel
479 {
480 __cache_l2_flush();
481 __cache_l2_512k();
482 }
483 cblas_csscal(N, alpha, X, incX);
485 // return default L2 cache (128 K)
486 #pragma omp parallel
487 {
488 __cache_l2_flush();
489 __cache_l2_128k();
490 }
491 }
493 void cblas_cswap_facade(const int N, void *X, const int incX, void *Y, const int incY)
494 {
495 #pragma omp parallel
496 {
497 __cache_l2_flush();
498 __cache_l2_512k();
499 }
500 cblas_cswap(N, X, incX, Y, incY);
502 // return default L2 cache (128 K)
503 #pragma omp parallel
504 {
505 __cache_l2_flush();
506 __cache_l2_128k();
507 }
508 }
510 void cblas_csymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
511 {
512 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
513 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
514 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
516 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
517 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
518 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
520 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
521 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
522 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
524 #pragma omp parallel
525 {
526 __cache_l1d_flush();
527 __cache_l1d_4k();
528 }
529 cblas_csymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
531 #pragma omp parallel
532 {
533 __cache_l1d_flush();
534 __cache_l1d_all();
535 }
536 }
538 void cblas_csyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
539 {
540 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
541 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
542 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
544 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
545 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
546 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
548 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
549 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
550 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
552 #pragma omp parallel
553 {
554 __cache_l1d_flush();
555 __cache_l1d_4k();
556 }
557 cblas_csyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
559 #pragma omp parallel
560 {
561 __cache_l1d_flush();
562 __cache_l1d_all();
563 }
564 }
566 void cblas_csyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
567 {
568 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
569 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
570 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
572 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
573 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
574 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
576 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
577 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
578 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
580 #pragma omp parallel
581 {
582 __cache_l1d_flush();
583 __cache_l1d_4k();
584 }
585 cblas_csyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
587 #pragma omp parallel
588 {
589 __cache_l1d_flush();
590 __cache_l1d_all();
591 }
592 }
594 void cblas_ctbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
595 {
596 #pragma omp parallel
597 {
598 __cache_l2_flush();
599 __cache_l2_512k();
600 }
601 cblas_ctbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
603 // return default L2 cache (128 K)
604 #pragma omp parallel
605 {
606 __cache_l2_flush();
607 __cache_l2_128k();
608 }
609 }
611 void cblas_ctbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
612 {
613 #pragma omp parallel
614 {
615 __cache_l2_flush();
616 __cache_l2_512k();
617 }
618 cblas_ctbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
620 // return default L2 cache (128 K)
621 #pragma omp parallel
622 {
623 __cache_l2_flush();
624 __cache_l2_128k();
625 }
626 }
628 void cblas_ctpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
629 {
630 #pragma omp parallel
631 {
632 __cache_l2_flush();
633 __cache_l2_512k();
634 }
635 cblas_ctpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
637 // return default L2 cache (128 K)
638 #pragma omp parallel
639 {
640 __cache_l2_flush();
641 __cache_l2_128k();
642 }
643 }
645 void cblas_ctpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
646 {
647 #pragma omp parallel
648 {
649 __cache_l2_flush();
650 __cache_l2_512k();
651 }
652 cblas_ctpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
654 // return default L2 cache (128 K)
655 #pragma omp parallel
656 {
657 __cache_l2_flush();
658 __cache_l2_128k();
659 }
660 }
662 void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
663 {
664 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
665 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
666 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
668 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
669 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
670 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
672 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
673 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
674 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
676 #pragma omp parallel
677 {
678 __cache_l1d_flush();
679 __cache_l1d_4k();
680 }
681 cblas_ctrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
683 #pragma omp parallel
684 {
685 __cache_l1d_flush();
686 __cache_l1d_all();
687 }
688 }
690 void cblas_ctrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
691 {
692 #pragma omp parallel
693 {
694 __cache_l2_flush();
695 __cache_l2_512k();
696 }
697 cblas_ctrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
699 // return default L2 cache (128 K)
700 #pragma omp parallel
701 {
702 __cache_l2_flush();
703 __cache_l2_128k();
704 }
705 }
707 void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
708 {
709 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
710 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
711 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
713 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
714 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
715 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
717 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
718 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
719 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
721 #pragma omp parallel
722 {
723 __cache_l1d_flush();
724 __cache_l1d_4k();
725 }
726 cblas_ctrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
728 #pragma omp parallel
729 {
730 __cache_l1d_flush();
731 __cache_l1d_all();
732 }
733 }
735 void cblas_ctrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
736 {
737 #pragma omp parallel
738 {
739 __cache_l2_flush();
740 __cache_l2_512k();
741 }
742 cblas_ctrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
744 // return default L2 cache (128 K)
745 #pragma omp parallel
746 {
747 __cache_l2_flush();
748 __cache_l2_128k();
749 }
750 }
752 void cblas_dasum_facade(const int N, const double *X, const int incX, double *retval)
753 {
754 #pragma omp parallel
755 {
756 __cache_l2_flush();
757 __cache_l2_512k();
758 }
759 *retval = cblas_dasum(N, X, incX);
761 // return default L2 cache (128 K)
762 #pragma omp parallel
763 {
764 __cache_l2_flush();
765 __cache_l2_128k();
766 }
767 }
769 void cblas_daxpy_facade(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY)
770 {
771 #pragma omp parallel
772 {
773 __cache_l2_flush();
774 __cache_l2_512k();
775 }
776 cblas_daxpy(N, alpha, X, incX, Y, incY);
778 // return default L2 cache (128 K)
779 #pragma omp parallel
780 {
781 __cache_l2_flush();
782 __cache_l2_128k();
783 }
784 }
786 void cblas_dcopy_facade(const int N, const double *X, const int incX, double *Y, const int incY)
787 {
788 #pragma omp parallel
789 {
790 __cache_l2_flush();
791 __cache_l2_512k();
792 }
793 cblas_dcopy(N, X, incX, Y, incY);
795 // return default L2 cache (128 K)
796 #pragma omp parallel
797 {
798 __cache_l2_flush();
799 __cache_l2_128k();
800 }
801 }
803 void cblas_ddot_facade(const int N, const double *X, const int incX, const double *Y, const int incY, double *retval)
804 {
805 #pragma omp parallel
806 {
807 __cache_l2_flush();
808 __cache_l2_512k();
809 }
810 *retval = cblas_ddot(N, X, incX, Y, incY);
812 // return default L2 cache (128 K)
813 #pragma omp parallel
814 {
815 __cache_l2_flush();
816 __cache_l2_128k();
817 }
818 }
820 void cblas_dgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
821 {
822 #pragma omp parallel
823 {
824 __cache_l2_flush();
825 __cache_l2_512k();
826 }
827 cblas_dgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
829 // return default L2 cache (128 K)
830 #pragma omp parallel
831 {
832 __cache_l2_flush();
833 __cache_l2_128k();
834 }
835 }
837 void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
838 {
839 // printf("dgemm facade A: %x, B: %x\n", A, B);
840 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
841 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
842 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
844 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
845 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
846 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
848 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
849 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
850 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
852 #pragma omp parallel
853 {
854 __cache_l1d_flush();
855 __cache_l1d_4k();
856 }
857 cblas_dgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
859 #pragma omp parallel
860 {
861 __cache_l1d_flush();
862 __cache_l1d_all();
863 }
864 }
866 void cblas_dgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
867 {
868 #pragma omp parallel
869 {
870 __cache_l2_flush();
871 __cache_l2_512k();
872 }
873 cblas_dgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
875 // return default L2 cache (128 K)
876 #pragma omp parallel
877 {
878 __cache_l2_flush();
879 __cache_l2_128k();
880 }
881 }
883 void cblas_dger_facade(const enum CBLAS_ORDER order, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
884 {
885 #pragma omp parallel
886 {
887 __cache_l2_flush();
888 __cache_l2_512k();
889 }
890 cblas_dger(order, M, N, alpha, X, incX, Y, incY, A, lda);
892 // return default L2 cache (128 K)
893 #pragma omp parallel
894 {
895 __cache_l2_flush();
896 __cache_l2_128k();
897 }
898 }
900 void cblas_dnrm2_facade(const int N, const double *X, const int incX, double *retval)
901 {
902 #pragma omp parallel
903 {
904 __cache_l2_flush();
905 __cache_l2_512k();
906 }
907 *retval = cblas_dnrm2(N, X, incX);
909 // return default L2 cache (128 K)
910 #pragma omp parallel
911 {
912 __cache_l2_flush();
913 __cache_l2_128k();
914 }
915 }
917 void cblas_drot_facade(const int N, double *X, const int incX, double *Y, const int incY, const double c, const double s)
918 {
919 #pragma omp parallel
920 {
921 __cache_l2_flush();
922 __cache_l2_512k();
923 }
924 cblas_drot(N, X, incX, Y, incY, c, s);
926 // return default L2 cache (128 K)
927 #pragma omp parallel
928 {
929 __cache_l2_flush();
930 __cache_l2_128k();
931 }
932 }
934 void cblas_drotg_facade(double *a, double *b, double *c, double *s)
935 {
936 #pragma omp parallel
937 {
938 __cache_l2_flush();
939 __cache_l2_512k();
940 }
941 cblas_drotg(a, b, c, s);
943 // return default L2 cache (128 K)
944 #pragma omp parallel
945 {
946 __cache_l2_flush();
947 __cache_l2_128k();
948 }
949 }
951 void cblas_drotm_facade(const int N, double *X, const int incX, double *Y, const int incY, const double *P)
952 {
953 #pragma omp parallel
954 {
955 __cache_l2_flush();
956 __cache_l2_512k();
957 }
958 cblas_drotm(N, X, incX, Y, incY, P);
960 // return default L2 cache (128 K)
961 #pragma omp parallel
962 {
963 __cache_l2_flush();
964 __cache_l2_128k();
965 }
966 }
968 void cblas_drotmg_facade(double *d1, double *d2, double *b1, const double b2, double *P)
969 {
970 #pragma omp parallel
971 {
972 __cache_l2_flush();
973 __cache_l2_512k();
974 }
975 cblas_drotmg(d1, d2, b1, b2, P);
977 // return default L2 cache (128 K)
978 #pragma omp parallel
979 {
980 __cache_l2_flush();
981 __cache_l2_128k();
982 }
983 }
985 void cblas_dsbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
986 {
987 #pragma omp parallel
988 {
989 __cache_l2_flush();
990 __cache_l2_512k();
991 }
992 cblas_dsbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
994 // return default L2 cache (128 K)
995 #pragma omp parallel
996 {
997 __cache_l2_flush();
998 __cache_l2_128k();
999 }
1000 }
1002 void cblas_dscal_facade(const int N, const double alpha, double *X, const int incX)
1003 {
1004 #pragma omp parallel
1005 {
1006 __cache_l2_flush();
1007 __cache_l2_512k();
1008 }
1009 cblas_dscal(N, alpha, X, incX);
1011 // return default L2 cache (128 K)
1012 #pragma omp parallel
1013 {
1014 __cache_l2_flush();
1015 __cache_l2_128k();
1016 }
1017 }
1019 void cblas_dsdot_facade(const int N, const float *X, const int incX, const float *Y, const int incY, double *retval)
1020 {
1021 #pragma omp parallel
1022 {
1023 __cache_l2_flush();
1024 __cache_l2_512k();
1025 }
1026 *retval = cblas_dsdot(N, X, incX, Y, incY);
1028 // return default L2 cache (128 K)
1029 #pragma omp parallel
1030 {
1031 __cache_l2_flush();
1032 __cache_l2_128k();
1033 }
1034 }
1036 void cblas_dspmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *Ap, const double *X, const int incX, const double beta, double *Y, const int incY)
1037 {
1038 #pragma omp parallel
1039 {
1040 __cache_l2_flush();
1041 __cache_l2_512k();
1042 }
1043 cblas_dspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
1045 // return default L2 cache (128 K)
1046 #pragma omp parallel
1047 {
1048 __cache_l2_flush();
1049 __cache_l2_128k();
1050 }
1051 }
1053 void cblas_dspr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, double *Ap)
1054 {
1055 #pragma omp parallel
1056 {
1057 __cache_l2_flush();
1058 __cache_l2_512k();
1059 }
1060 cblas_dspr(order, Uplo, N, alpha, X, incX, Ap);
1062 // return default L2 cache (128 K)
1063 #pragma omp parallel
1064 {
1065 __cache_l2_flush();
1066 __cache_l2_128k();
1067 }
1068 }
1070 void cblas_dspr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A)
1071 {
1072 #pragma omp parallel
1073 {
1074 __cache_l2_flush();
1075 __cache_l2_512k();
1076 }
1077 cblas_dspr2(order, Uplo, N, alpha, X, incX, Y, incY, A);
1079 // return default L2 cache (128 K)
1080 #pragma omp parallel
1081 {
1082 __cache_l2_flush();
1083 __cache_l2_128k();
1084 }
1085 }
1087 void cblas_dswap_facade(const int N, double *X, const int incX, double *Y, const int incY)
1088 {
1089 #pragma omp parallel
1090 {
1091 __cache_l2_flush();
1092 __cache_l2_512k();
1093 }
1094 cblas_dswap(N, X, incX, Y, incY);
1096 // return default L2 cache (128 K)
1097 #pragma omp parallel
1098 {
1099 __cache_l2_flush();
1100 __cache_l2_128k();
1101 }
1102 }
1104 void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
1105 {
1106 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
1107 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1108 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1110 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
1111 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1112 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1114 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
1115 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1116 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1118 #pragma omp parallel
1119 {
1120 __cache_l1d_flush();
1121 __cache_l1d_4k();
1122 }
1123 cblas_dsymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
1125 #pragma omp parallel
1126 {
1127 __cache_l1d_flush();
1128 __cache_l1d_all();
1129 }
1130 }
1132 void cblas_dsymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
1133 {
1134 #pragma omp parallel
1135 {
1136 __cache_l2_flush();
1137 __cache_l2_512k();
1138 }
1139 cblas_dsymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
1141 // return default L2 cache (128 K)
1142 #pragma omp parallel
1143 {
1144 __cache_l2_flush();
1145 __cache_l2_128k();
1146 }
1147 }
1149 void cblas_dsyr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, double *A, const int lda)
1150 {
1151 #pragma omp parallel
1152 {
1153 __cache_l2_flush();
1154 __cache_l2_512k();
1155 }
1156 cblas_dsyr(order, Uplo, N, alpha, X, incX, A, lda);
1158 // return default L2 cache (128 K)
1159 #pragma omp parallel
1160 {
1161 __cache_l2_flush();
1162 __cache_l2_128k();
1163 }
1164 }
1166 void cblas_dsyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
1167 {
1168 #pragma omp parallel
1169 {
1170 __cache_l2_flush();
1171 __cache_l2_512k();
1172 }
1173 cblas_dsyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
1175 // return default L2 cache (128 K)
1176 #pragma omp parallel
1177 {
1178 __cache_l2_flush();
1179 __cache_l2_128k();
1180 }
1181 }
1183 void cblas_dsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
1184 {
1185 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
1186 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1187 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1189 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
1190 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1191 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1193 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
1194 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1195 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1197 #pragma omp parallel
1198 {
1199 __cache_l1d_flush();
1200 __cache_l1d_4k();
1201 }
1202 cblas_dsyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
1204 #pragma omp parallel
1205 {
1206 __cache_l1d_flush();
1207 __cache_l1d_all();
1208 }
1209 }
1211 void cblas_dsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
1212 {
1213 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
1214 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1215 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1217 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
1218 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1219 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1221 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
1222 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1223 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1225 #pragma omp parallel
1226 {
1227 __cache_l1d_flush();
1228 __cache_l1d_4k();
1229 }
1230 cblas_dsyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
1232 #pragma omp parallel
1233 {
1234 __cache_l1d_flush();
1235 __cache_l1d_all();
1236 }
1237 }
1239 void cblas_dtbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const double *A, const int lda, double *X, const int incX)
1240 {
1241 #pragma omp parallel
1242 {
1243 __cache_l2_flush();
1244 __cache_l2_512k();
1245 }
1246 cblas_dtbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
1248 // return default L2 cache (128 K)
1249 #pragma omp parallel
1250 {
1251 __cache_l2_flush();
1252 __cache_l2_128k();
1253 }
1254 }
1256 void cblas_dtbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const double *A, const int lda, double *X, const int incX)
1257 {
1258 #pragma omp parallel
1259 {
1260 __cache_l2_flush();
1261 __cache_l2_512k();
1262 }
1263 cblas_dtbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
1265 // return default L2 cache (128 K)
1266 #pragma omp parallel
1267 {
1268 __cache_l2_flush();
1269 __cache_l2_128k();
1270 }
1271 }
1273 void cblas_dtpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *Ap, double *X, const int incX)
1274 {
1275 #pragma omp parallel
1276 {
1277 __cache_l2_flush();
1278 __cache_l2_512k();
1279 }
1280 cblas_dtpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
1282 // return default L2 cache (128 K)
1283 #pragma omp parallel
1284 {
1285 __cache_l2_flush();
1286 __cache_l2_128k();
1287 }
1288 }
1290 void cblas_dtpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *Ap, double *X, const int incX)
1291 {
1292 #pragma omp parallel
1293 {
1294 __cache_l2_flush();
1295 __cache_l2_512k();
1296 }
1297 cblas_dtpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
1299 // return default L2 cache (128 K)
1300 #pragma omp parallel
1301 {
1302 __cache_l2_flush();
1303 __cache_l2_128k();
1304 }
1305 }
1307 void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, float *l3_buf, float *l2_buf_loc)
1308 {
1309 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
1310 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1311 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1313 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
1314 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1315 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1317 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
1318 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1319 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1321 #pragma omp parallel
1322 {
1323 __cache_l1d_flush();
1324 __cache_l1d_4k();
1325 }
1326 cblas_dtrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
1328 #pragma omp parallel
1329 {
1330 __cache_l1d_flush();
1331 __cache_l1d_all();
1332 }
1333 }
1335 void cblas_dtrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *A, const int lda, double *X, const int incX)
1336 {
1337 #pragma omp parallel
1338 {
1339 __cache_l2_flush();
1340 __cache_l2_512k();
1341 }
1342 cblas_dtrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
1344 // return default L2 cache (128 K)
1345 #pragma omp parallel
1346 {
1347 __cache_l2_flush();
1348 __cache_l2_128k();
1349 }
1350 }
1352 void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, float *l3_buf, float *l2_buf_loc)
1353 {
1354 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
1355 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1356 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1358 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
1359 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1360 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1362 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
1363 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1364 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1366 #pragma omp parallel
1367 {
1368 __cache_l1d_flush();
1369 __cache_l1d_4k();
1370 }
1371 cblas_dtrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
1373 #pragma omp parallel
1374 {
1375 __cache_l1d_flush();
1376 __cache_l1d_all();
1377 }
1378 }
1380 void cblas_dtrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *A, const int lda, double *X, const int incX)
1381 {
1382 #pragma omp parallel
1383 {
1384 __cache_l2_flush();
1385 __cache_l2_512k();
1386 }
1387 cblas_dtrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
1389 // return default L2 cache (128 K)
1390 #pragma omp parallel
1391 {
1392 __cache_l2_flush();
1393 __cache_l2_128k();
1394 }
1395 }
1397 void cblas_dzasum_facade(const int N, const void *X, const int incX, double *retval)
1398 {
1399 #pragma omp parallel
1400 {
1401 __cache_l2_flush();
1402 __cache_l2_512k();
1403 }
1404 *retval = cblas_dzasum(N, X, incX);
1406 // return default L2 cache (128 K)
1407 #pragma omp parallel
1408 {
1409 __cache_l2_flush();
1410 __cache_l2_128k();
1411 }
1412 }
1414 void cblas_dznrm2_facade(const int N, const void *X, const int incX, double *retval)
1415 {
1416 #pragma omp parallel
1417 {
1418 __cache_l2_flush();
1419 __cache_l2_512k();
1420 }
1421 *retval = cblas_dznrm2(N, X, incX);
1423 // return default L2 cache (128 K)
1424 #pragma omp parallel
1425 {
1426 __cache_l2_flush();
1427 __cache_l2_128k();
1428 }
1429 }
1431 void cblas_icamax_facade(const int N, const void *X, const int incX, CBLAS_INDEX *retval)
1432 {
1433 #pragma omp parallel
1434 {
1435 __cache_l2_flush();
1436 __cache_l2_512k();
1437 }
1438 *retval = cblas_icamax(N, X, incX);
1440 // return default L2 cache (128 K)
1441 #pragma omp parallel
1442 {
1443 __cache_l2_flush();
1444 __cache_l2_128k();
1445 }
1446 }
1448 void cblas_idamax_facade(const int N, const double *X, const int incX, CBLAS_INDEX *retval)
1449 {
1450 #pragma omp parallel
1451 {
1452 __cache_l2_flush();
1453 __cache_l2_512k();
1454 }
1455 *retval = cblas_idamax(N, X, incX);
1457 // return default L2 cache (128 K)
1458 #pragma omp parallel
1459 {
1460 __cache_l2_flush();
1461 __cache_l2_128k();
1462 }
1463 }
1465 void cblas_isamax_facade(const int N, const float *X, const int incX, CBLAS_INDEX *retval)
1466 {
1467 #pragma omp parallel
1468 {
1469 __cache_l2_flush();
1470 __cache_l2_512k();
1471 }
1472 *retval = cblas_isamax(N, X, incX);
1474 // return default L2 cache (128 K)
1475 #pragma omp parallel
1476 {
1477 __cache_l2_flush();
1478 __cache_l2_128k();
1479 }
1480 }
1482 void cblas_izamax_facade(const int N, const void *X, const int incX, CBLAS_INDEX *retval)
1483 {
1484 #pragma omp parallel
1485 {
1486 __cache_l2_flush();
1487 __cache_l2_512k();
1488 }
1489 *retval = cblas_izamax(N, X, incX);
1491 // return default L2 cache (128 K)
1492 #pragma omp parallel
1493 {
1494 __cache_l2_flush();
1495 __cache_l2_128k();
1496 }
1497 }
1499 void cblas_sasum_facade(const int N, const float *X, const int incX, float *retval)
1500 {
1501 #pragma omp parallel
1502 {
1503 __cache_l2_flush();
1504 __cache_l2_512k();
1505 }
1506 *retval = cblas_sasum(N, X, incX);
1508 // return default L2 cache (128 K)
1509 #pragma omp parallel
1510 {
1511 __cache_l2_flush();
1512 __cache_l2_128k();
1513 }
1514 }
1516 void cblas_saxpy_facade(const int N, const float alpha, const float *X, const int incX, float *Y, const int incY)
1517 {
1518 #pragma omp parallel
1519 {
1520 __cache_l2_flush();
1521 __cache_l2_512k();
1522 }
1523 cblas_saxpy(N, alpha, X, incX, Y, incY);
1525 // return default L2 cache (128 K)
1526 #pragma omp parallel
1527 {
1528 __cache_l2_flush();
1529 __cache_l2_128k();
1530 }
1531 }
1533 void cblas_scasum_facade(const int N, const void *X, const int incX, float *retval)
1534 {
1535 #pragma omp parallel
1536 {
1537 __cache_l2_flush();
1538 __cache_l2_512k();
1539 }
1540 *retval = cblas_scasum(N, X, incX);
1542 // return default L2 cache (128 K)
1543 #pragma omp parallel
1544 {
1545 __cache_l2_flush();
1546 __cache_l2_128k();
1547 }
1548 }
1550 void cblas_scnrm2_facade(const int N, const void *X, const int incX, float *retval)
1551 {
1552 #pragma omp parallel
1553 {
1554 __cache_l2_flush();
1555 __cache_l2_512k();
1556 }
1557 *retval = cblas_scnrm2(N, X, incX);
1559 // return default L2 cache (128 K)
1560 #pragma omp parallel
1561 {
1562 __cache_l2_flush();
1563 __cache_l2_128k();
1564 }
1565 }
1567 void cblas_scopy_facade(const int N, const float *X, const int incX, float *Y, const int incY)
1568 {
1569 #pragma omp parallel
1570 {
1571 __cache_l2_flush();
1572 __cache_l2_512k();
1573 }
1574 cblas_scopy(N, X, incX, Y, incY);
1576 // return default L2 cache (128 K)
1577 #pragma omp parallel
1578 {
1579 __cache_l2_flush();
1580 __cache_l2_128k();
1581 }
1582 }
1584 void cblas_sdot_facade(const int N, const float *X, const int incX, const float *Y, const int incY, float *retval)
1585 {
1586 #pragma omp parallel
1587 {
1588 __cache_l2_flush();
1589 __cache_l2_512k();
1590 }
1591 *retval = cblas_sdot(N, X, incX, Y, incY);
1593 // return default L2 cache (128 K)
1594 #pragma omp parallel
1595 {
1596 __cache_l2_flush();
1597 __cache_l2_128k();
1598 }
1599 }
1601 void cblas_sdsdot_facade(const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *retval)
1602 {
1603 #pragma omp parallel
1604 {
1605 __cache_l2_flush();
1606 __cache_l2_512k();
1607 }
1608 *retval = cblas_sdsdot(N, alpha, X, incX, Y, incY);
1610 // return default L2 cache (128 K)
1611 #pragma omp parallel
1612 {
1613 __cache_l2_flush();
1614 __cache_l2_128k();
1615 }
1616 }
1618 void cblas_sgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
1619 {
1620 #pragma omp parallel
1621 {
1622 __cache_l2_flush();
1623 __cache_l2_512k();
1624 }
1625 cblas_sgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
1627 // return default L2 cache (128 K)
1628 #pragma omp parallel
1629 {
1630 __cache_l2_flush();
1631 __cache_l2_128k();
1632 }
1633 }
1635 void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
1636 {
1637 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
1638 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1639 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1641 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
1642 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1643 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1645 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
1646 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1647 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1649 #pragma omp parallel
1650 {
1651 __cache_l1d_flush();
1652 __cache_l1d_4k();
1653 }
1654 cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
1656 #pragma omp parallel
1657 {
1658 __cache_l1d_flush();
1659 __cache_l1d_all();
1660 }
1661 }
1663 void cblas_sgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
1664 {
1665 #pragma omp parallel
1666 {
1667 __cache_l2_flush();
1668 __cache_l2_512k();
1669 }
1670 cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
1672 // return default L2 cache (128 K)
1673 #pragma omp parallel
1674 {
1675 __cache_l2_flush();
1676 __cache_l2_128k();
1677 }
1678 }
1680 void cblas_sger_facade(const enum CBLAS_ORDER order, const int M, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda)
1681 {
1682 #pragma omp parallel
1683 {
1684 __cache_l2_flush();
1685 __cache_l2_512k();
1686 }
1687 cblas_sger(order, M, N, alpha, X, incX, Y, incY, A, lda);
1689 // return default L2 cache (128 K)
1690 #pragma omp parallel
1691 {
1692 __cache_l2_flush();
1693 __cache_l2_128k();
1694 }
1695 }
1697 void cblas_snrm2_facade(const int N, const float *X, const int incX, float *retval)
1698 {
1699 #pragma omp parallel
1700 {
1701 __cache_l2_flush();
1702 __cache_l2_512k();
1703 }
1704 *retval = cblas_snrm2(N, X, incX);
1706 // return default L2 cache (128 K)
1707 #pragma omp parallel
1708 {
1709 __cache_l2_flush();
1710 __cache_l2_128k();
1711 }
1712 }
1714 void cblas_srot_facade(const int N, float *X, const int incX, float *Y, const int incY, const float c, const float s)
1715 {
1716 #pragma omp parallel
1717 {
1718 __cache_l2_flush();
1719 __cache_l2_512k();
1720 }
1721 cblas_srot(N, X, incX, Y, incY, c, s);
1723 // return default L2 cache (128 K)
1724 #pragma omp parallel
1725 {
1726 __cache_l2_flush();
1727 __cache_l2_128k();
1728 }
1729 }
1731 void cblas_srotg_facade(float *a, float *b, float *c, float *s)
1732 {
1733 #pragma omp parallel
1734 {
1735 __cache_l2_flush();
1736 __cache_l2_512k();
1737 }
1738 cblas_srotg(a, b, c, s);
1740 // return default L2 cache (128 K)
1741 #pragma omp parallel
1742 {
1743 __cache_l2_flush();
1744 __cache_l2_128k();
1745 }
1746 }
1748 void cblas_srotm_facade(const int N, float *X, const int incX, float *Y, const int incY, const float *P)
1749 {
1750 #pragma omp parallel
1751 {
1752 __cache_l2_flush();
1753 __cache_l2_512k();
1754 }
1755 cblas_srotm(N, X, incX, Y, incY, P);
1757 // return default L2 cache (128 K)
1758 #pragma omp parallel
1759 {
1760 __cache_l2_flush();
1761 __cache_l2_128k();
1762 }
1763 }
1765 void cblas_srotmg_facade(float *d1, float *d2, float *b1, const float b2, float *P)
1766 {
1767 #pragma omp parallel
1768 {
1769 __cache_l2_flush();
1770 __cache_l2_512k();
1771 }
1772 cblas_srotmg(d1, d2, b1, b2, P);
1774 // return default L2 cache (128 K)
1775 #pragma omp parallel
1776 {
1777 __cache_l2_flush();
1778 __cache_l2_128k();
1779 }
1780 }
1782 void cblas_ssbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
1783 {
1784 #pragma omp parallel
1785 {
1786 __cache_l2_flush();
1787 __cache_l2_512k();
1788 }
1789 cblas_ssbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
1791 // return default L2 cache (128 K)
1792 #pragma omp parallel
1793 {
1794 __cache_l2_flush();
1795 __cache_l2_128k();
1796 }
1797 }
1799 void cblas_sscal_facade(const int N, const float alpha, float *X, const int incX)
1800 {
1801 #pragma omp parallel
1802 {
1803 __cache_l2_flush();
1804 __cache_l2_512k();
1805 }
1806 cblas_sscal(N, alpha, X, incX);
1808 // return default L2 cache (128 K)
1809 #pragma omp parallel
1810 {
1811 __cache_l2_flush();
1812 __cache_l2_128k();
1813 }
1814 }
1816 void cblas_sspmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *Ap, const float *X, const int incX, const float beta, float *Y, const int incY)
1817 {
1818 #pragma omp parallel
1819 {
1820 __cache_l2_flush();
1821 __cache_l2_512k();
1822 }
1823 cblas_sspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
1825 // return default L2 cache (128 K)
1826 #pragma omp parallel
1827 {
1828 __cache_l2_flush();
1829 __cache_l2_128k();
1830 }
1831 }
1833 void cblas_sspr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, float *Ap)
1834 {
1835 #pragma omp parallel
1836 {
1837 __cache_l2_flush();
1838 __cache_l2_512k();
1839 }
1840 cblas_sspr(order, Uplo, N, alpha, X, incX, Ap);
1842 // return default L2 cache (128 K)
1843 #pragma omp parallel
1844 {
1845 __cache_l2_flush();
1846 __cache_l2_128k();
1847 }
1848 }
1850 void cblas_sspr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A)
1851 {
1852 #pragma omp parallel
1853 {
1854 __cache_l2_flush();
1855 __cache_l2_512k();
1856 }
1857 cblas_sspr2(order, Uplo, N, alpha, X, incX, Y, incY, A);
1859 // return default L2 cache (128 K)
1860 #pragma omp parallel
1861 {
1862 __cache_l2_flush();
1863 __cache_l2_128k();
1864 }
1865 }
1867 void cblas_sswap_facade(const int N, float *X, const int incX, float *Y, const int incY)
1868 {
1869 #pragma omp parallel
1870 {
1871 __cache_l2_flush();
1872 __cache_l2_512k();
1873 }
1874 cblas_sswap(N, X, incX, Y, incY);
1876 // return default L2 cache (128 K)
1877 #pragma omp parallel
1878 {
1879 __cache_l2_flush();
1880 __cache_l2_128k();
1881 }
1882 }
1884 void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
1885 {
1886 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
1887 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1888 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1890 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
1891 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1892 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1894 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
1895 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1896 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1898 #pragma omp parallel
1899 {
1900 __cache_l1d_flush();
1901 __cache_l1d_4k();
1902 }
1903 cblas_ssymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
1905 #pragma omp parallel
1906 {
1907 __cache_l1d_flush();
1908 __cache_l1d_all();
1909 }
1910 }
1912 void cblas_ssymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
1913 {
1914 #pragma omp parallel
1915 {
1916 __cache_l2_flush();
1917 __cache_l2_512k();
1918 }
1919 cblas_ssymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
1921 // return default L2 cache (128 K)
1922 #pragma omp parallel
1923 {
1924 __cache_l2_flush();
1925 __cache_l2_128k();
1926 }
1927 }
1929 void cblas_ssyr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, float *A, const int lda)
1930 {
1931 #pragma omp parallel
1932 {
1933 __cache_l2_flush();
1934 __cache_l2_512k();
1935 }
1936 cblas_ssyr(order, Uplo, N, alpha, X, incX, A, lda);
1938 // return default L2 cache (128 K)
1939 #pragma omp parallel
1940 {
1941 __cache_l2_flush();
1942 __cache_l2_128k();
1943 }
1944 }
1946 void cblas_ssyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda)
1947 {
1948 #pragma omp parallel
1949 {
1950 __cache_l2_flush();
1951 __cache_l2_512k();
1952 }
1953 cblas_ssyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
1955 // return default L2 cache (128 K)
1956 #pragma omp parallel
1957 {
1958 __cache_l2_flush();
1959 __cache_l2_128k();
1960 }
1961 }
1963 void cblas_ssyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
1964 {
1965 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
1966 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1967 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1969 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
1970 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1971 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1973 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
1974 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1975 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
1977 #pragma omp parallel
1978 {
1979 __cache_l1d_flush();
1980 __cache_l1d_4k();
1981 }
1982 cblas_ssyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
1984 #pragma omp parallel
1985 {
1986 __cache_l1d_flush();
1987 __cache_l1d_all();
1988 }
1989 }
1991 void cblas_ssyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
1992 {
1993 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
1994 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1995 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
1997 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
1998 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
1999 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2001 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2002 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2003 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2005 #pragma omp parallel
2006 {
2007 __cache_l1d_flush();
2008 __cache_l1d_4k();
2009 }
2010 cblas_ssyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
2012 #pragma omp parallel
2013 {
2014 __cache_l1d_flush();
2015 __cache_l1d_all();
2016 }
2017 }
2019 void cblas_stbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const float *A, const int lda, float *X, const int incX)
2020 {
2021 #pragma omp parallel
2022 {
2023 __cache_l2_flush();
2024 __cache_l2_512k();
2025 }
2026 cblas_stbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
2028 // return default L2 cache (128 K)
2029 #pragma omp parallel
2030 {
2031 __cache_l2_flush();
2032 __cache_l2_128k();
2033 }
2034 }
2036 void cblas_stbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const float *A, const int lda, float *X, const int incX)
2037 {
2038 #pragma omp parallel
2039 {
2040 __cache_l2_flush();
2041 __cache_l2_512k();
2042 }
2043 cblas_stbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
2045 // return default L2 cache (128 K)
2046 #pragma omp parallel
2047 {
2048 __cache_l2_flush();
2049 __cache_l2_128k();
2050 }
2051 }
2053 void cblas_stpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *Ap, float *X, const int incX)
2054 {
2055 #pragma omp parallel
2056 {
2057 __cache_l2_flush();
2058 __cache_l2_512k();
2059 }
2060 cblas_stpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
2062 // return default L2 cache (128 K)
2063 #pragma omp parallel
2064 {
2065 __cache_l2_flush();
2066 __cache_l2_128k();
2067 }
2068 }
2070 void cblas_stpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *Ap, float *X, const int incX)
2071 {
2072 #pragma omp parallel
2073 {
2074 __cache_l2_flush();
2075 __cache_l2_512k();
2076 }
2077 cblas_stpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
2079 // return default L2 cache (128 K)
2080 #pragma omp parallel
2081 {
2082 __cache_l2_flush();
2083 __cache_l2_128k();
2084 }
2085 }
2087 void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, float *l3_buf, float *l2_buf_loc)
2088 {
2089 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2090 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2091 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2093 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2094 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2095 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2097 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2098 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2099 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2101 #pragma omp parallel
2102 {
2103 __cache_l1d_flush();
2104 __cache_l1d_4k();
2105 }
2106 cblas_strmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
2108 #pragma omp parallel
2109 {
2110 __cache_l1d_flush();
2111 __cache_l1d_all();
2112 }
2113 }
2115 void cblas_strmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *A, const int lda, float *X, const int incX)
2116 {
2117 #pragma omp parallel
2118 {
2119 __cache_l2_flush();
2120 __cache_l2_512k();
2121 }
2122 cblas_strmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
2124 // return default L2 cache (128 K)
2125 #pragma omp parallel
2126 {
2127 __cache_l2_flush();
2128 __cache_l2_128k();
2129 }
2130 }
2132 void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, float *l3_buf, float *l2_buf_loc)
2133 {
2134 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2135 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2136 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2138 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2139 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2140 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2142 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2143 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2144 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2146 #pragma omp parallel
2147 {
2148 __cache_l1d_flush();
2149 __cache_l1d_4k();
2150 }
2151 cblas_strsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
2153 #pragma omp parallel
2154 {
2155 __cache_l1d_flush();
2156 __cache_l1d_all();
2157 }
2158 }
2160 void cblas_strsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *A, const int lda, float *X, const int incX)
2161 {
2162 #pragma omp parallel
2163 {
2164 __cache_l2_flush();
2165 __cache_l2_512k();
2166 }
2167 cblas_strsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
2169 // return default L2 cache (128 K)
2170 #pragma omp parallel
2171 {
2172 __cache_l2_flush();
2173 __cache_l2_128k();
2174 }
2175 }
2177 void cblas_xerbla_facade(int p, const char *rout, const char *form)
2178 {
2179 #pragma omp parallel
2180 {
2181 __cache_l2_flush();
2182 __cache_l2_512k();
2183 }
2184 cblas_xerbla(p, rout, form);
2186 // return default L2 cache (128 K)
2187 #pragma omp parallel
2188 {
2189 __cache_l2_flush();
2190 __cache_l2_128k();
2191 }
2192 }
2194 void cblas_zaxpy_facade(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY)
2195 {
2196 #pragma omp parallel
2197 {
2198 __cache_l2_flush();
2199 __cache_l2_512k();
2200 }
2201 cblas_zaxpy(N, alpha, X, incX, Y, incY);
2203 // return default L2 cache (128 K)
2204 #pragma omp parallel
2205 {
2206 __cache_l2_flush();
2207 __cache_l2_128k();
2208 }
2209 }
2211 void cblas_zcopy_facade(const int N, const void *X, const int incX, void *Y, const int incY)
2212 {
2213 #pragma omp parallel
2214 {
2215 __cache_l2_flush();
2216 __cache_l2_512k();
2217 }
2218 cblas_zcopy(N, X, incX, Y, incY);
2220 // return default L2 cache (128 K)
2221 #pragma omp parallel
2222 {
2223 __cache_l2_flush();
2224 __cache_l2_128k();
2225 }
2226 }
2228 void cblas_zdotc_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc)
2229 {
2230 #pragma omp parallel
2231 {
2232 __cache_l2_flush();
2233 __cache_l2_512k();
2234 }
2235 cblas_zdotc_sub(N, X, incX, Y, incY, dotc);
2237 // return default L2 cache (128 K)
2238 #pragma omp parallel
2239 {
2240 __cache_l2_flush();
2241 __cache_l2_128k();
2242 }
2243 }
2245 void cblas_zdotu_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu)
2246 {
2247 #pragma omp parallel
2248 {
2249 __cache_l2_flush();
2250 __cache_l2_512k();
2251 }
2252 cblas_zdotu_sub(N, X, incX, Y, incY, dotu);
2254 // return default L2 cache (128 K)
2255 #pragma omp parallel
2256 {
2257 __cache_l2_flush();
2258 __cache_l2_128k();
2259 }
2260 }
2262 void cblas_zdscal_facade(const int N, const double alpha, void *X, const int incX)
2263 {
2264 #pragma omp parallel
2265 {
2266 __cache_l2_flush();
2267 __cache_l2_512k();
2268 }
2269 cblas_zdscal(N, alpha, X, incX);
2271 // return default L2 cache (128 K)
2272 #pragma omp parallel
2273 {
2274 __cache_l2_flush();
2275 __cache_l2_128k();
2276 }
2277 }
2279 void cblas_zgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
2280 {
2281 #pragma omp parallel
2282 {
2283 __cache_l2_flush();
2284 __cache_l2_512k();
2285 }
2286 cblas_zgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
2288 // return default L2 cache (128 K)
2289 #pragma omp parallel
2290 {
2291 __cache_l2_flush();
2292 __cache_l2_128k();
2293 }
2294 }
2296 void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
2297 {
2298 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2299 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2300 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2302 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2303 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2304 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2306 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2307 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2308 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2310 #pragma omp parallel
2311 {
2312 __cache_l1d_flush();
2313 __cache_l1d_4k();
2314 }
2315 cblas_zgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
2317 #pragma omp parallel
2318 {
2319 __cache_l1d_flush();
2320 __cache_l1d_all();
2321 }
2322 }
2324 void cblas_zgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
2325 {
2326 #pragma omp parallel
2327 {
2328 __cache_l2_flush();
2329 __cache_l2_512k();
2330 }
2331 cblas_zgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
2333 // return default L2 cache (128 K)
2334 #pragma omp parallel
2335 {
2336 __cache_l2_flush();
2337 __cache_l2_128k();
2338 }
2339 }
2341 void cblas_zgerc_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
2342 {
2343 #pragma omp parallel
2344 {
2345 __cache_l2_flush();
2346 __cache_l2_512k();
2347 }
2348 cblas_zgerc(order, M, N, alpha, X, incX, Y, incY, A, lda);
2350 // return default L2 cache (128 K)
2351 #pragma omp parallel
2352 {
2353 __cache_l2_flush();
2354 __cache_l2_128k();
2355 }
2356 }
2358 void cblas_zgeru_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
2359 {
2360 #pragma omp parallel
2361 {
2362 __cache_l2_flush();
2363 __cache_l2_512k();
2364 }
2365 cblas_zgeru(order, M, N, alpha, X, incX, Y, incY, A, lda);
2367 // return default L2 cache (128 K)
2368 #pragma omp parallel
2369 {
2370 __cache_l2_flush();
2371 __cache_l2_128k();
2372 }
2373 }
2375 void cblas_zhbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
2376 {
2377 #pragma omp parallel
2378 {
2379 __cache_l2_flush();
2380 __cache_l2_512k();
2381 }
2382 cblas_zhbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
2384 // return default L2 cache (128 K)
2385 #pragma omp parallel
2386 {
2387 __cache_l2_flush();
2388 __cache_l2_128k();
2389 }
2390 }
2392 void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
2393 {
2394 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2395 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2396 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2398 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2399 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2400 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2402 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2403 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2404 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2406 #pragma omp parallel
2407 {
2408 __cache_l1d_flush();
2409 __cache_l1d_4k();
2410 }
2411 cblas_zhemm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
2413 #pragma omp parallel
2414 {
2415 __cache_l1d_flush();
2416 __cache_l1d_all();
2417 }
2418 }
2420 void cblas_zhemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
2421 {
2422 #pragma omp parallel
2423 {
2424 __cache_l2_flush();
2425 __cache_l2_512k();
2426 }
2427 cblas_zhemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
2429 // return default L2 cache (128 K)
2430 #pragma omp parallel
2431 {
2432 __cache_l2_flush();
2433 __cache_l2_128k();
2434 }
2435 }
2437 void cblas_zher_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A, const int lda)
2438 {
2439 #pragma omp parallel
2440 {
2441 __cache_l2_flush();
2442 __cache_l2_512k();
2443 }
2444 cblas_zher(order, Uplo, N, alpha, X, incX, A, lda);
2446 // return default L2 cache (128 K)
2447 #pragma omp parallel
2448 {
2449 __cache_l2_flush();
2450 __cache_l2_128k();
2451 }
2452 }
2454 void cblas_zher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
2455 {
2456 #pragma omp parallel
2457 {
2458 __cache_l2_flush();
2459 __cache_l2_512k();
2460 }
2461 cblas_zher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
2463 // return default L2 cache (128 K)
2464 #pragma omp parallel
2465 {
2466 __cache_l2_flush();
2467 __cache_l2_128k();
2468 }
2469 }
2471 void cblas_zher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const double beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
2472 {
2473 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2474 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2475 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2477 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2478 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2479 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2481 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2482 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2483 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2485 #pragma omp parallel
2486 {
2487 __cache_l1d_flush();
2488 __cache_l1d_4k();
2489 }
2490 cblas_zher2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
2492 #pragma omp parallel
2493 {
2494 __cache_l1d_flush();
2495 __cache_l1d_all();
2496 }
2497 }
2499 void cblas_zherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const void *A, const int lda, const double beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
2500 {
2501 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2502 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2503 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2505 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2506 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2507 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2509 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2510 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2511 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2513 #pragma omp parallel
2514 {
2515 __cache_l1d_flush();
2516 __cache_l1d_4k();
2517 }
2518 cblas_zherk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
2520 #pragma omp parallel
2521 {
2522 __cache_l1d_flush();
2523 __cache_l1d_all();
2524 }
2525 }
2527 void cblas_zhpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *Ap, const void *X, const int incX, const void *beta, void *Y, const int incY)
2528 {
2529 #pragma omp parallel
2530 {
2531 __cache_l2_flush();
2532 __cache_l2_512k();
2533 }
2534 cblas_zhpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
2536 // return default L2 cache (128 K)
2537 #pragma omp parallel
2538 {
2539 __cache_l2_flush();
2540 __cache_l2_128k();
2541 }
2542 }
2544 void cblas_zhpr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A)
2545 {
2546 #pragma omp parallel
2547 {
2548 __cache_l2_flush();
2549 __cache_l2_512k();
2550 }
2551 cblas_zhpr(order, Uplo, N, alpha, X, incX, A);
2553 // return default L2 cache (128 K)
2554 #pragma omp parallel
2555 {
2556 __cache_l2_flush();
2557 __cache_l2_128k();
2558 }
2559 }
2561 void cblas_zhpr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *Ap)
2562 {
2563 #pragma omp parallel
2564 {
2565 __cache_l2_flush();
2566 __cache_l2_512k();
2567 }
2568 cblas_zhpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap);
2570 // return default L2 cache (128 K)
2571 #pragma omp parallel
2572 {
2573 __cache_l2_flush();
2574 __cache_l2_128k();
2575 }
2576 }
2578 void cblas_zrotg_facade(void *a, void *b, double *c, void *s)
2579 {
2580 #pragma omp parallel
2581 {
2582 __cache_l2_flush();
2583 __cache_l2_512k();
2584 }
2585 cblas_zrotg(a, b, c, s);
2587 // return default L2 cache (128 K)
2588 #pragma omp parallel
2589 {
2590 __cache_l2_flush();
2591 __cache_l2_128k();
2592 }
2593 }
2595 void cblas_zscal_facade(const int N, const void *alpha, void *X, const int incX)
2596 {
2597 #pragma omp parallel
2598 {
2599 __cache_l2_flush();
2600 __cache_l2_512k();
2601 }
2602 cblas_zscal(N, alpha, X, incX);
2604 // return default L2 cache (128 K)
2605 #pragma omp parallel
2606 {
2607 __cache_l2_flush();
2608 __cache_l2_128k();
2609 }
2610 }
2612 void cblas_zswap_facade(const int N, void *X, const int incX, void *Y, const int incY)
2613 {
2614 #pragma omp parallel
2615 {
2616 __cache_l2_flush();
2617 __cache_l2_512k();
2618 }
2619 cblas_zswap(N, X, incX, Y, incY);
2621 // return default L2 cache (128 K)
2622 #pragma omp parallel
2623 {
2624 __cache_l2_flush();
2625 __cache_l2_128k();
2626 }
2627 }
2629 void cblas_zsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
2630 {
2631 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2632 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2633 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2635 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2636 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2637 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2639 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2640 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2641 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2643 #pragma omp parallel
2644 {
2645 __cache_l1d_flush();
2646 __cache_l1d_4k();
2647 }
2648 cblas_zsymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
2650 #pragma omp parallel
2651 {
2652 __cache_l1d_flush();
2653 __cache_l1d_all();
2654 }
2655 }
2657 void cblas_zsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
2658 {
2659 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2660 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2661 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2663 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2664 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2665 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2667 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2668 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2669 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2671 #pragma omp parallel
2672 {
2673 __cache_l1d_flush();
2674 __cache_l1d_4k();
2675 }
2676 cblas_zsyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
2678 #pragma omp parallel
2679 {
2680 __cache_l1d_flush();
2681 __cache_l1d_all();
2682 }
2683 }
2685 void cblas_zsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
2686 {
2687 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2688 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2689 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2691 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2692 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2693 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2695 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2696 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2697 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2699 #pragma omp parallel
2700 {
2701 __cache_l1d_flush();
2702 __cache_l1d_4k();
2703 }
2704 cblas_zsyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
2706 #pragma omp parallel
2707 {
2708 __cache_l1d_flush();
2709 __cache_l1d_all();
2710 }
2711 }
2713 void cblas_ztbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
2714 {
2715 #pragma omp parallel
2716 {
2717 __cache_l2_flush();
2718 __cache_l2_512k();
2719 }
2720 cblas_ztbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
2722 // return default L2 cache (128 K)
2723 #pragma omp parallel
2724 {
2725 __cache_l2_flush();
2726 __cache_l2_128k();
2727 }
2728 }
2730 void cblas_ztbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
2731 {
2732 #pragma omp parallel
2733 {
2734 __cache_l2_flush();
2735 __cache_l2_512k();
2736 }
2737 cblas_ztbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
2739 // return default L2 cache (128 K)
2740 #pragma omp parallel
2741 {
2742 __cache_l2_flush();
2743 __cache_l2_128k();
2744 }
2745 }
2747 void cblas_ztpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
2748 {
2749 #pragma omp parallel
2750 {
2751 __cache_l2_flush();
2752 __cache_l2_512k();
2753 }
2754 cblas_ztpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
2756 // return default L2 cache (128 K)
2757 #pragma omp parallel
2758 {
2759 __cache_l2_flush();
2760 __cache_l2_128k();
2761 }
2762 }
2764 void cblas_ztpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
2765 {
2766 #pragma omp parallel
2767 {
2768 __cache_l2_flush();
2769 __cache_l2_512k();
2770 }
2771 cblas_ztpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
2773 // return default L2 cache (128 K)
2774 #pragma omp parallel
2775 {
2776 __cache_l2_flush();
2777 __cache_l2_128k();
2778 }
2779 }
2781 void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
2782 {
2783 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2784 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2785 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2787 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2788 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2789 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2791 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2792 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2793 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2795 #pragma omp parallel
2796 {
2797 __cache_l1d_flush();
2798 __cache_l1d_4k();
2799 }
2800 cblas_ztrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
2802 #pragma omp parallel
2803 {
2804 __cache_l1d_flush();
2805 __cache_l1d_all();
2806 }
2807 }
2809 void cblas_ztrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
2810 {
2811 #pragma omp parallel
2812 {
2813 __cache_l2_flush();
2814 __cache_l2_512k();
2815 }
2816 cblas_ztrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
2818 // return default L2 cache (128 K)
2819 #pragma omp parallel
2820 {
2821 __cache_l2_flush();
2822 __cache_l2_128k();
2823 }
2824 }
2826 void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
2827 {
2828 pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
2829 pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2830 pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
2832 pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
2833 pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2834 pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
2836 pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
2837 pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2838 pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
2840 #pragma omp parallel
2841 {
2842 __cache_l1d_flush();
2843 __cache_l1d_4k();
2844 }
2845 cblas_ztrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
2847 #pragma omp parallel
2848 {
2849 __cache_l1d_flush();
2850 __cache_l1d_all();
2851 }
2852 }
2854 void cblas_ztrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
2855 {
2856 #pragma omp parallel
2857 {
2858 __cache_l2_flush();
2859 __cache_l2_512k();
2860 }
2861 cblas_ztrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
2863 // return default L2 cache (128 K)
2864 #pragma omp parallel
2865 {
2866 __cache_l2_flush();
2867 __cache_l2_128k();
2868 }
2869 }