summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 0f89d18)
raw | patch | inline | side by side (parent: 0f89d18)
author | Jianzhong Xu <a0869574local@uda0869574> | |
Thu, 8 Oct 2015 13:32:51 +0000 (09:32 -0400) | ||
committer | Jianzhong Xu <a0869574local@uda0869574> | |
Thu, 8 Oct 2015 13:32:51 +0000 (09:32 -0400) |
148 files changed:
index b4f117b66c2811b621db1166e5c297aaafc97161..cf3fd3847b0c69e64d9104196a07c75f3d204c6b 100644 (file)
--- a/blasblisacc/src/facade.c
+++ b/blasblisacc/src/facade.c
char *pool_kn_mem_L3;
char *pool_mn_mem_L3;
-void ti_bli_init_dsp(char *l3_buf, char *l2_buf)
-{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
+extern void bli_mem_init();
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- bli_init();
+void ti_bli_init_dsp(char *l3_buf, char *l2_buf)
+{
+ bli_init();
}
void ti_bli_finalize_dsp(void)
{
- bli_finalize();
+ bli_finalize();
}
+
void cblas_caxpy_facade(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_caxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ccopy_facade(const int N, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ccopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cdotc_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cdotc_sub(N, X, incX, Y, incY, dotc);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cdotu_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cdotu_sub(N, X, incX, Y, incY, dotu);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -170,12 +102,13 @@ void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
__cache_l1d_4k();
}
-
cblas_cgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
#pragma omp parallel
@@ -187,70 +120,22 @@ void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE
void cblas_cgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgerc_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgerc(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgeru_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgeru(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -267,6 +152,8 @@ void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -283,53 +170,17 @@ void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_chemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cher_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const void *X, const int incX, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cher(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const float beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -346,6 +197,8 @@ void cblas_cher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Upl
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -374,6 +227,8 @@ void cblas_cherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -390,121 +245,37 @@ void cblas_cherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
void cblas_chpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *Ap, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chpr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const void *X, const int incX, void *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chpr(order, Uplo, N, alpha, X, incX, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chpr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_crotg_facade(void *a, void *b, float *c, void *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_crotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cscal_facade(const int N, const void *alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_csscal_facade(const int N, const float alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_csscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cswap_facade(const int N, void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_csymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -521,6 +292,8 @@ void cblas_csymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -549,6 +322,8 @@ void cblas_csyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Upl
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -577,6 +352,8 @@ void cblas_csyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -593,70 +370,22 @@ void cblas_csyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
void cblas_ctbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
@@ -673,6 +402,8 @@ void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -689,19 +420,7 @@ void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_ctrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
@@ -718,6 +437,8 @@ void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -734,109 +455,36 @@ void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_ctrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dasum_facade(const int N, const double *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_daxpy_facade(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_daxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dcopy_facade(const int N, const double *X, const int incX, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dcopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ddot_facade(const int N, const double *X, const int incX, const double *Y, const int incY, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_ddot(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
{
-// printf("dgemm facade A: %x, B: %x\n", A, B);
pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
@@ -849,6 +497,8 @@ void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -865,240 +515,72 @@ void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE
void cblas_dgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dger_facade(const enum CBLAS_ORDER order, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dger(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dnrm2_facade(const int N, const double *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dnrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drot_facade(const int N, double *X, const int incX, double *Y, const int incY, const double c, const double s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drot(N, X, incX, Y, incY, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drotg_facade(double *a, double *b, double *c, double *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drotm_facade(const int N, double *X, const int incX, double *Y, const int incY, const double *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drotm(N, X, incX, Y, incY, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drotmg_facade(double *d1, double *d2, double *b1, const double b2, double *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drotmg(d1, d2, b1, b2, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dscal_facade(const int N, const double alpha, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsdot_facade(const int N, const float *X, const int incX, const float *Y, const int incY, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dsdot(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dspmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *Ap, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dspr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, double *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dspr(order, Uplo, N, alpha, X, incX, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dspr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dspr2(order, Uplo, N, alpha, X, incX, Y, incY, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dswap_facade(const int N, double *X, const int incX, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -1115,6 +597,8 @@ void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -1131,53 +615,17 @@ void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_dsymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsyr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, double *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsyr(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -1194,6 +642,8 @@ void cblas_dsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Upl
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -1222,6 +672,8 @@ void cblas_dsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -1238,70 +690,22 @@ void cblas_dsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
void cblas_dtbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *Ap, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *Ap, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, float *l3_buf, float *l2_buf_loc)
@@ -1318,6 +722,8 @@ void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -1334,19 +740,7 @@ void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_dtrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, float *l3_buf, float *l2_buf_loc)
@@ -1363,6 +757,8 @@ void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -1379,257 +775,77 @@ void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_dtrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dzasum_facade(const int N, const void *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dzasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dznrm2_facade(const int N, const void *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dznrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_icamax_facade(const int N, const void *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_icamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_idamax_facade(const int N, const double *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_idamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_isamax_facade(const int N, const float *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_isamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_izamax_facade(const int N, const void *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_izamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sasum_facade(const int N, const float *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_sasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_saxpy_facade(const int N, const float alpha, const float *X, const int incX, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_saxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_scasum_facade(const int N, const void *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_scasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_scnrm2_facade(const int N, const void *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_scnrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_scopy_facade(const int N, const float *X, const int incX, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_scopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sdot_facade(const int N, const float *X, const int incX, const float *Y, const int incY, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_sdot(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sdsdot_facade(const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_sdsdot(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -1646,6 +862,8 @@ void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -1662,223 +880,67 @@ void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE
void cblas_sgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sger_facade(const enum CBLAS_ORDER order, const int M, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sger(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_snrm2_facade(const int N, const float *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_snrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srot_facade(const int N, float *X, const int incX, float *Y, const int incY, const float c, const float s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srot(N, X, incX, Y, incY, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srotg_facade(float *a, float *b, float *c, float *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srotm_facade(const int N, float *X, const int incX, float *Y, const int incY, const float *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srotm(N, X, incX, Y, incY, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srotmg_facade(float *d1, float *d2, float *b1, const float b2, float *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srotmg(d1, d2, b1, b2, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sscal_facade(const int N, const float alpha, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sspmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *Ap, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sspr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, float *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sspr(order, Uplo, N, alpha, X, incX, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sspr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sspr2(order, Uplo, N, alpha, X, incX, Y, incY, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sswap_facade(const int N, float *X, const int incX, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -1895,6 +957,8 @@ void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -1909,55 +973,19 @@ void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
}
}
-void cblas_ssymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
-{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
- cblas_ssymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
-}
-
+void cblas_ssymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
+{
+ cblas_ssymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
+}
+
void cblas_ssyr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, float *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssyr(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -1974,6 +1002,8 @@ void cblas_ssyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Upl
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2002,6 +1032,8 @@ void cblas_ssyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2018,70 +1050,22 @@ void cblas_ssyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
void cblas_stbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_stbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_stpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *Ap, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_stpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *Ap, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, float *l3_buf, float *l2_buf_loc)
@@ -2098,6 +1082,8 @@ void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2114,19 +1100,7 @@ void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_strmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_strmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, float *l3_buf, float *l2_buf_loc)
@@ -2143,6 +1117,8 @@ void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2159,138 +1135,42 @@ void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_strsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_strsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_xerbla_facade(int p, const char *rout, const char *form)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_xerbla(p, rout, form);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zaxpy_facade(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zaxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zcopy_facade(const int N, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zcopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zdotc_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zdotc_sub(N, X, incX, Y, incY, dotc);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zdotu_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zdotu_sub(N, X, incX, Y, incY, dotu);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zdscal_facade(const int N, const double alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zdscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -2307,6 +1187,8 @@ void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2323,70 +1205,22 @@ void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE
void cblas_zgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgerc_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgerc(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgeru_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgeru(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -2403,6 +1237,8 @@ void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2419,53 +1255,17 @@ void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_zhemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zher_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zher(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const double beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -2482,6 +1282,8 @@ void cblas_zher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Upl
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2510,6 +1312,8 @@ void cblas_zherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2526,104 +1330,32 @@ void cblas_zherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
void cblas_zhpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *Ap, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhpr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhpr(order, Uplo, N, alpha, X, incX, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhpr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zrotg_facade(void *a, void *b, double *c, void *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zrotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zscal_facade(const int N, const void *alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zswap_facade(const int N, void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
@@ -2640,6 +1372,8 @@ void cblas_zsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2668,6 +1402,8 @@ void cblas_zsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Upl
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2696,6 +1432,8 @@ void cblas_zsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2712,70 +1450,22 @@ void cblas_zsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo
void cblas_ztbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
@@ -2792,6 +1482,8 @@ void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2808,19 +1500,7 @@ void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_ztrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
@@ -2837,6 +1517,8 @@ void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
@@ -2853,18 +1535,6 @@ void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side
void cblas_ztrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
index 2af9c66bb289f3c7b7fd3fb5ebecebede04e22af..bdf2fe87113ea8274de6315575d11b7e02ac84db 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_caxpy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_caxpy");
- __real_cblas_caxpy(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_caxpy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_caxpy");
+ __real_cblas_caxpy(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_caxpy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_caxpy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_caxpy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_caxpy");
index 4458a8428c256bcc5ec52ac49f23b716fb4c8907..fff48bf88663331db0ff00d7fded32cd3d793b07 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ccopy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ccopy");
- __real_cblas_ccopy(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ccopy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ccopy");
+ __real_cblas_ccopy(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ccopy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ccopy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ccopy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ccopy");
diff --git a/blasblisacc/src/ti_cblas_cblas_cdotc_sub.c b/blasblisacc/src/ti_cblas_cblas_cdotc_sub.c
index 730493ab1d33db83f82e64e998bdecd9f979192c..c2f00cfdadd2499b3e5e85c3b6f0d9787362d91e 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cdotc_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotc_sub");
- __real_cblas_cdotc_sub(N,X,incX,Y,incY,dotc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotc_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotc_sub");
+ __real_cblas_cdotc_sub(N,X,incX,Y,incY,dotc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotc_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotc_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotc_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cdotc_sub");
diff --git a/blasblisacc/src/ti_cblas_cblas_cdotu_sub.c b/blasblisacc/src/ti_cblas_cblas_cdotu_sub.c
index 8f795c7306faa23fef1358d10c2d73c51855ee14..025ac50d4cd6260361950790e34760497c9a0cc6 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cdotu_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotu_sub");
- __real_cblas_cdotu_sub(N,X,incX,Y,incY,dotu);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotu_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotu_sub");
+ __real_cblas_cdotu_sub(N,X,incX,Y,incY,dotu);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotu_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotu_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotu_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cdotu_sub");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cdotu_sub", (float) clock_diff);
return ;
index 954148f3cd5a81443707ac8cf6dc37fe80a79a3f..e45a04f5be10f3016f186e7089d165956a20a642 100644 (file)
@@ -44,20 +44,19 @@ void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgbmv");
- __real_cblas_cgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgbmv");
+ __real_cblas_cgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -208,6 +206,7 @@ void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgbmv");
index a8edae8b8dbce36fd6af685df6deff9a5b800d3b..4dc98d9cb7da98b84ffe177a172e053966af4bec 100644 (file)
@@ -44,20 +44,19 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cgemm_offload_dsp(Order,M,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemm");
- __real_cblas_cgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cgemm_offload_dsp(Order,M,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemm");
+ __real_cblas_cgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -229,6 +227,7 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index f6d7f9b0f4bc663bc6945bcda65a5d291131e9c4..6c58f7a51567c961fb16bc93de4298ef2107e2bc 100644 (file)
@@ -44,20 +44,19 @@ void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemv");
- __real_cblas_cgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemv");
+ __real_cblas_cgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -196,6 +194,7 @@ void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgemv");
index f8776e95f76acbee95d0bfdd7ba28277edece224..b66278317b69046e68bc40c2da64dcf040e1db31 100644 (file)
@@ -44,20 +44,19 @@ void cblas_cgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgerc");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgerc");
- __real_cblas_cgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgerc", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgerc");
+ __real_cblas_cgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgerc", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgerc");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgerc");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -179,6 +177,7 @@ void cblas_cgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgerc");
index 087b519c40b6d7f7ac6dd4f1af24e23d5dc20668..011a1ffce4e7a799b4333cc2ba4ebd208cefb706 100644 (file)
@@ -44,20 +44,19 @@ void cblas_cgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgeru");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgeru");
- __real_cblas_cgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgeru", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgeru");
+ __real_cblas_cgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgeru", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgeru");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgeru");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -179,7 +177,9 @@ void cblas_cgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgeru");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cgeru", (float) clock_diff);
return ;
index 8c264a868db44bee88fd041e28550001b25dc381..34f98a716302c4677fb363925d3bce530d434523 100644 (file)
@@ -44,20 +44,19 @@ void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chbmv");
- __real_cblas_chbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chbmv");
+ __real_cblas_chbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -196,7 +194,9 @@ void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chbmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_chbmv", (float) clock_diff);
return ;
index d48091a5f5f2170178aba7278739b7f960354a54..52648550fa727b76ffcc4e78f632a385af5a8078 100644 (file)
@@ -44,20 +44,19 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!chemm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemm");
- __real_cblas_chemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!chemm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemm");
+ __real_cblas_chemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -190,12 +188,10 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
-
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
-
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -225,6 +221,7 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index f5a16d6bc462786122a657b43809c944dac7416b..09bf9bb9d2906ab6a98a5de1542773d781291013 100644 (file)
@@ -44,20 +44,19 @@ void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemv");
- __real_cblas_chemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemv");
+ __real_cblas_chemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -190,6 +188,7 @@ void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chemv");
index f42e4bcbb6a156f1df643d0b0697c24cd6edaa5a..8f7c90152d9d87b018b36b980e4993f65b5a8538 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cher");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher");
- __real_cblas_cher(order,Uplo,N,alpha,X,incX,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher");
+ __real_cblas_cher(order,Uplo,N,alpha,X,incX,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cher");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cher", (float) clock_diff);
return ;
index 00eb4aa2a31e311d01caa36a4b262bd7434d77b7..f07e96bc354773a7e5281fecc390e6098c831098 100644 (file)
@@ -44,20 +44,19 @@ void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cher2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2");
- __real_cblas_cher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2");
+ __real_cblas_cher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -179,7 +177,9 @@ void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cher2");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cher2", (float) clock_diff);
return ;
index b36127567e6967788e3b0d0592ca3837236873a8..4fcf2db13c09ab47a9f860bea50afdbc51e7c45b 100644 (file)
@@ -44,20 +44,19 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cher2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cher2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2k");
- __real_cblas_cher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cher2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2k");
+ __real_cblas_cher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -183,11 +181,12 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -219,6 +218,7 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cher2k");
index ed3f9e82c70bba64fd195c10e3bffd2faea7e484..b3f6c6a2e6becae9885f9682755e56ba82fec19c 100644 (file)
@@ -44,20 +44,19 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cherk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cherk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cherk");
- __real_cblas_cherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cherk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cherk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cherk");
+ __real_cblas_cherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cherk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cherk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cherk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -161,8 +159,8 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -192,6 +190,7 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index 1440c48809a0f924c01cae29c1684b153ab4cc85..dc12e2f426c21669d32ae597238b6774223084f7 100644 (file)
@@ -44,20 +44,19 @@ void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpmv");
- __real_cblas_chpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpmv");
+ __real_cblas_chpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -184,7 +182,9 @@ void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chpmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_chpmv", (float) clock_diff);
return ;
index 3209139569cc6768189af8591af8abaf5925f177..0b6e9c5a9175ac8da3cc91d9e49ef48888c4d8c7 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chpr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr");
- __real_cblas_chpr(order,Uplo,N,alpha,X,incX,A);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr");
+ __real_cblas_chpr(order,Uplo,N,alpha,X,incX,A);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chpr");
index 21111f2a03aaef1469068ad3c6da249f2db5a136..3a92c94eb70b0eca996282cdd7988ee79eab03e6 100644 (file)
@@ -44,20 +44,19 @@ void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chpr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr2");
- __real_cblas_chpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr2");
+ __real_cblas_chpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -173,6 +171,7 @@ void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chpr2");
index 1b09cfb517b53af7edbdcbf393e24e1ac9b9686b..3cc378cf804aaab705fc550412f3afd4cd9e2ff0 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_crotg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_crotg");
- __real_cblas_crotg(a,b,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_crotg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_crotg");
+ __real_cblas_crotg(a,b,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_crotg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_crotg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_crotg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
Buffer buf_a(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), (void *)a);
__K->setArg(0, buf_a);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_crotg");
index 1c5e49ad4f03bf0d121637bb786e6bf7165c1d3e..a80951b13c73f2e3f84a2654f1e42f43d871d5d5 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cscal");
- __real_cblas_cscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cscal");
+ __real_cblas_cscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cscal");
index 1e28a64d0419d6bdeffa67add4ec526c94cae935..22819ea243e394789c52af3fdf5b1b749e947526 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_csscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csscal");
- __real_cblas_csscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csscal");
+ __real_cblas_csscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_csscal");
index 9c2f0dd4275fe1d14e7c3bee794855ef9eaaed7b..75db0f145055d3bb53b22182ae362bdda63162db 100644 (file)
/* Do an init on first use */
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cswap");
+
/* OpenCL cannot deal with overlapping memory regions. This is an issue when you
* are trying to swap two rows of a matrix, where the matrix is column major. Hence,
* the offload of this routine to the DSP is disabled.
*/
#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
TI_CBLAS_PROFILE_START();
-
TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cswap");
- __real_cblas_cswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
+ __real_cblas_cswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
return ;
#else
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cswap");
- __real_cblas_cswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cswap");
+ __real_cblas_cswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cswap");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cswap");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cswap");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cswap", (float) clock_diff);
return ;
index f48c129c92044f8c79b3a22df0ab74818ca96f22..80de3a7a60d5a08393b28e8e1827a52db04e93f2 100644 (file)
@@ -44,20 +44,19 @@ void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_csymm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csymm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csymm");
- __real_cblas_csymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csymm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csymm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csymm");
+ __real_cblas_csymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csymm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csymm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csymm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -188,11 +186,12 @@ void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -222,6 +221,7 @@ void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index 9bec336cff81b3d30ec460d49b44cdc064c36781..a195349e15737b938bd346257a0b6ebe4c058930 100644 (file)
@@ -44,20 +44,19 @@ void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_csyr2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csyr2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csyr2k");
- __real_cblas_csyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csyr2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csyr2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csyr2k");
+ __real_cblas_csyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csyr2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csyr2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csyr2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -188,11 +186,12 @@ void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -222,6 +221,7 @@ void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index dafffb44c2c873cbd958fa8c1fc1c55652f17ce1..ecfcabc57bdbe5c95c77a03d8da0600f85677fa7 100644 (file)
@@ -44,20 +44,19 @@ void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_csyrk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csyrk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csyrk");
- __real_cblas_csyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csyrk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csyrk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csyrk");
+ __real_cblas_csyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csyrk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csyrk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csyrk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -167,11 +165,12 @@ void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#else
err |= clSetKernelArg(__K, 10, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -201,6 +200,7 @@ void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index 6ae8f35c60c70c22f811ec6cc1c9b1e0627eefbd..00d7b76eee045ca9a72dd7043edba96c32f25a99 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctbmv");
- __real_cblas_ctbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctbmv");
+ __real_cblas_ctbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -165,7 +163,9 @@ void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctbmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctbmv", (float) clock_diff);
return ;
index 831f84e648b6353ff3d2f2d60b913482366cb221..93b84be7d32f86a47c5641eafc7a776894b94ec2 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctbsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctbsv");
- __real_cblas_ctbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctbsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctbsv");
+ __real_cblas_ctbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctbsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctbsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctbsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -165,7 +163,9 @@ void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctbsv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctbsv", (float) clock_diff);
return ;
index 0dc6106a5282b0108921a40d93498af4497449e4..da5fc3f233c3ab76c5620c747903ad22e7570058 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctpmv");
- __real_cblas_ctpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctpmv");
+ __real_cblas_ctpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -111,7 +109,7 @@ void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
size_bufAp = MAX(size_bufAp,1);
#ifdef __cplusplus
- Buffer buf_Ap(*ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufAp , (void *)Ap);
+ Buffer buf_Ap(*ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufAp, (void *)Ap);
__K->setArg(5, buf_Ap);
#else
cl_mem buf_Ap = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufAp, (void *)Ap, &err);
@@ -153,7 +151,9 @@ void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctpmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctpmv", (float) clock_diff);
return ;
index cef43540830e87ae26253da1edb806887c840623..a1448002901dbd75848d81ad5eaa8b1365fb625d 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctpsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctpsv");
- __real_cblas_ctpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctpsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctpsv");
+ __real_cblas_ctpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctpsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctpsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctpsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -153,7 +151,9 @@ void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctpsv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctpsv", (float) clock_diff);
return ;
index dcb56943859e8b2c347f773bf3d29222212c0b95..1aeaa52c2c63e6dabbc3ba14313ae352333a3aa8 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctrmm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ctrmm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrmm");
- __real_cblas_ctrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrmm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ctrmm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrmm");
+ __real_cblas_ctrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrmm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrmm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrmm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -168,11 +166,12 @@ void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 11, sizeof(ldb), &ldb);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -202,6 +201,7 @@ void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index ee423018f4fcbe362257aab0f4a883c8e91b2e9e..0b33200b8e48b83dfbc2fe92ddcf2c25a48286df 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctrmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrmv");
- __real_cblas_ctrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrmv");
+ __real_cblas_ctrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -159,7 +157,9 @@ void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctrmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctrmv", (float) clock_diff);
return ;
index c9f4716dcac1b5d1d75312660d6b7edff8da92b7..6ea2cc2d76d62ee8a9e43507a483dd6cd973ec8e 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctrsm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ctrsm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrsm");
- __real_cblas_ctrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrsm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ctrsm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrsm");
+ __real_cblas_ctrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrsm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrsm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrsm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -168,11 +166,12 @@ void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 11, sizeof(ldb), &ldb);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -202,7 +201,9 @@ void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctrsm");
index d50ab5aae83d988bba853e5df9c93b8c2a7d56b1..8b697a5d3176c867498620cd02405a555652aa62 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctrsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrsv");
- __real_cblas_ctrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrsv");
+ __real_cblas_ctrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -159,7 +157,9 @@ void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctrsv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctrsv", (float) clock_diff);
return ;
index ae850f7feb719c3f95fdb48e8cf65b1eb6fcc2ae..ab0fb5f80d6458e53557af38d317c2704bb5ec80 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dasum");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dasum");
- double rval = __real_cblas_dasum(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dasum", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dasum");
+ double rval = __real_cblas_dasum(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dasum", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dasum");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dasum");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dasum");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dasum", (float) clock_diff);
return retval;
index 7ce6a9956e7d98f77879f56fc7670b6a58a3eef1..7e99bb165b29bb928c62954d1e78d3041f34f800 100644 (file)
@@ -44,20 +44,19 @@ void cblas_daxpy(const int N, const double alpha, const double *X, const int inc
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_daxpy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_daxpy");
- __real_cblas_daxpy(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_daxpy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_daxpy");
+ __real_cblas_daxpy(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_daxpy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_daxpy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_daxpy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -141,7 +139,9 @@ void cblas_daxpy(const int N, const double alpha, const double *X, const int inc
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_daxpy");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_daxpy", (float) clock_diff);
return ;
index 29ccdcc34f1a2e58ae1208768289c6e2775c2872..fb9790e8248dd5e65eab602d60d7a3d68d89ce1a 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dcopy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dcopy");
- __real_cblas_dcopy(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dcopy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dcopy");
+ __real_cblas_dcopy(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dcopy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dcopy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dcopy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dcopy");
index 0c1e9ac51a14d2d0febe8bc4ac890a66304e211d..719e82e3d2d5782b7d24da61c0b36cd32eee1bd5 100644 (file)
@@ -44,20 +44,19 @@ double cblas_ddot(const int N, const double *X, const int incX, const double *Y,
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ddot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ddot");
- double rval = __real_cblas_ddot(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ddot", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ddot");
+ double rval = __real_cblas_ddot(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ddot", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ddot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ddot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -148,6 +146,7 @@ double cblas_ddot(const int N, const double *X, const int incX, const double *Y,
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ddot");
index 7eda3f0a00b1d0f9e50ecf9ebb70e597191e8f57..468b5b4048a26492394ed51723d8db60b1ed2d6a 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dgbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgbmv");
- __real_cblas_dgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgbmv");
+ __real_cblas_dgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -198,6 +196,7 @@ void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dgbmv");
index 7ce8311b9f724f8547abdf54073cb583beb5beb5..976ec5a79f49f2771d33d795134fd5a2bd1ad457 100644 (file)
@@ -44,24 +44,21 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dgemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dgemm_offload_dsp(Order,M,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgemm");
- __real_cblas_dgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dgemm_offload_dsp(Order,M,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgemm");
+ __real_cblas_dgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgemm");
/* Lookup kernel pointer from global table */
- void *msmc_ptr;
-
#ifdef __cplusplus
Event e;
Kernel* __K;
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -186,6 +182,8 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#else
err |= clSetKernelArg(__K, 13, sizeof(ldc), &ldc);
#endif
+
+ void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
@@ -193,8 +191,8 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
__K->setArg(14, buf_MSMC);
#else
- cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
//cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
+ cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 14, sizeof(buf_MSMC), &buf_MSMC);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
@@ -219,7 +217,9 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dgemm");
index 617badbb63ab79e8bb4907dbbb12cf2d32123f32..97a1b056cb49cfcd8a3b76f8f41909726ce9faf3 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dgemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgemv");
- __real_cblas_dgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgemv");
+ __real_cblas_dgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -186,7 +184,9 @@ void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dgemv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dgemv", (float) clock_diff);
return ;
index b49bc04452479e9f4e6ce137d722e3eb55aba704..4da84337102ea97ff4cc2cd6198e46eebd683eeb 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dger(const enum CBLAS_ORDER order, const int M, const int N, const do
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dger");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dger");
- __real_cblas_dger(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dger", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dger");
+ __real_cblas_dger(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dger", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dger");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dger");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -174,6 +172,7 @@ void cblas_dger(const enum CBLAS_ORDER order, const int M, const int N, const do
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dger");
index cd816df0f8f1c29b46020347d55dba5ff24a4392..18eaddd236330460eed88f7515c065d47ddbd33f 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dnrm2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dnrm2");
- double rval = __real_cblas_dnrm2(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dnrm2", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dnrm2");
+ double rval = __real_cblas_dnrm2(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dnrm2", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dnrm2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dnrm2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dnrm2");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dnrm2", (float) clock_diff);
return retval;
index 3c1afc0173c6e6e6deba420caccfcdac91e3775c..0694e9d3496b6ad8437401477ebd064bd0a0b6c8 100644 (file)
@@ -44,20 +44,19 @@ void cblas_drot(const int N, double *X, const int incX, double *Y, const int inc
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_drot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drot");
- __real_cblas_drot(N,X,incX,Y,incY,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drot", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drot");
+ __real_cblas_drot(N,X,incX,Y,incY,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drot", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -146,6 +144,7 @@ void cblas_drot(const int N, double *X, const int incX, double *Y, const int inc
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_drot");
index a2b5b5ea9771fa02a9a18808865c2e8230d5ba30..d152518c5678e24ebd4066ab76383af20903543d 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_drotg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotg");
- __real_cblas_drotg(a,b,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotg");
+ __real_cblas_drotg(a,b,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
Buffer buf_a(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), (void *)a);
__K->setArg(0, buf_a);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_drotg");
index 4fc4d24769439ccaf410e4e8431c5ce8cefa15ee..f8b95aabfaf385e2275e9b4e4c8a88213973405b 100644 (file)
@@ -44,20 +44,19 @@ void cblas_drotm(const int N, double *X, const int incX, double *Y, const int in
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_drotm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotm");
- __real_cblas_drotm(N,X,incX,Y,incY,P);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotm");
+ __real_cblas_drotm(N,X,incX,Y,incY,P);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -145,6 +143,7 @@ void cblas_drotm(const int N, double *X, const int incX, double *Y, const int in
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_drotm");
index 0be15c2cb6cae87c02dd97ff55708d9cc4a40358..fb19c073a8f87fc945cde643b815c8a296046786 100644 (file)
@@ -44,20 +44,19 @@ void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_drotmg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotmg");
- __real_cblas_drotmg(d1,d2,b1,b2,P);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotmg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotmg");
+ __real_cblas_drotmg(d1,d2,b1,b2,P);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotmg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotmg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotmg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
Buffer buf_d1(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), (void *)d1);
__K->setArg(0, buf_d1);
@@ -134,7 +132,9 @@ void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_drotmg");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_drotmg", (float) clock_diff);
return ;
index e1e77c5f080861a372932d7ad159fd7783c2fc04..bc02f21fe4a4a35e7030d541c1eef67fa00af920 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsbmv");
- __real_cblas_dsbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsbmv");
+ __real_cblas_dsbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -186,7 +184,9 @@ void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsbmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dsbmv", (float) clock_diff);
return ;
index d085915522bccc92b120795b3bd517485faeed8b..badc23089e235fc88ee7fd1defb8096babae1cd0 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dscal");
- __real_cblas_dscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dscal");
+ __real_cblas_dscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dscal");
index 9dd52b969ccc17dd90951f9e313e839b720f5995..1a03fd6d0f57f17da0c2ea6a0bd920585f87ab89 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsdot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsdot");
- double rval = __real_cblas_dsdot(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsdot", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsdot");
+ double rval = __real_cblas_dsdot(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsdot", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsdot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsdot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsdot");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dsdot", (float) clock_diff);
return retval;
index 36f91b96683f8386b8a0fa9163c0837b6de90c03..ba7eb80c39b54a0354928f3f35845d17b6333b53 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dspmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspmv");
- __real_cblas_dspmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspmv");
+ __real_cblas_dspmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -174,6 +172,7 @@ void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dspmv");
index 06ef54e2ef99d709bde5493fde78b7e548a520ce..3462f3ff1f6a8825e97983527e39c8fa1d4fc435 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dspr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspr");
- __real_cblas_dspr(order,Uplo,N,alpha,X,incX,Ap);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspr");
+ __real_cblas_dspr(order,Uplo,N,alpha,X,incX,Ap);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dspr");
index cea52f7df8bfcefd5127826d4b569f7b082b070d..677f81330cf7f9085d9e015b0e790d222bba4d77 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dspr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspr2");
- __real_cblas_dspr2(order,Uplo,N,alpha,X,incX,Y,incY,A);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspr2");
+ __real_cblas_dspr2(order,Uplo,N,alpha,X,incX,Y,incY,A);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -168,6 +166,7 @@ void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dspr2");
index cd4cc27ae13c1d5278c2e9ec7598111a2f642ba4..9937fe09bb0b3e16c307d06d06aec8f4039da3cc 100644 (file)
/* Do an init on first use */
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dswap");
+
/* OpenCL cannot deal with overlapping memory regions. This is an issue when you
* are trying to swap two rows of a matrix, where the matrix is column major. Hence,
* the offload of this routine to the DSP is disabled.
@@ -50,25 +51,24 @@ void cblas_dswap(const int N, double *X, const int incX, double *Y, const int in
#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
TI_CBLAS_PROFILE_START();
TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dswap");
- __real_cblas_dswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dswap", (float) clock_diff);
+ __real_cblas_dswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dswap", (float) clock_diff);
return ;
#else
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dswap");
- __real_cblas_dswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dswap", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dswap");
+ __real_cblas_dswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dswap", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dswap");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dswap");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -145,7 +144,9 @@ void cblas_dswap(const int N, double *X, const int incX, double *Y, const int in
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dswap");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dswap", (float) clock_diff);
return ;
index 97e89a963629f5edfa1433eed1d99c9a9c1b140e..53f36330d70b59042bfbcc0fc66bf74ea497b052 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsymm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsymm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsymm");
- __real_cblas_dsymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsymm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsymm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsymm");
+ __real_cblas_dsymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsymm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsymm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsymm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -182,8 +180,8 @@ void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -213,7 +211,9 @@ void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsymm");
index 966e2a3b7efaaf7b309a3b9d981845eaf6f1fbfe..498ab222229d40b9b87ca18c5eb2dbe07d3d6093 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsymv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsymv");
- __real_cblas_dsymv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsymv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsymv");
+ __real_cblas_dsymv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsymv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsymv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsymv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -180,6 +178,7 @@ void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsymv");
index ca5044fa5bcab19db1910ff3e86aac146d83f528..408976a370fadc1cfc9b965531867be77a697067 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsyr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr");
- __real_cblas_dsyr(order,Uplo,N,alpha,X,incX,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr");
+ __real_cblas_dsyr(order,Uplo,N,alpha,X,incX,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsyr");
index a0d366f914e55d2b2fd340da2824f2766b6b16bc..5a13d29a535f1cdb8fc12962e5f071acd20a0fa2 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsyr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr2");
- __real_cblas_dsyr2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr2");
+ __real_cblas_dsyr2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -174,6 +172,9 @@ void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
+ ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsyr2");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dsyr2", (float) clock_diff);
return ;
index cea6055bd8ed8801d5fcf740724ec43f06bb12c9..4cf91631127f0d4c8046c7b124b4751aa46f9ef8 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsyr2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsyr2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr2k");
- __real_cblas_dsyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsyr2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr2k");
+ __real_cblas_dsyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -182,8 +180,8 @@ void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -213,7 +211,9 @@ void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsyr2k");
index d3b7f1a78cabcd93cd2e33efcba8d017eea2ae9f..747d1420689d5e19130a29cdbb3c466c6d6d0ee4 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsyrk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsyrk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyrk");
- __real_cblas_dsyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyrk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsyrk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyrk");
+ __real_cblas_dsyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyrk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyrk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyrk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -161,8 +159,8 @@ void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -192,7 +190,9 @@ void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsyrk");
index 2b1570ee54d66f081d349fb242c5685e1973796b..98ca5d37cf2ce1f240804192f8e8dd05c093847f 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtbmv");
- __real_cblas_dtbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtbmv");
+ __real_cblas_dtbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -165,7 +163,9 @@ void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtbmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dtbmv", (float) clock_diff);
return ;
index a024be828c48f3d9da279759c599c73991afde7e..9e48c9d36b52ad5f95121d052cd81690f3dd33b9 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtbsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtbsv");
- __real_cblas_dtbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtbsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtbsv");
+ __real_cblas_dtbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtbsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtbsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtbsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -165,6 +163,7 @@ void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtbsv");
index 323098c6ee29900f9792f436b4968fa979d29df8..3e715b9ed822dbe8a6214e165c36a5a846fd9d29 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtpmv");
- __real_cblas_dtpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtpmv");
+ __real_cblas_dtpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -153,6 +151,7 @@ void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtpmv");
index 53a0e2275f4e3e90fa62c48eecebbc33b286c6c4..6084309a6ecdbfbc09e918a516b731c5c980d170 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtpsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtpsv");
- __real_cblas_dtpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtpsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtpsv");
+ __real_cblas_dtpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtpsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtpsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtpsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -153,6 +151,7 @@ void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtpsv");
index 2a32919aac6a49c5a8e71a3d1a77293f1dd8e2f9..13c0f6e1016094dbd6ecd2aa1c3d1403c8a4422b 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtrmm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dtrmm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrmm");
- __real_cblas_dtrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrmm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dtrmm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrmm");
+ __real_cblas_dtrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrmm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrmm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrmm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -163,11 +161,12 @@ void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 11, sizeof(ldb), &ldb);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -199,6 +198,7 @@ void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtrmm");
index a2f840ca21a6d9588253c17248063d122dad56d2..339ed16d54acbe80a77934841d073e4494a57ee9 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtrmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrmv");
- __real_cblas_dtrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrmv");
+ __real_cblas_dtrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -159,6 +157,7 @@ void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtrmv");
index 662960d8cbd4170d7d82bdc323ae1adf66e1989d..7724f014d016d4f1a29bb747bf342465106a91c7 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtrsm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dtrsm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrsm");
- __real_cblas_dtrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrsm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dtrsm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrsm");
+ __real_cblas_dtrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrsm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrsm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrsm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -167,8 +165,8 @@ void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -198,6 +196,7 @@ void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index 7c8d7d336f945385a2c07e02272e7a06fc89996b..eb5cfd4f65ea8da0b4b4952d63905682a3c1426a 100644 (file)
@@ -44,20 +44,19 @@ void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtrsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrsv");
- __real_cblas_dtrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrsv");
+ __real_cblas_dtrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -159,7 +157,9 @@ void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtrsv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dtrsv", (float) clock_diff);
return ;
index eb15f740fc0daf15acd9a9718995eaf189b608d2..603aa7873cc0972f68527d0e7ef938abc3d8080b 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dzasum");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dzasum");
- double rval = __real_cblas_dzasum(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dzasum", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dzasum");
+ double rval = __real_cblas_dzasum(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dzasum", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dzasum");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dzasum");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dzasum");
index 860bc3202aa8097f8421a41a8d060cfd6acb43d7..0295da8e6c6f5907904f3e8d746c1b8417c4dcfb 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dznrm2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dznrm2");
- double rval = __real_cblas_dznrm2(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dznrm2", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dznrm2");
+ double rval = __real_cblas_dznrm2(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dznrm2", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dznrm2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dznrm2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dznrm2");
index 13ec89596afaab9ac7888af0da35753f20b40373..0dc769fea4ee6e437ad9b168a0eeed65a4ce1128 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_icamax");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_icamax");
- CBLAS_INDEX rval = __real_cblas_icamax(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_icamax", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_icamax");
+ CBLAS_INDEX rval = __real_cblas_icamax(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_icamax", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_icamax");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_icamax");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_icamax");
index 939303de8de7306d2055656b7809845c647b1e15..12d6e0e4de50a066dcc17aae4b4e5c6ce91a38b6 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_idamax");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_idamax");
- CBLAS_INDEX rval = __real_cblas_idamax(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_idamax", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_idamax");
+ CBLAS_INDEX rval = __real_cblas_idamax(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_idamax", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_idamax");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_idamax");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_idamax");
index c97db11a5fc1bd37e7b5c1f151dc34ab7c0df996..f1cfed86d4d7ca06bb89d40a3aa4a7c10ad9e186 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_isamax");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_isamax");
- CBLAS_INDEX rval = __real_cblas_isamax(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_isamax", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_isamax");
+ CBLAS_INDEX rval = __real_cblas_isamax(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_isamax", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_isamax");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_isamax");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_isamax");
index f4eaa90a5a96a0543b4227c46f63eeb9e659d9b0..623059fd6f31dea8d1d27ae0ab33eb569c7c82e1 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_izamax");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_izamax");
- CBLAS_INDEX rval = __real_cblas_izamax(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_izamax", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_izamax");
+ CBLAS_INDEX rval = __real_cblas_izamax(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_izamax", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_izamax");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_izamax");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_izamax");
index af48cc2eca8d443a5d73bea92d12b63b114a0921..60ba25c2145844a389dec6b0436447e6d67296b5 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sasum");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sasum");
- float rval = __real_cblas_sasum(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sasum", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sasum");
+ float rval = __real_cblas_sasum(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sasum", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sasum");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sasum");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sasum");
index c9ec621e9da671ec8cd82b50261e785aef3bbcd8..d00a338291c2791de77d6a5d3bc7552cdff554f9 100644 (file)
@@ -44,20 +44,19 @@ void cblas_saxpy(const int N, const float alpha, const float *X, const int incX,
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_saxpy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_saxpy");
- __real_cblas_saxpy(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_saxpy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_saxpy");
+ __real_cblas_saxpy(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_saxpy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_saxpy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_saxpy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -141,6 +139,7 @@ void cblas_saxpy(const int N, const float alpha, const float *X, const int incX,
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_saxpy");
index c712fb51673ad1e515dc74b35ebee65f2ef655f1..4c7cc216148c698f4b02030849efddad32d2ea1d 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_scasum");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scasum");
- float rval = __real_cblas_scasum(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scasum", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scasum");
+ float rval = __real_cblas_scasum(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scasum", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scasum");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scasum");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_scasum");
index 234112694851bcb34a2bceda7ac6ce7d3416f012..8131294913e7a8e8597b7efc8dedd4cba1075df5 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_scnrm2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scnrm2");
- float rval = __real_cblas_scnrm2(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scnrm2", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scnrm2");
+ float rval = __real_cblas_scnrm2(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scnrm2", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scnrm2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scnrm2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_scnrm2");
index f1bdaedb41c464f58c3ed9fed988f6918d8c5c42..099f5ecf17f60fc1ac60cfff4a94a6d5f3881e36 100644 (file)
@@ -44,20 +44,19 @@ void cblas_scopy(const int N, const float *X, const int incX, float *Y, const in
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_scopy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scopy");
- __real_cblas_scopy(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scopy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scopy");
+ __real_cblas_scopy(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scopy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scopy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scopy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -135,6 +133,7 @@ void cblas_scopy(const int N, const float *X, const int incX, float *Y, const in
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_scopy");
index d6b70e811d8b092f311a62ed4ac29819f6e824ec..86877e91a7e4097b669a871a9fc039cdc81003bf 100644 (file)
@@ -44,20 +44,19 @@ float cblas_sdot(const int N, const float *X, const int incX, const float *Y, co
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sdot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sdot");
- float rval = __real_cblas_sdot(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sdot", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sdot");
+ float rval = __real_cblas_sdot(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sdot", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sdot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sdot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -148,6 +146,7 @@ float cblas_sdot(const int N, const float *X, const int incX, const float *Y, co
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sdot");
index 829ba41bc234c938da58fc12bfdca5749a58dcab..09f6f84ee794e367e8936ae1fa73100aa0131e29 100644 (file)
@@ -44,20 +44,19 @@ float cblas_sdsdot(const int N, const float alpha, const float *X, const int inc
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sdsdot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sdsdot");
- float rval = __real_cblas_sdsdot(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sdsdot", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sdsdot");
+ float rval = __real_cblas_sdsdot(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sdsdot", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sdsdot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sdsdot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -154,6 +152,7 @@ float cblas_sdsdot(const int N, const float alpha, const float *X, const int inc
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sdsdot");
index c27de76076a42315f85a69eebb50732630b6a3c1..cd4951ed71ecdce889bbbc3f5de219f2bca0cf8f 100644 (file)
@@ -44,20 +44,19 @@ void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sgbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgbmv");
- __real_cblas_sgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgbmv");
+ __real_cblas_sgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -144,7 +142,7 @@ void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
size_bufX = MAX(size_bufX,1);
#ifdef __cplusplus
- Buffer buf_X(*ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufX , (void *)X);
+ Buffer buf_X(*ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufX, (void *)X);
__K->setArg(9, buf_X);
#else
cl_mem buf_X = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufX, (void *)X, &err);
@@ -198,6 +196,7 @@ void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sgbmv");
index 89a010fdc2ee179ad254a0371d81283e3be7b688..d97a454f17dcfea3adf84df1681cc8cd8b7d246b 100644 (file)
@@ -44,25 +44,21 @@ void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sgemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!sgemm_offload_dsp(Order,M,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgemm");
- __real_cblas_sgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!sgemm_offload_dsp(Order,M,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgemm");
+ __real_cblas_sgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgemm");
/* Lookup kernel pointer from global table */
-
- void *msmc_ptr;
-
#ifdef __cplusplus
Event e;
Kernel* __K;
cl_kernel __K;
#endif
__K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SGEMM_IDX, "ocl_cblas_sgemm");
-
#ifdef __cplusplus
try
#else
@@ -187,13 +182,16 @@ void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#else
err |= clSetKernelArg(__K, 13, sizeof(ldc), &ldc);
#endif
+
+ void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
- //Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(14, buf_MSMC);
#else
+ //cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 14, sizeof(buf_MSMC), &buf_MSMC);
@@ -219,6 +217,7 @@ void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index 699187898cda0ac635ac914fb3193a6ae1c519de..3b7b9eafe90f23647b6bdba5015ac57e51c95592 100644 (file)
@@ -44,20 +44,19 @@ void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sgemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgemv");
- __real_cblas_sgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgemv");
+ __real_cblas_sgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -186,11 +184,11 @@ void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sgemv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_sgemv", (float) clock_diff);
-
return ;
}
index 223e4302994449aa6ba50588e8548c774256c304..dfe9aceac19a5a9ab5ed885e81dabe6d8714e6ed 100644 (file)
@@ -44,20 +44,19 @@ void cblas_sger(const enum CBLAS_ORDER order, const int M, const int N, const fl
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sger");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sger");
- __real_cblas_sger(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sger", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sger");
+ __real_cblas_sger(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sger", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sger");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sger");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -174,6 +172,7 @@ void cblas_sger(const enum CBLAS_ORDER order, const int M, const int N, const fl
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sger");
index f5b75652bfbc1aab58ba5b6c06b2a8c40c08b866..798b7b0a9e87594288007fcdc487ab25199b352d 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_snrm2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_snrm2");
- float rval = __real_cblas_snrm2(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_snrm2", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_snrm2");
+ float rval = __real_cblas_snrm2(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_snrm2", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_snrm2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_snrm2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_snrm2");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_snrm2", (float) clock_diff);
return retval;
index 160d339654a3410f68da607b429c7abaf498597e..d185fed4dac440ae8a567af0540763b12650eab9 100644 (file)
@@ -44,20 +44,19 @@ void cblas_srot(const int N, float *X, const int incX, float *Y, const int incY,
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_srot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srot");
- __real_cblas_srot(N,X,incX,Y,incY,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srot", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srot");
+ __real_cblas_srot(N,X,incX,Y,incY,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srot", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -146,6 +144,7 @@ void cblas_srot(const int N, float *X, const int incX, float *Y, const int incY,
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_srot");
index 0ab8423f641a959e19a3706cc9826aed157b6384..5a0594e81cef542bc94b442766526cfb2bf6520f 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_srotg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotg");
- __real_cblas_srotg(a,b,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotg");
+ __real_cblas_srotg(a,b,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
Buffer buf_a(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), (void *)a);
__K->setArg(0, buf_a);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_srotg");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_srotg", (float) clock_diff);
return ;
index ec94428d1512907e242e297fddd6c76c8786ae06..410b8fd61579c66c562adf11c40cdd15088bb3ae 100644 (file)
@@ -44,20 +44,19 @@ void cblas_srotm(const int N, float *X, const int incX, float *Y, const int incY
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_srotm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotm");
- __real_cblas_srotm(N,X,incX,Y,incY,P);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotm");
+ __real_cblas_srotm(N,X,incX,Y,incY,P);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -145,6 +143,7 @@ void cblas_srotm(const int N, float *X, const int incX, float *Y, const int incY
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_srotm");
index 4cfa6b23da62ba85ff188ad696a8cdc035644ad9..6418b25d95a3f153c10cb544addfe4ec968104c8 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_srotmg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotmg");
- __real_cblas_srotmg(d1,d2,b1,b2,P);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotmg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotmg");
+ __real_cblas_srotmg(d1,d2,b1,b2,P);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotmg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotmg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotmg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
Buffer buf_d1(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), (void *)d1);
__K->setArg(0, buf_d1);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_srotmg");
index 2791b007e99c2962c06932fd64e67808c8d1d562..47c73765342b837ada80629d6040b821e62db56b 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssbmv");
- __real_cblas_ssbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssbmv");
+ __real_cblas_ssbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -186,6 +184,7 @@ void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssbmv");
index a38f6844c965ac85b59e88e0d24c7ba37d16a8de..743be109f2ea3132607669fd80e740c1ef5ca932 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sscal");
- __real_cblas_sscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sscal");
+ __real_cblas_sscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sscal");
index 97003a764e5890dbe13af65563ab03a595dfbbd5..14d13d50d12a682a60e60d8993521c80614e79dc 100644 (file)
@@ -44,20 +44,19 @@ void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sspmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspmv");
- __real_cblas_sspmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspmv");
+ __real_cblas_sspmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -174,6 +172,7 @@ void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sspmv");
index 3b7e58b0c9cc5cd152ad4af8f0aefc571374662d..31e59b82aef7d121a75c2cc1a9b7f83e8b020d17 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sspr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspr");
- __real_cblas_sspr(order,Uplo,N,alpha,X,incX,Ap);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspr");
+ __real_cblas_sspr(order,Uplo,N,alpha,X,incX,Ap);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sspr");
index 6bc1302ce09d87668dc6b2a5a2d57609763df902..4c78328ab8fc7a59971ec7d37cf6c75bcd61cb97 100644 (file)
@@ -44,20 +44,19 @@ void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sspr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspr2");
- __real_cblas_sspr2(order,Uplo,N,alpha,X,incX,Y,incY,A);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspr2");
+ __real_cblas_sspr2(order,Uplo,N,alpha,X,incX,Y,incY,A);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -168,6 +166,7 @@ void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sspr2");
index f67e00a27037f6359161277bb5492e2b00a122b3..d62a9e4d4f628c7f0464e0edc7bce0086d5ed44c 100644 (file)
@@ -49,27 +49,26 @@ void cblas_sswap(const int N, float *X, const int incX, float *Y, const int incY
* the offload of this routine to the DSP is disabled.
*/
#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sswap");
- __real_cblas_sswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sswap", (float) clock_diff);
+ __real_cblas_sswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sswap", (float) clock_diff);
return ;
#else
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sswap");
- __real_cblas_sswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sswap", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sswap");
+ __real_cblas_sswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sswap", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sswap");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sswap");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -146,7 +144,9 @@ void cblas_sswap(const int N, float *X, const int incX, float *Y, const int incY
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sswap");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_sswap", (float) clock_diff);
return ;
index 426489589cf6940136ae8363c341f9dbe59f020a..43a506564381154b6645b16e506814ff6f1df747 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssymm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssymm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssymm");
- __real_cblas_ssymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssymm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssymm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssymm");
+ __real_cblas_ssymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssymm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssymm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssymm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -182,8 +180,8 @@ void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -213,6 +211,7 @@ void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index d2c495d00849f450cd1fa3cba681fcc2c771fecd..8a84e6363287e69ede19a520c0340f5119a707cf 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssymv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssymv");
- __real_cblas_ssymv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssymv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssymv");
+ __real_cblas_ssymv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssymv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssymv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssymv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -180,6 +178,7 @@ void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssymv");
index 7bf8d2d8f59243e7c5bdaacccbc47d85f324540f..bf5259ad7490864ae75b09e785e20bc145d56330 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssyr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr");
- __real_cblas_ssyr(order,Uplo,N,alpha,X,incX,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr");
+ __real_cblas_ssyr(order,Uplo,N,alpha,X,incX,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssyr");
index 47fde30cda5047666d3c3a49c34bd9ba1d5fa3d2..7aa68442afbc9a852eaa9e57577e37dabdfc13db 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssyr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr2");
- __real_cblas_ssyr2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr2");
+ __real_cblas_ssyr2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -174,6 +172,7 @@ void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssyr2");
index cf39e18a627eff51253a3a4662ab99ea1d02934d..c7c30f005af24db20c7a93a18b4fd19258bfd093 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssyr2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssyr2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr2k");
- __real_cblas_ssyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssyr2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr2k");
+ __real_cblas_ssyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -182,8 +180,8 @@ void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -213,7 +211,9 @@ void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssyr2k");
index 12c59ed731d692ccd50488a384189518b86b68d3..efcb97bbfcf585ca8d8beba3c8bd41db66c49924 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssyrk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssyrk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyrk");
- __real_cblas_ssyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyrk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssyrk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyrk");
+ __real_cblas_ssyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyrk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyrk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyrk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -161,11 +159,12 @@ void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
+ //cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 11, sizeof(buf_MSMC), &buf_MSMC);
@@ -191,7 +190,9 @@ void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssyrk");
index e1732cc1fbab304f00c8cf1b6f0270fad5b14188..662600a1c1d90a7e33387a6b055afe2859fd38f7 100644 (file)
@@ -44,20 +44,19 @@ void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_stbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stbmv");
- __real_cblas_stbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stbmv");
+ __real_cblas_stbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -165,6 +163,7 @@ void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_stbmv");
index 052657cd1f12229ccbeb4214a55ce2b30ecbbf02..48b640b89c1ff7342b8551a4c94c7a3e0829fa38 100644 (file)
@@ -44,20 +44,19 @@ void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_stbsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stbsv");
- __real_cblas_stbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stbsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stbsv");
+ __real_cblas_stbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stbsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stbsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stbsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -165,6 +163,7 @@ void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_stbsv");
index 3d2844cadf7da87394e771871f424f8b4e7d57fc..a1d91f8af8cc658575d7a30e843136bb9bf49912 100644 (file)
@@ -44,20 +44,19 @@ void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_stpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stpmv");
- __real_cblas_stpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stpmv");
+ __real_cblas_stpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -153,6 +151,7 @@ void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_stpmv");
index 236273c04c295ceb2d702bf14b030ba0db64d024..e487ad6fe5722cdfd51397f9cfe1243b5fa9d10e 100644 (file)
@@ -44,20 +44,19 @@ void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_stpsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stpsv");
- __real_cblas_stpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stpsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stpsv");
+ __real_cblas_stpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stpsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stpsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stpsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -153,6 +151,7 @@ void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_stpsv");
index fb28e11717b2c9c1e9d6d5827434e9bd849b73b9..3f1244127fa2497ca782188213ad147d022dbaca 100644 (file)
@@ -44,20 +44,19 @@ void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_strmm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!strmm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strmm");
- __real_cblas_strmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strmm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!strmm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strmm");
+ __real_cblas_strmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strmm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strmm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strmm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -167,8 +165,8 @@ void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -198,9 +196,10 @@ void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
- ti_cblas_delete_kernel(__K);
+ ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_strmm");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_strmm", (float) clock_diff);
index 63e3fa1d43439db7b10b1f52a815d9df0f6685f5..a103a7f23712cd63390893a23a10c0f3e5e90e3f 100644 (file)
@@ -44,20 +44,19 @@ void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_strmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strmv");
- __real_cblas_strmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strmv");
+ __real_cblas_strmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -159,6 +157,7 @@ void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_strmv");
index 831df896ee87758e888ea773ae81057d1fa1f326..e1d8513d608f018906bab5965684eb2ec4421e2c 100644 (file)
@@ -44,20 +44,19 @@ void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_strsm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!strsm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strsm");
- __real_cblas_strsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strsm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!strsm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strsm");
+ __real_cblas_strsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strsm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strsm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strsm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -167,8 +165,8 @@ void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -198,7 +196,9 @@ void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_strsm");
index 19ef805413c1609cdaaa73f4d1bc58df8aab408e..33505905123641c3881bed7a745e350a94b575fd 100644 (file)
@@ -44,20 +44,19 @@ void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_strsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strsv");
- __real_cblas_strsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strsv");
+ __real_cblas_strsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -159,6 +157,7 @@ void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_strsv");
index 6d9a5a211718a030fce311f10a9cc89939236e16..5643cedbfb74b3d0598590fbe96423317e319939 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_xerbla");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_xerbla");
- __real_cblas_xerbla(p,rout,form);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_xerbla", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_xerbla");
+ __real_cblas_xerbla(p,rout,form);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_xerbla", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_xerbla");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_xerbla");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, p);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
- TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_errprn");
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_errprn", (float) clock_diff);
+ TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_xerbla");
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_xerbla", (float) clock_diff);
return ;
}
index 623a96fee04f2195283bc54d6b06389c247a2783..db210af27db374e31f4d73378adafc0a8f1f56ff 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zaxpy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zaxpy");
- __real_cblas_zaxpy(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zaxpy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zaxpy");
+ __real_cblas_zaxpy(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zaxpy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zaxpy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zaxpy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zaxpy");
index b4c9f9252381f69d78ecdbe093d9b1a807b4768d..7236ae986e18efc9584f7bf8405f6fc5daf33b22 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zcopy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zcopy");
- __real_cblas_zcopy(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zcopy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zcopy");
+ __real_cblas_zcopy(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zcopy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zcopy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zcopy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zcopy");
diff --git a/blasblisacc/src/ti_cblas_cblas_zdotc_sub.c b/blasblisacc/src/ti_cblas_cblas_zdotc_sub.c
index 3b2937dcb7c2ebf599bb9d9ed294563388ab76e0..5f241935e11b8b528b8c78cb6132fca0c63003e2 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zdotc_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdotc_sub");
- __real_cblas_zdotc_sub(N,X,incX,Y,incY,dotc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdotc_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdotc_sub");
+ __real_cblas_zdotc_sub(N,X,incX,Y,incY,dotc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdotc_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdotc_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdotc_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zdotc_sub");
diff --git a/blasblisacc/src/ti_cblas_cblas_zdotu_sub.c b/blasblisacc/src/ti_cblas_cblas_zdotu_sub.c
index 63bd00b5714db92ef0450ec7b9e141a1c23053e2..26d70cea8b94699302eaba17e11b63617e15d1c9 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zdotu_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdotu_sub");
- __real_cblas_zdotu_sub(N,X,incX,Y,incY,dotu);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdotu_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdotu_sub");
+ __real_cblas_zdotu_sub(N,X,incX,Y,incY,dotu);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdotu_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdotu_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdotu_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zdotu_sub");
index a81e4b0c730584436ec2c4d056abae2a54ac2856..c8a2600ac0fec3bccd3ee4408fe7f5933dbc4936 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zdscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdscal");
- __real_cblas_zdscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdscal");
+ __real_cblas_zdscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zdscal");
index b35307bacc1fd4a0c28f8ff5213f6bb370886b9e..d333b18c28ca3232f68603c307dbcbff46b14764 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgbmv");
- __real_cblas_zgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgbmv");
+ __real_cblas_zgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -208,6 +206,7 @@ void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgbmv");
index 6c4df83b91f76d60fd0fe26431678f5f1b7b6e57..86cdecc527206e7a4cfacf4563256cf766ef78af 100644 (file)
@@ -44,21 +44,19 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
-
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zgemm_offload_dsp(Order,M,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgemm");
- __real_cblas_zgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zgemm_offload_dsp(Order,M,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgemm");
+ __real_cblas_zgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -199,8 +196,8 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(14, buf_MSMC);
#else
@@ -230,7 +227,9 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgemm");
index 529d7fafe29e8fc332fc07526fe25375fccf35f3..f3739e863df495c13536045ec142d3934baebb43 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgemv");
- __real_cblas_zgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgemv");
+ __real_cblas_zgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -196,6 +194,7 @@ void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgemv");
index c4f9f419bf61727e969074f58df33821bd33e4a4..867e0a566c8a1f43ae6fd439a2c1796abf44b511 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgerc");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgerc");
- __real_cblas_zgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgerc", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgerc");
+ __real_cblas_zgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgerc", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgerc");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgerc");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -179,6 +177,7 @@ void cblas_zgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgerc");
index 1ab3344394d4e2a83fe56af07239c3081cdbaac2..22b64e56c1f682e50032ffe02a699ecc4a2dbb9f 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgeru");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgeru");
- __real_cblas_zgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgeru", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgeru");
+ __real_cblas_zgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgeru", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgeru");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgeru");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -179,6 +177,7 @@ void cblas_zgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgeru");
index f572f9a34b55291276e5eb37fa061cf2c9efb0f7..8b0d9b9e8c7b44b466152d91aea9f0a1e0cefa9f 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhbmv");
- __real_cblas_zhbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhbmv");
+ __real_cblas_zhbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -196,6 +194,7 @@ void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhbmv");
index 2ca9ef5e6aebfeee03c8caa2be09238be90084b4..fa5d9cb9b9b4b420e871aafeb7f664a82887accf 100644 (file)
@@ -44,22 +44,19 @@ void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zhemm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhemm");
-
- __real_cblas_zhemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
-
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zhemm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhemm");
+ __real_cblas_zhemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -194,8 +190,8 @@ void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -225,6 +221,7 @@ void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index 1327c6357bd7bdc4e81c108cf7f9f94de10a9b39..511eb725e375e26c6a887ba32e86b691b2275b93 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhemv");
- __real_cblas_zhemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhemv");
+ __real_cblas_zhemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -190,6 +188,7 @@ void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhemv");
index 5097968b4e32cf562db195b0ae7be67f9e35d901..55c99e4840871d9ec5a77dbb687775342872f08f 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zher");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher");
- __real_cblas_zher(order,Uplo,N,alpha,X,incX,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher");
+ __real_cblas_zher(order,Uplo,N,alpha,X,incX,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zher");
index c4e640050154811b2a65e60cfcd49d3abd421a53..905730515f33b539553b38e419852662d74056b6 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zher2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher2");
- __real_cblas_zher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher2");
+ __real_cblas_zher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -179,6 +177,7 @@ void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zher2");
index 7884ddfe0b6504013137cfaa1e866b9001e6025a..3fe3bdb781e34a18460d0ce964628688f512cee9 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zher2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zher2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher2k");
- __real_cblas_zher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zher2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher2k");
+ __real_cblas_zher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -187,13 +185,13 @@ void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
//cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
- cl_mem buf_MSMC = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
+ cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 13, sizeof(buf_MSMC), &buf_MSMC);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
@@ -218,6 +216,7 @@ void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index 2285f9b02c4bdb19edc81aa1abc498c07ec0d2c8..232d839a24289af5ab982c996e306d0da317f0db 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zherk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zherk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zherk");
- __real_cblas_zherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zherk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zherk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zherk");
+ __real_cblas_zherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zherk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zherk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zherk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -161,8 +159,8 @@ void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -192,7 +190,9 @@ void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zherk");
index 3c6b71974bb5d0220f36621f332a3d52889765f3..5cd8688d82fa798ce707579844e3d881e3cedc42 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpmv");
- __real_cblas_zhpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpmv");
+ __real_cblas_zhpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -184,6 +182,7 @@ void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhpmv");
index 0288c7b0abc0520ffc6d2f36147afd9eda8eed45..80a6348c55e70378a6362016ffabd216d55b061c 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhpr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpr");
- __real_cblas_zhpr(order,Uplo,N,alpha,X,incX,A);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpr");
+ __real_cblas_zhpr(order,Uplo,N,alpha,X,incX,A);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhpr");
index d8f30b7382a4e5a9a060a87a34c59d40cbe26578..854cd61c74cba1f9850526beee7d697008c324bb 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhpr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpr2");
- __real_cblas_zhpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpr2");
+ __real_cblas_zhpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -173,6 +171,7 @@ void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhpr2");
index dd4a8d62fce1bcabf4b9e25ebf03753ae5c612dd..b01f3ae49467081048c4c0948da435edd0faf9dc 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zrotg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zrotg");
- __real_cblas_zrotg(a,b,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zrotg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zrotg");
+ __real_cblas_zrotg(a,b,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zrotg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zrotg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zrotg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
Buffer buf_a(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), (void *)a);
__K->setArg(0, buf_a);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zrotg");
index 47ab78a9bdc32974a73529b1e97cf995591b284a..c731c2bf0bff8b9b87050f836f96d94042cff0ad 100644 (file)
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zscal");
- __real_cblas_zscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zscal");
+ __real_cblas_zscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zscal");
index e8996851650d88b0476d58aa1478529cb24766bd..8deae8d58b182a248a82b19cf305a6e0e43de146 100644 (file)
* the offload of this routine to the DSP is disabled.
*/
#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zswap");
- __real_cblas_zswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zswap", (float) clock_diff);
+ __real_cblas_zswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zswap", (float) clock_diff);
return ;
#else
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zswap");
- __real_cblas_zswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zswap", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zswap");
+ __real_cblas_zswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zswap", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zswap");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zswap");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
#endif
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zswap");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_zswap", (float) clock_diff);
return ;
index 84b891f7c6bcec1cd6a3a1b342af7dfe2da69197..40fdba50fa7486ac351f3c0efb434c43601f5b83 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zsymm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsymm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsymm");
- __real_cblas_zsymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsymm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsymm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsymm");
+ __real_cblas_zsymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsymm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsymm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsymm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -188,11 +186,12 @@ void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -222,6 +221,7 @@ void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index a1974118b4a308d3b030701de7882ca3861182e2..80844059f5489c2471543b788c3a6b0109967b28 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zsyr2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsyr2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsyr2k");
- __real_cblas_zsyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsyr2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsyr2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsyr2k");
+ __real_cblas_zsyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsyr2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsyr2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsyr2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -188,11 +186,12 @@ void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -222,6 +221,7 @@ void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
ti_cblas_delete_kernel(__K);
index 562202f8c988e0bb3debf15a4620b7f846439d37..99a661c2eb4d7c9cc4b8b258556295da61d44239 100644 (file)
@@ -44,20 +44,19 @@ void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zsyrk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsyrk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsyrk");
- __real_cblas_zsyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsyrk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsyrk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsyrk");
+ __real_cblas_zsyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsyrk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsyrk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsyrk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -167,11 +165,12 @@ void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#else
err |= clSetKernelArg(__K, 10, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -201,7 +200,9 @@ void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zsyrk");
index f4551f9c3de526c57f79a9f0f27ecac0de42e4c9..e3d2155a9169dbb634a769da1e3b92945b624a43 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztbmv");
- __real_cblas_ztbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztbmv");
+ __real_cblas_ztbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -165,6 +163,7 @@ void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztbmv");
index 80cdb22a3d7d00fb6068fbb2d5a981e743cf9c5f..696a26c974435353c9a358d7008582421ca85b10 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztbsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztbsv");
- __real_cblas_ztbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztbsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztbsv");
+ __real_cblas_ztbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztbsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztbsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztbsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -165,6 +163,7 @@ void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztbsv");
index 8198ad0898e672d632f0bb97c82f9ceed622789f..a40b4e497bead3e85ff9f3bd8d487bdaf99a4d7a 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztpmv");
- __real_cblas_ztpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztpmv");
+ __real_cblas_ztpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -153,6 +151,7 @@ void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztpmv");
index 53df46db9d7d4b428d8ebf3ff6fb5c6a8af54357..15c9515d1da22e5b27c71d5e8a007f8de08f8400 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztpsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztpsv");
- __real_cblas_ztpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztpsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztpsv");
+ __real_cblas_ztpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztpsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztpsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztpsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -153,6 +151,7 @@ void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztpsv");
index 20095bbd5c99300667b2b22a3919cc8d4f2e7360..4474751f82531e984151a79a992829f2e0280ef2 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztrmm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ztrmm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrmm");
- __real_cblas_ztrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrmm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ztrmm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrmm");
+ __real_cblas_ztrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrmm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrmm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrmm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -172,8 +170,8 @@ void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -203,8 +201,11 @@ void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztrmm");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ztrmm", (float) clock_diff);
return ;
index 14c5990682cf43f1dfafe520c4e6fc8bcd3e5f14..1bad5fda21cf49f1fbe455941cabc482e5e6ab13 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztrmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrmv");
- __real_cblas_ztrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrmv");
+ __real_cblas_ztrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -159,6 +157,7 @@ void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztrmv");
index 6c4c5c68165f7617ffec1ecbab625f0b6f4a909a..bc9bd319789c620421d3a59dfa2e5f3c0ad238eb 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztrsm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ztrsm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrsm");
- __real_cblas_ztrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrsm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ztrsm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrsm");
+ __real_cblas_ztrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrsm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrsm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrsm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -172,8 +170,8 @@ void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
void *msmc_ptr;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -203,7 +201,9 @@ void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_mem_free(msmc_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztrsm");
index 498c208dd49e74436c5f268921f96abb0800ad48..f0cd18f2e167121e62409502bac6193a04400894 100644 (file)
@@ -44,20 +44,19 @@ void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztrsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrsv");
- __real_cblas_ztrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrsv");
+ __real_cblas_ztrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -159,6 +157,7 @@ void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztrsv");
similarity index 52%
rename from blasblisacc/src/blas_wrap_gen.sh
rename to blasblisacc/src/wrap_gen/blas_wrap_gen.sh
index 5df08d327e8520e2f99c10678d376d0e549e41ec..cacce9fb743c64320ae12a4d02eb69173eb8bbd5 100755 (executable)
rename from blasblisacc/src/blas_wrap_gen.sh
rename to blasblisacc/src/wrap_gen/blas_wrap_gen.sh
index 5df08d327e8520e2f99c10678d376d0e549e41ec..cacce9fb743c64320ae12a4d02eb69173eb8bbd5 100755 (executable)
-./oclgen.pl -f -offload=002 -dspl1=4 -dspl2=128 -offmin=10000 -offmax=10000000 blas ./wrap_gen/cblas.h
+./oclgen.pl -f -offload=002 -dspl1=4 -dspl2=128 -offmin=10000 -offmax=10000000 blas ./cblas.h
+
+#sudo apt-get install ctags
\ No newline at end of file
similarity index 96%
rename from blasblisacc/src/oclgen.pl
rename to blasblisacc/src/wrap_gen/oclgen.pl
index 32bd923d9c8fe53d6d7e7635701fe0447891f174..c5302c55f9a8297260e860e95b6e693b9bc3f302 100755 (executable)
rename from blasblisacc/src/oclgen.pl
rename to blasblisacc/src/wrap_gen/oclgen.pl
index 32bd923d9c8fe53d6d7e7635701fe0447891f174..c5302c55f9a8297260e860e95b6e693b9bc3f302 100755 (executable)
my @offloaded; # array of function names to be offloaded, filled in by generate_arm...
my $blas_prefix = 'cblas_';
-my $blas_L1 = '.asum|.axpy|.copy|.dot|.sdot|.dotc|.dotu|.nrm2|.rot|.rotg|.rotmg|.scal|.swap|i.amax|i.amin|.cabs1|';
+my $blas_L1 = '.asum|.axpy|.copy|.dot|.sdot|.dotc|.cdotc_sub|.cdotu_sub|.dotu|.nrm2|.rot|.rotg|.rotmg|.scal|.swap|i.amax|i.amin|.cabs1|.csscal|.drotm|.dzasum|.dznrm2|.scasum|.scnrm2|.sdsdot|.srotm|.xerbla|.zdotu_sub|.zdscal|';
my $blas_L2 = '.gbmv|.gemv|.ger|.gerc|.geru|.hbmv|.hemv|.her|.her2|.hpmv|.hpr|.hpr2|.sbmv|.spmv|.spr|.spr2|.symv|.syr|.syr2|.tbmv|.tbsv|.tpmv|.tpsv|.trmv|.trsv|';
my $blas_L3 = '.gemm|.hemm|.herk|.her2k|.symm|.syrk|.syr2k|.trmm|.trsm|';
my $blas_L123 = "${blas_L1}|${blas_L2}|${blas_L3}";
return $code;
}
+sub get_init_code
+{
+ my $code = "
+void ti_bli_init_dsp(global char *l3_buf, local char *l2_buf);
+kernel void ocl_bli_init(global char *l3_buf, local char *l2_buf)
+{ ti_bli_init_dsp(l3_buf, l2_buf); }
+void ti_bli_finalize_dsp(void);
+kernel void ocl_bli_finalize(void)
+{ ti_bli_finalize_dsp(); }
+";
+ return $code;
+}
+
# generates the initial portion of the Makefile. This needs to be done
# only once, and not on a per-function basis
sub generate_makefile_prologue
my $func = shift;
my $NAMESPACE = uc($namespace);
$func_name = substr($func, 7);
-
+#JXU
+# print "function name is $func_name\n";
+#JXU
if (index($blas_L1, "${func_name}|") != -1) {
return "${NAMESPACE}_L1_OFFLOAD";
}
# return "${NAMESPACE}_L3_OFFLOAD" if ($func_name =~ /$blas_L3/);
# return "${NAMESPACE}_L2_OFFLOAD" if ($func_name =~ /$blas_L2/);
+#JXU
+ print "function name not matched!\n";
+#JXU
+
# if no match then use the default offload variable
return "${namespace}_offload";
}
#my $arm_func_cond = get_func_based_arm_cond($trampname, \@kernelargs);
my $arm_func_cond = get_offload_decision($trampname, \@kernelargs);
my $arm_condition_code = "";
+#JXU
+# print "trampname is $trampname\n";
+#JXU
my $offload_var = get_func_specific_offload_var($trampname);
+#JXU
+ print "offload_var is $offload_var\n";
+#JXU
my $indent = "";
my $arm_end_condition_code = "";
if (!$commentarm) {
/* Do an init on first use */
if (!${namespace}_init_done) ${namespace}_init();
${NAMESPACE}_DEBUG_PRINT("Intercepted call to %s\\n", "$trampname");
+ARM_FROM_PROTO
- ${NAMESPACE}_PROFILE_START();
+ if (index($trampname, "swap") != -1) {
+ $armcode .= "
+ /* OpenCL cannot deal with overlapping memory regions. This is an issue when you
+ * are trying to swap two rows of a matrix, where the matrix is column major. Hence,
+ * the offload of this routine to the DSP is disabled.
+ */
+#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
+ TI_CBLAS_PROFILE_START();
+ TI_CBLAS_DEBUG_PRINT(\"Executing ARM %s\\n\", \"$trampname\");
+ $no_offload_arm_call
+ TI_CBLAS_PROFILE_REPORT(\" Entire %s call (ARM) took %8.2f us\\n\",\"$trampname\", (float) clock_diff);
+ return ;
+#else
+";
+}
+ $armcode .= "
+ ${NAMESPACE}_PROFILE_START();
$arm_comment_header
- $arm_condition_code
- ${indent}${NAMESPACE}_DEBUG_PRINT("Executing ARM %s\\n", "$trampname");
- ${indent}$no_offload_arm_call
- ${indent}${NAMESPACE}_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\\n","$trampname", (float) clock_diff);
- ${indent}$no_offload_arm_return
- $arm_end_condition_code
- $arm_comment_trailer
+ $arm_condition_code
+ ${NAMESPACE}_DEBUG_PRINT(\"Executing ARM %s\\n\", \"$trampname\");
+ $no_offload_arm_call
+ ${NAMESPACE}_PROFILE_REPORT(\" Entire %s call (ARM) took %8.2f us\\n\",\"$trampname\", (float) clock_diff);
+ $no_offload_arm_return
+ $arm_end_condition_code
+ $arm_comment_trailer
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- ${indent}${NAMESPACE}_DEBUG_PRINT("Offloading to DSP %s\\n", "$trampname");
+ ${NAMESPACE}_DEBUG_PRINT(\"Offloading to DSP %s\\n\", \"$trampname\");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ${namespace}_get_kernel($trampdef, "ocl_$trampname");
+ __K = ${namespace}_get_kernel($trampdef, \"ocl_$trampname\");
#ifdef __cplusplus
try
#else
cl_int err = CL_SUCCESS;
#endif
{
-
-ARM_FROM_PROTO
+";
my $i = 0;
foreach $arg (@kernelargs) {
size_buf$arg = MAX(size_buf$arg,1);
#ifdef __cplusplus
- Buffer buf_$arg(${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, size_buf$arg, (void *)$arg);
+ Buffer buf_$arg(*${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, size_buf$arg, (void *)$arg);
__K->setArg($i, buf_$arg);
#else
cl_mem buf_$arg = clCreateBuffer(${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, size_buf$arg, (void *)$arg, &err);
else {
$armcode .= "
#ifdef __cplusplus
- Buffer buf_$arg(${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, ${modify_bufsize}sizeof($sizeoftype), (void *)$arg);
+ Buffer buf_$arg(*${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, ${modify_bufsize}sizeof($sizeoftype), (void *)$arg);
__K->setArg($i, buf_$arg);
#else
cl_mem buf_$arg = clCreateBuffer(${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, ${modify_bufsize}sizeof($sizeoftype), (void *)$arg, &err);
# print "kernel_name is " . $kernel_name . "\n";
# print "blas_L3 string is ". $blas_L3 . "\n";
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
- print "This is a level 3 function - " . $trampname . "\n";
+# print "This is a level 3 function - " . $trampname . "\n";
- $i_plus_1 = $i+1;
+# $i_plus_1 = $i+1;
$armcode .= "
+ void *msmc_ptr;
+ msmc_ptr = ${namespace}_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- Buffer buf_MSMC(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
+ Buffer buf_MSMC(*${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg($i, buf_MSMC);
#else
- cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
+ //cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
+ cl_mem buf_MSMC = clCreateBuffer(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR(\"clCreateBuffer\",err);
err |= clSetKernelArg(__K, $i, sizeof(buf_MSMC), &buf_MSMC);
TI_CBLAS_OCL_CHKERROR(\"clSetKernelArg\",err);
#endif
-
-#ifdef __cplusplus
- __K->setArg($i_plus_1, __local(L2_BUF_SIZE));
-#else
- err |= clSetKernelArg(__K, $i_plus_1, L2_BUF_SIZE, NULL);
-#endif
";
+##ifdef __cplusplus
+# __K->setArg($i_plus_1, __local(L2_BUF_SIZE));
+##else
+# err |= clSetKernelArg(__K, $i_plus_1, L2_BUF_SIZE, NULL);
+##endif
+#";
}
if ($tramptype !~ /^void$/i) {
$armcode .= "
/* create a buffer argument to get the return value from the DSP */
$tramptype retval;
#ifdef __cplusplus
- Buffer buf_retval(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof($tramptype), &retval);
+ Buffer buf_retval(*${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof($tramptype), &retval);
__K->setArg($i, buf_retval);
#else
cl_mem buf_retval = clCreateBuffer(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof($tramptype), &retval, &err);
}
$armcode .= "
#ifdef __cplusplus
- ${namespace}_ocl_Q.enqueueTask(*__K, 0, &e);
+ ${namespace}_ocl_Q->enqueueTask(*__K, 0, &e);
e.wait();
#else
cl_event e;
${NAMESPACE}_OCL_CHKERROR(\"clReleaseEvent\",err);
#endif
+";
+ if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
+ $armcode .= "
+ ${namespace}_mem_free(msmc_ptr);
+";
+ }
+ $armcode .= "
+ ${namespace}_delete_kernel(__K);
+
${NAMESPACE}_DEBUG_PRINT(\"Finished executing %s\\n\", \"$trampname\");
${NAMESPACE}_PROFILE_REPORT(\" Entire %s call (DSP) took %8.2f us\\n\",\"$trampname\", (float) clock_diff);
return ";
$armcode .= "0" unless ($tramptype =~ /^void$/i);
$armcode .= ";\n";
$armcode .= "\t}\n#endif\n";
+ if (index($trampname, "swap") != -1) {
+ $armcode .= "#endif //TI_CBLAS_SWAP_ENABLE_OFFLOAD\n";
+ }
$armcode .= "}\n";
return $armcode;
sub generate_kernel_from_proto($)
{
my $string = shift;
+ print "In generate_kernel_from_proto, string is " . $string. "\n";
my $oclcode = "";
my @tmp = split /[\(\)]/,$string;
else {
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
print "In generate_kernel_from_proto, this is a level 3 function - " . $trampname . "\n";
- $oclcode .= ", global double *l3_buf, local double *l2_buf_loc";
- $trampproto .= ", global double *l3_buf, local double *l2_buf_loc";
+# $oclcode .= ", global double *l3_buf, local double *l2_buf_loc";
+# $trampproto .= ", global double *l3_buf, local double *l2_buf_loc";
+ $oclcode .= ", global double *l3_buf";
+ $trampproto .= ", global double *l3_buf";
}
}
}
$oclcode .= "${comma}retval" unless ($tramptype =~ /^void$/i);
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
- $oclcode .= ", l3_buf, l2_buf_loc";
+# $oclcode .= ", l3_buf, l2_buf_loc";
+ $oclcode .= ", l3_buf";
}
$oclcode .= "); }";
$oclcode = $trampproto . "\n" . $oclcode;
char *pool_kn_mem_L3;
char *pool_mn_mem_L3;
+extern void bli_mem_init();
+
+void ti_bli_init_dsp(char *l3_buf, char *l2_buf)
+{
+ bli_init();
+}
+
+void ti_bli_finalize_dsp(void)
+{
+ bli_finalize();
+}
+
FACADE_PROLOGUE
return $facade_prologue;
}
$kernel_name = substr($trampname, 7);
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
- $dspcode .= ", float *l3_buf, float *l2_buf_loc";
- $trampproto .= ", float *l3_buf, float *l2_buf_loc" ;
+# $dspcode .= ", float *l3_buf, float *l2_buf_loc";
+# $trampproto .= ", float *l3_buf, float *l2_buf_loc" ;
+ $dspcode .= ", float *l3_buf";
+ $trampproto .= ", float *l3_buf" ;
}
$trampproto .= ");";
pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
+ bli_mem_init();
+
#pragma omp parallel
{
__cache_l1d_flush();
}
else {
# print "facade code to setup cache for level 1 or 2 function ". $trampname ."\n";
+# $dspcode .= "
+# #pragma omp parallel
+# {
+# __cache_l2_flush();
+# __cache_l2_512k();
+# }
+#";
$dspcode .= "
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
";
}
}
else {
# print "facade code to return default cache for level 1 or 2 function ". $trampname ."\n";
- $dspcode .= "
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
-";
+# $dspcode .= "
+# // return default L2 cache (128 K)
+# #pragma omp parallel
+# {
+# __cache_l2_flush();
+# __cache_l2_128k();
+# }
+#";
}
$dspcode .= "}\n";
# $dspcode = $trampproto . "\n" . $dspcode;
write_output(generate_kernel_prologue(),"${namespace}_kernel.cl");
print "DSP ${namespace}_kernel.cl generated.\n";
write_output(get_enums_and_defines(),"${namespace}_kernel.cl");
+ write_output(get_init_code(),"${namespace}_kernel.cl");
print "ARM ${namespace}_initfini.c code:\n" unless ($f);
write_output(generate_arm_init(), "${namespace}_initfini.c");
#print "Makefile:\n" unless ($f);