summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: b2c06ae)
raw | patch | inline | side by side (parent: b2c06ae)
author | Jianzhong Xu <a0869574@ti.com> | |
Wed, 22 Apr 2015 13:06:36 +0000 (09:06 -0400) | ||
committer | Jianzhong Xu <a0869574@ti.com> | |
Wed, 22 Apr 2015 13:06:36 +0000 (09:06 -0400) |
12 files changed:
build/tar_files_list.txt | patch | blob | history | |
examples/dgemm_test/Makefile | patch | blob | history | |
examples/dgemm_test/dgemm_test.c | patch | blob | history | |
examples/dsyrk_test/Makefile | [new file with mode: 0644] | patch | blob |
examples/dsyrk_test/dsyrk_test.c | [new file with mode: 0644] | patch | blob |
examples/ludinv/main.c | patch | blob | history | |
examples/ztrmm_test/Makefile | [new file with mode: 0644] | patch | blob |
examples/ztrmm_test/ztrmm_test.c | [new file with mode: 0644] | patch | blob |
examples/ztrsm_test/Makefile | [new file with mode: 0644] | patch | blob |
examples/ztrsm_test/ztrsm_test.c | [new file with mode: 0644] | patch | blob |
readme.txt | patch | blob | history | |
tuning/Makefile | patch | blob | history |
index d7796962320d44ced26949fb7dfe93b9a9133a2b..a3cb737b780a21ce79bd34d3120e444b5323817e 100644 (file)
--- a/build/tar_files_list.txt
+++ b/build/tar_files_list.txt
debian
examples/make.inc
examples/Makefile
-examples/eig
-examples/ludinv
examples/matmpy
-examples/sgemm_tune
-examples/dgemm_tune
-examples/cgemm_tune
-examples/zgemm_tune
examples/dgemm_test
+examples/dsyrk_test
+examples/ztrmm_test
+examples/ztrsm_test
+examples/eig
+examples/ludinv
+tuning
blis/version
blis/build
blis/CHANGELOG
index b7c549218de63bce068ce99701202e152af83d5e..503045316cdc27517c55ab2d91c2f52b2452ccb0 100644 (file)
$(EXE): dgemm_test.o
$(CC) $(CFLAGS) dgemm_test.o $(BLASLIB) -o $@
+run: $(EXE)
+ export TI_CBLAS_OFFLOAD=000;./$(EXE);cp dgemm_time.dat dgemm_time_ARM.dat; cp dgemm_gflops.dat dgemm_gflops_ARM.dat;\
+ export TI_CBLAS_OFFLOAD=001;./$(EXE);cp dgemm_time.dat dgemm_time_DSP.dat; cp dgemm_gflops.dat dgemm_gflops_DSP.dat;\
+ export TI_CBLAS_OFFLOAD=002;./$(EXE);cp dgemm_time.dat dgemm_time_OPT.dat; cp dgemm_gflops.dat dgemm_gflops_OPT.dat;
\ No newline at end of file
index 35e56bc5f8d8ca38e7a6bd3e996fe1c17cce0907..88b6de21de918fe1a5e93e56dde390cb08d390c1 100644 (file)
/******************************************************************************
- * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
}
#endif
-#define TUNING_START_SIZE_RECTAN_MATRIX 64
+#define TUNING_START_SIZE_RECTAN_MATRIX 128
#define NUM_MATRIX_SIZE_TO_BENCHMARK 4
#define HAS_MEMORY 1
#define NO_MEMORY 0
int M, N, K, m, n, k;
int M_pre, N_pre, K_pre, M_start_size, N_start_size;
float time_secs_arm, gflops_arm, time_secs_dsp, gflops_dsp, time_secs_opt, gflops_opt;
- FILE *fp_time, *fp_flops;
+ FILE *fp_time, *fp_gflops;
fp_time = fopen("dgemm_time.dat","w");
- fp_flops = fopen("dgemm_flops.dat","w");
+ fp_gflops = fopen("dgemm_gflops.dat","w");
srand(12345);
/* setting up TI CBLAS during first call */
- run_dgemm(100, 100, 100, &time_secs_arm, &gflops_arm);
+ run_dgemm(1000, 1000, 1000, &time_secs_arm, &gflops_arm);
/* sweep M, K, and N */
for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
printf("Running DGEMM for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- TI_CBLAS_L3_OFFLOAD = 0;
- dgemm_err = run_dgemm(M, N, K, &time_secs_arm, &gflops_arm);
+ dgemm_err = run_dgemm(M, N, K, &time_secs, &gflops);
if(dgemm_err == -1) { /* out of memory for DSP offloading */
printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
}
else {
- TI_CBLAS_L3_OFFLOAD = 1;
- dgemm_err = run_dgemm(M, N, K, &time_secs_dsp, &gflops_dsp);
-
- TI_CBLAS_L3_OFFLOAD = 2;
- dgemm_err = run_dgemm(M, N, K, &time_secs_opt, &gflops_opt);
-
- fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\t%10.8e\t%10.8e\n",
- M, N, K, time_secs_arm, time_secs_dsp, time_secs_opt);
- fprintf(fp_flops, "%6d\t%6d\t%6d\t%10.8e\t%10.8e\t%10.8e\n",
- M, N, K, gflops_arm, gflops_dsp, gflops_opt);
+ fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
+ fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
}
}
}
}
fclose(fp_time);
- fclose(fp_flops);
+ fclose(fp_gflops);
return 0;
}
diff --git a/examples/dsyrk_test/Makefile b/examples/dsyrk_test/Makefile
--- /dev/null
@@ -0,0 +1,12 @@
+
+EXE = dsyrk_test
+
+include ../make.inc
+
+$(EXE): dsyrk_test.o
+ $(CC) $(CFLAGS) dsyrk_test.o $(BLASLIB) -o $@
+
+run: $(EXE)
+ export TI_CBLAS_OFFLOAD=000;./$(EXE);cp dsyrk_time.dat dsyrk_time_ARM.dat; cp dsyrk_gflops.dat dsyrk_gflops_ARM.dat;\
+ export TI_CBLAS_OFFLOAD=001;./$(EXE);cp dsyrk_time.dat dsyrk_time_DSP.dat; cp dsyrk_gflops.dat dsyrk_gflops_DSP.dat;\
+ export TI_CBLAS_OFFLOAD=002;./$(EXE);cp dsyrk_time.dat dsyrk_time_OPT.dat; cp dsyrk_gflops.dat dsyrk_gflops_OPT.dat;
\ No newline at end of file
diff --git a/examples/dsyrk_test/dsyrk_test.c b/examples/dsyrk_test/dsyrk_test.c
--- /dev/null
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+#define START_SIZE_RECTAN_MATRIX 128
+#define NUM_MATRIX_SIZE_TO_TEST 4
+#define NUM_TEST_RUN 5
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+ t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double alpha = 0.7;
+double beta = 0.3;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_UPLO uplo = CblasUpper;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const double *C1, const double *C2, int N, int K);
+int run_dsyrk(int N, int K, float *time, float *gflops);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int dsyrk_err;
+ int N, K, n, k;
+ float time_secs, gflops;
+ FILE *fp_time;
+ FILE *fp_gflops;
+
+ fp_time = fopen("dsyrk_time.dat","w");
+ fp_gflops = fopen("dsyrk_gflops.dat","w");
+
+ srand(12345);
+
+ /* setting up TI CBLAS during first call */
+ run_dsyrk(1000, 1000, &time_secs, &gflops);
+
+ /* sweep K, and N */
+ for (N=START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_TEST; n++,N*=2)
+ {
+ for (K=START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_TEST; k++,K*=2)
+ {
+ printf("Running DSYRK for (N,K)=(%d,%d)\n", N,K);
+
+ dsyrk_err = run_dsyrk(N, K, &time_secs, &gflops);
+
+ if(dsyrk_err == -1) { /* out of memory for DSP offloading */
+ printf("Out of memory for (N,K) = (%d,%d).\n", N,K);
+ fprintf(fp_time, "%6d\t%6d\t%10.8e\n", N, K, -1.0);
+ fprintf(fp_gflops, "%6d\t%6d\t%10.8e\n", N, K, -1.0);
+ }
+ else {
+ fprintf(fp_time, "%6d\t%6d\t%10.8e\n", N, K, time_secs);
+ fprintf(fp_gflops, "%6d\t%6d\t%10.8e\n", N, K, gflops);
+ }
+ }
+ }
+ fclose(fp_time);
+ fclose(fp_gflops);
+
+ return 0;
+}
+
+
+int run_dsyrk(int N, int K, float *time, float *gflops)
+{
+ long long i, size_A, size_C;
+ int iter;
+ float time_secs, total_time, total_gflops, gflops_iter;
+ float operation_count = 1.0*(double)N*(double)N*(double)K;
+ int err_code = 0;
+
+ total_time= 0.0;
+ total_gflops = 0.0;
+ size_A = (long long)N*(long long)K;
+ size_C = (long long)N*(long long)N;
+
+ if( (size_A*sizeof(double)>(long long)0x0ffffffff)
+ ||(size_C*sizeof(double)>(long long)0x0ffffffff) ) {
+ return (-1);
+ }
+
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory.
+ *------------------------------------------------------------------------*/
+ double *A = (double *) __malloc_ddr(size_A*sizeof(double));
+ double *C = (double *) __malloc_ddr(size_C*sizeof(double));
+
+ if (!A || !C)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(C) __free_ddr(C);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ for (i = 0; i < (long long)N*K; ++i) A[i] = (double)rand()/RAND_MAX;
+ for (i = 0; i < (long long)N*N; ++i) C[i] = (double)rand()/RAND_MAX;
+
+ int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+ (order == CblasRowMajor && transA == CblasTrans)) ? N : K;
+
+ int ldc = N;
+
+ fflush(stdout);
+
+ /*------------------------------------------------------------------------
+ * Time dsyrk
+ *-----------------------------------------------------------------------*/
+ tick();
+ cblas_dsyrk(order,uplo,transA,N,K,alpha,A,lda,beta,C,ldc);
+ time_secs = tock();
+ total_time += time_secs;
+ gflops_iter = operation_count/time_secs*1e-9;
+ total_gflops += gflops_iter;
+
+ __free_ddr(A);
+ __free_ddr(C);
+ }
+
+ *gflops = total_gflops / (double)NUM_TEST_RUN;
+ *time = total_time / (double)NUM_TEST_RUN;
+
+ return err_code;
+}
+
diff --git a/examples/ludinv/main.c b/examples/ludinv/main.c
index e99932856ff483f61f274a1abf3d2118e0acc340..7660bdcfc794da0e6a9d2a3c77dd9eb92758ad69 100644 (file)
--- a/examples/ludinv/main.c
+++ b/examples/ludinv/main.c
if(argc == 1) { /* no command line arguments, use default */
num_test = 3;
n_min = 1000;
- n_inc = 1000;
+ n_inc = 500;
num_run = 1;
print_data = 0;
}
diff --git a/examples/ztrmm_test/Makefile b/examples/ztrmm_test/Makefile
--- /dev/null
@@ -0,0 +1,12 @@
+
+EXE = ztrmm_test
+
+include ../make.inc
+
+$(EXE): ztrmm_test.o
+ $(CC) $(CFLAGS) ztrmm_test.o $(BLASLIB) -o $@
+
+run: $(EXE)
+ export TI_CBLAS_OFFLOAD=000;./$(EXE);cp ztrmm_time.dat ztrmm_time_ARM.dat; cp ztrmm_gflops.dat ztrmm_gflops_ARM.dat;\
+ export TI_CBLAS_OFFLOAD=001;./$(EXE);cp ztrmm_time.dat ztrmm_time_DSP.dat; cp ztrmm_gflops.dat ztrmm_gflops_DSP.dat;\
+ export TI_CBLAS_OFFLOAD=002;./$(EXE);cp ztrmm_time.dat ztrmm_time_OPT.dat; cp ztrmm_gflops.dat ztrmm_gflops_OPT.dat;
\ No newline at end of file
diff --git a/examples/ztrmm_test/ztrmm_test.c b/examples/ztrmm_test/ztrmm_test.c
--- /dev/null
@@ -0,0 +1,194 @@
+/******************************************************************************
+ * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <complex.h>
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+#define START_SIZE_RECTAN_MATRIX 128
+#define NUM_MATRIX_SIZE_TO_TEST 4
+#define NUM_TEST_RUN 5
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+ t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double complex alpha = 0.7 - 0.3*I;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasUnit;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const double complex *C1, const double complex *C2, int M, int N);
+int run_ztrmm(int M, int N, float *time, float *gflops);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int ztrmm_err;
+ int M, N, m, n;
+ float time_secs, gflops;
+ FILE *fp_time, *fp_gflops;
+
+ fp_time = fopen("ztrmm_time.dat","w");
+ fp_gflops = fopen("ztrmm_gflops.dat","w");
+
+ srand(12345);
+
+ /* setting up TI CBLAS during first call */
+ run_ztrmm(1000, 1000, &time_secs, &gflops);
+
+ /* sweep M, and N */
+ for (M=START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_TEST; m++,M*=2)
+ {
+ for (N=START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_TEST; n++,N*=2)
+ {
+ printf("Running ZTRMM for (M,N)=(%d,%d)\n", M,N);
+ ztrmm_err = run_ztrmm(M, N, &time_secs, &gflops);
+
+ if(ztrmm_err == -1) { /* out of memory for DSP offloading */
+ printf("Out of memory for (M,N) = (%d,%d).\n", M,N);
+ fprintf(fp_time, "%6d\t%6d\t%10.8e\n", M, N, -1.0);
+ fprintf(fp_gflops, "%6d\t%6d\t%10.8e\n", M, N, -1.0);
+ }
+ else {
+ fprintf(fp_time, "%6d\t%6d\t%10.8e\n", M, N, time_secs);
+ fprintf(fp_gflops, "%6d\t%6d\t%10.8e\n", M, N, gflops);
+ }
+ }
+ }
+
+ fclose(fp_time);
+ fclose(fp_gflops);
+
+ return 0;
+}
+
+int run_ztrmm(int M, int N, float *time, float *gflops)
+{
+ long long i, size_A, size_B;
+ int iter;
+ float time_secs, total_time, total_gflops, gflops_iter;
+ float operation_count;
+ int err_code = 0;
+
+ total_time= 0.0;
+ total_gflops = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ operation_count = 4.0*(double)M*(double)M*(double)N;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ operation_count = 4.0*(double)M*(double)N*(double)N;
+ }
+ size_B = (long long)M*(long long)N;
+ if( (size_A*sizeof(double complex)>(long long)0x0ffffffff)
+ ||(size_B*sizeof(double complex)>(long long)0x0ffffffff) ) {
+ return (-1);
+ }
+
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory.
+ *------------------------------------------------------------------------*/
+ double complex *A = (double complex *) __malloc_ddr(size_A*sizeof(double complex));
+ double complex *B = (double complex *) __malloc_ddr(size_B*sizeof(double complex));
+
+ if (!A || !B)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(B) __free_ddr(B);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
+ for (i = 0; i < size_A; ++i)
+ {
+ A[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ }
+
+ for (i = 0; i < size_B; ++i)
+ {
+ B[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ }
+
+ /*------------------------------------------------------------------------
+ * Time ztrmm
+ *-----------------------------------------------------------------------*/
+ tick();
+ cblas_ztrmm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,B,ldb);
+ time_secs = tock();
+ total_time += time_secs;
+ gflops_iter = operation_count/time_secs*1e-9;
+ total_gflops += gflops_iter;
+
+ __free_ddr(A);
+ __free_ddr(B);
+ }
+
+ *gflops = total_gflops / (double)NUM_TEST_RUN;
+ *time = total_time / (double)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
diff --git a/examples/ztrsm_test/Makefile b/examples/ztrsm_test/Makefile
--- /dev/null
@@ -0,0 +1,12 @@
+
+EXE = ztrsm_test
+
+include ../make.inc
+
+$(EXE): ztrsm_test.o
+ $(CC) $(CFLAGS) ztrsm_test.o $(BLASLIB) -o $@
+
+run: $(EXE)
+ export TI_CBLAS_OFFLOAD=000;./$(EXE);cp ztrsm_time.dat ztrsm_time_ARM.dat; cp ztrsm_gflops.dat ztrsm_gflops_ARM.dat;\
+ export TI_CBLAS_OFFLOAD=001;./$(EXE);cp ztrsm_time.dat ztrsm_time_DSP.dat; cp ztrsm_gflops.dat ztrsm_gflops_DSP.dat;\
+ export TI_CBLAS_OFFLOAD=002;./$(EXE);cp ztrsm_time.dat ztrsm_time_OPT.dat; cp ztrsm_gflops.dat ztrsm_gflops_OPT.dat;
\ No newline at end of file
diff --git a/examples/ztrsm_test/ztrsm_test.c b/examples/ztrsm_test/ztrsm_test.c
--- /dev/null
@@ -0,0 +1,202 @@
+/******************************************************************************
+ * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <complex.h>
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+#define START_SIZE_RECTAN_MATRIX 128
+#define NUM_MATRIX_SIZE_TO_TEST 4
+#define NUM_TEST_RUN 5
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+ t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double complex alpha = 0.7 - 0.3*I;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasUnit;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const double complex *C1, const double complex *C2, int M, int N);
+int run_ztrsm(int M, int N, float *time, float *gflops);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int ztrsm_err;
+ int M, N, m, n;
+ float time_secs, gflops;
+ FILE *fp_time, *fp_gflops;
+
+ fp_time = fopen("ztrsm_time.dat","w");
+ fp_gflops = fopen("ztrsm_gflops.dat","w");
+
+ srand(12345);
+
+ /* setting up TI CBLAS during first call */
+ run_ztrsm(1000, 1000, &time_secs, &gflops);
+
+ /* sweep M, and N */
+ for (M=START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_TEST; m++,M*=2)
+ {
+ for (N=START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_TEST; n++,N*=2)
+ {
+ printf("Running ZTRSM for (M,N)=(%d,%d)\n", M,N);
+ ztrsm_err = run_ztrsm(M, N, &time_secs, &gflops);
+
+ if(ztrsm_err == -1) { /* out of memory for DSP offloading */
+ printf("Out of memory for (M,N) = (%d,%d).\n", M,N);
+ fprintf(fp_time, "%6d\t%6d\t%10.8e\n", M, N, -1.0);
+ fprintf(fp_gflops, "%6d\t%6d\t%10.8e\n", M, N, -1.0);
+ }
+ else {
+ fprintf(fp_time, "%6d\t%6d\t%10.8e\n", M, N, time_secs);
+ fprintf(fp_gflops, "%6d\t%6d\t%10.8e\n", M, N, gflops);
+ }
+ }
+ }
+
+ fclose(fp_time);
+ fclose(fp_gflops);
+
+ return 0;
+}
+
+int run_ztrsm(int M, int N, float *time, float *gflops)
+{
+ long long i, size_A, size_B;
+ int iter, j, k;
+ float time_secs, total_time, total_gflops, gflops_iter;
+ float operation_count;
+ int err_code = 0;
+
+ total_time= 0.0;
+ total_gflops = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ operation_count = 4.0*(double)M*(double)M*(double)N;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ operation_count = 4.0*(double)M*(double)N*(double)N;
+ }
+ size_B = (long long)M*(long long)N;
+ if( (size_A*sizeof(double complex)>(long long)0x0ffffffff)
+ ||(size_B*sizeof(double complex)>(long long)0x0ffffffff) ) {
+ return (-1);
+ }
+
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory.
+ *------------------------------------------------------------------------*/
+ double complex *A = (double complex *) __malloc_ddr(size_A*sizeof(double complex));
+ double complex *B = (double complex *) __malloc_ddr(size_B*sizeof(double complex));
+
+ if (!A || !B)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(B) __free_ddr(B);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
+ for(j=0;j<lda;j++)
+ {
+ for(k=0;k<lda;k++)
+ {
+ if (j==k)
+ A[j*lda+k] = 1.0+j + 0.0*I;
+ else if (j<k)
+ A[j*lda+k] = 0.0 + 0.0*I;
+ else
+ A[j*lda+k] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ }
+ }
+
+ for (i = 0; i < size_B; ++i)
+ {
+ B[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ }
+
+ /*------------------------------------------------------------------------
+ * Time ztrsm
+ *-----------------------------------------------------------------------*/
+ tick();
+ cblas_ztrsm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,B,ldb);
+ time_secs = tock();
+ total_time += time_secs;
+ gflops_iter = operation_count/time_secs*1e-9;
+ total_gflops += gflops_iter;
+
+ __free_ddr(A);
+ __free_ddr(B);
+ }
+
+ *gflops = total_gflops / (double)NUM_TEST_RUN;
+ *time = total_time / (double)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
diff --git a/readme.txt b/readme.txt
index 8a480382755ec6a7648a53e34718dbd9c5ecae05..5364fc33ae71fbce82871658664e66b0c7343cb8 100644 (file)
--- a/readme.txt
+++ b/readme.txt
-
-========== Run applications with LINALG libraries: ==========
-1. include following headers files located at /usr/include after MCSDK-HPC installation:
- - BLAS: cblas.h
- - LAPACK: f2c.h, blaswrap.h, clapack.h
- - Note: fc2.h has a complex type which is different from the complex type in
- C99 complex.h. If f2c.h is included, C99 complex.h should not be used.
-2. link following LINALG libraries located at /usr/lib after MCSDK-HPC installation:
- - BLAS: libblis.a, libcblas_armplusdsp.a
- - LAPACK: libcblaswr.a, liblapack.a, libf2c.a
- - Note: cblas calls are not thread safe
-3. setup environment variables:
- - BLIS_IC_NT for number of ARM threads: 1 through 4
- - TI_CBLAS_OFFLOAD for BLAS offloading (level 1, 2, or 3) configuration:
- - set to xyz, where x,y,z correspond to level 1, level 2, and level 3 and
- can take any of 3 values:
- - 0: no offloading to DSP, ie. always running on ARM
- - 1: forced offloading to DSP, ie. always running on DSP
- - 2: conditional offloading to DSP based on matrix sizes
- - example: TI_CBLAS_OFFLOAD=001 means level 1&2 functions will always run
- on ARM, and level 3 functions will always run on DSP.
- - default offloading configuration if TI_CBLAS_OFFLOAD is not set:
- - 002 (level 1&2 no offloading, level 3 offloading based on sizes)
- - Note: in this release, conditional offloading (value 2) is not available
- for level 1 and level 2. If this option is configured for level 1
- and level 2, functions will be offloaded to DSP.
-
-========== Rebuild LINALG libraries: ==========
-1. rebuild ARM without rebuilding DSP code: make ARMonly
-2. rebuild both DSP and ARM code: make ARMplusDSP
-
-========== Install LINALG libraries: ==========
-1. install libraries to /usr/lib: make install
-2. install libraries to another directory: make install DESTDIR=<directory name>
-
+For information about how to use LINALG library, please go to: http://processors.wiki.ti.com/index.php/MCSDK_HPC_3.x_Linear_Algebra_Library.
\ No newline at end of file
diff --git a/tuning/Makefile b/tuning/Makefile
index 5aad157bce26297c391bd5d96462237fc78a2e96..4de0d0bc31aa63937c042fe2bc76065de685e1ac 100644 (file)
--- a/tuning/Makefile
+++ b/tuning/Makefile
echo "=============== " $$dir " =================" ; \
$(MAKE) -C $$dir tune; \
done; \
- mkdir ../ofld_tbls; find . -iname "ofld_tbl*.c" -exec cp {} ../ofld_tbls \;
+ mkdir ofld_tbls; find *_tune -name *ofld*.c -exec cp {} ofld_tbls \;
clean:
for dir in $(DIRS); do \