summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 53e6407)
raw | patch | inline | side by side (parent: 53e6407)
author | Jianzhong Xu <xuj@ti.com> | |
Mon, 11 Jan 2016 19:58:34 +0000 (19:58 +0000) | ||
committer | Jianzhong Xu <xuj@ti.com> | |
Mon, 11 Jan 2016 19:58:34 +0000 (19:58 +0000) |
diff --git a/Makefile b/Makefile
index b7e5e3d9907f0a061ef8b5a1949d40b1fbe174c9..30457b62dbcf1340b2f1d934cf1bb08445a1b64d 100644 (file)
--- a/Makefile
+++ b/Makefile
cd ../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make -j8; cd ..
ARMplusDSP: DSPlibs ARMlibs
- cd $(LINALG_BLASACC_DIR)/src; make debug MEM_MODEL=$(MEM_MODEL) TARGET=$(TARGET); cd ../..; \
+ cd $(LINALG_BLASACC_DIR)/src; make MEM_MODEL=$(MEM_MODEL) TARGET=$(TARGET); cd ../..; \
cp $(LINALG_BLASACC_DIR)/lib/libcblas_armplusdsp.a ./lib; \
cp $(LINALG_BLIS_DIR)/install/arm/lib/libblis.a ./lib; \
cp $(LINALG_CLAPACK_DIR)/lapack_ARM.a ./lib/liblapack.a; \
cleanDSPlibs:
cd $(LINALG_CBLAS_DIR); make arch=C66 clean; \
cd ../$(LINALG_BLIS_DIR); ./configure -p install/$(BLIS_CFG) c66x; make -j8 clean; \
- cd ../$(LINALG_TICBLAS_DIR)/src; make clean; cd ../lib/objs; rm *; cd ../; rm libcblas.ae66; rm -r objs; cd ../..;
+ cd ../$(LINALG_TICBLAS_DIR)/src; make clean;
cleanARMlibs:
cd $(LINALG_CBLAS_DIR); make arch=ARM clean; \
#DSPonly:
BLIStest:
- cd ../$(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; \
- cd $(LINALG_BLIS_DIR)/testsuite; make lib=OpenCLCBLAS -j8
+ cd $(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; \
+ cd testsuite; make lib=OpenCLCBLAS -j8
BLAStest:
cd $(LINALG_CLAPACK_DIR)/BLAS/TESTING; make -f Makeblat1; make -f Makeblat2; make -f Makeblat3
cleanARMplusDSP: cleanDSPlibs cleanARMlibs
-
+
clean:
cd $(LINALG_CBLAS_DIR)/src; make arch=ARM clean; \
index 0d606b338600ea77eea9ce0f73790e10eafcfd89..1946377c397f40d394014ad3b3564c96e7f30836 100644 (file)
--- a/blasblisacc/src/Makefile
+++ b/blasblisacc/src/Makefile
OBJS = ti_cblas_initfini.o
# CBLAS and BLIS directories
-CBLAS_DSP_LIB = ../../cblas/lib/C66/libcblas_C66.ae66
+CBLAS_DSP_LIB = ../../cblas/lib/C66/libcblas.ae66
TICBLAS_DSP_LIB = ../../ticblas/lib/libticblas.ae66
CBLAS_ARM_LIB = ../../cblas/lib/ARM/libcblas_ARM.a
LIBARCH_LIB = $(LIBARCH_DIR)/lib/libArch.ae66
debug: CPP_FLAGS += -DTI_CBLAS_DEBUG $(CPP_DEBUG) #-DTI_CBLAS_PROFILE
debug: cross
+debug: CL6X_FLAGS += -DTI_CBLAS_DEBUG
profile: CPP_FLAGS += -DTI_CBLAS_PROFILE
profile: armplusdsp
index ed60c2ba25d02155fb51840cace7fc6bfbec5a71..3beb479988a780340ea2f372c4c350d5210285b0 100644 (file)
--- a/blasblisacc/src/facade.c
+++ b/blasblisacc/src/facade.c
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-//#include "stdio.h"
+
#include "../../cblas/include/cblas.h"
#include "../../ticblas/ticblas.h"
+#ifdef TI_CBLAS_DEBUG
+#include "stdio.h"
+
+extern char *pool_mk_mem_L1;
+extern char *pool_kn_mem_L1;
+extern char *pool_mn_mem_L1;
+extern char *pool_mk_mem_L2;
+extern char *pool_kn_mem_L2;
+extern char *pool_mn_mem_L2;
+extern char *pool_mk_mem_L3;
+extern char *pool_kn_mem_L3;
+extern char *pool_mn_mem_L3;
+#endif
+
extern int bli_l3_mem_config(double *msmc_buf, size_t msmc_buf_size, size_t *l1D_SRAM_size_orig, size_t *l2_SRAM_size_orig);
extern int bli_l3_mem_reconfig(size_t l1D_SRAM_size_orig, size_t l2_SRAM_size_orig);
@@ -307,19 +321,7 @@ void cblas_dgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE
{
cblas_dgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
}
-/*
-extern char *pool_mk_mem_L1;
-extern char *pool_kn_mem_L1;
-extern char *pool_mn_mem_L1;
-
-extern char *pool_mk_mem_L2;
-extern char *pool_kn_mem_L2;
-extern char *pool_mn_mem_L2;
-extern char *pool_mk_mem_L3;
-extern char *pool_kn_mem_L3;
-extern char *pool_mn_mem_L3;
-*/
void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, double *l3_buf, size_t l3_buf_size, int *err_code)
{
size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
@@ -328,10 +330,12 @@ void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE
if(*err_code != TICBLAS_SUCCESS) {
return;
}
-
- //printf("pool_mk_mem_L1 is 0x%x, pool_kn_mem_L1 is 0x%x, pool_mn_mem_L1 is 0x%x.\n", (unsigned int)pool_mk_mem_L1, (unsigned int)pool_kn_mem_L1, (unsigned int)pool_mn_mem_L1);
- //printf("pool_mk_mem_L2 is 0x%x, pool_kn_mem_L2 is 0x%x, pool_mn_mem_L2 is 0x%x.\n", (unsigned int)pool_mk_mem_L2, (unsigned int)pool_kn_mem_L2, (unsigned int)pool_mn_mem_L2);
- //printf("pool_mk_mem_L3 is 0x%x, pool_kn_mem_L3 is 0x%x, pool_mn_mem_L3 is 0x%x.\n", (unsigned int)pool_mk_mem_L3, (unsigned int)pool_kn_mem_L3, (unsigned int)pool_mn_mem_L3);
+
+#ifdef TI_CBLAS_DEBUG
+ printf("pool_mk_mem_L1 is 0x%x, pool_kn_mem_L1 is 0x%x, pool_mn_mem_L1 is 0x%x.\n", (unsigned int)pool_mk_mem_L1, (unsigned int)pool_kn_mem_L1, (unsigned int)pool_mn_mem_L1);
+ printf("pool_mk_mem_L2 is 0x%x, pool_kn_mem_L2 is 0x%x, pool_mn_mem_L2 is 0x%x.\n", (unsigned int)pool_mk_mem_L2, (unsigned int)pool_kn_mem_L2, (unsigned int)pool_mn_mem_L2);
+ printf("pool_mk_mem_L3 is 0x%x, pool_kn_mem_L3 is 0x%x, pool_mn_mem_L3 is 0x%x.\n", (unsigned int)pool_mk_mem_L3, (unsigned int)pool_kn_mem_L3, (unsigned int)pool_mn_mem_L3);
+#endif
cblas_dgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
index c08b76ec14b133622e77be3c160e4d1b9d053ad8..e1879fb27e3123fda8a8c9acd2390e199b4be3a8 100644 (file)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-
-/*#include <stdio.h>*/
+#ifdef TI_CBLAS_DEBUG
+#include <stdio.h>
+#include <ti/csl/csl_chipAux.h>
+#include <ti/csl/csl_idmaAux.h>
+#endif
#include "../../ticblas/ticblas.h"
#include <libarch.h>
@@ -55,10 +58,12 @@ int bli_l3_mem_config(double *msmc_buf, size_t msmc_buf_size, size_t *l1D_SRAM_s
/* Configure L1D if necessary */
*l1D_SRAM_size_orig = lib_get_L1D_SRAM_size(); /* get current L1D SRAM size */
l1d_cfg_err = LIB_CACHE_SUCCESS;
-/*
- printf("Original L1D SRAM size is: %d\n", *l1D_SRAM_size_orig);
- printf("Required L1D SRAM size is: %d\n", smem_size_vfast);
-*/
+
+#ifdef TI_CBLAS_DEBUG
+ printf("Original L1D SRAM size is: %d\n", *l1D_SRAM_size_orig);
+ printf("Required L1D SRAM size is: %d\n", smem_size_vfast);
+#endif
+
if(*l1D_SRAM_size_orig < smem_size_vfast) { /* configure L1D if needs more SRAM */
/*printf("Configuring L1D SRAM on all cores.\n");*/
#pragma omp parallel
@@ -66,20 +71,35 @@ int bli_l3_mem_config(double *msmc_buf, size_t msmc_buf_size, size_t *l1D_SRAM_s
l1d_cfg_err = lib_L1D_config_SRAM(smem_size_vfast);
}
}
-/*
+
+#ifdef TI_CBLAS_DEBUG
#pragma omp parallel
{
int core_id = lib_get_coreID();
- printf("New L1D SRAM size from core %d is: %d\n", core_id, lib_get_L1D_SRAM_size());
+ printf("New L1D SRAM size from core %d is: %d\n", core_id, lib_get_L1D_SRAM_size());
}
-*/
+
+ CSL_IDMA_chan1Wait();
+ printf("IDMA1 source register: 0x%x, destination register: 0x%x, count register: 0x%x.\n", (unsigned int)hIdma->IDMA1_SOURCE, (unsigned int)hIdma->IDMA1_DEST, (unsigned int)hIdma->IDMA1_COUNT);
+ printf("Configure IDMA1 to transfer 128 bytes from 0x820000 to 0xf00000.\n");
+ hIdma->IDMA1_SOURCE = (uint32_t)0x820000;
+ hIdma->IDMA1_DEST = (uint32_t)0xf00000;
+ hIdma->IDMA1_COUNT = CSL_FMK(CGEM_IDMA1_COUNT_PRI, (uint32_t)7) |
+ CSL_FMK(CGEM_IDMA1_COUNT_INT, (uint32_t)0) |
+ CSL_FMK(CGEM_IDMA1_COUNT_FILL, (uint32_t)0) |
+ CSL_FMK(CGEM_IDMA1_COUNT_COUNT, 128);
+ printf("IDMA1 source register: 0x%x, destination register: 0x%x, count register: 0x%x.\n", (unsigned int)hIdma->IDMA1_SOURCE, (unsigned int)hIdma->IDMA1_DEST, (unsigned int)hIdma->IDMA1_COUNT);
+#endif
+
/* Configure L2 if necessary */
*l2_SRAM_size_orig = lib_get_L2_SRAM_size(); /* get current L2 SRAM size */
l2_cfg_err = LIB_CACHE_SUCCESS;
-/*
+
+#ifdef TI_CBLAS_DEBUG
printf("Original L2 SRAM size is: %d\n", *l2_SRAM_size_orig);
printf("Required L2 SRAM size is: %d\n", smem_size_fast);
-*/
+#endif
+
if(*l2_SRAM_size_orig < smem_size_fast) { /* configure L2 if needs more SRAM */
#pragma omp parallel
{
@@ -90,16 +110,21 @@ int bli_l3_mem_config(double *msmc_buf, size_t msmc_buf_size, size_t *l1D_SRAM_s
if(l1d_cfg_err || l2_cfg_err) {
return(TICBLAS_INIT_ERROR);
}
-/*
- printf("New L2 SRAM size is: %d\n", lib_get_L2_SRAM_size());
-*/
+
+#ifdef TI_CBLAS_DEBUG
+ printf("New L2 SRAM size is: %d\n", lib_get_L2_SRAM_size());
+#endif
+
/* get L1D and L2 SRAM base address */
l1d_SRAM_ptr = lib_get_L1D_SRAM_base();
l2_SRAM_ptr = lib_get_L2_SRAM_base();
-/*
- printf("L1D SRAM base address is 0x%x.\n", (unsigned int)l1d_SRAM_ptr);
- printf("L2 SRAM base address is 0x%x.\n", (unsigned int) l2_SRAM_ptr);
-*/
+
+#ifdef TI_CBLAS_DEBUG
+ printf("L1D SRAM base address is 0x%x.\n", (unsigned int)l1d_SRAM_ptr);
+ printf("L2 SRAM base address is 0x%x.\n", (unsigned int) l2_SRAM_ptr);
+ printf("MSMC SRAM address is 0x%x.\n", (unsigned int) msmc_buf);
+#endif
+
/* pass allocated memories for heap initialization */
return(tiCblasInit(l1d_SRAM_ptr, smem_size_vfast,
l2_SRAM_ptr, smem_size_fast,
/* This function will be removed. Function tiCblasNew() will be used instead. */
void ti_bli_init_dsp(char *l3_buf, char *l2_buf)
-{
+{
+#ifdef TI_CBLAS_DEBUG
printf("In function ti_bli_init_dsp, l3_buff is 0x%x, l2_buf is 0x%x.\n", (unsigned int)l3_buf, (unsigned int)l2_buf);
+#endif
+
bli_init();
}
index 72712bbf69ffb5b47f09507e4bea0149d328995e..43d2d05be5e6da4ddf99764bd45f77e4ac15b046 100644 (file)
{ \
/*Always use MR and NR while transfering a packed panel*/ \
lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
- lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype), 0, 0, 7);\
+ lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype));\
} \
\
/* Loop over the m dimension (MR rows at a time). */ \
lib_imt_wait(); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
- lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype)); \
a2 = a_cast; \
b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
} \
else \
{ \
- lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype)); \
} \
\
if(i == ir_thread_id) \
index 9ae98b67ac2bbac863ed60de090c751c60816fe5..ac0b20eee6e1ddd354b7a940a0bcda476d2ee7dd 100644 (file)
{ \
/*Storing the value back*/\
/*lib_imt_wait(); \
- lib_imt_copy(a11, a1 + ( off_b11 * PACKMR ) / off_scl, k_b11*PACKMR*sizeof(ctype), 0,0,7);*/ \
+ lib_imt_copy(a11, a1 + ( off_b11 * PACKMR ) / off_scl, k_b11*PACKMR*sizeof(ctype));*/ \
{ \
ctype *ptr_source; \
ctype *ptr_dest; \
diff --git a/cblas/Makefile b/cblas/Makefile
index 967e24e888c2c418e0ea0e0695da4b7926cb8e0d..d0e292b2ce3360492feb7030b392f68269e3ec1a 100644 (file)
--- a/cblas/Makefile
+++ b/cblas/Makefile
allprecision:
( cd src && make all)
libinstall:
-ifeq ($(arch), C66)
- (cp $(TEMPCBLIB) $(patsubst %.a, %.ae66, $(TEMPCBLIB) ))
-endif
stest1: link
( cd testing && make stest1 )
diff --git a/cblas/Makefile.C66 b/cblas/Makefile.C66
index 3f6d4999e9cc2981507427be5c1758e715dec67e..cbb8cdb00171252ce2074adf82c0b39d4b7acc1f 100644 (file)
--- a/cblas/Makefile.C66
+++ b/cblas/Makefile.C66
# Libraries and includes
#-----------------------------------------------------------------------------
-CBLIB = ../lib/$(PLAT)/libcblas.a
+CBLIB = ../lib/$(PLAT)/libcblas.ae66
LN_S = ln -sf
diff --git a/cblas/Makefile.in b/cblas/Makefile.in
index 61dcf9de53db21d1b4a46c9cabda74690b1a608d..43f68c27d7d52cc0e5618c08434490378ff8236b 120000 (symlink)
--- a/cblas/Makefile.in
+++ b/cblas/Makefile.in
-Makefile.ARM
\ No newline at end of file
+Makefile.C66
\ No newline at end of file
index e061337cf13d12635e80476e359f713b3cbea133..88358b1b6486c3085cfdf51bbae69d40c1d43433 100644 (file)
#define NUM_TEST_RUN 5
+#define GFLOPS_MARGIN (1.1f)
/*-----------------------------------------------------------------------------
* Timing Setup
int num_size, gemm_err;
int M, N, K, m, n, k, test_idx;
float time_secs, gflops, gflops_ref, cpu_freq_GHz;
- cl_platform_id platform;
- cl_uint num_platforms;
- cl_device_id devices;
- cl_uint num_devices;
+ cl_platform_id platform;
+ cl_uint num_platforms;
+ cl_device_id devices;
+ cl_uint num_devices;
cl_uint cpu_freq;
- size_t cpu_freq_size;
+ size_t cpu_freq_size;
FILE *fp_time, *fp_gflops;
- if(clGetPlatformIDs(1, &platform, &num_platforms) != CL_SUCCESS) {
- printf("Error in clGetPlatformIDs\n.");
- exit(0);
- }
-
- if(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &devices, &num_devices) != CL_SUCCESS) {
- printf("Error in clGetDeviceIDs\n.");
- exit(0);
- }
- if(clGetDeviceInfo(devices, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), (void *)&cpu_freq, &cpu_freq_size) != CL_SUCCESS) {
- printf("Error in clGetDeviceInfo\n.");
- exit(0);
- }
- cpu_freq_GHz = (float)cpu_freq/1e3; /* convert from MHz to GHz */
- printf("CPU frequency is %f GHz.\n", cpu_freq_GHz);
-
+ if(clGetPlatformIDs(1, &platform, &num_platforms) != CL_SUCCESS) {
+ printf("Error in clGetPlatformIDs\n.");
+ exit(0);
+ }
+
+ if(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &devices, &num_devices) != CL_SUCCESS) {
+ printf("Error in clGetDeviceIDs\n.");
+ exit(0);
+ }
+ if(clGetDeviceInfo(devices, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), (void *)&cpu_freq, &cpu_freq_size) != CL_SUCCESS) {
+ printf("Error in clGetDeviceInfo\n.");
+ exit(0);
+ }
+ cpu_freq_GHz = (float)cpu_freq/1e3; /* convert from MHz to GHz */
+ printf("CPU frequency is %f GHz.\n", cpu_freq_GHz);
+
srand(12345);
- /* setting up TI CBLAS during first call */
- run_dgemm(1000, 1000, 1000, &time_secs, &gflops);
-
+ /* setting up TI CBLAS during first call */
+ run_dgemm(1000, 1000, 1000, &time_secs, &gflops);
+
/*------- benchmarking DGEMM ------- */
fp_time = fopen("dgemm_time.dat","w");
fp_gflops = fopen("dgemm_gflops.dat","w");
-
- test_idx = 0;
+
+ test_idx = 0;
for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
printf("Running DGEMM for (M,N,K) = (%d,%d,%d). ", M,N,K);
-
+
gemm_err = run_dgemm(M, N, K, &time_secs, &gflops);
-
- gflops_ref = dgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
- gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
+
+ if(gemm_err == -1) { /* out of memory for DSP offloading */
+ printf("DGEMM out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+ exit(0);
+ }
+ else {
+ fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
+ fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
+ }
+
+ gflops_ref = dgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
+ gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
- if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
- printf("DGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
- exit(0);
- }
+ if((gflops > gflops_ref*GFLOPS_MARGIN) || (gflops < gflops_ref/GFLOPS_MARGIN)) {
+ printf("DGEMM test FAILED! GFLOPS deviates from reference unacceptably.\n");
+ exit(0);
+ }
if(gemm_err == -1) { /* out of memory for DSP offloading */
printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- exit(0);
+ exit(0);
}
else {
fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
/*------- benchmarking SGEMM -------*/
fp_time = fopen("sgemm_time.dat","w");
fp_gflops = fopen("sgemm_gflops.dat","w");
-
- test_idx = 0;
+
+ test_idx = 0;
for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
printf("Running SGEMM for (M,N,K) = (%d,%d,%d). ", M,N,K);
-
+
gemm_err = run_sgemm(M, N, K, &time_secs, &gflops);
-
- gflops_ref = sgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
- gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
+
+ if(gemm_err == -1) { /* out of memory for DSP offloading */
+ printf("SGEMM out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+ exit(0);
+ }
+ else {
+ fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
+ fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
+ }
+
+ gflops_ref = sgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
+ gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
- if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
- printf("SGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
- exit(0);
- }
+ if((gflops > gflops_ref*GFLOPS_MARGIN) || (gflops < gflops_ref/GFLOPS_MARGIN)) {
+ printf("SGEMM test FAILED! GFLOPS deviates from reference unacceptably.\n");
+ exit(0);
+ }
if(gemm_err == -1) { /* out of memory for DSP offloading */
printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- exit(0);
+ exit(0);
}
else {
fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
/*------- benchmarking CGEMM -------*/
fp_time = fopen("cgemm_time.dat","w");
fp_gflops = fopen("cgemm_gflops.dat","w");
-
- test_idx = 0;
+
+ test_idx = 0;
for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
printf("Running CGEMM for (M,N,K) = (%d,%d,%d). ", M,N,K);
-
+
gemm_err = run_cgemm(M, N, K, &time_secs, &gflops);
-
- gflops_ref = cgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
- gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
- printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
- if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
- printf("CGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
- exit(0);
- }
if(gemm_err == -1) { /* out of memory for DSP offloading */
- printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- exit(0);
+ printf("CGEMM out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+ exit(0);
}
else {
fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
}
+
+ gflops_ref = cgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
+ gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
+ printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
+ if((gflops > gflops_ref*GFLOPS_MARGIN) || (gflops < gflops_ref/GFLOPS_MARGIN)) {
+ printf("CGEMM test FAILED! GFLOPS deviates from reference unacceptably.\n");
+ exit(0);
+ }
}
}
}
/*------- benchmarking ZGEMM -------*/
fp_time = fopen("zgemm_time.dat","w");
fp_gflops = fopen("zgemm_gflops.dat","w");
-
- test_idx = 0;
+
+ test_idx = 0;
for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
printf("Running ZGEMM for (M,N,K) = (%d,%d,%d). ", M,N,K);
-
+
gemm_err = run_zgemm(M, N, K, &time_secs, &gflops);
-
- gflops_ref = zgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
- gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
- printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
- if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
- printf("ZGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
- exit(0);
- }
-
if(gemm_err == -1) { /* out of memory for DSP offloading */
- printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- exit(0);
+ printf("ZGEMM out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+ exit(0);
}
else {
fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
}
+
+ gflops_ref = zgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
+ gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
+ printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
+ if((gflops > gflops_ref*GFLOPS_MARGIN) || (gflops < gflops_ref/GFLOPS_MARGIN)) {
+ printf("ZGEMM test FAILED! GFLOPS deviates from reference unacceptably.\n");
+ exit(0);
+ }
}
}
}
fclose(fp_time);
fclose(fp_gflops);
- printf("Passed.\n");
+ printf("Passed.\n");
return 0;
}
return (-1);
}
-
+
total_time = 0.0;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
return (-1);
}
-
+
total_time = 0.0;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
return (-1);
}
- total_time = 0.0;
+ total_time = 0.0;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*----------------------------------------------------------------------
total_time += time_secs;
total_GFLOPS += operation_count/time_secs*1e-9;
}
-
+
__free_ddr(A);
__free_ddr(B);
__free_ddr(C);
return (-1);
}
- total_time = 0.0;
+ total_time = 0.0;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*----------------------------------------------------------------------
total_time += time_secs;
total_GFLOPS += operation_count/time_secs*1e-9;
}
-
+
__free_ddr(A);
__free_ddr(B);
__free_ddr(C);
diff --git a/ticblas/src/Makefile b/ticblas/src/Makefile
index 11770beb015760da186c64b9dd8f7533d0a32aaa..7383951720f4cd1fdb7fd90aedb0521a39e804c9 100644 (file)
--- a/ticblas/src/Makefile
+++ b/ticblas/src/Makefile
$(AR) -cr $(DSP_LIB) $(OBJS)
clean::
- rm $(DSP_LIB)
\ No newline at end of file
+ rm -r $(DSP_LIB_DIR)
\ No newline at end of file