summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 7ac1b6e)
raw | patch | inline | side by side (parent: 7ac1b6e)
author | Jianzhong Xu <xuj@ti.com> | |
Tue, 16 Feb 2016 22:11:08 +0000 (17:11 -0500) | ||
committer | Jianzhong Xu <xuj@ti.com> | |
Tue, 16 Feb 2016 22:11:08 +0000 (17:11 -0500) |
Signed-off-by: Jianzhong Xu <xuj@ti.com>
Makefile | patch | blob | history | |
blis/config/c66x/bli_config.h | patch | blob | history | |
blis/config/c66x/bli_kernel.h | patch | blob | history | |
blis/config/c66x/make_defs.mk | patch | blob | history | |
blis/frame/base/bli_dma.c | patch | blob | history | |
blis/frame/base/bli_mem.c | patch | blob | history | |
examples/dsponly/dgemm_test/Makefile.common | patch | blob | history | |
examples/dsponly/dgemm_test/dgemm_test.c | patch | blob | history | |
examples/dsponly/dgemm_test/linker_fc.cmd | [new file with mode: 0644] | patch | blob |
examples/run_tests_evm.sh | [new file with mode: 0755] | patch | blob |
ticblas/src/ticblas.c | patch | blob | history |
diff --git a/Makefile b/Makefile
index c8d56f41cb84a9757c6d2319d1b1c31380338e99..0dd21855ba2068b305c737d2547a6d04310b167b 100644 (file)
--- a/Makefile
+++ b/Makefile
cleanDSPlibs:
cd $(LINALG_CBLAS_DIR); make arch=C66 clean; \
- cd ../$(LINALG_BLIS_DIR); ./configure -p install/$(BLIS_CFG) c66x; make -j8 clean; \
- cd ../$(LINALG_TICBLAS_DIR)/src; make clean;
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/$(BLIS_CFG) c66x; make -j8 clean; rm install/$(BLIS_CFG)/lib/*; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make clean; \
+ cd ../..; rm lib/*
cleanARMlibs:
cd $(LINALG_CBLAS_DIR); make arch=ARM clean; \
cleanARMplusDSP: cleanDSPlibs cleanARMlibs
+docs: ./docs/doxygen/doxycfg.txt ./docs/doxygen/mainpage.dox
+ doxygen ./docs/doxygen/doxycfg.txt
installDSPlib:
install -m 755 -d ${DESTDIR}/include
index c5a6182d580c8e14ff383ca6f21d9b6ac0231624..33de0dfea1ce13b46692277f9b7f95189ad05c64 100755 (executable)
#define BLIS_ENABLE_C66X_MEM_POOLS
-#define BLIS_ENABLE_C66X_OPENCL
#ifdef BLIS_ENABLE_C66X_OPENCL
// clocl creates a cio section in L2 when fprintf is used. Redefining fprintf to map to printf.
index 3c18494e6e3d51824fb93e24586ece67c26932bf..65dcd61634258fdfbf4aec094877152737629ebe 100755 (executable)
#elif defined(MEM_MODEL_SMALL)
-#define BLIS_DEFAULT_MC_S 144
-#define BLIS_DEFAULT_KC_S 428
-#define BLIS_DEFAULT_NC_S 1224
+#define BLIS_DEFAULT_MC_S 104
+#define BLIS_DEFAULT_KC_S 196
+#define BLIS_DEFAULT_NC_S 824
-#define BLIS_DEFAULT_MC_D 140
-#define BLIS_DEFAULT_KC_D 220
-#define BLIS_DEFAULT_NC_D 1184
+#define BLIS_DEFAULT_MC_D 64
+#define BLIS_DEFAULT_KC_D 180
+#define BLIS_DEFAULT_NC_D 540
-#define BLIS_DEFAULT_MC_C 116
-#define BLIS_DEFAULT_KC_C 260
-#define BLIS_DEFAULT_NC_C 1008
+#define BLIS_DEFAULT_MC_C 64
+#define BLIS_DEFAULT_KC_C 180
+#define BLIS_DEFAULT_NC_C 540
-#define BLIS_DEFAULT_MC_Z 86
-#define BLIS_DEFAULT_KC_Z 178
-#define BLIS_DEFAULT_NC_Z 736
+#define BLIS_DEFAULT_MC_Z 32
+#define BLIS_DEFAULT_KC_Z 145
+#define BLIS_DEFAULT_NC_Z 306
-#define BLIS_DEFAULT_4M_MC_C 140
-#define BLIS_DEFAULT_4M_KC_C 220
-#define BLIS_DEFAULT_4M_NC_C 1184
+#define BLIS_DEFAULT_4M_MC_C 64
+#define BLIS_DEFAULT_4M_KC_C 180
+#define BLIS_DEFAULT_4M_NC_C 540
-#define BLIS_DEFAULT_4M_MC_Z 86
-#define BLIS_DEFAULT_4M_KC_Z 178
-#define BLIS_DEFAULT_4M_NC_Z 736
+#define BLIS_DEFAULT_4M_MC_Z 32
+#define BLIS_DEFAULT_4M_KC_Z 145
+#define BLIS_DEFAULT_4M_NC_Z 306
-#define BLIS_DEFAULT_3M_MC_C 88
-#define BLIS_DEFAULT_3M_KC_C 220
-#define BLIS_DEFAULT_3M_NC_C 792
+#define BLIS_DEFAULT_3M_MC_C 64
+#define BLIS_DEFAULT_3M_KC_C 96
+#define BLIS_DEFAULT_3M_NC_C 488
-#define BLIS_DEFAULT_3M_MC_Z 56
-#define BLIS_DEFAULT_3M_KC_Z 178
-#define BLIS_DEFAULT_3M_NC_Z 488
+#define BLIS_DEFAULT_3M_MC_Z 36
+#define BLIS_DEFAULT_3M_KC_Z 108
+#define BLIS_DEFAULT_3M_NC_Z 196
#endif
index d012a5eaa0d5fe0163546aadc645344f25984bee..f1cd21f864bdc024f7684fce7d1a98945ca3ee7a 100755 (executable)
ifeq ($(LIBOS),LIB_OPENCL)
CMISCFLAGS += -I$(TI_OCL_INSTALL_DIR)
+CMISCFLAGS += -DBLIS_ENABLE_C66X_OPENCL
endif
ifeq ($(MEM_MODEL),Large)
index c83b6d063659eb0c28dd1fd5b77a918d43c35281..bf521d0267571b90bc4aef50c00e1023bb30ca27 100755 (executable)
_Pragma( "omp parallel num_threads(BLIS_MAX_NUM_THREADS)" )
{
gint_t status; //int32_t
-/*
-#ifdef BLIS_ENABLE_C66X_OPENCL
- status = LIB_EMT_SUCCESS;
-#else
- status = EdmaMgr_init(lib_get_coreID(), NULL);
-#endif
-*/
status = lib_emt_init();
if(status != LIB_EMT_SUCCESS)
index 40e50571ce39143cb1c8aeedb3c997a7feccad5a..430321744d4b6f45fec03dd0f15654c10440298e 100644 (file)
// the memory.
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-#ifdef BLIS_ENABLE_C66X_OPENCL
static pool_t pools[12];
//Main Memory Pools
static void* pool_mn_blk_ptrs_L3[ BLIS_NUM_MC_X_NC_BLOCKS_L3 ];
extern char *pool_mn_mem_L3;
-
-#else //CCS
-static pool_t pools[12];
-
-//Main Memory Pools
-static void* pool_mk_blk_ptrs[ BLIS_NUM_MC_X_KC_BLOCKS ];
-#pragma DATA_SECTION( pool_mk_mem, ".myDDR3");
-static char pool_mk_mem[ BLIS_MK_POOL_SIZE ];
-
-static void* pool_kn_blk_ptrs[ BLIS_NUM_KC_X_NC_BLOCKS ];
-#pragma DATA_SECTION( pool_mk_mem, ".myDDR3");
-static char pool_kn_mem[ BLIS_KN_POOL_SIZE ];
-
-static void* pool_mn_blk_ptrs[ BLIS_NUM_MC_X_NC_BLOCKS ];
-#pragma DATA_SECTION( pool_mk_mem, ".myDDR3");
-static char pool_mn_mem[ BLIS_MN_POOL_SIZE ];
-
-//L1
-static void* pool_mk_blk_ptrs_L1[ BLIS_NUM_MR_X_KC_BLOCKS_L1 ];
-#pragma DATA_SECTION( pool_mk_mem_L1, ".myL1");
-#pragma DATA_ALIGN(pool_mk_mem_L1, BLIS_CACHE_LINE_SIZE);
-static char pool_mk_mem_L1[ BLIS_MK_POOL_SIZE_L1 ];
-
-static void* pool_kn_blk_ptrs_L1[ BLIS_NUM_KC_X_NR_BLOCKS_L1 ];
-#pragma DATA_SECTION( pool_kn_mem_L1, ".myL1");
-#pragma DATA_ALIGN(pool_kn_mem_L1, BLIS_CACHE_LINE_SIZE);
-static char pool_kn_mem_L1[ BLIS_KN_POOL_SIZE_L1 ];
-
-static void* pool_mn_blk_ptrs_L1[ BLIS_NUM_MR_X_NR_BLOCKS_L1 ];
-#pragma DATA_SECTION( pool_mn_mem_L1, ".myL1");
-static char pool_mn_mem_L1[ BLIS_MN_POOL_SIZE_L1 ];
-
-//
-//L2 Pools
-//
-static void* pool_mk_blk_ptrs_L2[ BLIS_NUM_MC_X_KC_BLOCKS_L2 ];
-#pragma DATA_SECTION( pool_mk_mem_L2, ".myL2");
-#pragma DATA_ALIGN(pool_mk_mem_L2, BLIS_CACHE_LINE_SIZE);
-static char pool_mk_mem_L2[ BLIS_MK_POOL_SIZE_L2 ];
-
-static void* pool_kn_blk_ptrs_L2[ BLIS_NUM_KC_X_NC_BLOCKS_L2 ];
-#pragma DATA_SECTION( pool_kn_mem_L2, ".myL2");
-#pragma DATA_ALIGN(pool_kn_mem_L2, BLIS_CACHE_LINE_SIZE);
-static char pool_kn_mem_L2[ BLIS_KN_POOL_SIZE_L2 ];
-
-static void* pool_mn_blk_ptrs_L2[ BLIS_NUM_MC_X_NR_BLOCKS_L2 ];
-#pragma DATA_SECTION( pool_mn_mem_L2, ".myL2");
-#pragma DATA_ALIGN(pool_mn_mem_L2, BLIS_CACHE_LINE_SIZE);
-static char pool_mn_mem_L2[ BLIS_MNR_POOL_SIZE_L2 ];
-
-//
-//L3 Pools
-//
-static void* pool_mk_blk_ptrs_L3[ BLIS_NUM_MC_X_KC_BLOCKS_L3 ];
-#pragma DATA_SECTION( pool_mk_mem_L3, ".myL3");
-#pragma DATA_ALIGN(pool_mk_mem_L3, BLIS_CACHE_LINE_SIZE);
-static char pool_mk_mem_L3[ BLIS_MK_POOL_SIZE_L3 ];
-
-static void* pool_kn_blk_ptrs_L3[ BLIS_NUM_KC_X_NC_BLOCKS_L3 ];
-#pragma DATA_SECTION( pool_kn_mem_L3, ".myL3");
-#pragma DATA_ALIGN(pool_kn_mem_L3, BLIS_CACHE_LINE_SIZE);
-static char pool_kn_mem_L3[ BLIS_KN_POOL_SIZE_L3 ];
-
-static void* pool_mn_blk_ptrs_L3[ BLIS_NUM_MC_X_NC_BLOCKS_L3 ];
-#pragma DATA_SECTION( pool_mn_mem_L3, ".myL3");
-#pragma DATA_ALIGN(pool_mn_mem_L3, BLIS_CACHE_LINE_SIZE);
-static char pool_mn_mem_L3[ BLIS_MN_POOL_SIZE_L3 ];
-#endif
#else
static pool_t pools[3];
diff --git a/examples/dsponly/dgemm_test/Makefile.common b/examples/dsponly/dgemm_test/Makefile.common
index 08deee81cecf19a68aac54e3274dbf623cedc573..d11a8774f13e845c22b68f4d257370989f479575 100644 (file)
%.out: $(testfiles) libomp_config
echo compiling $<
- $(CL) $(CL_OPTS) $< $(testfiles) -z $(LNK_OPTS) -o $@ -m $*.map $(LNK_CMD) $(RTS_LIB) $(LIBARCH_LIB) $(LINALG_LIB)
+ $(CL) $(CL_OPTS) $< $(testfiles) -z $(LNK_OPTS) -o $@ -m $*.map $(LNK_CMD) ./linker_fc.cmd $(RTS_LIB) $(LIBARCH_LIB) $(LINALG_LIB)
clean: libomp_clean
@rm -rf *.map *.out *.obj *.mak
index 1ed9e4c32bd2c067f82a648be1cae00092da8125..6d7b0397883ae1bfe3530a55d633024c8dfbbbb5 100644 (file)
* FILE: dgemm_test.c
******************************************************************************/
#include <omp.h>
+#include <string.h>
#include <stdio.h>
#include <libarch.h>
#include <ticblas.h>
#include <cblas.h>
/* use small memory model of BLAS */
-#define BLAS_L2_BUF_SIZE (256*1024UL) /* 256KB */
-#define BLAS_MSMC_BUF_SIZE (2*1024*1024UL) /* 2MB */
-#define BLAS_L3_DDR_SIZE_ZERO (0)
+#define BLAS_L2_BUF_SIZE (183*1024UL) /* 183KB */
+#define BLAS_MSMC_BUF_SIZE (2*1024*1024UL) /* 2MB */
+#define BLAS_L3_DDR_SIZE (5120) /* 5KB */
+
+//#define BLAS_L2_BUF_SIZE (384*1024UL) /* 384KB */
+//#define BLAS_MSMC_BUF_SIZE (4718592UL) /* 4.5MB */
+//#define BLAS_L3_DDR_SIZE (5120) /* 5KB */
size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
#pragma DATA_ALIGN(blas_l2_buf,32)
char blas_l2_buf[BLAS_L2_BUF_SIZE];
-int config_mem_for_ticblas(double *msmc_buf, double *l2_buf, size_t msmc_buf_size, size_t l2_buf_size);
+char blas_ddr_buf[BLAS_L3_DDR_SIZE];
+
+int config_mem_for_ticblas(double *l2_buf, size_t l2_buf_size,
+ double *msmc_buf, size_t msmc_buf_size,
+ double *ddr_buf, size_t ddr_buf_size);
int reconfig_mem_after_ticblas();
void matrix_gen(double *A, double *B, double *C, int m, int k, int n);
-void test_edma();
+void mat_mpy(const double * A, const double * B, double * C, int mat_N,
+ int mat_K, int mat_M, double alpha, double beta);
+double dotprod(const double * A, const double * B, int n);
+void print_matrix(double *mat, int m, int n);
+double diff_matrix(double *mat1, double * mat2, int m, int n);
int main (int argc, char *argv[])
{
- double *A, *B, *C;
- int m, n, k, err, l2_cache_size;
- double alpha, beta;
+ double *A, *B, *C, *C_copy;
+ int m, n, k, err;
+ double alpha, beta, precision_diff;
int nthreads, tid;
-#if 1
-/* Fork a team of threads giving them their own copies of variables */
-#pragma omp parallel private(nthreads, tid)
+ /* Verify OpenMP working properly */
+ #pragma omp parallel private(nthreads, tid)
{
+ tid = omp_get_thread_num(); /* Obtain thread number */
+ printf("Hello World from thread = %d\n", tid);
- /* Obtain thread number */
- tid = omp_get_thread_num();
- printf("Hello World from thread = %d\n", tid);
-
- /* Only master thread does this */
- if (tid == 0)
- {
- nthreads = omp_get_num_threads();
- printf("Number of threads = %d\n", nthreads);
+ /* Only master thread does this */
+ if (tid == 0) {
+ nthreads = omp_get_num_threads();
+ printf("Number of threads = %d\n", nthreads);
}
} /* All threads join master thread and disband */
-#endif
-
- //printf("Testing EDMA manager.\n");
-
- //test_edma();
-
+
+ /* hard code dgemm parameters */
m = k = n = 1000;
alpha = 0.7;
beta = 1.3;
-
- /* Allocate memory for matrices */
- A = (double *)malloc( m*k*sizeof( double ) );
- B = (double *)malloc( k*n*sizeof( double ) );
- C = (double *)malloc( m*n*sizeof( double ) );
- if (A == NULL || B == NULL || C == NULL) {
- printf( "\nERROR: Can't allocate memory for matrices. Aborting... \n\n");
- free(A);
- free(B);
- free(C);
- return 1;
- }
- else {
- printf("Matrix A address: 0x%x, Matrix B address: 0x%x, Matrix C address: 0x%x.\n", (unsigned int)A, (unsigned int)B, (unsigned int)C);
- }
-
- /* Initialize random number generator */
+
+ /* Allocate memory for matrices */
+ A = (double *)malloc( m*k*sizeof( double ) );
+ B = (double *)malloc( k*n*sizeof( double ) );
+ C = (double *)malloc( m*n*sizeof( double ) );
+ C_copy = (double *)malloc( m*n*sizeof( double ) );
+ if (A == NULL || B == NULL || C == NULL || C_copy == NULL) {
+ printf( "\nERROR: Can't allocate memory for matrices. Aborting... \n\n");
+ free(A);
+ free(B);
+ free(C);
+ return 1;
+ }
+
+ /* Initialize random number generator */
srand(123456789);
-
+
+ /* Generate matrices */
matrix_gen(A, B, C, m, k, n);
+ memcpy(C_copy, C, m*n*sizeof(double));
- switch (CACHE_getL2Size())
- {
- case CACHE_0KCACHE: l2_cache_size = 0; break;
- case CACHE_32KCACHE: l2_cache_size = (32 << 10); break;
- case CACHE_64KCACHE: l2_cache_size = (64 << 10); break;
- case CACHE_128KCACHE: l2_cache_size = (128 << 10); break;
- case CACHE_256KCACHE: l2_cache_size = (256 << 10); break;
- case CACHE_512KCACHE: l2_cache_size = (512 << 10); break;
- case CACHE_1024KCACHE: l2_cache_size = (1024 << 10); break;
- default: l2_cache_size = (1024 << 10); break;
- }
-
- printf("CACHE_getL2Size() returns %d, L2 Cache size is %d.\n", CACHE_getL2Size(), l2_cache_size);
- printf("lib_get_L2_SRAM_size() returns %d, lib_get_L2_total_size() returns %d.\n", lib_get_L2_SRAM_size(), lib_get_L2_total_size());
-
- //err = config_mem_for_ticblas((double *)blas_msmc_buf, (size_t)BLAS_MSMC_BUF_SIZE);
- err = config_mem_for_ticblas((double *)blas_msmc_buf, (double *)blas_l2_buf, (size_t)BLAS_MSMC_BUF_SIZE, (size_t)BLAS_L2_BUF_SIZE);
+ printf("L2 SRAM size is %d, total L2 size is %d.\n", lib_get_L2_SRAM_size(), lib_get_L2_total_size());
+
+ /* Call TI CBLAS API to creat new CBLAS instance */
+ tiCblasNew();
+
+ /* Configure memory for TI CBLAS if necessary */
+ err = config_mem_for_ticblas((double *)blas_l2_buf, (size_t)BLAS_L2_BUF_SIZE,
+ (double *)blas_msmc_buf, (size_t)BLAS_MSMC_BUF_SIZE,
+ (double *)blas_ddr_buf, (size_t)BLAS_L3_DDR_SIZE);
+
if(err) {
- printf("Memory configuration for BLAS failed with error code %d.\n", err);
+ printf("Memory configuration for CBLAS failed with error code %d.\n", err);
+ exit (0);
}
-
- cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, k, B, n, beta, C, n);
-
- reconfig_mem_after_ticblas();
-
- return 0;
-}
-void test_edma()
-{
- //lib_emt_Handle test_emt_handle;
- EdmaMgr_Handle test_edma_handle;
- int coreID, edma_err;
-
- coreID = lib_get_coreID();
-
- printf("Core ID is %d\n", coreID);
-
- edma_err = EdmaMgr_init(coreID, NULL);
-
- printf("EdmaMgr_init finished.\n");
-
- if(edma_err != EdmaMgr_SUCCESS) {
- printf("EdmaMgr_init fails. Error code is %d.\n", edma_err);
- }
- else {
- printf("EdmaMgr_init succeeds.\n");
- }
-
- /* Use external memory transfer API */
- //lib_emt_init();
- //if((test_emt_handle=lib_emt_alloc(1))==NULL) {
- if((test_edma_handle=EdmaMgr_alloc(1))==NULL) {
- printf("External memory transfer handle allocation error!\n");
- return;
- }
- else {
- printf("External memory transfer handle allocation succeeded!\n");
- }
-}
+ /* Call standard CBLAS API for dgemm */
+ cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, k, B, n, beta, C, n);
+ /* Reconfigure memory if necessary */
+ if(reconfig_mem_after_ticblas() == TICBLAS_SUCCESS) {
+ printf("Memory reconfiguration after BLAS call finished.\n");
+ }
+
+ /* Straightforward matrix multiplication as reference */
+ mat_mpy(A, B, C_copy, m, n, k, alpha, beta);
+
+ /* Find the difference between dgemm and reference */
+ precision_diff = diff_matrix(C, C_copy, m, k);
+ printf("Precision error is %e.\n", precision_diff);
+
+ return 0;
+}
+/*==============================================================================
+ * This function generates matrices of random data
+ *============================================================================*/
void matrix_gen(double *A, double *B, double *C, int m, int k, int n)
{
int i;
for (i = 0; i < (m*k); i++) {
- A[i] = (double)rand()/RAND_MAX;
+ A[i] = (double)rand()/RAND_MAX - 0.5;
}
for (i = 0; i < (k*n); i++) {
- B[i] = (double)rand()/RAND_MAX;
+ B[i] = (double)rand()/RAND_MAX - 0.5;
}
for (i = 0; i < (m*n); i++) {
- C[i] = (double)rand()/RAND_MAX;
+ C[i] = (double)rand()/RAND_MAX - 0.5;
}
-
+
}
-int config_mem_for_ticblas(double *msmc_buf, double *l2_buf, size_t msmc_buf_size, size_t l2_buf_size)
+/*==============================================================================
+ * This function configures and initializes memory for BLAS calls
+ *============================================================================*/
+int config_mem_for_ticblas(double *l2_buf, size_t l2_buf_size,
+ double *msmc_buf, size_t msmc_buf_size,
+ double *ddr_buf, size_t ddr_buf_size)
{
size_t smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow;
- void *l1d_SRAM_ptr, *l2_SRAM_ptr;
- int l1d_cfg_err, l2_cfg_err, tid;
-
+ void *l1d_SRAM_ptr;
+ int l1d_cfg_err;
+
/* First, verify the provided/available memory meet requirements */
tiCblasGetSizes(&smem_size_vfast, &smem_size_fast, &smem_size_med, &smem_size_slow);
- printf("BLAS memory requirements - vfast size: %d, fast size: %d, medium size: %d, slow size: %d.\n", smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow);
-
+ printf("BLAS memory requirements - vfast size: %d, fast size: %d, medium size: %d, slow size: %d.\n", smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow);
+
if( (smem_size_vfast> lib_get_L1D_total_size()) /* total available L1D */
- //||(smem_size_fast > lib_get_L2_total_size()) /* total available L2 */
- ||(smem_size_fast > l2_buf_size) /* total available L2 */
- ||(smem_size_med > msmc_buf_size) /* provided MSMC memory */
- ||(smem_size_slow > BLAS_L3_DDR_SIZE_ZERO) /* DDR not used */
+ ||(smem_size_fast > l2_buf_size) /* provided L2 size */
+ ||(smem_size_med > msmc_buf_size) /* provided MSMC memory */
+ ||(smem_size_slow > ddr_buf_size)
) {
- return(-2);
+ printf("Provided memory is not enough for BLAS!\n");
+ exit(0);
}
-
+
/* Configure L1D if necessary */
l1D_SRAM_size_orig = lib_get_L1D_SRAM_size(); /* get current L1D SRAM size */
l1d_cfg_err = LIB_CACHE_SUCCESS;
- printf("Original L1D SRAM size is: %d\n", l1D_SRAM_size_orig);
- printf("Required L1D SRAM size is: %d\n", smem_size_vfast);
+ printf("Original L1D SRAM size is: %d\n", l1D_SRAM_size_orig);
+ printf("Required L1D SRAM size is: %d\n", smem_size_vfast);
if(l1D_SRAM_size_orig < smem_size_vfast) { /* configure L1D if needs more SRAM */
- #pragma omp parallel
+ #pragma omp parallel
{
l1d_cfg_err = lib_L1D_config_SRAM(smem_size_vfast);
+ if(l1d_cfg_err) {
+ printf("L1D configuration fails on core %d!\n", lib_get_coreID());
+ exit(1);
+ }
}
}
- #pragma omp parallel
- {
- tid = omp_get_thread_num();
- printf("New L1D SRAM size from thread %d is: %d\n", tid, lib_get_L1D_SRAM_size());
- }
-
- /* Configure L2 if necessary */
- l2_SRAM_size_orig = lib_get_L2_SRAM_size(); /* get current L2 SRAM size */
- l2_cfg_err = LIB_CACHE_SUCCESS;
- printf("Original L2 SRAM size is: %d\n", l2_SRAM_size_orig);
- printf("Required L2 SRAM size is: %d\n", smem_size_fast);
- if(l2_SRAM_size_orig < smem_size_fast) { /* configure L2 if needs more SRAM */
- printf("Configuring L2 for each core:\n");
- #pragma omp parallel
- {
- l2_cfg_err = lib_L2_config_SRAM(smem_size_fast);
- }
- }
-
- if(l1d_cfg_err || l2_cfg_err) {
- return(-3);
- }
-
- #pragma omp parallel
+ #pragma omp parallel
{
- tid = omp_get_thread_num();
- printf("New L2 SRAM size from thread %d is: %d\n", tid, lib_get_L2_SRAM_size());
+ printf("New L1D SRAM size on core %d is: %d\n", lib_get_coreID(), lib_get_L1D_SRAM_size());
}
- /* get L1D and L2 SRAM base address */
+ /* get L1D SRAM base address */
l1d_SRAM_ptr = lib_get_L1D_SRAM_base();
- //l2_SRAM_ptr = lib_get_L2_SRAM_base();
+ printf("L1D SRAM base address is 0x%x.\n", (unsigned int)l1d_SRAM_ptr);
/* pass allocated memories for heap initialization */
- return(tiCblasInit(l1d_SRAM_ptr, smem_size_vfast,
- //l2_SRAM_ptr, smem_size_fast,
- l2_buf, smem_size_fast,
- msmc_buf, smem_size_med,
- NULL, BLAS_L3_DDR_SIZE_ZERO));
+ return(tiCblasInit(l1d_SRAM_ptr, lib_get_L1D_SRAM_size(),
+ l2_buf, l2_buf_size,
+ msmc_buf, msmc_buf_size,
+ ddr_buf, ddr_buf_size));
} /* config_mem_for_ticblas */
/*==============================================================================
- * This function reconfigures L1D and L2 after processing is finished
+ * This function reconfigures L1D after processing is finished
*============================================================================*/
int reconfig_mem_after_ticblas()
{
- int l1d_cfg_err, l2_cfg_err;
+ int l1d_cfg_err;
/* configure L1D back */
l1d_cfg_err = LIB_CACHE_SUCCESS;
#pragma omp parallel
{
l1d_cfg_err = lib_L1D_config_SRAM(l1D_SRAM_size_orig);
- }
- }
-
- l2_cfg_err = LIB_CACHE_SUCCESS;
- if(l2_SRAM_size_orig <= lib_get_L2_SRAM_size()) {
- #pragma omp parallel
- {
- l2_cfg_err = lib_L2_config_SRAM(l2_SRAM_size_orig);
+ if(l1d_cfg_err) {
+ printf("L1D reconfiguration fails on core %d!\n", lib_get_coreID());
+ exit(2);
+ }
}
}
+
+ printf("L1D SRAM size reconfigured to: %d\n", lib_get_L1D_SRAM_size());
- /* configure L1D and L2 back */
- if(l1d_cfg_err || l2_cfg_err) {
- return(-4);
- }
-
- printf("L1D SRAM size set to: %d\n", lib_get_L1D_SRAM_size());
- printf("L2 SRAM size set to: %d\n", lib_get_L2_SRAM_size());
-
return(TICBLAS_SUCCESS);
} /* reconfig_mem_after_ticblas */
+
+
+/******************************************************************************
+* Straightforward implementation of matrix multiplication with row-major
+******************************************************************************/
+void mat_mpy(const double * A, const double * B, double * C, int mat_N,
+ int mat_K, int mat_M, double alpha, double beta)
+{
+ int col, row;
+ double b_col[mat_K];
+
+ for (col = 0; col < mat_M; ++col)
+ {
+ for (row = 0; row < mat_K; ++row)
+ b_col[row] = B[row*mat_M+col];
+
+ for (row = 0; row < mat_N; ++row)
+ C[row*mat_M+col] = alpha*dotprod(A + (row * mat_K), b_col, mat_K)
+ + beta*C[row*mat_M+col];
+ }
+}
+
+/******************************************************************************
+* dot product for matrix multiplication
+******************************************************************************/
+double dotprod(const double * A, const double * B, int n)
+{
+ int i;
+
+ float result = 0;
+ for (i = 0; i < n; ++i) result += A[i] * B[i];
+
+ return result;
+}
+
+/******************************************************************************
+* Print a row-major matrix
+******************************************************************************/
+void print_matrix(double *mat, int m, int n)
+{
+ int i, j;
+
+ for(i=0; i<m; i++) {
+ for(j=0; j<n; j++) {
+ printf( " %10.5f ", mat[i*n+j]);
+ }
+ printf( "\n" );
+ }
+}
+
+/******************************************************************************
+* Find the maximum absolute difference of two matrices
+******************************************************************************/
+double diff_matrix(double *mat1, double * mat2, int m, int n)
+{
+ int i, j;
+ double abs_max_err, err;
+
+ abs_max_err = 0.0f;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ err = fabs(mat1[i*n+j] - mat2[i*n+j]);
+ if(abs_max_err < err) {
+ abs_max_err = err;
+ }
+ }
+ }
+
+ return (abs_max_err);
+}
+
+/* Nothing past this point */
diff --git a/examples/dsponly/dgemm_test/linker_fc.cmd b/examples/dsponly/dgemm_test/linker_fc.cmd
--- /dev/null
@@ -0,0 +1,25 @@
+
+SECTIONS
+{
+ .fclocalfar :
+ {
+ "edmamgr.ae66" (.fardata)
+ "ecpy.ae66" (.fardata)
+ "edma3Chan.ae66" (.fardata)
+ "edma3.ae66" (.fardata)
+ "rman.ae66" (.fardata)
+ "nullres.ae66" (.fardata)
+ "fcsettings.ae66" (.fardata)
+ "edma3_lld_rm.ae66" (.fardata)
+
+ "edmamgr.ae66" (.far)
+ "ecpy.ae66" (.far)
+ "edma3Chan.ae66" (.far)
+ "edma3.ae66" (.far)
+ "rman.ae66" (.far)
+ "nullres.ae66" (.far)
+ "fcsettings.ae66" (.far)
+ "edma3_lld_rm.ae66" (.far)
+ } > L2SRAM
+
+}
diff --git a/examples/run_tests_evm.sh b/examples/run_tests_evm.sh
--- /dev/null
@@ -0,0 +1,8 @@
+./matmpy/matmpy
+./dsyrk_test/dsyrk_test
+./ztrsm_test/ztrsm_test
+./dgemm_test/dgemm_test
+./eig/eig
+./ludinv/ludinv
+./ztrmm_test/ztrmm_test
+
diff --git a/ticblas/src/ticblas.c b/ticblas/src/ticblas.c
index 55dfe2e427fed97144a155e6d4f7c83dff6f1c7c..4f62d794ddc0254950319344fb146d0909f30cfe 100644 (file)
--- a/ticblas/src/ticblas.c
+++ b/ticblas/src/ticblas.c
#include "../ticblas.h"\r
#include "blis.h"\r
\r
-#define BLAS_LEVEL3_L1DSRAM_SIZE (28*1024UL)\r
+#define getNextMultiple(x, y) ( ( ((x)+(y)-1)/(y) )* (y) )\r
\r
+#if 0\r
#ifdef MEM_MODEL_LARGE\r
+#define BLAS_LEVEL3_L1DSRAM_SIZE (28*1024UL)\r
#define BLAS_LEVEL3_L2SRAM_SIZE (767*1024UL) /* 767KB */\r
#define BLAS_LEVEL3_MSMC_SIZE (0x47FDC0) /* 4.5MB */\r
#else\r
# ifdef MEM_MODEL_MEDIUM\r
+# define BLAS_LEVEL3_L1DSRAM_SIZE (28*1024UL)\r
# define BLAS_LEVEL3_L2SRAM_SIZE (384*1024UL) /* 384KB */\r
# define BLAS_LEVEL3_MSMC_SIZE (0x380000) /* 3.5MB */\r
# else\r
# ifdef MEM_MODEL_SMALL\r
-# define BLAS_LEVEL3_L2SRAM_SIZE (256*1024UL) /* 256KB */\r
-# define BLAS_LEVEL3_MSMC_SIZE (2048*1024UL)/* 2MB */\r
+# define BLAS_LEVEL3_L1DSRAM_SIZE (18*1024UL)\r
+# define BLAS_LEVEL3_L2SRAM_SIZE (183*1024UL) /* 187KB */\r
+# define BLAS_LEVEL3_MSMC_SIZE (1520*1024UL)/* 1.5MB */\r
# else\r
# error "Unsupported memory model"\r
# endif\r
# endif\r
#endif\r
-\r
+#endif\r
+/*\r
#define BLAS_MEM_SIZE_VFAST BLAS_LEVEL3_L1DSRAM_SIZE \r
#define BLAS_MEM_SIZE_FAST BLAS_LEVEL3_L2SRAM_SIZE\r
#define BLAS_MEM_SIZE_MEDIUM BLAS_LEVEL3_MSMC_SIZE\r
+*/\r
+#define BLAS_MEM_SIZE_VFAST ( getNextMultiple(BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_MN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE) )\r
+#define BLAS_MEM_SIZE_FAST ( getNextMultiple(BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_MN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE) )\r
+#define BLAS_MEM_SIZE_MEDIUM ( getNextMultiple(BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_MN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE) )\r
#define BLAS_MEM_SIZE_SLOW (4804)\r
\r
-\r
/* Define memory descriptors for memory management */\r
lib_memdscr_t blas_mem_vfast;\r
lib_memdscr_t blas_mem_fast;\r
*smem_size_fast = BLAS_MEM_SIZE_FAST; // fast scratch memory\r
*smem_size_medium = BLAS_MEM_SIZE_MEDIUM; // medium speed scratch memory\r
*smem_size_slow = BLAS_MEM_SIZE_SLOW; // slow scratch memory\r
+/*\r
+ printf("BLIS_MK_POOL_SIZE_L1 is %d.\n", BLIS_MK_POOL_SIZE_L1);\r
+ printf("BLIS_KN_POOL_SIZE_L1 is %d.\n", BLIS_KN_POOL_SIZE_L1);\r
+ printf("BLIS_MN_POOL_SIZE_L1 is %d.\n", BLIS_MN_POOL_SIZE_L1);\r
+ printf("BLIS_MK_POOL_SIZE_L2 is %d.\n", BLIS_MK_POOL_SIZE_L2);\r
+ printf("BLIS_KN_POOL_SIZE_L2 is %d.\n", BLIS_KN_POOL_SIZE_L2);\r
+ printf("BLIS_MN_POOL_SIZE_L2 is %d.\n", BLIS_MN_POOL_SIZE_L2);\r
+ printf("BLIS_MK_POOL_SIZE_L3 is %d.\n", BLIS_MK_POOL_SIZE_L3);\r
+ printf("BLIS_KN_POOL_SIZE_L3 is %d.\n", BLIS_KN_POOL_SIZE_L3);\r
+ printf("BLIS_MN_POOL_SIZE_L3 is %d.\n", BLIS_MN_POOL_SIZE_L3);\r
+*/\r
} /* tiCblasGetSizes */\r
\r
/*==============================================================================\r
pool_mk_mem_L3 = lib_smem_malloc(blas_mem_handle, BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);\r
pool_kn_mem_L3 = lib_smem_malloc(blas_mem_handle, BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);\r
pool_mn_mem_L3 = lib_smem_malloc(blas_mem_handle, BLIS_MN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);\r
-\r
+/*\r
+ printf("BLIS_MK_POOL_SIZE_L1 is %d, pool_mk_mem_L1 is 0x%x.\n", BLIS_MK_POOL_SIZE_L1, (unsigned int)pool_mk_mem_L1);\r
+ printf("BLIS_KN_POOL_SIZE_L1 is %d, pool_kn_mem_L1 is 0x%x.\n", BLIS_KN_POOL_SIZE_L1, (unsigned int)pool_kn_mem_L1);\r
+ printf("BLIS_MN_POOL_SIZE_L1 is %d, pool_mn_mem_L1 is 0x%x.\n", BLIS_MN_POOL_SIZE_L1, (unsigned int)pool_mn_mem_L1);\r
+ printf("BLIS_MK_POOL_SIZE_L2 is %d, pool_mk_mem_L2 is 0x%x.\n", BLIS_MK_POOL_SIZE_L2, (unsigned int)pool_mk_mem_L2);\r
+ printf("BLIS_KN_POOL_SIZE_L2 is %d, pool_kn_mem_L2 is 0x%x.\n", BLIS_KN_POOL_SIZE_L2, (unsigned int)pool_kn_mem_L2);\r
+ printf("BLIS_MN_POOL_SIZE_L2 is %d, pool_mn_mem_L2 is 0x%x.\n", BLIS_MN_POOL_SIZE_L2, (unsigned int)pool_mn_mem_L2);\r
+ printf("BLIS_MK_POOL_SIZE_L3 is %d, pool_mk_mem_L3 is 0x%x.\n", BLIS_MK_POOL_SIZE_L3, (unsigned int)pool_mk_mem_L3);\r
+ printf("BLIS_KN_POOL_SIZE_L3 is %d, pool_kn_mem_L3 is 0x%x.\n", BLIS_KN_POOL_SIZE_L3, (unsigned int)pool_kn_mem_L3);\r
+ printf("BLIS_MN_POOL_SIZE_L3 is %d, pool_mn_mem_L3 is 0x%x.\n", BLIS_MN_POOL_SIZE_L3, (unsigned int)pool_mn_mem_L3);\r
+*/ \r
if( (pool_mk_mem_L1 == NULL)\r
||(pool_kn_mem_L1 == NULL) \r
||(pool_mn_mem_L1 == NULL) \r