/****************************************************************************** * FILE: dgemm_test.c ******************************************************************************/ #include #include #include #include #include /* use small memory model of BLAS */ #define BLAS_L2_BUF_SIZE (256*1024UL) /* 256KB */ #define BLAS_MSMC_BUF_SIZE (2*1024*1024UL) /* 2MB */ #define BLAS_L3_DDR_SIZE_ZERO (0) size_t l1D_SRAM_size_orig, l2_SRAM_size_orig; #pragma DATA_SECTION(blas_msmc_buf, ".blas_msmc") #pragma DATA_ALIGN(blas_msmc_buf,32) char blas_msmc_buf[BLAS_MSMC_BUF_SIZE]; #pragma DATA_SECTION(blas_l2_buf, ".blas_l2") #pragma DATA_ALIGN(blas_l2_buf,32) char blas_l2_buf[BLAS_L2_BUF_SIZE]; int config_mem_for_ticblas(double *msmc_buf, double *l2_buf, size_t msmc_buf_size, size_t l2_buf_size); int reconfig_mem_after_ticblas(); void matrix_gen(double *A, double *B, double *C, int m, int k, int n); void test_edma(); int main (int argc, char *argv[]) { double *A, *B, *C; int m, n, k, err, l2_cache_size; double alpha, beta; int nthreads, tid; #if 1 /* Fork a team of threads giving them their own copies of variables */ #pragma omp parallel private(nthreads, tid) { /* Obtain thread number */ tid = omp_get_thread_num(); printf("Hello World from thread = %d\n", tid); /* Only master thread does this */ if (tid == 0) { nthreads = omp_get_num_threads(); printf("Number of threads = %d\n", nthreads); } } /* All threads join master thread and disband */ #endif //printf("Testing EDMA manager.\n"); //test_edma(); m = k = n = 1000; alpha = 0.7; beta = 1.3; /* Allocate memory for matrices */ A = (double *)malloc( m*k*sizeof( double ) ); B = (double *)malloc( k*n*sizeof( double ) ); C = (double *)malloc( m*n*sizeof( double ) ); if (A == NULL || B == NULL || C == NULL) { printf( "\nERROR: Can't allocate memory for matrices. Aborting... \n\n"); free(A); free(B); free(C); return 1; } else { printf("Matrix A address: 0x%x, Matrix B address: 0x%x, Matrix C address: 0x%x.\n", (unsigned int)A, (unsigned int)B, (unsigned int)C); } /* Initialize random number generator */ srand(123456789); matrix_gen(A, B, C, m, k, n); switch (CACHE_getL2Size()) { case CACHE_0KCACHE: l2_cache_size = 0; break; case CACHE_32KCACHE: l2_cache_size = (32 << 10); break; case CACHE_64KCACHE: l2_cache_size = (64 << 10); break; case CACHE_128KCACHE: l2_cache_size = (128 << 10); break; case CACHE_256KCACHE: l2_cache_size = (256 << 10); break; case CACHE_512KCACHE: l2_cache_size = (512 << 10); break; case CACHE_1024KCACHE: l2_cache_size = (1024 << 10); break; default: l2_cache_size = (1024 << 10); break; } printf("CACHE_getL2Size() returns %d, L2 Cache size is %d.\n", CACHE_getL2Size(), l2_cache_size); printf("lib_get_L2_SRAM_size() returns %d, lib_get_L2_total_size() returns %d.\n", lib_get_L2_SRAM_size(), lib_get_L2_total_size()); //err = config_mem_for_ticblas((double *)blas_msmc_buf, (size_t)BLAS_MSMC_BUF_SIZE); err = config_mem_for_ticblas((double *)blas_msmc_buf, (double *)blas_l2_buf, (size_t)BLAS_MSMC_BUF_SIZE, (size_t)BLAS_L2_BUF_SIZE); if(err) { printf("Memory configuration for BLAS failed with error code %d.\n", err); } cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, k, B, n, beta, C, n); reconfig_mem_after_ticblas(); return 0; } void test_edma() { //lib_emt_Handle test_emt_handle; EdmaMgr_Handle test_edma_handle; int coreID, edma_err; coreID = lib_get_coreID(); printf("Core ID is %d\n", coreID); edma_err = EdmaMgr_init(coreID, NULL); printf("EdmaMgr_init finished.\n"); if(edma_err != EdmaMgr_SUCCESS) { printf("EdmaMgr_init fails. Error code is %d.\n", edma_err); } else { printf("EdmaMgr_init succeeds.\n"); } /* Use external memory transfer API */ //lib_emt_init(); //if((test_emt_handle=lib_emt_alloc(1))==NULL) { if((test_edma_handle=EdmaMgr_alloc(1))==NULL) { printf("External memory transfer handle allocation error!\n"); return; } else { printf("External memory transfer handle allocation succeeded!\n"); } } void matrix_gen(double *A, double *B, double *C, int m, int k, int n) { int i; for (i = 0; i < (m*k); i++) { A[i] = (double)rand()/RAND_MAX; } for (i = 0; i < (k*n); i++) { B[i] = (double)rand()/RAND_MAX; } for (i = 0; i < (m*n); i++) { C[i] = (double)rand()/RAND_MAX; } } int config_mem_for_ticblas(double *msmc_buf, double *l2_buf, size_t msmc_buf_size, size_t l2_buf_size) { size_t smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow; void *l1d_SRAM_ptr, *l2_SRAM_ptr; int l1d_cfg_err, l2_cfg_err, tid; /* First, verify the provided/available memory meet requirements */ tiCblasGetSizes(&smem_size_vfast, &smem_size_fast, &smem_size_med, &smem_size_slow); printf("BLAS memory requirements - vfast size: %d, fast size: %d, medium size: %d, slow size: %d.\n", smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow); if( (smem_size_vfast> lib_get_L1D_total_size()) /* total available L1D */ //||(smem_size_fast > lib_get_L2_total_size()) /* total available L2 */ ||(smem_size_fast > l2_buf_size) /* total available L2 */ ||(smem_size_med > msmc_buf_size) /* provided MSMC memory */ ||(smem_size_slow > BLAS_L3_DDR_SIZE_ZERO) /* DDR not used */ ) { return(-2); } /* Configure L1D if necessary */ l1D_SRAM_size_orig = lib_get_L1D_SRAM_size(); /* get current L1D SRAM size */ l1d_cfg_err = LIB_CACHE_SUCCESS; printf("Original L1D SRAM size is: %d\n", l1D_SRAM_size_orig); printf("Required L1D SRAM size is: %d\n", smem_size_vfast); if(l1D_SRAM_size_orig < smem_size_vfast) { /* configure L1D if needs more SRAM */ #pragma omp parallel { l1d_cfg_err = lib_L1D_config_SRAM(smem_size_vfast); } } #pragma omp parallel { tid = omp_get_thread_num(); printf("New L1D SRAM size from thread %d is: %d\n", tid, lib_get_L1D_SRAM_size()); } /* Configure L2 if necessary */ l2_SRAM_size_orig = lib_get_L2_SRAM_size(); /* get current L2 SRAM size */ l2_cfg_err = LIB_CACHE_SUCCESS; printf("Original L2 SRAM size is: %d\n", l2_SRAM_size_orig); printf("Required L2 SRAM size is: %d\n", smem_size_fast); if(l2_SRAM_size_orig < smem_size_fast) { /* configure L2 if needs more SRAM */ printf("Configuring L2 for each core:\n"); #pragma omp parallel { l2_cfg_err = lib_L2_config_SRAM(smem_size_fast); } } if(l1d_cfg_err || l2_cfg_err) { return(-3); } #pragma omp parallel { tid = omp_get_thread_num(); printf("New L2 SRAM size from thread %d is: %d\n", tid, lib_get_L2_SRAM_size()); } /* get L1D and L2 SRAM base address */ l1d_SRAM_ptr = lib_get_L1D_SRAM_base(); //l2_SRAM_ptr = lib_get_L2_SRAM_base(); /* pass allocated memories for heap initialization */ return(tiCblasInit(l1d_SRAM_ptr, smem_size_vfast, //l2_SRAM_ptr, smem_size_fast, l2_buf, smem_size_fast, msmc_buf, smem_size_med, NULL, BLAS_L3_DDR_SIZE_ZERO)); } /* config_mem_for_ticblas */ /*============================================================================== * This function reconfigures L1D and L2 after processing is finished *============================================================================*/ int reconfig_mem_after_ticblas() { int l1d_cfg_err, l2_cfg_err; /* configure L1D back */ l1d_cfg_err = LIB_CACHE_SUCCESS; if(l1D_SRAM_size_orig!=lib_get_L1D_SRAM_size()) { #pragma omp parallel { l1d_cfg_err = lib_L1D_config_SRAM(l1D_SRAM_size_orig); } } l2_cfg_err = LIB_CACHE_SUCCESS; if(l2_SRAM_size_orig <= lib_get_L2_SRAM_size()) { #pragma omp parallel { l2_cfg_err = lib_L2_config_SRAM(l2_SRAM_size_orig); } } /* configure L1D and L2 back */ if(l1d_cfg_err || l2_cfg_err) { return(-4); } printf("L1D SRAM size set to: %d\n", lib_get_L1D_SRAM_size()); printf("L2 SRAM size set to: %d\n", lib_get_L2_SRAM_size()); return(TICBLAS_SUCCESS); } /* reconfig_mem_after_ticblas */