/******************************************************************************
* FILE: dgemm_test.c
******************************************************************************/
#include <omp.h>
#include <stdio.h>
#include <libarch.h>
#include <ticblas.h>
#include <cblas.h>

/* use small memory model of BLAS */
#define BLAS_L2_BUF_SIZE (256*1024UL)        /* 256KB */
#define BLAS_MSMC_BUF_SIZE (2*1024*1024UL)   /* 2MB */
#define BLAS_L3_DDR_SIZE_ZERO  (0)

size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;

#pragma DATA_SECTION(blas_msmc_buf, ".blas_msmc")
#pragma DATA_ALIGN(blas_msmc_buf,32)
char blas_msmc_buf[BLAS_MSMC_BUF_SIZE];

#pragma DATA_SECTION(blas_l2_buf, ".blas_l2")
#pragma DATA_ALIGN(blas_l2_buf,32)
char blas_l2_buf[BLAS_L2_BUF_SIZE];

int config_mem_for_ticblas(double *msmc_buf, double *l2_buf, size_t msmc_buf_size, size_t l2_buf_size);
int reconfig_mem_after_ticblas();
void matrix_gen(double *A, double *B, double *C, int m, int k, int n);
void test_edma();

int main (int argc, char *argv[]) 
{
  double *A, *B, *C;
  int m, n, k, err, l2_cache_size;
  double alpha, beta;

  int nthreads, tid;

#if 1 
/* Fork a team of threads giving them their own copies of variables */
#pragma omp parallel private(nthreads, tid)
  {

  /* Obtain thread number */
  tid = omp_get_thread_num();
  printf("Hello World from thread = %d\n", tid);

  /* Only master thread does this */
  if (tid == 0) 
    {
    nthreads = omp_get_num_threads();
    printf("Number of threads = %d\n", nthreads);
    }

  }  /* All threads join master thread and disband */
#endif
  
  //printf("Testing EDMA manager.\n");
  
  //test_edma();
	
  m = k = n = 1000;
  alpha = 0.7; 
  beta  = 1.3; 
	
    /* Allocate memory for matrices */
    A = (double *)malloc( m*k*sizeof( double ) );
    B = (double *)malloc( k*n*sizeof( double ) );
    C = (double *)malloc( m*n*sizeof( double ) );
    if (A == NULL || B == NULL || C == NULL) {
      printf( "\nERROR: Can't allocate memory for matrices. Aborting... \n\n");
      free(A);
      free(B);
      free(C);
      return 1;
    }
	else {
	  printf("Matrix A address: 0x%x, Matrix B address: 0x%x, Matrix C address: 0x%x.\n", (unsigned int)A, (unsigned int)B, (unsigned int)C);	
	}
    
  /* Initialize random number generator */	
  srand(123456789);
	
  matrix_gen(A, B, C, m, k, n);

    switch (CACHE_getL2Size())
    {
        case CACHE_0KCACHE:    l2_cache_size = 0; break;
        case CACHE_32KCACHE:   l2_cache_size = (32   << 10); break;
        case CACHE_64KCACHE:   l2_cache_size = (64   << 10); break;
        case CACHE_128KCACHE:  l2_cache_size = (128  << 10); break;
        case CACHE_256KCACHE:  l2_cache_size = (256  << 10); break;
        case CACHE_512KCACHE:  l2_cache_size = (512  << 10); break;
        case CACHE_1024KCACHE: l2_cache_size = (1024 << 10); break;
        default:               l2_cache_size = (1024 << 10); break;
    }
	
  printf("CACHE_getL2Size() returns %d, L2 Cache size is %d.\n", CACHE_getL2Size(), l2_cache_size);
  printf("lib_get_L2_SRAM_size() returns %d, lib_get_L2_total_size() returns %d.\n", lib_get_L2_SRAM_size(), lib_get_L2_total_size());
	
  //err = config_mem_for_ticblas((double *)blas_msmc_buf, (size_t)BLAS_MSMC_BUF_SIZE);
  err = config_mem_for_ticblas((double *)blas_msmc_buf, (double *)blas_l2_buf, (size_t)BLAS_MSMC_BUF_SIZE, (size_t)BLAS_L2_BUF_SIZE);
  if(err) {
    printf("Memory configuration for BLAS failed with error code %d.\n", err);
  }
	
  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, k, B, n, beta, C, n);	
	
  reconfig_mem_after_ticblas();
	
  return 0;
}

void test_edma()
{
    //lib_emt_Handle test_emt_handle;
	EdmaMgr_Handle test_edma_handle;
	int coreID, edma_err;

	coreID = lib_get_coreID();
	
	printf("Core ID is %d\n", coreID);
	
    edma_err = EdmaMgr_init(coreID, NULL);

	printf("EdmaMgr_init finished.\n"); 
	
	if(edma_err != EdmaMgr_SUCCESS) {
		printf("EdmaMgr_init fails. Error code is %d.\n", edma_err);
	}
	else {
		printf("EdmaMgr_init succeeds.\n");
	}
  
    /* Use external memory transfer API */
    //lib_emt_init();
    //if((test_emt_handle=lib_emt_alloc(1))==NULL) {
    if((test_edma_handle=EdmaMgr_alloc(1))==NULL) {
        printf("External memory transfer handle allocation error!\n");
        return;    
    } 
	else {
        printf("External memory transfer handle allocation succeeded!\n");		
	}
}


void matrix_gen(double *A, double *B, double *C, int m, int k, int n)
{

    int i;
    for (i = 0; i < (m*k); i++) {
        A[i] = (double)rand()/RAND_MAX;
    }

    for (i = 0; i < (k*n); i++) {
        B[i] = (double)rand()/RAND_MAX;
    }

    for (i = 0; i < (m*n); i++) {
        C[i] = (double)rand()/RAND_MAX;
    }
	
}

int config_mem_for_ticblas(double *msmc_buf, double *l2_buf, size_t msmc_buf_size, size_t l2_buf_size)
{
    size_t smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow;
    void *l1d_SRAM_ptr, *l2_SRAM_ptr;
    int l1d_cfg_err, l2_cfg_err, tid;
	
    /* First, verify the provided/available memory meet requirements */
    tiCblasGetSizes(&smem_size_vfast, &smem_size_fast, &smem_size_med, &smem_size_slow);

	printf("BLAS memory requirements - vfast size: %d, fast size: %d, medium size: %d, slow size: %d.\n", smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow);
	
    if(  (smem_size_vfast> lib_get_L1D_total_size()) /* total available L1D  */
       //||(smem_size_fast > lib_get_L2_total_size())  /* total available L2   */
       ||(smem_size_fast > l2_buf_size)  /* total available L2   */
       ||(smem_size_med  > msmc_buf_size)            /* provided MSMC memory */                 
       ||(smem_size_slow > BLAS_L3_DDR_SIZE_ZERO)    /* DDR not used         */
      ) {                                                            
        return(-2);
    }
	
    /* Configure L1D if necessary */
    l1D_SRAM_size_orig = lib_get_L1D_SRAM_size(); /* get current L1D SRAM size  */
    l1d_cfg_err = LIB_CACHE_SUCCESS;
	printf("Original L1D SRAM size is: %d\n", l1D_SRAM_size_orig);
	printf("Required L1D SRAM size is: %d\n", smem_size_vfast);
    if(l1D_SRAM_size_orig < smem_size_vfast) {    /* configure L1D if needs more SRAM */
      #pragma omp parallel                         
      {
        l1d_cfg_err = lib_L1D_config_SRAM(smem_size_vfast);  
      }
    }  

    #pragma omp parallel                         
    {
      tid = omp_get_thread_num();
	  printf("New L1D SRAM size from thread %d is: %d\n", tid, lib_get_L1D_SRAM_size());
    }

    /* Configure L2 if necessary */
    l2_SRAM_size_orig  = lib_get_L2_SRAM_size();   /* get current L2 SRAM size */
    l2_cfg_err = LIB_CACHE_SUCCESS;
    printf("Original L2 SRAM size is: %d\n", l2_SRAM_size_orig);
    printf("Required L2 SRAM size is: %d\n", smem_size_fast);
    if(l2_SRAM_size_orig < smem_size_fast) {       /* configure L2 if needs more SRAM */
      printf("Configuring L2 for each core:\n");
      #pragma omp parallel
      { 
        l2_cfg_err  = lib_L2_config_SRAM(smem_size_fast);    
      }
    }
    
    if(l1d_cfg_err || l2_cfg_err) {
      return(-3);        
    }

    #pragma omp parallel                         
    {
      tid = omp_get_thread_num();
	  printf("New L2 SRAM size from thread %d is: %d\n", tid, lib_get_L2_SRAM_size());
    }

    /* get L1D and L2 SRAM base address */
    l1d_SRAM_ptr = lib_get_L1D_SRAM_base();  
    //l2_SRAM_ptr  = lib_get_L2_SRAM_base();   
  
    /* pass allocated memories for heap initialization */
    return(tiCblasInit(l1d_SRAM_ptr,  smem_size_vfast,
                       //l2_SRAM_ptr,   smem_size_fast,
                       l2_buf,   smem_size_fast,
                       msmc_buf,      smem_size_med, 
                       NULL,          BLAS_L3_DDR_SIZE_ZERO));
} /* config_mem_for_ticblas */

/*==============================================================================
 * This function reconfigures L1D and L2 after processing is finished
 *============================================================================*/
int reconfig_mem_after_ticblas()
{
    int l1d_cfg_err, l2_cfg_err;
    
    /* configure L1D back */    
    l1d_cfg_err = LIB_CACHE_SUCCESS;
    if(l1D_SRAM_size_orig!=lib_get_L1D_SRAM_size()) {
      #pragma omp parallel
      {
        l1d_cfg_err = lib_L1D_config_SRAM(l1D_SRAM_size_orig);    
      }
    }     

    l2_cfg_err = LIB_CACHE_SUCCESS;
    if(l2_SRAM_size_orig <= lib_get_L2_SRAM_size()) {
      #pragma omp parallel
      {
        l2_cfg_err  = lib_L2_config_SRAM(l2_SRAM_size_orig);      
      }
    }
    
    /* configure L1D and L2 back */    
    if(l1d_cfg_err || l2_cfg_err) {
      return(-4);
    }   

	printf("L1D SRAM size set to: %d\n", lib_get_L1D_SRAM_size());
	printf("L2 SRAM size set to: %d\n", lib_get_L2_SRAM_size());
	
    return(TICBLAS_SUCCESS);            
} /* reconfig_mem_after_ticblas */