Added tuning examples.
authorJianzhong Xu <a0869574@ti.com>
Thu, 26 Mar 2015 20:18:42 +0000 (16:18 -0400)
committerJianzhong Xu <a0869574@ti.com>
Thu, 26 Mar 2015 20:18:42 +0000 (16:18 -0400)
13 files changed:
Makefile
build/tar_files_list.txt
examples/cgemm_tune/Makefile [new file with mode: 0644]
examples/cgemm_tune/cgemm_tune.c [new file with mode: 0644]
examples/dgemm_test/Makefile [new file with mode: 0644]
examples/dgemm_test/dgemm_test.c [new file with mode: 0644]
examples/dgemm_tune/Makefile [new file with mode: 0644]
examples/dgemm_tune/dgemm_tune.c [new file with mode: 0644]
examples/eig/main.c
examples/sgemm_tune/Makefile [new file with mode: 0644]
examples/sgemm_tune/sgemm_tune.c [new file with mode: 0644]
examples/zgemm_tune/Makefile [new file with mode: 0644]
examples/zgemm_tune/zgemm_tune.c [new file with mode: 0644]

index cc2276c65f0914f9b71a05d16d0774b9c46a17e5..b73b7db4a274aedcc77bbb89cf69b0d22329164d 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -20,10 +20,13 @@ ARMplusDSP:
        cd $(LINALG_CBLAS_DIR); make arch=ARM alllib; make arch=C66 alllib; \
        cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make -j8; make install; \
        ./configure -p install/arm cortex-a15; make -j8; make install; \
-       cd ../$(LINALG_BLISACC_DIR); make cross; 
-       cd ../$(LINALG_BLIS_DIR)/testsuite; make lib=OpenCLCBLAS -j8
+       cd ../$(LINALG_BLISACC_DIR); make cross; \
+       cd ../../clapack; make f2clib; make cblaswrap; cd SRC; make 
+
+BLIStest:
+       cd $(LINALG_BLIS_DIR)/testsuite; make lib=OpenCLCBLAS -j8 
 
-cleanall:
+cleanARMplusDSP:
        cd $(LINALG_CBLAS_DIR); make arch=ARM clean; make arch=C66 clean; \
        cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make clean; \
        ./configure -p install/arm cortex-a15; make clean; \
@@ -31,16 +34,16 @@ cleanall:
        cd ../$(LINALG_BLIS_DIR)/testsuite; make clean; \
        cd ../../clapack; make clean
 
+clean:
+       cd $(LINALG_CBLAS_DIR)/src; make arch=ARM clean; \
+       cd ../../$(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; make clean; \
+       cd ../$(LINALG_BLISACC_DIR)/src; make -f Makefile.ARM cleanARM
+
 DSPonly:
        cd $(LINALG_CBLAS_DIR); make arch=C66 alllib; \
        cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make -j8; make install; \
        cd ../$(LINALG_BLISACC_DIR)/src; make ti_cblas_kernel.dsp_h
        
-clean:
-       cd $(LINALG_CBLAS_DIR)/src; make arch=ARM clean; \
-       cd ../../$(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; make clean; \
-       cd ../$(LINALG_BLISACC_DIR); make clean
-
 install:
        install -m 755 -d ${DESTDIR}/usr/include
        install -m 755 -d ${DESTDIR}/usr/lib
index 9c9d021208af10b353d2c5fa2e221c89de32311e..f6be7667b65547a3f16b76078b856067cbea69e6 100644 (file)
@@ -6,6 +6,11 @@ examples/Makefile
 examples/eig
 examples/ludinv
 examples/matmpy
+examples/sgemm_tune
+examples/dgemm_tune
+examples/cgemm_tune
+examples/zgemm_tune
+examples/dgemm_test
 blis/version
 blis/build
 blis/CHANGELOG
diff --git a/examples/cgemm_tune/Makefile b/examples/cgemm_tune/Makefile
new file mode 100644 (file)
index 0000000..8ab318b
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = cgemm_tune
+
+include ../make.inc
+
+$(EXE): cgemm_tune.o
+       $(CC) $(CFLAGS) cgemm_tune.o $(BLASLIB) -o $@
+
+tune: $(EXE)
+       ./$(EXE);
\ No newline at end of file
diff --git a/examples/cgemm_tune/cgemm_tune.c b/examples/cgemm_tune/cgemm_tune.c
new file mode 100644 (file)
index 0000000..71110d3
--- /dev/null
@@ -0,0 +1,326 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *       * Neither the name of Texas Instruments Incorporated nor the
+ *         names of its contributors may be used to endorse or promote products
+ *         derived from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <complex.h>
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+#define TUNING_START_SIZE_SQUARE_MATRIX 16
+#define TUNING_START_SIZE_RECTAN_MATRIX 8
+#define NUM_MATRIX_SIZE_TO_BENCHMARK    8 //16
+#define HAS_MEMORY   1
+#define NO_MEMORY    0
+#define OFFLOAD      1
+#define NO_OFFLOAD   0
+
+#define NUM_TEST_RUN 5
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick()  clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+                        t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+float complex alpha = 0.7 - 0.3*I; 
+float complex beta  = 0.4 + 0.6*I;
+enum CBLAS_ORDER     order  = CblasColMajor; 
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const float complex *C1, const float complex *C2, int M, int N);
+int run_cgemm_dsp_and_arm(int M, int K, int N, float *time_dsp, float *time_arm, 
+                          float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+    int num_size, cgemm_err;
+    int M, N, K, m, n, k;
+    int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+    int offload_threshold_1, offload_threshold_2;
+    float total_GFLOPS_DSP, total_GFLOPS_ARM;
+    float time_DSP, time_ARM, t_dsp, t_arm;
+    char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+    char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+    int skip_next_point;
+    float diff_tmp, diff_pre;
+    FILE *fp_flag, *fp_time, *fp_tbl;  
+  
+    fp_flag = fopen("ofld_flag_cgemm.dat","w");
+    fp_tbl  = fopen("ofld_tbl_cgemm.c","w");
+    fp_time = fopen("cgemm_time_ARMvsDSP.dat","w");
+
+    fprintf(fp_tbl, "char ofld_tbl_cgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+    
+    srand(12345);
+
+    /* sweep M, K, and N */    
+    for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2) 
+    {
+        for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2) 
+        {
+            for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2) 
+            {
+                if(  (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
+                   ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
+                   ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
+                    ofld_flag[m][n][k] = OFFLOAD;
+                    mem_flag[m][n][k]  = HAS_MEMORY;  // to avoid error
+                    time_DSP = -1.0;
+                    time_ARM = -1.0;
+                    printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                }
+                else if(  (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
+                        ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
+                        ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
+                    ofld_flag[m][n][k] = NO_OFFLOAD;
+                    mem_flag[m][n][k]  = NO_MEMORY;
+                    time_DSP = -2.0;
+                    time_ARM = -2.0;
+                    printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                }
+                else {                    
+                    printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                    cgemm_err = run_cgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+                    //dsym_err  = run_dsymm_dsp_and_arm();
+              
+                    if(cgemm_err == -1) {  /* out of memory for DSP offloading */
+                        ofld_flag[m][n][k] = NO_OFFLOAD;
+                        mem_flag[m][n][k] = NO_MEMORY;
+                        time_DSP = -2.0;
+                        time_ARM = -2.0;
+                        printf("Out of memory, skipping next point.\n");
+                    }
+                    else {
+                        mem_flag[m][n][k] = HAS_MEMORY;                    
+                        time_DSP = t_dsp;
+                        time_ARM = t_arm;
+                        if (cgemm_err == 0){
+                            //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+                            if(t_dsp < t_arm) {
+                                ofld_flag[m][n][k] = OFFLOAD;
+                                printf("Offloading to DSP for this point. Skipping next point.\n");
+                            }
+                            else {
+                                ofld_flag[m][n][k] = NO_OFFLOAD;
+                            }
+                        }
+                        else {
+                            printf("Error in CGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
+                            exit(0);
+                        }
+                    }
+                }
+                
+                fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
+                fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
+
+                if(   (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+                   && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+                   && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
+                  fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
+                } else {
+                  fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
+                }
+            }
+            fprintf(fp_tbl, "\n");
+        }
+    }
+        
+    fclose(fp_flag);
+    fclose(fp_time);
+    fclose(fp_tbl);
+    
+    return 0;
+}
+
+
+int run_cgemm_dsp_and_arm(int M, int K, int N, float *time_dsp, float *time_arm, 
+                          float *gflops_dsp, float *gflops_arm)
+{
+    int    iter;
+    long long i;
+    float time_secs, total_time_dsp, total_time_arm;
+    float gflops_ARM, gflops_DSP;
+    float operation_count = 2.0*(float)M*(float)N*(float)K;
+    float total_GFLOPS_DSP = 0.0f;
+    float total_GFLOPS_ARM = 0.0f;
+    int    err_code = 0;
+    
+    total_time_dsp = 0.0;
+    total_time_arm = 0.0;
+    for (iter = 0; iter < NUM_TEST_RUN; iter++)
+    {      
+      /*-------------------------------------------------------------------------
+      * Allocate space for the matrices.  The matrices that will be passed to 
+      * the DSP are allocated using device memory.  The Carm array is not passed
+      * to the dsp and so can use system memory.
+      *------------------------------------------------------------------------*/
+      float complex *A    = (float complex*) __malloc_ddr(M*K*sizeof(float complex));
+      float complex *B    = (float complex*) __malloc_ddr(K*N*sizeof(float complex));
+      float complex *Cdsp = (float complex*) __malloc_ddr(M*N*sizeof(float complex));
+      float complex *Carm = (float complex*) malloc      (M*N*sizeof(float complex));
+  
+      if (!A || !B || !Cdsp || !Carm)
+      {
+          printf("Could not allocate enough space for the arrays!");
+          if(A) __free_ddr(A);
+          if(B) __free_ddr(B);
+          if(Cdsp) __free_ddr(Cdsp);
+          if(Carm) free(Carm);
+          
+          return (-1);
+      }
+  
+      /*-------------------------------------------------------------------------
+      * Initialize matrices and print if small enough.
+      *------------------------------------------------------------------------*/
+      for (i = 0; i < M*K; ++i) 
+      {
+          A[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+      }
+      for (i = 0; i < K*N; ++i)
+      {
+          B[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+      }
+      for (i = 0; i < M*N; ++i)
+      {
+          Carm[i] = Cdsp[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+      }  
+  
+      int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+              (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
+  
+      int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
+              (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
+  
+      int ldc = (order == CblasColMajor) ? M : N;
+    
+      /*============ BLAS tuning: running on DSP and then on ARM =============*/
+      /*------------------------------------------------------------------------
+      * Time DSP cgemm
+      *-----------------------------------------------------------------------*/
+      //ti_cblas_offload_config("001");  /* force offloading level 3 to DSP */
+      TI_CBLAS_L3_OFFLOAD = 1;
+      
+      tick();
+      cblas_cgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Cdsp,ldc);
+      time_secs = tock();
+      total_time_dsp += time_secs;
+      gflops_DSP = operation_count/time_secs*1e-9;
+      total_GFLOPS_DSP += gflops_DSP;
+      
+      /*-------------------------------------------------------------------------
+      * Time ARM cgemm
+      *------------------------------------------------------------------------*/
+      //ti_cblas_offload_config("000");  /* force no offloading */
+      TI_CBLAS_L3_OFFLOAD = 0;
+      
+      tick();
+      cblas_cgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Carm,ldc);
+      time_secs = tock();
+      total_time_arm += time_secs;
+      gflops_ARM = operation_count/time_secs*1e-9;
+      total_GFLOPS_ARM += gflops_ARM;
+      //printf(" %6.3f  %6.3f  %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
+  
+      /*-------------------------------------------------------------------------
+      * Verify Results
+      *------------------------------------------------------------------------*/
+      //return check_results(Cdsp, Carm, M, N);
+      err_code += check_results(Cdsp, Carm, M, N);
+  
+      __free_ddr(A);
+      __free_ddr(B);
+      __free_ddr(Cdsp);
+      free(Carm);
+    }
+    
+    *gflops_dsp = total_GFLOPS_DSP;
+    *gflops_arm = total_GFLOPS_ARM;
+    *time_dsp   = total_time_dsp / (float)NUM_TEST_RUN;
+    *time_arm   = total_time_arm / (float)NUM_TEST_RUN;
+
+    return err_code;
+}
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const float complex *C1, const float complex *C2, int M, int N)
+{
+    int i;
+    const float EPISILON = 1e-5;
+    //const float EPISILON = 1e-200;
+    const int NERRORS  = 5;
+    int       num_errors = 0;
+
+    for (i=0; i<M*N; i++)
+    {
+        float delta = cabs(C1[i]) - cabs(C2[i]);
+
+        if (delta > EPISILON*cabs(C1[i]))
+            if ((num_errors += 1) < NERRORS)
+                printf("Error [elem:%d]: %f <==> %f\n", i, cabs(C1[i]), cabs(C2[i]));
+    }
+
+    if (num_errors > 0)
+    {
+         printf("FAIL with %d errors!\n", num_errors);
+         return -1;
+    }
+    else 
+    {
+        //printf("PASS!\n");
+        return 0;
+    }
+}
+
+
diff --git a/examples/dgemm_test/Makefile b/examples/dgemm_test/Makefile
new file mode 100644 (file)
index 0000000..b7c5492
--- /dev/null
@@ -0,0 +1,8 @@
+
+EXE = dgemm_test
+
+include ../make.inc
+
+$(EXE): dgemm_test.o
+       $(CC) $(CFLAGS) dgemm_test.o $(BLASLIB) -o $@
+
diff --git a/examples/dgemm_test/dgemm_test.c b/examples/dgemm_test/dgemm_test.c
new file mode 100644 (file)
index 0000000..35e56bc
--- /dev/null
@@ -0,0 +1,208 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *       * Neither the name of Texas Instruments Incorporated nor the
+ *         names of its contributors may be used to endorse or promote products
+ *         derived from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+#define TUNING_START_SIZE_RECTAN_MATRIX 64
+#define NUM_MATRIX_SIZE_TO_BENCHMARK 4
+#define HAS_MEMORY   1
+#define NO_MEMORY    0
+#define OFFLOAD      1
+#define NO_OFFLOAD   0
+
+#define NUM_TEST_RUN 1
+
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick()  clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+                        t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double alpha           = 0.7; 
+double beta            = 0.3;
+enum CBLAS_ORDER     order  = CblasColMajor; 
+//enum CBLAS_ORDER     order  = CblasRowMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int run_dgemm(int M, int N, int K, float *time, float *gflops);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+    int num_size, dgemm_err;
+    int M, N, K, m, n, k;
+    int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+    float time_secs_arm, gflops_arm, time_secs_dsp, gflops_dsp, time_secs_opt, gflops_opt;
+    FILE *fp_time, *fp_flops;  
+  
+    fp_time = fopen("dgemm_time.dat","w");
+    fp_flops = fopen("dgemm_flops.dat","w");
+
+    srand(12345);
+    
+       /* setting up TI CBLAS during first call */
+       run_dgemm(100, 100, 100, &time_secs_arm, &gflops_arm);
+       
+    /* sweep M, K, and N */    
+    for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2) 
+    {
+        for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2) 
+        {
+            for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2) 
+            {
+                printf("Running DGEMM for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+                               
+                               TI_CBLAS_L3_OFFLOAD = 0;
+                dgemm_err = run_dgemm(M, N, K, &time_secs_arm, &gflops_arm);
+          
+                if(dgemm_err == -1) {  /* out of memory for DSP offloading */
+                    printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+                }
+                else {
+                               TI_CBLAS_L3_OFFLOAD = 1;
+                    dgemm_err = run_dgemm(M, N, K, &time_secs_dsp, &gflops_dsp);
+    
+                               TI_CBLAS_L3_OFFLOAD = 2;
+                    dgemm_err = run_dgemm(M, N, K, &time_secs_opt, &gflops_opt);
+
+                    fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\t%10.8e\t%10.8e\n", 
+                                               M, N, K, time_secs_arm, time_secs_dsp, time_secs_opt);
+                    fprintf(fp_flops, "%6d\t%6d\t%6d\t%10.8e\t%10.8e\t%10.8e\n", 
+                            M, N, K, gflops_arm, gflops_dsp, gflops_opt);
+                }
+            }
+        }
+    }
+        
+    fclose(fp_time);
+    fclose(fp_flops);
+    
+    return 0;
+}
+
+
+int run_dgemm(int M, int N, int K, float *time, float *gflops)
+{
+    int    iter;
+    long long i;
+    double time_secs, total_time;
+    double operation_count = 2.0*(double)M*(double)N*(double)K;
+    double total_GFLOPS = 0.0f;
+    int    err_code = 0;
+    
+    total_time = 0.0;
+    for (iter = 0; iter < NUM_TEST_RUN; iter++)
+    {      
+      /*-------------------------------------------------------------------------
+      * Allocate space for the matrices.  
+      *------------------------------------------------------------------------*/
+      double *A = (double *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(double));
+      double *B = (double *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(double));
+      double *C = (double *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(double));
+  
+      if (!A || !B || !C)
+      {
+          printf("Could not allocate enough space for the arrays!");
+          if(A) __free_ddr(A);
+          if(B) __free_ddr(B);
+          if(C) __free_ddr(C);
+          
+          return (-1);
+      }
+  
+      /*-------------------------------------------------------------------------
+      * Initialize matrices 
+      *------------------------------------------------------------------------*/
+      for (i = 0; i < (long long)M*K; ++i) A[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
+      for (i = 0; i < (long long)K*N; ++i) B[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
+      for (i = 0; i < (long long)M*N; ++i) C[i] = 0;
+  
+      int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+              (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
+  
+      int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
+              (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
+  
+      int ldc = (order == CblasColMajor) ? M : N;
+  
+      fflush(stdout);
+  
+      /*------------------------------------------------------------------------
+      * Run and time dgemm
+      *-----------------------------------------------------------------------*/
+      tick();
+      cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+      time_secs = tock();
+      total_time += time_secs;
+      total_GFLOPS += operation_count/time_secs*1e-9;
+/*      
+      if(M==4096 && K==256 && N==16) {
+          FILE *file_a = fopen("mat_a.dat","w");
+          FILE *file_b = fopen("mat_b.dat","w");
+          FILE *file_c = fopen("mat_c.dat","w");
+          
+          for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+          for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
+          for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",C[i]);
+      }
+*/      
+  
+      __free_ddr(A);
+      __free_ddr(B);
+      __free_ddr(C);
+    }
+    
+    *gflops = total_GFLOPS / (double)NUM_TEST_RUN;
+    *time   = total_time / (double)NUM_TEST_RUN;
+    
+    return err_code;
+}
diff --git a/examples/dgemm_tune/Makefile b/examples/dgemm_tune/Makefile
new file mode 100644 (file)
index 0000000..4887def
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = dgemm_tune
+
+include ../make.inc
+
+$(EXE): dgemm_tune.o
+       $(CC) $(CFLAGS) dgemm_tune.o $(BLASLIB) -o $@
+
+tune: $(EXE)
+       ./$(EXE);
\ No newline at end of file
diff --git a/examples/dgemm_tune/dgemm_tune.c b/examples/dgemm_tune/dgemm_tune.c
new file mode 100644 (file)
index 0000000..c3ebcc7
--- /dev/null
@@ -0,0 +1,332 @@
+/******************************************************************************
+ * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *       * Neither the name of Texas Instruments Incorporated nor the
+ *         names of its contributors may be used to endorse or promote products
+ *         derived from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+#define TUNING_START_SIZE_SQUARE_MATRIX 16
+#define TUNING_START_SIZE_RECTAN_MATRIX 8
+#define NUM_MATRIX_SIZE_TO_BENCHMARK    8 //16
+#define HAS_MEMORY   1
+#define NO_MEMORY    0
+#define OFFLOAD      1
+#define NO_OFFLOAD   0
+
+#define NUM_TEST_RUN 5
+
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick()  clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+                        t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double alpha           = 0.7; 
+double beta            = 0.3;
+enum CBLAS_ORDER     order  = CblasColMajor; 
+//enum CBLAS_ORDER     order  = CblasRowMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const double *C1, const double *C2, int M, int N);
+int run_dgemm_dsp_and_arm(int M, int N, int K, double *time_dsp, double *time_arm, 
+                          double *gflops_dsp, double *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+    int num_size, dgemm_err;
+    int M, N, K, m, n, k;
+    int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+    int offload_threshold_1, offload_threshold_2;
+    double total_GFLOPS_DSP, total_GFLOPS_ARM;
+    double time_DSP, time_ARM, t_dsp, t_arm;
+    char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+    char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+    int skip_next_point;
+    float diff_tmp, diff_pre;
+    FILE *fp_flag, *fp_time, *fp_tbl;  
+  
+    fp_flag = fopen("ofld_flag_dgemm.dat","w");
+    fp_tbl  = fopen("ofld_tbl_dgemm.c","w");
+    fp_time = fopen("dgemm_time_ARMvsDSP.dat","w");
+
+    fprintf(fp_tbl, "char ofld_tbl_dgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+    
+    srand(12345);
+    
+    /* sweep M, K, and N */    
+    for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2) 
+    {
+        for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2) 
+        {
+            for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2) 
+            {
+                if(  (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
+                   ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
+                   ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
+                    ofld_flag[m][n][k] = OFFLOAD;
+                    mem_flag[m][n][k]  = HAS_MEMORY;  // to avoid error
+                    time_DSP = -1.0;
+                    time_ARM = -1.0;
+                    printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                }
+                else if(  (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
+                        ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
+                        ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
+                    ofld_flag[m][n][k] = NO_OFFLOAD;
+                    mem_flag[m][n][k]  = NO_MEMORY;
+                    time_DSP = -2.0;
+                    time_ARM = -2.0;
+                    printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                }
+                else {                    
+                    printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                    dgemm_err = run_dgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+                    //dsym_err  = run_dsymm_dsp_and_arm();
+              
+                    if(dgemm_err == -1) {  /* out of memory for DSP offloading */
+                        ofld_flag[m][n][k] = NO_OFFLOAD;
+                        mem_flag[m][n][k] = NO_MEMORY;
+                        time_DSP = -2.0;
+                        time_ARM = -2.0;
+                        printf("Out of memory, skipping next point.\n");
+                    }
+                    else {
+                        mem_flag[m][n][k] = HAS_MEMORY;                    
+                        time_DSP = t_dsp;
+                        time_ARM = t_arm;
+                        if (dgemm_err == 0){
+                            //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+                            if(t_dsp < t_arm) {
+                                ofld_flag[m][n][k] = OFFLOAD;
+                                printf("Offloading to DSP for this point. Skipping next point.\n");
+                            }
+                            else {
+                                ofld_flag[m][n][k] = NO_OFFLOAD;
+                            }
+                        }
+                        else {
+                            printf("Error in DGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
+                            exit(0);
+                        }
+                    }
+                }
+                
+                fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
+                fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
+
+                if(   (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+                   && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+                   && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
+                  fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
+                } else {
+                  fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
+                }
+            }
+            fprintf(fp_tbl, "\n");
+        }
+    }
+        
+    fclose(fp_flag);
+    fclose(fp_time);
+    fclose(fp_tbl);
+    
+    return 0;
+}
+
+
+int run_dgemm_dsp_and_arm(int M, int N, int K, double *time_dsp, double *time_arm, 
+                          double *gflops_dsp, double *gflops_arm)
+{
+    int    iter;
+    long long i;
+    double time_secs, total_time_dsp, total_time_arm;
+    double gflops_ARM, gflops_DSP;
+    double operation_count = 2.0*(double)M*(double)N*(double)K;
+    double total_GFLOPS_DSP = 0.0f;
+    double total_GFLOPS_ARM = 0.0f;
+    int    err_code = 0;
+    
+    total_time_dsp = 0.0;
+    total_time_arm = 0.0;
+    for (iter = 0; iter < NUM_TEST_RUN; iter++)
+    {      
+      /*-------------------------------------------------------------------------
+      * Allocate space for the matrices.  The matrices that will be passed to 
+      * the DSP are allocated using device memory.  The Carm array is not passed
+      * to the dsp and so can use system memory.
+      *------------------------------------------------------------------------*/
+      double *A    = (double *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(double));
+      double *B    = (double *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(double));
+      double *Cdsp = (double *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(double));
+      double *Carm = (double *) malloc      ((long long)M*(long long)N*(long long)sizeof(double));
+  
+      if (!A || !B || !Cdsp || !Carm)
+      {
+          printf("Could not allocate enough space for the arrays!");
+          if(A) __free_ddr(A);
+          if(B) __free_ddr(B);
+          if(Cdsp) __free_ddr(Cdsp);
+          if(Carm) free(Carm);
+          
+          return (-1);
+      }
+  
+      /*-------------------------------------------------------------------------
+      * Initialize matrices 
+      *------------------------------------------------------------------------*/
+      for (i = 0; i < (long long)M*K; ++i) A[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
+      for (i = 0; i < (long long)K*N; ++i) B[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
+      for (i = 0; i < (long long)M*N; ++i) Carm[i] = Cdsp[i] = 0;
+  
+      int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+              (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
+  
+      int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
+              (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
+  
+      int ldc = (order == CblasColMajor) ? M : N;
+  
+      fflush(stdout);
+  
+      /*============ BLAS tuning: running on DSP and then on ARM =============*/
+      /*------------------------------------------------------------------------
+      * Time DSP dgemm
+      *-----------------------------------------------------------------------*/
+      //ti_cblas_offload_config("001");  /* force offloading level 3 to DSP */
+      //printf("Running on DSP.\n");
+      TI_CBLAS_L3_OFFLOAD = 1;
+      
+      tick();
+      cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Cdsp,ldc);
+      time_secs = tock();
+      total_time_dsp += time_secs;
+      gflops_DSP = operation_count/time_secs*1e-9;
+      total_GFLOPS_DSP += gflops_DSP;
+/*      
+      if(M==4096 && K==256 && N==16) {
+          FILE *file_a = fopen("mat_a.dat","w");
+          FILE *file_b = fopen("mat_b.dat","w");
+          FILE *file_c = fopen("mat_c.dat","w");
+          
+          for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+          for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
+          for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Cdsp[i]);
+      }
+*/      
+      /*-------------------------------------------------------------------------
+      * Time ARM dgemm
+      *------------------------------------------------------------------------*/
+      //ti_cblas_offload_config("000");  /* force no offloading */
+      //printf("Running on ARM.\n");
+      TI_CBLAS_L3_OFFLOAD = 0;
+      
+      tick();
+      cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Carm,ldc);
+      time_secs = tock();
+      total_time_arm += time_secs;
+      gflops_ARM = operation_count/time_secs*1e-9;
+      total_GFLOPS_ARM += gflops_ARM;
+      //printf(" %6.3f  %6.3f  %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
+      fflush(stdout);
+  
+      /*-------------------------------------------------------------------------
+      * Verify Results
+      *------------------------------------------------------------------------*/
+      //return check_results(Cdsp, Carm, M, N);
+      err_code += check_results(Cdsp, Carm, M, N);
+  
+      __free_ddr(A);
+      __free_ddr(B);
+      __free_ddr(Cdsp);
+      free(Carm);
+    }
+    
+    *gflops_dsp = total_GFLOPS_DSP;
+    *gflops_arm = total_GFLOPS_ARM;
+    *time_dsp   = total_time_dsp / (double)NUM_TEST_RUN;
+    *time_arm   = total_time_arm / (double)NUM_TEST_RUN;
+    
+    return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const double *C1, const double *C2, int M, int N)
+{
+    int i;
+    const double EPISILON = 1e-10;
+    //const double EPISILON = 1e-200;
+    const int NERRORS  = 5;
+    int       num_errors = 0;
+
+    for (i=0; i<(long)M*N; i++)
+    {
+        double delta = fabs(C1[i] - C2[i]);
+        if (delta > EPISILON*fabs(C1[i]))
+            if ((num_errors += 1) < NERRORS)
+                printf("Error [elem:%d]: %e <==> %e\n", i, C1[i], C2[i]);
+    }
+
+    if (num_errors > 0)
+    {
+         printf("FAIL with %d errors!\n", num_errors);
+         return num_errors;
+    }
+    else 
+    {
+        //printf("PASS!\n");
+        return 0;
+    }
+}
+
+
index bac05431883c6dfdc9745a5dabdfe9dd09cb35d8..10aff474e349099fa1c89e50af0acf4420737860 100644 (file)
 #include <string.h>
 #include <time.h>
 #include <math.h>
-#include <complex.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "cblas.h"
+#include "f2c.h"
+#include "blaswrap.h"
+#include "clapack.h"
 #ifdef __cplusplus
 }
 #endif
@@ -51,32 +54,13 @@ struct timespec t0,t1;
 /* Auxiliary routines prototypes */
 void print_eigenvalues(char * desc, int n, double * vector_r, double * vector_i);
 void print_eigenvectors(char * desc, int n, double * vector_i, double * v, int ldv);
-void form_diag_matrix(double complex *matrix_diag, double *vector_r, double *vector_i, int n);
-void form_eigen_matrix(double complex *mat, int n, double *vector_i, double *v, int ldv); 
-void print_real_matrix(char * desc, double * mat, int n, int m, int ldv); 
-void print_complex_matrix(char * desc, double complex *mat, int n, int m, int ldv);
-double compute_evd_error(double complex *matrix_reconstruct, double *matrix_original, int n, int ld);
+void form_diag_matrix(doublecomplex *matrix_diag, doublereal *vector_r, doublereal *vector_i, int n);
+void form_eigen_matrix(doublecomplex *mat, int n, double* vector_i, double* v, int ldv); 
+void print_real_matrix(char* desc, doublereal *mat, int n, int m, int ldv); 
+void print_complex_matrix(char* desc, doublecomplex *mat, int n, int m, int ldv);
+doublereal compute_evd_error(doublecomplex *matrix_reconstruct, doublereal *matrix_original, int n, int ld);
 void print_performance(char * func_name, int *matrix_sizes, float *time, int num_test, int num_run);
 
-/* LAPACK routines */
-extern int zlacpy_(char *, int *, int *, double complex *, int *, 
-                   double complex *, int *);
-extern int dlatmr_(int *, int *, char *, int *, char *, 
-                   double *, int *, double *, double *, char *, char *,
-                   double *, int *, double *, double *, int *, 
-                   double *, char *, int *, int *, int *, 
-                   double *, double *, char *, double *, int *, 
-                   int *, int *);
-extern int zgetrf_(int *, int *, double complex *, int *, int *, int *);
-extern int dgeev_ (char *, char *, int *, double *,
-                   int *, double *, double *, double *, int *, 
-                   double *, int *, double *, int *, int *);
-extern int zgetri_(int *, double complex *, int *, int *, 
-                   double complex *, int *, int *);
-extern int zgemm_(char *, char *, int *, int *, int *, double complex *, double complex *, 
-                  int *, double complex *, int *, double complex *, double complex *, int *);
-               
-
 /* Parameters */
 #define MAX_ALLOWED_ERR (1e-10)
 
@@ -88,22 +72,22 @@ int main(int argc, char *argv[])
 {
     /* Locals */
     float time, time_ev, time_cinv, *time_ev_all_runs, *time_cinv_all_runs;
-    int num_test, num_run, i, j, n, n_min, n_inc;
-    int lda, ldl, ldr, info, lwork;
-    double wkopt;
-    double* work;
-    int *vector_pivot, *vector_work;
-    double max_error_evd;
-    double *matrix_A, *matrix_A_cpy, *matrix_R, *matrix_L;
-    double *vector_diag, *vector_diagl, *vector_diagr, *vector_r, *vector_i;
-    double complex *matrix_D, *matrix_P, *matrix_invP, *matrix_A_cmplx, *work_dcomplex, lwork_opt;
+    integer num_test, num_run, i, j, n, n_min, n_inc;
+    integer lda, ldl, ldr, info, lwork;
+    doublereal wkopt;
+    doublereal* work;
+    integer *vector_pivot, *vector_work;
+       doublereal max_error_evd;
+    doublereal *matrix_A, *matrix_A_cpy, *matrix_R, *matrix_L;
+       doublereal *vector_diag, *vector_diagl, *vector_diagr, *vector_r, *vector_i;
+    doublecomplex *matrix_D, *matrix_P, *matrix_invP, *matrix_A_cmplx, *work_dcomplex, lwork_opt;
     int *matrix_sizes;
-    int const_int6 = 6;
-    double const_dbr1 = 1.;
-    double const_dbr0 = 0.;       
-    double complex const_dbc1 = 1. + 0.*I;
-    double complex const_dbc0 = 0. + 0.*I;
-    int rand_seed[4] = {10,123,278,3579};  /* numbers must lie between 0 and 4095 */
+    integer const_int6 = 6;
+    doublereal const_dbr1 = 1.;
+    doublereal const_dbr0 = 0.;       
+    doublecomplex const_dbc1 = {1.,0.};
+    doublecomplex const_dbc0 = {0.,0.};
+       integer rand_seed[4] = {10,123,278,3579};  /* numbers must lie between 0 and 4095 */
     
     fprintf(stdout, "\n\n======= LAPACK example: eigen decomposition =======\n");
     if(argc == 1) { /* no command line arguments, use default */
@@ -142,20 +126,20 @@ int main(int argc, char *argv[])
         {
         
             /* Allocate memory for matrix and vectors */
-            matrix_A     = (double *)__malloc_ddr(sizeof(double)*n*n);
-            matrix_A_cpy = (double *)malloc(sizeof(double)*n*n);
-            vector_pivot = (int    *)__malloc_ddr(sizeof(int)*n);     
-            vector_work  = (int    *)malloc(sizeof(int)*n);     /* not used in the example */
-            vector_diag  = (double *)malloc(sizeof(double)*n);  /* not used in the example */
-            vector_diagl = (double *)malloc(sizeof(double)*n);  /* not used in the example */
-            vector_diagr = (double *)malloc(sizeof(double)*n);  /* not used in the example */
-            matrix_D     = (double complex *)__malloc_ddr(n*n*sizeof(double complex));
-            matrix_P     = (double complex *)__malloc_ddr(n*n*sizeof(double complex));
-            matrix_invP  = (double complex *)__malloc_ddr(n*n*sizeof(double complex));
-            vector_r     = (double *)__malloc_ddr(sizeof(double)*n);
-            vector_i     = (double *)__malloc_ddr(sizeof(double)*n);
-            matrix_L     = (double *)__malloc_ddr(sizeof(double)*n*n); /* left eigenvectors */
-            matrix_R     = (double *)__malloc_ddr(sizeof(double)*n*n); /* right eigenvectors */
+            matrix_A     = (doublereal *)__malloc_ddr(sizeof(doublereal)*n*n);
+            matrix_A_cpy = (doublereal *)malloc(sizeof(doublereal)*n*n);
+            vector_pivot = (integer    *)__malloc_ddr(sizeof(integer)*n);     
+            vector_work  = (integer    *)malloc(sizeof(integer)*n);     /* not used in the example */
+            vector_diag  = (doublereal *)malloc(sizeof(doublereal)*n);  /* not used in the example */
+            vector_diagl = (doublereal *)malloc(sizeof(doublereal)*n);  /* not used in the example */
+            vector_diagr = (doublereal *)malloc(sizeof(doublereal)*n);  /* not used in the example */
+            matrix_D     = (doublecomplex *)__malloc_ddr(n*n*sizeof(doublecomplex));
+            matrix_P     = (doublecomplex *)__malloc_ddr(n*n*sizeof(doublecomplex));
+            matrix_invP  = (doublecomplex *)__malloc_ddr(n*n*sizeof(doublecomplex));
+            vector_r     = (doublereal *)__malloc_ddr(sizeof(doublereal)*n);
+            vector_i     = (doublereal *)__malloc_ddr(sizeof(doublereal)*n);
+            matrix_L     = (doublereal *)__malloc_ddr(sizeof(doublereal)*n*n); /* left eigenvectors */
+            matrix_R     = (doublereal *)__malloc_ddr(sizeof(doublereal)*n*n); /* right eigenvectors */
         
             /* Generate a N by N random matrix using LAPACK auxiliary testing routine. */
             printf("\nStart test run %d out of %d. \n", j+1, num_run);
@@ -171,7 +155,7 @@ int main(int argc, char *argv[])
             }
             
             print_real_matrix("Generated random matrix (double real): ", matrix_A, n, n, lda);
-            memcpy((void *)matrix_A_cpy, (void *)matrix_A, sizeof(double)*n*n);
+            memcpy((void *)matrix_A_cpy, (void *)matrix_A, sizeof(doublereal)*n*n);
 
             //time = read_cycle();
             tick();
@@ -182,7 +166,7 @@ int main(int argc, char *argv[])
                    &wkopt, &lwork, &info);       /* to find right eigenvectors only */
         
             lwork = (int)wkopt;  /* get optimal workspace size returned by dgeev_() */
-            work = (double*)__malloc_ddr( lwork*sizeof(double) );
+            work = (doublereal*)__malloc_ddr( lwork*sizeof(doublereal) );
         
             /* Find eigenvectors and eigenvalues: AP = PD, where colums of P are eigenvectors,
                and diagonal elements of D are eigenvalues. */
@@ -239,8 +223,8 @@ int main(int argc, char *argv[])
             lwork = -1;
             zgetri_(&n, matrix_invP, &n, vector_pivot, &lwork_opt, &lwork, &info);    
             
-            lwork = (int)creal(lwork_opt); /* get optimal workspace size returned by zgetri_() */
-            work_dcomplex = (double complex *)malloc( lwork*sizeof(double complex) );
+            lwork = (int)lwork_opt.r; /* get optimal workspace size returned by zgetri_() */
+            work_dcomplex = (doublecomplex *)malloc( lwork*sizeof(doublecomplex) );
             
             /* Compute inverse of P based on LUD results. */
             zgetri_(&n, matrix_invP, &n, vector_pivot, work_dcomplex, &lwork, &info);
@@ -259,7 +243,7 @@ int main(int argc, char *argv[])
             print_complex_matrix("Inverse of eigenvector matrix P:", matrix_invP, n, n, n);
 
             /* Compute P*D*inv(P). */ 
-            work_dcomplex = (double complex *)malloc(sizeof(double complex)*n*n);
+            work_dcomplex = (doublecomplex *)malloc(sizeof(doublecomplex)*n*n);
             zgemm_("N", "N", &n, &n, &n, &const_dbc1, matrix_P, &n, matrix_D, &n, &const_dbc0, work_dcomplex, &n);
             //EDMA_Free();
             
@@ -358,7 +342,7 @@ void print_eigenvectors( char* desc, int n, double* vector_i, double* v, int ldv
    }
 }
 
-void form_eigen_matrix(double complex *mat, int n, double* vector_i, double* v, int ldv) 
+void form_eigen_matrix(doublecomplex *mat, int n, double* vector_i, double* v, int ldv) 
 {
     int i, j;
     
@@ -366,29 +350,34 @@ void form_eigen_matrix(double complex *mat, int n, double* vector_i, double* v,
       j = 0;
       while( j < n ) {
          if( vector_i[j] == (double)0.0 ) {
-            mat[i+j*ldv] = v[i+j*ldv] + 0*I;
+            mat[i+j*ldv].r = v[i+j*ldv];
+            mat[i+j*ldv].i = 0;
             j++;
          } else {
-            mat[i+j*ldv] = v[i+j*ldv] + v[i+(j+1)*ldv]*I;
-            mat[i+(j+1)*ldv] = v[i+j*ldv] - v[i+(j+1)*ldv]*I;
+            mat[i+j*ldv].r = v[i+j*ldv];
+            mat[i+j*ldv].i = v[i+(j+1)*ldv];
+            mat[i+(j+1)*ldv].r = v[i+j*ldv];
+            mat[i+(j+1)*ldv].i = -v[i+(j+1)*ldv];
             j += 2;
          }
       }
    }
 }
 
-void form_diag_matrix(double complex *matrix_diag, double *vector_r, double *vector_i, int n)
+void form_diag_matrix(doublecomplex *matrix_diag, doublereal *vector_r, doublereal *vector_i, int n)
 {
    int i, j;
   
    for(i=0; i<n; i++) {
       for(j=0; j<n; j++) {
-        matrix_diag[i+j*n] = 0. + 0.*I;
+           matrix_diag[i+j*n].r = 0.;
+           matrix_diag[i+j*n].i = 0.;
       }
    }
    
    for(i=0; i<n; i++) {
-      matrix_diag[i+i*n] = vector_r[i] + vector_i[i]*I;
+      matrix_diag[i+i*n].r = vector_r[i];
+      matrix_diag[i+i*n].i = vector_i[i];
    }   
 }
 
@@ -408,7 +397,7 @@ void print_real_matrix(char* desc, double *mat, int m, int n, int ldv )
   }
 }
 
-void print_complex_matrix(char* desc, double complex *mat, int m, int n, int ldv ) 
+void print_complex_matrix(char* desc, doublecomplex *mat, int m, int n, int ldv ) 
 {
    int i, j;
    double real, imag;
@@ -419,8 +408,8 @@ void print_complex_matrix(char* desc, double complex *mat, int m, int n, int ldv
    printf( "\n %s\n", desc );
    for(i=0; i<m; i++) {
       for(j=0; j<n; j++) {
-         real = creal(mat[i+j*ldv]);
-         imag = cimag(mat[i+j*ldv]);
+         real = mat[i+j*ldv].r;
+         imag = mat[i+j*ldv].i;
          if(imag >= 0) {
             printf( " %10.5f + %10.5fi", real, imag);
          }
@@ -432,27 +421,27 @@ void print_complex_matrix(char* desc, double complex *mat, int m, int n, int ldv
   }
 }
 
-double compute_evd_error(double complex *matrix_reconstruct, double *matrix_original, int n, int ld)
+doublereal compute_evd_error(doublecomplex *matrix_reconstruct, doublereal *matrix_original, int n, int ld)
 {
    int i, j;
-   double err_real, err_imag;
-   double abs_err;
-   double max_err = 0.;
+   doublereal err_real, err_imag;
+   doublereal abs_err;
+   doublereal max_abs_err = 0.;
 
    for(i=0; i<n; i++) {
       for(j=0; j<n; j++) {
-        err_real = creal(matrix_reconstruct[i+j*ld]) - matrix_original[i+j*ld];
-        err_imag = cimag(matrix_reconstruct[i+j*ld]);
+           err_real = matrix_reconstruct[i+j*ld].r - matrix_original[i+j*ld];
+           err_imag = matrix_reconstruct[i+j*ld].i;
         
         abs_err = sqrt(err_real*err_real + err_imag*err_imag);
         
-        if(abs_err > max_err) {
-          max_err = abs_err;
+        if(abs_err > max_abs_err) {
+          max_abs_err = abs_err;
         }
       }
    }
    
-   return max_err;
+   return max_abs_err;
 }
 
 void print_performance(char * func_name, int *matrix_sizes, float *time, int num_test, int num_run)
diff --git a/examples/sgemm_tune/Makefile b/examples/sgemm_tune/Makefile
new file mode 100644 (file)
index 0000000..569cae6
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = sgemm_tune
+
+include ../make.inc
+
+$(EXE): sgemm_tune.o
+       $(CC) $(CFLAGS) sgemm_tune.o $(BLASLIB) -o $@
+
+tune: $(EXE)
+       ./$(EXE);
\ No newline at end of file
diff --git a/examples/sgemm_tune/sgemm_tune.c b/examples/sgemm_tune/sgemm_tune.c
new file mode 100644 (file)
index 0000000..05ffc7c
--- /dev/null
@@ -0,0 +1,331 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *       * Neither the name of Texas Instruments Incorporated nor the
+ *         names of its contributors may be used to endorse or promote products
+ *         derived from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+#define TUNING_START_SIZE_SQUARE_MATRIX 16
+#define TUNING_START_SIZE_RECTAN_MATRIX 8
+#define NUM_MATRIX_SIZE_TO_BENCHMARK    8 //16
+#define HAS_MEMORY   1
+#define NO_MEMORY    0
+#define OFFLOAD      1
+#define NO_OFFLOAD   0
+
+#define NUM_TEST_RUN 5
+
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick()  clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+                        t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+float alpha           = 0.7; 
+float beta            = 0.3;
+enum CBLAS_ORDER     order  = CblasColMajor; 
+//enum CBLAS_ORDER     order  = CblasRowMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const float *C1, const float *C2, int M, int N);
+int run_sgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm, 
+                          float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+    int num_size, sgemm_err;
+    int M, N, K, m, n, k;
+    int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+    int offload_threshold_1, offload_threshold_2;
+    float total_GFLOPS_DSP, total_GFLOPS_ARM;
+    float time_DSP, time_ARM, t_dsp, t_arm;
+    char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+    char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+    int skip_next_point;
+    float diff_tmp, diff_pre;
+    FILE *fp_flag, *fp_time, *fp_tbl;  
+  
+    fp_flag = fopen("ofld_flag_sgemm.dat","w");
+    fp_tbl  = fopen("ofld_tbl_sgemm.c","w");
+    fp_time = fopen("sgemm_time_ARMvsDSP.dat","w");
+
+    fprintf(fp_tbl, "char ofld_tbl_sgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+    
+    srand(12345);
+    
+    /* sweep M, K, and N */    
+    for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2) 
+    {
+        for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2) 
+        {    
+            for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2) 
+            {
+                if(  (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
+                   ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
+                   ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
+                    ofld_flag[m][n][k] = OFFLOAD;
+                    mem_flag[m][n][k]  = HAS_MEMORY;  // to avoid error
+                    time_DSP = -1.0;
+                    time_ARM = -1.0;
+                    printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                }
+                else if(  (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
+                        ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
+                        ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
+                    ofld_flag[m][n][k] = NO_OFFLOAD;
+                    mem_flag[m][n][k]  = NO_MEMORY;
+                    time_DSP = -2.0;
+                    time_ARM = -2.0;
+                    printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                }
+                else {                    
+                    printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                    sgemm_err = run_sgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+              
+                    if(sgemm_err == -1) {  /* out of memory for DSP offloading */
+                        ofld_flag[m][n][k] = NO_OFFLOAD;
+                        mem_flag[m][n][k] = NO_MEMORY;
+                        time_DSP = -2.0;
+                        time_ARM = -2.0;
+                        printf("Out of memory, skipping next point.\n");
+                    }
+                    else {
+                        mem_flag[m][n][k] = HAS_MEMORY;                    
+                        time_DSP = t_dsp;
+                        time_ARM = t_arm;
+                        if (sgemm_err == 0){
+                            //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+                            if(t_dsp < t_arm) {
+                                ofld_flag[m][n][k] = OFFLOAD;
+                                printf("Offloading to DSP for this point. Skipping next point.\n");
+                            }
+                            else {
+                                ofld_flag[m][n][k] = NO_OFFLOAD;
+                            }
+                        }
+                        else {
+                            printf("Error in SGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
+                            exit(0);
+                        }
+                    }
+                }
+                
+                fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
+                fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
+
+                if(   (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+                   && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+                   && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
+                  fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
+                } else {
+                  fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
+                }
+            }
+            fprintf(fp_tbl, "\n");
+        }
+    }
+       
+    fclose(fp_flag);
+    fclose(fp_time);
+    fclose(fp_tbl);
+    
+    return 0;
+}
+
+
+int run_sgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm, 
+                          float *gflops_dsp, float *gflops_arm)
+{
+    int    iter;
+    long long i;
+    float time_secs, total_time_dsp, total_time_arm;
+    float gflops_ARM, gflops_DSP;
+    float operation_count = 2.0*(float)M*(float)N*(float)K;
+    float total_GFLOPS_DSP = 0.0f;
+    float total_GFLOPS_ARM = 0.0f;
+    int    err_code = 0;
+    
+    total_time_dsp = 0.0;
+    total_time_arm = 0.0;
+    for (iter = 0; iter < NUM_TEST_RUN; iter++)
+    {      
+      /*-------------------------------------------------------------------------
+      * Allocate space for the matrices.  The matrices that will be passed to 
+      * the DSP are allocated using device memory.  The Carm array is not passed
+      * to the dsp and so can use system memory.
+      *------------------------------------------------------------------------*/
+      float *A    = (float *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(float));
+      float *B    = (float *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(float));
+      float *Cdsp = (float *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(float));
+      float *Carm = (float *) malloc      ((long long)M*(long long)N*(long long)sizeof(float));
+  
+      if (!A || !B || !Cdsp || !Carm)
+      {
+          printf("Could not allocate enough space for the arrays!");
+          if(A) __free_ddr(A);
+          if(B) __free_ddr(B);
+          if(Cdsp) __free_ddr(Cdsp);
+          if(Carm) free(Carm);
+          
+          return (-1);
+      }
+  
+      /*-------------------------------------------------------------------------
+      * Initialize matrices 
+      *------------------------------------------------------------------------*/
+      for (i = 0; i < (long long)M*K; ++i) A[i] = (float)rand()/RAND_MAX;
+      for (i = 0; i < (long long)K*N; ++i) B[i] = (float)rand()/RAND_MAX;
+      for (i = 0; i < (long long)M*N; ++i) Carm[i] = Cdsp[i] = 0;
+  
+      int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+              (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
+  
+      int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
+              (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
+  
+      int ldc = (order == CblasColMajor) ? M : N;
+    
+      /*============ BLAS tuning: running on DSP and then on ARM =============*/
+      /*------------------------------------------------------------------------
+      * Time DSP sgemm
+      *-----------------------------------------------------------------------*/
+      //ti_cblas_offload_config("001");  /* force offloading level 3 to DSP */
+      //printf("Running on DSP.\n");
+      TI_CBLAS_L3_OFFLOAD = 1;
+      
+      tick();
+      cblas_sgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Cdsp,ldc);
+      time_secs = tock();
+      total_time_dsp += time_secs;
+      gflops_DSP = operation_count/time_secs*1e-9;
+      total_GFLOPS_DSP += gflops_DSP;
+/*      
+      if(M==4096 && K==256 && N==16) {
+          FILE *file_a = fopen("mat_a.dat","w");
+          FILE *file_b = fopen("mat_b.dat","w");
+          FILE *file_c = fopen("mat_c.dat","w");
+          
+          for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+          for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
+          for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Cdsp[i]);
+      }
+*/      
+      /*-------------------------------------------------------------------------
+      * Time ARM sgemm
+      *------------------------------------------------------------------------*/
+      //ti_cblas_offload_config("000");  /* force no offloading */
+      //printf("Running on ARM.\n");
+      TI_CBLAS_L3_OFFLOAD = 0;
+      
+      tick();
+      cblas_sgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Carm,ldc);
+      time_secs = tock();
+      total_time_arm += time_secs;
+      gflops_ARM = operation_count/time_secs*1e-9;
+      total_GFLOPS_ARM += gflops_ARM;
+      //printf(" %6.3f  %6.3f  %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
+      fflush(stdout);
+  
+      /*-------------------------------------------------------------------------
+      * Verify Results
+      *------------------------------------------------------------------------*/
+      //return check_results(Cdsp, Carm, M, N);
+      err_code += check_results(Cdsp, Carm, M, N);
+  
+      __free_ddr(A);
+      __free_ddr(B);
+      __free_ddr(Cdsp);
+      free(Carm);
+    }
+    
+    *gflops_dsp = total_GFLOPS_DSP;
+    *gflops_arm = total_GFLOPS_ARM;
+    *time_dsp   = total_time_dsp / (float)NUM_TEST_RUN;
+    *time_arm   = total_time_arm / (float)NUM_TEST_RUN;
+    
+    return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const float *C1, const float *C2, int M, int N)
+{
+    int i;
+    const float EPISILON = 1e-5;
+    //const float EPISILON = 1e-200;
+    const int NERRORS  = 5;
+    int       num_errors = 0;
+
+    for (i=0; i<(long)M*N; i++)
+    {
+        float delta = fabs(C1[i] - C2[i]);
+
+        if (delta > EPISILON*fabs(C1[i]))
+            if ((num_errors += 1) < NERRORS)
+                printf("Error [elem:%d]: %e <==> %e\n", i, C1[i], C2[i]);
+    }
+
+    if (num_errors > 0)
+    {
+         printf("FAIL with %d errors!\n", num_errors);
+         return num_errors;
+    }
+    else 
+    {
+        //printf("PASS!\n");
+        return 0;
+    }
+}
+
+
diff --git a/examples/zgemm_tune/Makefile b/examples/zgemm_tune/Makefile
new file mode 100644 (file)
index 0000000..ffb2cc4
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = zgemm_tune
+
+include ../make.inc
+
+$(EXE): zgemm_tune.o
+       $(CC) $(CFLAGS) zgemm_tune.o $(BLASLIB) -o $@
+
+tune: $(EXE)
+       ./$(EXE);
\ No newline at end of file
diff --git a/examples/zgemm_tune/zgemm_tune.c b/examples/zgemm_tune/zgemm_tune.c
new file mode 100644 (file)
index 0000000..a8e83d9
--- /dev/null
@@ -0,0 +1,325 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *       * Neither the name of Texas Instruments Incorporated nor the
+ *         names of its contributors may be used to endorse or promote products
+ *         derived from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <complex.h>
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+#define TUNING_START_SIZE_SQUARE_MATRIX 16
+#define TUNING_START_SIZE_RECTAN_MATRIX 8
+#define NUM_MATRIX_SIZE_TO_BENCHMARK    8 //16
+#define HAS_MEMORY   1
+#define NO_MEMORY    0
+#define OFFLOAD      1
+#define NO_OFFLOAD   0
+
+#define NUM_TEST_RUN 5
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick()  clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+                        t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double complex alpha = 0.7 - 0.3*I; 
+double complex beta  = 0.4 + 0.6*I;
+enum CBLAS_ORDER     order  = CblasColMajor; 
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const double complex *C1, const double complex *C2, int M, int N);
+int run_zgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm, 
+                          float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+    int num_size, zgemm_err;
+    int M, N, K, m, n, k;
+    int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+    int offload_threshold_1, offload_threshold_2;
+    float total_GFLOPS_DSP, total_GFLOPS_ARM;
+    float time_DSP, time_ARM, t_dsp, t_arm;
+    char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+    char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+    int skip_next_point;
+    FILE *fp_flag, *fp_time, *fp_tbl;  
+  
+    fp_flag = fopen("ofld_flag_zgemm.dat","w");
+    fp_tbl  = fopen("ofld_tbl_zgemm.c","w");
+    fp_time = fopen("zgemm_time_ARMvsDSP.dat","w");
+
+    fprintf(fp_tbl, "char ofld_tbl_zgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+    
+    srand(12345);
+    
+    /* sweep M, K, and N */    
+    for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2) 
+    {
+        for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2) 
+        {
+            for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2) 
+            {
+                if(  (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
+                   ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
+                   ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
+                    ofld_flag[m][n][k] = OFFLOAD;
+                    mem_flag[m][n][k]  = HAS_MEMORY;  // to avoid error
+                    time_DSP = -1.0;
+                    time_ARM = -1.0;
+                    printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                }
+                else if(  (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
+                        ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
+                        ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
+                    ofld_flag[m][n][k] = NO_OFFLOAD;
+                    mem_flag[m][n][k]  = NO_MEMORY;
+                    time_DSP = -2.0;
+                    time_ARM = -2.0;
+                    printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                }
+                else {                    
+                    printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+                    zgemm_err = run_zgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+                    //dsym_err  = run_dsymm_dsp_and_arm();
+              
+                    if(zgemm_err == -1) {  /* out of memory for DSP offloading */
+                        ofld_flag[m][n][k] = NO_OFFLOAD;
+                        mem_flag[m][n][k] = NO_MEMORY;
+                        time_DSP = -2.0;
+                        time_ARM = -2.0;
+                        printf("Out of memory, skipping next point.\n");
+                    }
+                    else {
+                        mem_flag[m][n][k] = HAS_MEMORY;                    
+                        time_DSP = t_dsp;
+                        time_ARM = t_arm;
+                        if (zgemm_err == 0){
+                            //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+                            if(t_dsp < t_arm) {
+                                ofld_flag[m][n][k] = OFFLOAD;
+                                printf("Offloading to DSP for this point. Skipping next point.\n");
+                            }
+                            else {
+                                ofld_flag[m][n][k] = NO_OFFLOAD;
+                            }
+                        }
+                        else {
+                            printf("Error in DGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
+                            exit(0);
+                        }
+                    }
+                }
+                
+                fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
+                fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
+
+                if(   (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+                   && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+                   && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
+                  fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
+                } else {
+                  fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
+                }
+            }
+            fprintf(fp_tbl, "\n");
+        }
+    }
+        
+    fclose(fp_flag);
+    fclose(fp_time);
+    fclose(fp_tbl);
+    
+    return 0;
+}
+
+
+int run_zgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm, 
+                          float *gflops_dsp, float *gflops_arm)
+{
+    int    iter;
+    long long i;
+    float time_secs, total_time_dsp, total_time_arm;
+    float gflops_ARM, gflops_DSP;
+    float operation_count = 2.0*(float)M*(float)N*(float)K;
+    float total_GFLOPS_DSP = 0.0f;
+    float total_GFLOPS_ARM = 0.0f;
+    int    err_code = 0;
+    
+    total_time_dsp = 0.0;
+    total_time_arm = 0.0;
+    for (iter = 0; iter < NUM_TEST_RUN; iter++)
+    {      
+      /*-------------------------------------------------------------------------
+      * Allocate space for the matrices.  The matrices that will be passed to 
+      * the DSP are allocated using device memory.  The Carm array is not passed
+      * to the dsp and so can use system memory.
+      *------------------------------------------------------------------------*/
+      double complex *A    = (double complex*) __malloc_ddr(M*K*sizeof(double complex));
+      double complex *B    = (double complex*) __malloc_ddr(K*N*sizeof(double complex));
+      double complex *Cdsp = (double complex*) __malloc_ddr(M*N*sizeof(double complex));
+      double complex *Carm = (double complex*) malloc      (M*N*sizeof(double complex));
+  
+      if (!A || !B || !Cdsp || !Carm)
+      {
+          printf("Could not allocate enough space for the arrays!");
+          if(A) __free_ddr(A);
+          if(B) __free_ddr(B);
+          if(Cdsp) __free_ddr(Cdsp);
+          if(Carm) free(Carm);
+          
+          return (-1);
+      }
+  
+      /*-------------------------------------------------------------------------
+      * Initialize matrices and print if small enough.
+      *------------------------------------------------------------------------*/
+      for (i = 0; i < M*K; ++i) 
+      {
+          A[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+      }
+      for (i = 0; i < K*N; ++i)
+      {
+          B[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+      }
+      for (i = 0; i < M*N; ++i)
+      {
+          Carm[i] = Cdsp[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+      }  
+  
+      int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+              (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
+  
+      int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
+              (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
+  
+      int ldc = (order == CblasColMajor) ? M : N;
+  
+      /*============ BLAS tuning: running on DSP and then on ARM =============*/
+      /*------------------------------------------------------------------------
+      * Time DSP zgemm
+      *-----------------------------------------------------------------------*/
+      //ti_cblas_offload_config("001");  /* force offloading level 3 to DSP */
+      TI_CBLAS_L3_OFFLOAD = 1;
+      
+      tick();
+      cblas_zgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Cdsp,ldc);
+      time_secs = tock();
+      total_time_dsp += time_secs;
+      gflops_DSP = operation_count/time_secs*1e-9;
+      total_GFLOPS_DSP += gflops_DSP;
+      
+      /*-------------------------------------------------------------------------
+      * Time ARM zgemm
+      *------------------------------------------------------------------------*/
+      //ti_cblas_offload_config("000");  /* force no offloading */
+      TI_CBLAS_L3_OFFLOAD = 0;
+      
+      tick();
+      cblas_zgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Carm,ldc);
+      time_secs = tock();
+      total_time_arm += time_secs;
+      gflops_ARM = operation_count/time_secs*1e-9;
+      total_GFLOPS_ARM += gflops_ARM;
+      //printf(" %6.3f  %6.3f  %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
+  
+      /*-------------------------------------------------------------------------
+      * Verify Results
+      *------------------------------------------------------------------------*/
+      //return check_results(Cdsp, Carm, M, N);
+      err_code += check_results(Cdsp, Carm, M, N);
+  
+      __free_ddr(A);
+      __free_ddr(B);
+      __free_ddr(Cdsp);
+      free(Carm);
+    }
+    
+    *gflops_dsp = total_GFLOPS_DSP;
+    *gflops_arm = total_GFLOPS_ARM;
+    *time_dsp   = total_time_dsp / (float)NUM_TEST_RUN;
+    *time_arm   = total_time_arm / (float)NUM_TEST_RUN;
+
+    return err_code;
+}
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const double complex *C1, const double complex *C2, int M, int N)
+{
+    int i;
+    const double EPISILON = 1e-10;
+    //const double EPISILON = 1e-200;
+    const int NERRORS  = 5;
+    int       num_errors = 0;
+
+    for (i=0; i<M*N; i++)
+    {
+        double delta = fabs(cabs(C1[i]) - cabs(C2[i]));
+
+        if (delta > EPISILON*cabs(C1[i]))
+            if ((num_errors += 1) < NERRORS)
+                printf("Error [elem:%d]: %f <==> %f\n", i, cabs(C1[i]), cabs(C2[i]));
+    }
+
+    if (num_errors > 0)
+    {
+         printf("FAIL with %d errors!\n", num_errors);
+         return -1;
+    }
+    else 
+    {
+        //printf("PASS!\n");
+        return 0;
+    }
+}
+
+