Build tuning code.
[dense-linear-algebra-libraries/linalg.git] / src / ti / linalg / tuning / dgemm_tune / dgemm_tune.c
1 /******************************************************************************
2  * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
3  *   All rights reserved.
4  *
5  *   Redistribution and use in source and binary forms, with or without
6  *   modification, are permitted provided that the following conditions are met:
7  *       * Redistributions of source code must retain the above copyright
8  *         notice, this list of conditions and the following disclaimer.
9  *       * Redistributions in binary form must reproduce the above copyright
10  *         notice, this list of conditions and the following disclaimer in the
11  *         documentation and/or other materials provided with the distribution.
12  *       * Neither the name of Texas Instruments Incorporated nor the
13  *         names of its contributors may be used to endorse or promote products
14  *         derived from this software without specific prior written permission.
15  *
16  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  *   THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <math.h>
31 #include <time.h>
32 #include "../common/tune_com.h"
34 #include "cblas.h"
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38 #include "cblas.h"
39 #ifdef __cplusplus
40 }
41 #endif
43 /*-----------------------------------------------------------------------------
44 * Global Variables
45 *----------------------------------------------------------------------------*/
46 double alpha           = 0.7; 
47 double beta            = 0.3;
48 enum CBLAS_ORDER     order  = CblasColMajor; 
49 //enum CBLAS_ORDER     order  = CblasRowMajor;
50 enum CBLAS_TRANSPOSE transA = CblasNoTrans;
51 enum CBLAS_TRANSPOSE transB = CblasNoTrans;
53 extern int TI_CBLAS_L3_OFFLOAD;
54 /*-----------------------------------------------------------------------------
55 * Prototypes
56 *----------------------------------------------------------------------------*/
57 int check_results(const double *C1, const double *C2, int M, int N);
58 int run_dgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm, 
59                           float *gflops_dsp, float *gflops_arm);
61 /*-----------------------------------------------------------------------------
62 * MAIN
63 *----------------------------------------------------------------------------*/
64 int main()
65 {
66     int num_size, dgemm_err;
67     int M, N, K, m, n, k;
68     int M_pre, N_pre, K_pre, M_start_size, N_start_size;
69     int offload_threshold_1, offload_threshold_2;
70     float total_GFLOPS_DSP, total_GFLOPS_ARM;
71     float time_DSP, time_ARM, t_dsp, t_arm;
72     char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
73     char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
74     int skip_next_point;
75     float diff_tmp, diff_pre;
76     FILE *fp_flag, *fp_time, *fp_tbl;  
77   
78     fp_flag = fopen("ofld_flag_dgemm.dat","w");
79     fp_tbl  = fopen("ofld_tbl_dgemm.c","w");
80     fp_time = fopen("dgemm_time_ARMvsDSP.dat","w");
82     print_file_header(fp_tbl);
83     fprintf(fp_tbl, "char ofld_tbl_dgemm[GEMM_OFFLOAD_TBL_SIZE] = {\n");
84     
85     srand(12345);
86     
87     /* sweep M, K, and N */    
88     for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2) 
89     {
90         for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2) 
91         {
92             for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2) 
93             {
94                 if(  (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
95                    ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
96                    ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
97                     ofld_flag[m][n][k] = OFFLOAD;
98                     mem_flag[m][n][k]  = HAS_MEMORY;  // to avoid error
99                     time_DSP = -1.0;
100                     time_ARM = -1.0;
101                     printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
102                 }
103                 else if(  (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
104                         ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
105                         ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
106                     ofld_flag[m][n][k] = NO_OFFLOAD;
107                     mem_flag[m][n][k]  = NO_MEMORY;
108                     time_DSP = -2.0;
109                     time_ARM = -2.0;
110                     printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
111                 }
112                 else {                    
113                     printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
114                     dgemm_err = run_dgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
115                     //dsym_err  = run_dsymm_dsp_and_arm();
116               
117                     if(dgemm_err == -1) {  /* out of memory for DSP offloading */
118                         ofld_flag[m][n][k] = NO_OFFLOAD;
119                         mem_flag[m][n][k] = NO_MEMORY;
120                         time_DSP = -2.0;
121                         time_ARM = -2.0;
122                         printf("Out of memory, skipping next point.\n");
123                     }
124                     else {
125                         mem_flag[m][n][k] = HAS_MEMORY;                    
126                         time_DSP = t_dsp;
127                         time_ARM = t_arm;
128                         if (dgemm_err == 0){
129                             //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
130                             if(DSP_FASTER_THAN_ARM(t_dsp,t_arm)) {
131                                 ofld_flag[m][n][k] = OFFLOAD;
132                                 printf("Offloading to DSP for this point. Skipping next point.\n");
133                             }
134                             else {
135                                 ofld_flag[m][n][k] = NO_OFFLOAD;
136                             }
137                         }
138                         else {
139                             printf("Error in DGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
140                             exit(0);
141                         }
142                     }
143                 }
144                 
145                 fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
146                 fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
148                 if(   (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
149                    && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
150                    && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
151                   fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
152                 } else {
153                   fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
154                 }
155             }
156             fprintf(fp_tbl, "\n");
157         }
158     }
159         
160     fclose(fp_flag);
161     fclose(fp_time);
162     fclose(fp_tbl);
163     
164     return 0;
168 int run_dgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm, 
169                           float *gflops_dsp, float *gflops_arm)
171     int    iter;
172     long long i;
173     float time_secs, total_time_dsp, total_time_arm;
174     float gflops_ARM, gflops_DSP;
175     float operation_count = 2.0*(double)M*(double)N*(double)K;
176     float total_GFLOPS_DSP = 0.0f;
177     float total_GFLOPS_ARM = 0.0f;
178     int    err_code = 0;
179     
180     total_time_dsp = 0.0;
181     total_time_arm = 0.0;
182     for (iter = 0; iter <= NUM_TEST_RUN; iter++)
183     {      
184       /*-------------------------------------------------------------------------
185       * Allocate space for the matrices.  The matrices that will be passed to 
186       * the DSP are allocated using device memory.  The Carm array is not passed
187       * to the dsp and so can use system memory.
188       *------------------------------------------------------------------------*/
189       double *A    = (double *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(double));
190       double *B    = (double *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(double));
191       double *Cdsp = (double *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(double));
192       double *Carm = (double *) malloc      ((long long)M*(long long)N*(long long)sizeof(double));
193   
194       if (!A || !B || !Cdsp || !Carm)
195       {
196           printf("Could not allocate enough space for the arrays!");
197           if(A) __free_ddr(A);
198           if(B) __free_ddr(B);
199           if(Cdsp) __free_ddr(Cdsp);
200           if(Carm) free(Carm);
201           
202           return (-1);
203       }
204   
205       /*-------------------------------------------------------------------------
206       * Initialize matrices 
207       *------------------------------------------------------------------------*/
208       for (i = 0; i < (long long)M*K; ++i) A[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
209       for (i = 0; i < (long long)K*N; ++i) B[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
210       for (i = 0; i < (long long)M*N; ++i) Carm[i] = Cdsp[i] = 0;
211   
212       int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
213               (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
214   
215       int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
216               (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
217   
218       int ldc = (order == CblasColMajor) ? M : N;
219   
220       fflush(stdout);
221   
222       /*============ BLAS tuning: running on DSP and then on ARM =============*/
223       /*------------------------------------------------------------------------
224       * Time DSP dgemm
225       *-----------------------------------------------------------------------*/
226       printf("Running on DSP.\n");
227       TI_CBLAS_L3_OFFLOAD = 1;
228       
229       tick();
230       cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Cdsp,ldc);
231       time_secs = tock();
232       if(iter > 0) { /* skip first iteration */
233         total_time_dsp += time_secs;
234         gflops_DSP = operation_count/time_secs*1e-9;
235         total_GFLOPS_DSP += gflops_DSP;
236       }
237 /*      
238       if(M==4096 && K==256 && N==16) {
239           FILE *file_a = fopen("mat_a.dat","w");
240           FILE *file_b = fopen("mat_b.dat","w");
241           FILE *file_c = fopen("mat_c.dat","w");
242           
243           for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
244           for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
245           for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Cdsp[i]);
246       }
247 */      
248       /*-------------------------------------------------------------------------
249       * Time ARM dgemm
250       *------------------------------------------------------------------------*/
251       printf("Running on ARM.\n");
252       TI_CBLAS_L3_OFFLOAD = 0;
253       
254       tick();
255       cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Carm,ldc);
256       time_secs = tock();
257       if(iter > 0) { /* skip first iteration */
258         total_time_arm += time_secs;
259         gflops_ARM = operation_count/time_secs*1e-9;
260         total_GFLOPS_ARM += gflops_ARM;
261       }
262       //printf(" %6.3f  %6.3f  %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
263       fflush(stdout);
264   
265       /*-------------------------------------------------------------------------
266       * Verify Results
267       *------------------------------------------------------------------------*/
268       //return check_results(Cdsp, Carm, M, N);
269       err_code += check_results(Cdsp, Carm, M, N);
270   
271       __free_ddr(A);
272       __free_ddr(B);
273       __free_ddr(Cdsp);
274       free(Carm);
275     }
276     
277     *gflops_dsp = total_GFLOPS_DSP;
278     *gflops_arm = total_GFLOPS_ARM;
279     *time_dsp   = total_time_dsp / (double)NUM_TEST_RUN;
280     *time_arm   = total_time_arm / (double)NUM_TEST_RUN;
281     
282     return err_code;
286 /*-----------------------------------------------------------------------------
287 * check_results
288 *----------------------------------------------------------------------------*/
289 int check_results(const double *C1, const double *C2, int M, int N)
291     int i;
292     const double EPISILON = 1e-10;
293     //const double EPISILON = 1e-200;
294     const int NERRORS  = 5;
295     int       num_errors = 0;
297     for (i=0; i<(long)M*N; i++)
298     {
299         double delta = fabs(C1[i] - C2[i]);
300         if (delta > EPISILON*fabs(C1[i]))
301             if ((num_errors += 1) < NERRORS)
302                 printf("Error [elem:%d]: %e <==> %e\n", i, C1[i], C2[i]);
303     }
305     if (num_errors > 0)
306     {
307          printf("FAIL with %d errors!\n", num_errors);
308          return num_errors;
309     }
310     else 
311     {
312         //printf("PASS!\n");
313         return 0;
314     }