]> Gitweb @ Texas Instruments - Open Source Git Repositories - git.TI.com/gitweb - dense-linear-algebra-libraries/linalg.git/blob - tuning/ctrsm_tune/ctrsm_tune.c
Added tuning for xSYRK, xTRMM, and xTRSM.
[dense-linear-algebra-libraries/linalg.git] / tuning / ctrsm_tune / ctrsm_tune.c
1 /******************************************************************************
2  * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
3  *   All rights reserved.
4  *
5  *   Redistribution and use in source and binary forms, with or without
6  *   modification, are permitted provided that the following conditions are met:
7  *       * Redistributions of source code must retain the above copyright
8  *         notice, this list of conditions and the following disclaimer.
9  *       * Redistributions in binary form must reproduce the above copyright
10  *         notice, this list of conditions and the following disclaimer in the
11  *         documentation and/or other materials provided with the distribution.
12  *       * Neither the name of Texas Instruments Incorporated nor the
13  *         names of its contributors may be used to endorse or promote products
14  *         derived from this software without specific prior written permission.
15  *
16  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  *   THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <math.h>
31 #include <time.h>
32 #include <complex.h>
33 #include "../common/tune_com.h"
35 #include "cblas.h"
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39 #include "cblas.h"
40 #ifdef __cplusplus
41 }
42 #endif
44 /*-----------------------------------------------------------------------------
45 * Global Variables
46 *----------------------------------------------------------------------------*/
47 float complex alpha = 0.7 - 0.3*I; 
48 enum CBLAS_ORDER     order  = CblasColMajor; 
49 enum CBLAS_TRANSPOSE transA = CblasNoTrans;
50 enum CBLAS_SIDE side  = CblasLeft;
51 enum CBLAS_UPLO uplo  = CblasUpper;
52 enum CBLAS_DIAG diag  = CblasNonUnit;
54 extern int TI_CBLAS_L3_OFFLOAD;
56 /*-----------------------------------------------------------------------------
57 * Prototypes
58 *----------------------------------------------------------------------------*/
59 int check_results(const float complex *C1, const float complex *C2, int M, int N);
60 int run_ctrsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm, 
61                           float *gflops_dsp, float *gflops_arm);
63 /*-----------------------------------------------------------------------------
64 * MAIN
65 *----------------------------------------------------------------------------*/
66 int main()
67 {
68     int num_size, ctrsm_err;
69     int M, N, m, n;
70     int M_pre, N_pre, K_pre, M_start_size, N_start_size;
71     int offload_threshold_1, offload_threshold_2;
72     float total_GFLOPS_DSP, total_GFLOPS_ARM;
73     float time_DSP, time_ARM, t_dsp, t_arm;
74     char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
75     char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
76     int skip_next_point;
77     float diff_tmp, diff_pre;
78     FILE *fp_flag, *fp_time, *fp_tbl;  
79   
80     fp_flag = fopen("ofld_flag_ctrsm.dat","w");
81     fp_tbl  = fopen("ofld_tbl_ctrsm.c","w");
82     fp_time = fopen("ctrsm_time_ARMvsDSP.dat","w");
84     print_file_header(fp_tbl);
85     fprintf(fp_tbl, "char ofld_tbl_ctrsm[TRMM_OFFLOAD_TBL_SIZE] = {\n");
86     
87     srand(12345);
88     
89     /* sweep M, and N */    
90     for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2) 
91     {
92         for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2) 
93         {    
94             if(  (m>0 && ofld_flag[m-1][n]==OFFLOAD)
95                ||(n>0 && ofld_flag[m][n-1]==OFFLOAD) ) {
96                 ofld_flag[m][n] = OFFLOAD;
97                 mem_flag[m][n]  = HAS_MEMORY;  // to avoid error
98                 time_DSP = -1.0;
99                 time_ARM = -1.0;
100                 printf("Offloading. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
101             }
102             else if(  (m>0 && (mem_flag[m-1][n]==NO_MEMORY))
103                     ||(n>0 && (mem_flag[m][n-1]==NO_MEMORY)) ) {
104                 ofld_flag[m][n] = NO_OFFLOAD;
105                 mem_flag[m][n]  = NO_MEMORY;
106                 time_DSP = -2.0;
107                 time_ARM = -2.0;
108                 printf("Out of memory. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
109             }
110             else {                    
111                 printf("Measuring DSP and ARM GFLOPS for (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
112                 ctrsm_err = run_ctrsm_dsp_and_arm(M, N, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
113           
114                 if(ctrsm_err == -1) {  /* out of memory for DSP offloading */
115                     ofld_flag[m][n] = NO_OFFLOAD;
116                     mem_flag[m][n] = NO_MEMORY;
117                     time_DSP = -2.0;
118                     time_ARM = -2.0;
119                     printf("Out of memory, skipping next point.\n");
120                 }
121                 else {
122                     mem_flag[m][n] = HAS_MEMORY;                    
123                     time_DSP = t_dsp;
124                     time_ARM = t_arm;
125                     if (ctrsm_err == 0){
126                         //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
127                         if(t_dsp < t_arm) {
128                             ofld_flag[m][n] = OFFLOAD;
129                             printf("Offloading to DSP for this point. Skipping next point.\n");
130                         }
131                         else {
132                             ofld_flag[m][n] = NO_OFFLOAD;
133                         }
134                     }
135                     else {
136                         printf("Error in CTRSM tuning for (M,N)=(%d,%d)!\n", M,N);
137                         exit(0);
138                     }
139                 }
140             }
141             
142             fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n], (int)ofld_flag[m][n]);
143             fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", M, N, time_ARM, time_DSP);
145             if(   (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
146                && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1)) ) {
147               fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n]);
148             } else {
149               fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n]);
150             }
151         }
152         fprintf(fp_tbl, "\n");
153     }
154        
155     fclose(fp_flag);
156     fclose(fp_time);
157     fclose(fp_tbl);
158     
159     return 0;
162 int run_ctrsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm, 
163                           float *gflops_dsp, float *gflops_arm)
165     int   iter,j,k;
166     long long i, size_A, size_B;
167     float time_secs, total_time_dsp, total_time_arm;
168     float gflops_ARM, gflops_DSP;
169     float operation_count = 2.0*(float)M*(float)N;
170     float total_GFLOPS_DSP = 0.0f;
171     float total_GFLOPS_ARM = 0.0f;
172     int    err_code = 0;
173     
174     total_time_dsp = 0.0;
175     total_time_arm = 0.0;
176         if(side == CblasLeft) {
177           size_A = (long long)M*(long long)M;
178         }
179         else {
180           size_A = (long long)N*(long long)N;
181         }
182         size_B = (long long)M*(long long)N;
183     for (iter = 0; iter < NUM_TEST_RUN; iter++)
184     {      
185       /*-------------------------------------------------------------------------
186       * Allocate space for the matrices.  The matrices that will be passed to 
187       * the DSP are allocated using device memory.  The Carm array is not passed
188       * to the dsp and so can use system memory.
189       *------------------------------------------------------------------------*/
190       float complex *A    = (float complex *) __malloc_ddr(size_A*sizeof(float complex));
191       float complex *Bdsp = (float complex *) __malloc_ddr(size_B*sizeof(float complex));
192       float complex *Barm = (float complex *) malloc(size_B*sizeof(float complex));
193   
194       if (!A || !Bdsp || !Barm)
195       {
196           printf("Could not allocate enough space for the arrays!");
197           if(A) __free_ddr(A);
198           if(Bdsp) __free_ddr(Bdsp);
199           if(Barm) free(Barm);
200           
201           return (-1);
202       }
203   
204       /*-------------------------------------------------------------------------
205       * Initialize matrices 
206       *------------------------------------------------------------------------*/
207       int lda = (side == CblasLeft) ? M : N;  
208       int ldb = M;  
209           for(j=0;j<lda;j++)
210           {
211               for(k=0;k<lda;k++)
212                   {
213                       if (j==k)
214                               A[j*lda+k] = 1.0+j + 0.0*I;
215               else if (j<k)
216                               A[j*lda+k] = 0.0 + 0.0*I;
217                           else
218                                 A[j*lda+k] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
219                   }
220           }
221 //      for (i = 0; i < size_A; ++i) 
222 //        {
223 //          A[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
224 //        }
226       for (i = 0; i < size_B; ++i)
227           {
228           Bdsp[i] = Barm[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
229       }
231       if(M==8 && N==8) {
232           FILE *file_a = fopen("mat_a.dat","w");
233           FILE *file_b = fopen("mat_b.dat","w");
234           FILE *file_c = fopen("mat_c.dat","w");
235           
236           for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\t%1.10e\n",crealf(A[i]),   cimagf(A[i]));
237           for(i=0; i < size_B; ++i) fprintf(file_b, "%1.10e\t%1.10e\n",crealf(Barm[i]), cimagf(Barm[i]));
238           for(i=0; i < size_B; ++i) fprintf(file_c, "%1.10e\t%1.10e\n",crealf(Bdsp[i]), cimagf(Bdsp[i]));
239                   
240                   fclose(file_a);
241                   fclose(file_b);
242                   fclose(file_c);
243       }  
244     
245       /*============ BLAS tuning: running on DSP and then on ARM =============*/
246       /*------------------------------------------------------------------------
247       * Time DSP ctrsm
248       *-----------------------------------------------------------------------*/
249       TI_CBLAS_L3_OFFLOAD = 1;
250       
251       tick();
252       cblas_ctrsm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Bdsp,ldb);
253       time_secs = tock();
254       total_time_dsp += time_secs;
255       gflops_DSP = operation_count/time_secs*1e-9;
256       total_GFLOPS_DSP += gflops_DSP;
257           
258       /*-------------------------------------------------------------------------
259       * Time ARM ctrsm
260       *------------------------------------------------------------------------*/
261       TI_CBLAS_L3_OFFLOAD = 0;
262       
263       tick();
264       cblas_ctrsm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Barm,ldb);
265       time_secs = tock();
266       total_time_arm += time_secs;
267       gflops_ARM = operation_count/time_secs*1e-9;
268       total_GFLOPS_ARM += gflops_ARM;
269       //printf(" %6.3f  %6.3f  %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
270       
271       if(M==8 && N==8) {
272           FILE *file_a = fopen("mat_a2.dat","w");
273           FILE *file_b = fopen("mat_b2.dat","w");
274           FILE *file_c = fopen("mat_c2.dat","w");
275           
276           for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\t%1.10e\n",crealf(A[i]),   cimagf(A[i]));
277           for(i=0; i < size_B; ++i) fprintf(file_b, "%1.10e\t%1.10e\n",crealf(Barm[i]), cimagf(Barm[i]));
278           for(i=0; i < size_B; ++i) fprintf(file_c, "%1.10e\t%1.10e\n",crealf(Bdsp[i]), cimagf(Bdsp[i]));
279                   
280                   fclose(file_a);
281                   fclose(file_b);
282                   fclose(file_c);
283       }
284           
285       /*-------------------------------------------------------------------------
286       * Verify Results
287       *------------------------------------------------------------------------*/
288       err_code += check_results(Bdsp, Barm, M, N);
289   
290       __free_ddr(A);
291       __free_ddr(Bdsp);
292       free(Barm);
293     }
294     
295     *gflops_dsp = total_GFLOPS_DSP;
296     *gflops_arm = total_GFLOPS_ARM;
297     *time_dsp   = total_time_dsp / (float)NUM_TEST_RUN;
298     *time_arm   = total_time_arm / (float)NUM_TEST_RUN;
299     
300     return err_code;
304 /*-----------------------------------------------------------------------------
305 * check_results
306 *----------------------------------------------------------------------------*/
307 int check_results(const float complex *C1, const float complex *C2, int M, int N)
309     int i;
310         float norm, delta;
311     const float EPISILON = 1e-5;
312     const float DELTA = 1e-5;
313     //const float EPISILON = 1e-200;
314     const int NERRORS  = 5;
315     int       num_errors = 0;
317     for (i=0; i<M*N; i++)
318     {
319         delta = cabs(C1[i]) - cabs(C2[i]);
320                 norm = cabs(C1[i]);
321                 if(norm < cabs(C2[i]))
322                     norm = cabs(C2[i]);
324         if (delta > EPISILON*norm && delta>DELTA)
325             if ((num_errors += 1) < NERRORS)
326                 printf("Error [elem:%d]: %f <==> %f\n", i, cabs(C1[i]), cabs(C2[i]));
327     }
329     if (num_errors > 0)
330     {
331          printf("FAIL with %d errors!\n", num_errors);
332          return num_errors;
333     }
334     else 
335     {
336         //printf("PASS!\n");
337         return 0;
338     }