[dense-linear-algebra-libraries/linalg.git] / src / ti / linalg / tuning / zgemm_tune / zgemm_tune.c
1 /******************************************************************************
2 * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <math.h>
31 #include <time.h>
32 #include <complex.h>
33 #include "../common/tune_com.h"
35 #include "cblas.h"
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39 #include "cblas.h"
40 #ifdef __cplusplus
41 }
42 #endif
44 /*-----------------------------------------------------------------------------
45 * Global Variables
46 *----------------------------------------------------------------------------*/
47 double complex alpha = 0.7 - 0.3*I;
48 double complex beta = 0.4 + 0.6*I;
49 enum CBLAS_ORDER order = CblasColMajor;
50 enum CBLAS_TRANSPOSE transA = CblasNoTrans;
51 enum CBLAS_TRANSPOSE transB = CblasNoTrans;
53 extern int TI_CBLAS_L3_OFFLOAD;
55 /*-----------------------------------------------------------------------------
56 * Prototypes
57 *----------------------------------------------------------------------------*/
58 int check_results(const double complex *C1, const double complex *C2, int M, int N);
59 int run_zgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
60 float *gflops_dsp, float *gflops_arm);
62 /*-----------------------------------------------------------------------------
63 * MAIN
64 *----------------------------------------------------------------------------*/
65 int main()
66 {
67 int num_size, zgemm_err;
68 int M, N, K, m, n, k;
69 int M_pre, N_pre, K_pre, M_start_size, N_start_size;
70 int offload_threshold_1, offload_threshold_2;
71 float total_GFLOPS_DSP, total_GFLOPS_ARM;
72 float time_DSP, time_ARM, t_dsp, t_arm;
73 char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
74 char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
75 int skip_next_point;
76 FILE *fp_flag, *fp_time, *fp_tbl;
78 fp_flag = fopen("ofld_flag_zgemm.dat","w");
79 fp_tbl = fopen("ofld_tbl_zgemm.c","w");
80 fp_time = fopen("zgemm_time_ARMvsDSP.dat","w");
82 print_file_header(fp_tbl);
83 fprintf(fp_tbl, "char ofld_tbl_zgemm[GEMM_OFFLOAD_TBL_SIZE] = {\n");
85 srand(12345);
87 /* sweep M, K, and N */
88 for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
89 {
90 for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
91 {
92 for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
93 {
94 if( (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
95 ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
96 ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
97 ofld_flag[m][n][k] = OFFLOAD;
98 mem_flag[m][n][k] = HAS_MEMORY; // to avoid error
99 time_DSP = -1.0;
100 time_ARM = -1.0;
101 printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
102 }
103 else if( (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
104 ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
105 ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
106 ofld_flag[m][n][k] = NO_OFFLOAD;
107 mem_flag[m][n][k] = NO_MEMORY;
108 time_DSP = -2.0;
109 time_ARM = -2.0;
110 printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
111 }
112 else {
113 printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
114 zgemm_err = run_zgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
115 //dsym_err = run_dsymm_dsp_and_arm();
117 if(zgemm_err == -1) { /* out of memory for DSP offloading */
118 ofld_flag[m][n][k] = NO_OFFLOAD;
119 mem_flag[m][n][k] = NO_MEMORY;
120 time_DSP = -2.0;
121 time_ARM = -2.0;
122 printf("Out of memory, skipping next point.\n");
123 }
124 else {
125 mem_flag[m][n][k] = HAS_MEMORY;
126 time_DSP = t_dsp;
127 time_ARM = t_arm;
128 if (zgemm_err == 0){
129 //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
130 if(DSP_FASTER_THAN_ARM(t_dsp,t_arm)) {
131 ofld_flag[m][n][k] = OFFLOAD;
132 printf("Offloading to DSP for this point. Skipping next point.\n");
133 }
134 else {
135 ofld_flag[m][n][k] = NO_OFFLOAD;
136 }
137 }
138 else {
139 printf("Error in ZGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
140 exit(0);
141 }
142 }
143 }
145 fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
146 fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
148 if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
149 && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
150 && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
151 fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
152 } else {
153 fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
154 }
155 }
156 fprintf(fp_tbl, "\n");
157 }
158 }
160 fclose(fp_flag);
161 fclose(fp_time);
162 fclose(fp_tbl);
164 return 0;
165 }
168 int run_zgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
169 float *gflops_dsp, float *gflops_arm)
170 {
171 int iter;
172 long long i;
173 float time_secs, total_time_dsp, total_time_arm;
174 float gflops_ARM, gflops_DSP;
175 float operation_count = 2.0*(float)M*(float)N*(float)K;
176 float total_GFLOPS_DSP = 0.0f;
177 float total_GFLOPS_ARM = 0.0f;
178 int num_errors = 0;
180 total_time_dsp = 0.0;
181 total_time_arm = 0.0;
182 for (iter = 0; iter <= NUM_TEST_RUN; iter++)
183 {
184 /*-------------------------------------------------------------------------
185 * Allocate space for the matrices. The matrices that will be passed to
186 * the DSP are allocated using device memory. The Carm array is not passed
187 * to the dsp and so can use system memory.
188 *------------------------------------------------------------------------*/
189 double complex *A = (double complex*) __malloc_ddr(M*K*sizeof(double complex));
190 double complex *B = (double complex*) __malloc_ddr(K*N*sizeof(double complex));
191 double complex *Cdsp = (double complex*) __malloc_ddr(M*N*sizeof(double complex));
192 double complex *Carm = (double complex*) malloc (M*N*sizeof(double complex));
194 if (!A || !B || !Cdsp || !Carm)
195 {
196 printf("Could not allocate enough space for the arrays!");
197 if(A) __free_ddr(A);
198 if(B) __free_ddr(B);
199 if(Cdsp) __free_ddr(Cdsp);
200 if(Carm) free(Carm);
202 return (-1);
203 }
205 /*-------------------------------------------------------------------------
206 * Initialize matrices and print if small enough.
207 *------------------------------------------------------------------------*/
208 for (i = 0; i < M*K; ++i)
209 {
210 A[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
211 }
212 for (i = 0; i < K*N; ++i)
213 {
214 B[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
215 }
216 for (i = 0; i < M*N; ++i)
217 {
218 Carm[i] = Cdsp[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
219 }
221 int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
222 (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
224 int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
225 (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
227 int ldc = (order == CblasColMajor) ? M : N;
229 /*============ BLAS tuning: running on DSP and then on ARM =============*/
230 /*------------------------------------------------------------------------
231 * Time DSP zgemm
232 *-----------------------------------------------------------------------*/
233 //ti_cblas_offload_config("001"); /* force offloading level 3 to DSP */
234 TI_CBLAS_L3_OFFLOAD = 1;
236 tick();
237 cblas_zgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Cdsp,ldc);
238 time_secs = tock();
239 if(iter > 0) { /* skip first iteration */
240 total_time_dsp += time_secs;
241 gflops_DSP = operation_count/time_secs*1e-9;
242 total_GFLOPS_DSP += gflops_DSP;
243 }
245 /*-------------------------------------------------------------------------
246 * Time ARM zgemm
247 *------------------------------------------------------------------------*/
248 //ti_cblas_offload_config("000"); /* force no offloading */
249 TI_CBLAS_L3_OFFLOAD = 0;
251 tick();
252 cblas_zgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Carm,ldc);
253 time_secs = tock();
254 if(iter > 0) { /* skip first iteration */
255 total_time_arm += time_secs;
256 gflops_ARM = operation_count/time_secs*1e-9;
257 total_GFLOPS_ARM += gflops_ARM;
258 }
259 //printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
261 /*-------------------------------------------------------------------------
262 * Verify Results
263 *------------------------------------------------------------------------*/
264 //return check_results(Cdsp, Carm, M, N);
265 num_errors += check_results(Cdsp, Carm, M, N);
267 __free_ddr(A);
268 __free_ddr(B);
269 __free_ddr(Cdsp);
270 free(Carm);
271 }
273 *gflops_dsp = total_GFLOPS_DSP;
274 *gflops_arm = total_GFLOPS_ARM;
275 *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
276 *time_arm = total_time_arm / (float)NUM_TEST_RUN;
278 return num_errors;
279 }
281 /*-----------------------------------------------------------------------------
282 * check_results
283 *----------------------------------------------------------------------------*/
284 int check_results(const double complex *C1, const double complex *C2, int M, int N)
285 {
286 int i;
287 const double EPISILON = 1e-10;
288 //const double EPISILON = 1e-200;
289 const int NERRORS = 5;
290 int num_errors = 0;
292 for (i=0; i<M*N; i++)
293 {
294 double delta = fabs(cabs(C1[i]) - cabs(C2[i]));
296 if (delta > EPISILON*cabs(C1[i]))
297 if ((num_errors += 1) < NERRORS)
298 printf("Error [elem:%d]: %f <==> %f\n", i, cabs(C1[i]), cabs(C2[i]));
299 }
301 if (num_errors > 0)
302 {
303 printf("FAIL with %d errors!\n", num_errors);
304 return num_errors;
305 }
306 else
307 {
308 //printf("PASS!\n");
309 return 0;
310 }
311 }