1 /******************************************************************************
2 * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <math.h>
31 #include <time.h>
33 #include "cblas.h"
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37 #include "cblas.h"
38 #ifdef __cplusplus
39 }
40 #endif
42 #define TUNING_START_SIZE_SQUARE_MATRIX 16
43 #define TUNING_START_SIZE_RECTAN_MATRIX 8
44 #define NUM_MATRIX_SIZE_TO_BENCHMARK 8 //16
45 #define HAS_MEMORY 1
46 #define NO_MEMORY 0
47 #define OFFLOAD 1
48 #define NO_OFFLOAD 0
50 #define NUM_TEST_RUN 5
53 /*-----------------------------------------------------------------------------
54 * Timing Setup
55 *----------------------------------------------------------------------------*/
56 struct timespec t0,t1;
57 #define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
58 #define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
59 t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
61 /*-----------------------------------------------------------------------------
62 * Global Variables
63 *----------------------------------------------------------------------------*/
64 float alpha = 0.7;
65 float beta = 0.3;
66 enum CBLAS_ORDER order = CblasColMajor;
67 //enum CBLAS_ORDER order = CblasRowMajor;
68 enum CBLAS_TRANSPOSE transA = CblasNoTrans;
69 enum CBLAS_TRANSPOSE transB = CblasNoTrans;
71 extern int TI_CBLAS_L3_OFFLOAD;
73 /*-----------------------------------------------------------------------------
74 * Prototypes
75 *----------------------------------------------------------------------------*/
76 int check_results(const float *C1, const float *C2, int M, int N);
77 int run_sgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
78 float *gflops_dsp, float *gflops_arm);
80 /*-----------------------------------------------------------------------------
81 * MAIN
82 *----------------------------------------------------------------------------*/
83 int main()
84 {
85 int num_size, sgemm_err;
86 int M, N, K, m, n, k;
87 int M_pre, N_pre, K_pre, M_start_size, N_start_size;
88 int offload_threshold_1, offload_threshold_2;
89 float total_GFLOPS_DSP, total_GFLOPS_ARM;
90 float time_DSP, time_ARM, t_dsp, t_arm;
91 char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
92 char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
93 int skip_next_point;
94 float diff_tmp, diff_pre;
95 FILE *fp_flag, *fp_time, *fp_tbl;
97 fp_flag = fopen("ofld_flag_sgemm.dat","w");
98 fp_tbl = fopen("ofld_tbl_sgemm.c","w");
99 fp_time = fopen("sgemm_time_ARMvsDSP.dat","w");
101 fprintf(fp_tbl, "char ofld_tbl_sgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
103 srand(12345);
105 /* sweep M, K, and N */
106 for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
107 {
108 for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
109 {
110 for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
111 {
112 if( (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
113 ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
114 ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
115 ofld_flag[m][n][k] = OFFLOAD;
116 mem_flag[m][n][k] = HAS_MEMORY; // to avoid error
117 time_DSP = -1.0;
118 time_ARM = -1.0;
119 printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
120 }
121 else if( (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
122 ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
123 ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
124 ofld_flag[m][n][k] = NO_OFFLOAD;
125 mem_flag[m][n][k] = NO_MEMORY;
126 time_DSP = -2.0;
127 time_ARM = -2.0;
128 printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
129 }
130 else {
131 printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
132 sgemm_err = run_sgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
134 if(sgemm_err == -1) { /* out of memory for DSP offloading */
135 ofld_flag[m][n][k] = NO_OFFLOAD;
136 mem_flag[m][n][k] = NO_MEMORY;
137 time_DSP = -2.0;
138 time_ARM = -2.0;
139 printf("Out of memory, skipping next point.\n");
140 }
141 else {
142 mem_flag[m][n][k] = HAS_MEMORY;
143 time_DSP = t_dsp;
144 time_ARM = t_arm;
145 if (sgemm_err == 0){
146 //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
147 if(t_dsp < t_arm) {
148 ofld_flag[m][n][k] = OFFLOAD;
149 printf("Offloading to DSP for this point. Skipping next point.\n");
150 }
151 else {
152 ofld_flag[m][n][k] = NO_OFFLOAD;
153 }
154 }
155 else {
156 printf("Error in SGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
157 exit(0);
158 }
159 }
160 }
162 fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
163 fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
165 if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
166 && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
167 && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
168 fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
169 } else {
170 fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
171 }
172 }
173 fprintf(fp_tbl, "\n");
174 }
175 }
177 fclose(fp_flag);
178 fclose(fp_time);
179 fclose(fp_tbl);
181 return 0;
182 }
185 int run_sgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
186 float *gflops_dsp, float *gflops_arm)
187 {
188 int iter;
189 long long i;
190 float time_secs, total_time_dsp, total_time_arm;
191 float gflops_ARM, gflops_DSP;
192 float operation_count = 2.0*(float)M*(float)N*(float)K;
193 float total_GFLOPS_DSP = 0.0f;
194 float total_GFLOPS_ARM = 0.0f;
195 int err_code = 0;
197 total_time_dsp = 0.0;
198 total_time_arm = 0.0;
199 for (iter = 0; iter < NUM_TEST_RUN; iter++)
200 {
201 /*-------------------------------------------------------------------------
202 * Allocate space for the matrices. The matrices that will be passed to
203 * the DSP are allocated using device memory. The Carm array is not passed
204 * to the dsp and so can use system memory.
205 *------------------------------------------------------------------------*/
206 float *A = (float *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(float));
207 float *B = (float *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(float));
208 float *Cdsp = (float *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(float));
209 float *Carm = (float *) malloc ((long long)M*(long long)N*(long long)sizeof(float));
211 if (!A || !B || !Cdsp || !Carm)
212 {
213 printf("Could not allocate enough space for the arrays!");
214 if(A) __free_ddr(A);
215 if(B) __free_ddr(B);
216 if(Cdsp) __free_ddr(Cdsp);
217 if(Carm) free(Carm);
219 return (-1);
220 }
222 /*-------------------------------------------------------------------------
223 * Initialize matrices
224 *------------------------------------------------------------------------*/
225 for (i = 0; i < (long long)M*K; ++i) A[i] = (float)rand()/RAND_MAX;
226 for (i = 0; i < (long long)K*N; ++i) B[i] = (float)rand()/RAND_MAX;
227 for (i = 0; i < (long long)M*N; ++i) Carm[i] = Cdsp[i] = 0;
229 int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
230 (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
232 int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
233 (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
235 int ldc = (order == CblasColMajor) ? M : N;
237 /*============ BLAS tuning: running on DSP and then on ARM =============*/
238 /*------------------------------------------------------------------------
239 * Time DSP sgemm
240 *-----------------------------------------------------------------------*/
241 //ti_cblas_offload_config("001"); /* force offloading level 3 to DSP */
242 //printf("Running on DSP.\n");
243 TI_CBLAS_L3_OFFLOAD = 1;
245 tick();
246 cblas_sgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Cdsp,ldc);
247 time_secs = tock();
248 total_time_dsp += time_secs;
249 gflops_DSP = operation_count/time_secs*1e-9;
250 total_GFLOPS_DSP += gflops_DSP;
251 /*
252 if(M==4096 && K==256 && N==16) {
253 FILE *file_a = fopen("mat_a.dat","w");
254 FILE *file_b = fopen("mat_b.dat","w");
255 FILE *file_c = fopen("mat_c.dat","w");
257 for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
258 for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
259 for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Cdsp[i]);
260 }
261 */
262 /*-------------------------------------------------------------------------
263 * Time ARM sgemm
264 *------------------------------------------------------------------------*/
265 //ti_cblas_offload_config("000"); /* force no offloading */
266 //printf("Running on ARM.\n");
267 TI_CBLAS_L3_OFFLOAD = 0;
269 tick();
270 cblas_sgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Carm,ldc);
271 time_secs = tock();
272 total_time_arm += time_secs;
273 gflops_ARM = operation_count/time_secs*1e-9;
274 total_GFLOPS_ARM += gflops_ARM;
275 //printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
276 fflush(stdout);
278 /*-------------------------------------------------------------------------
279 * Verify Results
280 *------------------------------------------------------------------------*/
281 //return check_results(Cdsp, Carm, M, N);
282 err_code += check_results(Cdsp, Carm, M, N);
284 __free_ddr(A);
285 __free_ddr(B);
286 __free_ddr(Cdsp);
287 free(Carm);
288 }
290 *gflops_dsp = total_GFLOPS_DSP;
291 *gflops_arm = total_GFLOPS_ARM;
292 *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
293 *time_arm = total_time_arm / (float)NUM_TEST_RUN;
295 return err_code;
296 }
299 /*-----------------------------------------------------------------------------
300 * check_results
301 *----------------------------------------------------------------------------*/
302 int check_results(const float *C1, const float *C2, int M, int N)
303 {
304 int i;
305 const float EPISILON = 1e-5;
306 //const float EPISILON = 1e-200;
307 const int NERRORS = 5;
308 int num_errors = 0;
310 for (i=0; i<(long)M*N; i++)
311 {
312 float delta = fabs(C1[i] - C2[i]);
314 if (delta > EPISILON*fabs(C1[i]))
315 if ((num_errors += 1) < NERRORS)
316 printf("Error [elem:%d]: %e <==> %e\n", i, C1[i], C2[i]);
317 }
319 if (num_errors > 0)
320 {
321 printf("FAIL with %d errors!\n", num_errors);
322 return num_errors;
323 }
324 else
325 {
326 //printf("PASS!\n");
327 return 0;
328 }
329 }