9a35652ed3cf62d082c47875476535c593b35421
1 /******************************************************************************
2 * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <math.h>
31 #include <time.h>
33 #include "cblas.h"
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37 #include "cblas.h"
38 #ifdef __cplusplus
39 }
40 #endif
42 #define TUNING_START_SIZE_SQUARE_MATRIX 16
43 #define TUNING_START_SIZE_RECTAN_MATRIX 8
44 #define NUM_MATRIX_SIZE_TO_BENCHMARK 16
45 #define HAS_MEMORY 1
46 #define NO_MEMORY 0
47 #define OFFLOAD 1
48 #define NO_OFFLOAD 0
50 #define NUM_TEST_RUN 5
53 /*-----------------------------------------------------------------------------
54 * Timing Setup
55 *----------------------------------------------------------------------------*/
56 struct timespec t0,t1;
57 #define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
58 #define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
59 t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
61 /*-----------------------------------------------------------------------------
62 * Global Variables
63 *----------------------------------------------------------------------------*/
64 float alpha = 0.7;
65 float beta = 0.3;
66 enum CBLAS_ORDER order = CblasColMajor;
67 //enum CBLAS_ORDER order = CblasRowMajor;
68 enum CBLAS_TRANSPOSE transA = CblasNoTrans;
69 enum CBLAS_TRANSPOSE transB = CblasNoTrans;
71 extern int TI_CBLAS_L3_OFFLOAD;
72 /*-----------------------------------------------------------------------------
73 * Prototypes
74 *----------------------------------------------------------------------------*/
75 int run_sgemm(int M, int N, int K, float *time, float *gflops);
77 /*-----------------------------------------------------------------------------
78 * MAIN
79 *----------------------------------------------------------------------------*/
80 int main()
81 {
82 int num_size, dgemm_err;
83 int M, N, K, m, n, k;
84 int M_pre, N_pre, K_pre, M_start_size, N_start_size;
85 float time_secs, gflops;
86 FILE *fp_time;
88 fp_time = fopen("dgemm_time.dat","w");
90 srand(12345);
92 /* sweep M, K, and N */
93 for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
94 {
95 for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
96 {
97 for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
98 {
99 dgemm_err = run_sgemm(M, N, K, &time_secs, &gflops);
101 if(dgemm_err == -1) { /* out of memory for DSP offloading */
102 printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
103 }
104 else {
105 if (dgemm_err == 0){
106 fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\n", M, N, K, time_secs);
107 }
108 else {
109 printf("Error in DGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
110 exit(0);
111 }
112 }
113 }
114 }
115 }
117 fclose(fp_time);
119 return 0;
120 }
123 int run_sgemm(int M, int N, int K, float *time, float *gflops)
124 {
125 int iter;
126 long long i;
127 float time_secs, total_time;
128 float operation_count = 2.0*(float)M*(float)N*(float)K;
129 float total_GFLOPS = 0.0f;
130 int err_code = 0;
132 total_time = 0.0;
133 for (iter = 0; iter < NUM_TEST_RUN; iter++)
134 {
135 /*-------------------------------------------------------------------------
136 * Allocate space for the matrices. The matrices that will be passed to
137 * the DSP are allocated using device memory. The Carm array is not passed
138 * to the dsp and so can use system memory.
139 *------------------------------------------------------------------------*/
140 float *A = (float *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(float));
141 float *B = (float *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(float));
142 float *Cdsp = (float *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(float));
143 float *Carm = (float *) malloc ((long long)M*(long long)N*(long long)sizeof(float));
145 if (!A || !B || !Cdsp || !Carm)
146 {
147 printf("Could not allocate enough space for the arrays!");
148 if(A) __free_ddr(A);
149 if(B) __free_ddr(B);
150 if(Cdsp) __free_ddr(Cdsp);
151 if(Carm) free(Carm);
153 return (-1);
154 }
156 /*-------------------------------------------------------------------------
157 * Initialize matrices
158 *------------------------------------------------------------------------*/
159 for (i = 0; i < (long long)M*K; ++i) A[i] = (float)rand()/RAND_MAX;// (float)(rand() % 5 + 1);
160 for (i = 0; i < (long long)K*N; ++i) B[i] = (float)rand()/RAND_MAX;// (float)(rand() % 5 + 1);
161 for (i = 0; i < (long long)M*N; ++i) Carm[i] = Cdsp[i] = 0;
163 int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
164 (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
166 int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
167 (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
169 int ldc = (order == CblasColMajor) ? M : N;
171 fflush(stdout);
173 /*------------------------------------------------------------------------
174 * Run and time dgemm
175 *-----------------------------------------------------------------------*/
176 tick();
177 cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Cdsp,ldc);
178 time_secs = tock();
179 total_time += time_secs;
180 total_GFLOPS += operation_count/time_secs*1e-9;
181 /*
182 if(M==4096 && K==256 && N==16) {
183 FILE *file_a = fopen("mat_a.dat","w");
184 FILE *file_b = fopen("mat_b.dat","w");
185 FILE *file_c = fopen("mat_c.dat","w");
187 for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
188 for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
189 for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Cdsp[i]);
190 }
191 */
193 __free_ddr(A);
194 __free_ddr(B);
195 __free_ddr(Cdsp);
196 free(Carm);
197 }
199 *gflops = total_GFLOPS / (float)NUM_TEST_RUN;
200 *time = total_time / (float)NUM_TEST_RUN;
202 return err_code;
203 }