summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: a155533)
raw | patch | inline | side by side (parent: a155533)
author | Jianzhong Xu <a0869574@ti.com> | |
Thu, 30 Apr 2015 21:38:53 +0000 (17:38 -0400) | ||
committer | Jianzhong Xu <a0869574@ti.com> | |
Thu, 30 Apr 2015 21:38:53 +0000 (17:38 -0400) |
examples/dgemm_bench/Makefile | [new file with mode: 0644] | patch | blob |
examples/dgemm_bench/main.c | [new file with mode: 0644] | patch | blob |
examples/dgemm_test/dgemm_test.c | patch | blob | history |
diff --git a/examples/dgemm_bench/Makefile b/examples/dgemm_bench/Makefile
--- /dev/null
@@ -0,0 +1,12 @@
+
+EXE = dgemm_bench
+
+include ../make.inc
+
+$(EXE): main.o
+ $(CC) $(CFLAGS) main.o $(BLASLIB) -o $@
+
+run: $(EXE)
+ @echo "Benchmarking DGEMM"; \
+ export TI_CBLAS_OFFLOAD=002; \
+ ./$(EXE);
diff --git a/examples/dgemm_bench/main.c b/examples/dgemm_bench/main.c
--- /dev/null
@@ -0,0 +1,219 @@
+/******************************************************************************
+ * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+#include "cblas.h"
+
+#define GEMM_MATRIX_SIZE_START 1024
+#define NUM_MATRIX_SIZE_TO_BENCHMARK 2
+#define NUM_TESTS (NUM_MATRIX_SIZE_TO_BENCHMARK*NUM_MATRIX_SIZE_TO_BENCHMARK*NUM_MATRIX_SIZE_TO_BENCHMARK)
+#define HAS_MEMORY 1
+#define NO_MEMORY 0
+#define OFFLOAD 1
+#define NO_OFFLOAD 0
+
+#define NUM_TEST_RUN 5
+
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+ t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double alpha = 0.7;
+double beta = 0.3;
+enum CBLAS_ORDER order = CblasColMajor;
+//enum CBLAS_ORDER order = CblasRowMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int run_dgemm(int M, int N, int K, float *time, float *gflops);
+
+/* reference GFLOPS based on 1GHz K2H device */
+float dgemm_gflops_ref[NUM_TESTS] =
+{21.6774,21.9383,22.3325,22.7754,22.6200,23.0515,23.3946,23.6324};
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int num_size, dgemm_err;
+ int M, N, K, m, n, k, test_idx;
+ float time_secs, gflops, gflops_ref, cpu_freq_GHz;
+ cl_platform_id platform;
+ cl_uint num_platforms;
+ cl_device_id devices;
+ cl_uint num_devices;
+ cl_uint cpu_freq;
+ size_t cpu_freq_size;
+ FILE *fp_time, *fp_gflops;
+
+ fp_time = fopen("dgemm_time.dat","w");
+ fp_gflops = fopen("dgemm_gflops.dat","w");
+
+ if(clGetPlatformIDs(1, &platform, &num_platforms) != CL_SUCCESS) {
+ printf("Error in clGetPlatformIDs\n.");
+ exit(0);
+ }
+
+ if(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &devices, &num_devices) != CL_SUCCESS) {
+ printf("Error in clGetDeviceIDs\n.");
+ exit(0);
+ }
+ if(clGetDeviceInfo(devices, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), (void *)&cpu_freq, &cpu_freq_size) != CL_SUCCESS) {
+ printf("Error in clGetDeviceInfo\n.");
+ exit(0);
+ }
+ cpu_freq_GHz = (float)cpu_freq/1e3; /* convert from MHz to GHz */
+ printf("CPU frequency is %f GHz.\n", cpu_freq_GHz);
+
+ srand(12345);
+
+ /* setting up TI CBLAS during first call */
+ run_dgemm(1000, 1000, 1000, &time_secs, &gflops);
+
+ /* sweep M, K, and N */
+ test_idx = 0;
+ for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
+ {
+ for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ {
+ for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
+ {
+ printf("Running DGEMM for (M,N,K) = (%d,%d,%d).\t", M,N,K);
+
+ dgemm_err = run_dgemm(M, N, K, &time_secs, &gflops);
+
+ gflops_ref = dgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
+ gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
+ printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
+ if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
+ printf("DGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
+ exit(0);
+ }
+
+ if(dgemm_err == -1) { /* out of memory for DSP offloading */
+ printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+ }
+ else {
+ fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
+ fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
+ }
+ }
+ }
+ }
+
+ fclose(fp_time);
+ fclose(fp_gflops);
+
+ printf("Passed.\n");
+ return 0;
+}
+
+
+int run_dgemm(int M, int N, int K, float *time, float *gflops)
+{
+ int iter;
+ long long i;
+ double time_secs, total_time;
+ double operation_count = 2.0*(double)M*(double)N*(double)K;
+ double total_GFLOPS = 0.0f;
+ int err_code = 0;
+
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices.
+ *------------------------------------------------------------------------*/
+ double *A = (double *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(double));
+ double *B = (double *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(double));
+ double *C = (double *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(double));
+
+ if (!A || !B || !C)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(B) __free_ddr(B);
+ if(C) __free_ddr(C);
+
+ return (-1);
+ }
+
+ total_time = 0.0;
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*----------------------------------------------------------------------
+ * Initialize matrices
+ *---------------------------------------------------------------------*/
+ for (i = 0; i < (long long)M*K; ++i) A[i] = (double)rand()/RAND_MAX;
+ for (i = 0; i < (long long)K*N; ++i) B[i] = (double)rand()/RAND_MAX;
+ for (i = 0; i < (long long)M*N; ++i) C[i] = 0;
+
+ int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+ (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
+
+ int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
+ (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
+
+ int ldc = (order == CblasColMajor) ? M : N;
+
+ fflush(stdout);
+
+ /*------------------------------------------------------------------------
+ * Run and time dgemm
+ *-----------------------------------------------------------------------*/
+ tick();
+ cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ time_secs = tock();
+ total_time += time_secs;
+ total_GFLOPS += operation_count/time_secs*1e-9;
+ }
+
+ __free_ddr(A);
+ __free_ddr(B);
+ __free_ddr(C);
+
+ *gflops = total_GFLOPS / (double)NUM_TEST_RUN;
+ *time = total_time / (double)NUM_TEST_RUN;
+
+ return err_code;
+}
index bf52b835e6c5abf1535cef1e028ae9a425572f68..5ea020a8b258a2d097e3090c587a86fe9ccc9f5d 100644 (file)
#include <stdio.h>
#include <math.h>
#include <time.h>
-#include <CL/cl.h>
-#include <CL/cl_ext.h>
#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
-#define GEMM_MATRIX_SIZE_START 1024
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 2
-#define NUM_TESTS (NUM_MATRIX_SIZE_TO_BENCHMARK*NUM_MATRIX_SIZE_TO_BENCHMARK*NUM_MATRIX_SIZE_TO_BENCHMARK)
+#define TUNING_START_SIZE_RECTAN_MATRIX 128
+#define NUM_MATRIX_SIZE_TO_BENCHMARK 4
#define HAS_MEMORY 1
#define NO_MEMORY 0
#define OFFLOAD 1
#define NO_OFFLOAD 0
-#define NUM_TEST_RUN 5
+#define NUM_TEST_RUN 1
/*-----------------------------------------------------------------------------
*----------------------------------------------------------------------------*/
int run_dgemm(int M, int N, int K, float *time, float *gflops);
-/* reference GFLOPS based on 1GHz K2H device */
-float dgemm_gflops_ref[NUM_TESTS] =
-{21.6774,21.9383,22.3325,22.7754,22.6200,23.0515,23.3946,23.6324};
-
/*-----------------------------------------------------------------------------
* MAIN
*----------------------------------------------------------------------------*/
int main()
{
int num_size, dgemm_err;
- int M, N, K, m, n, k, test_idx;
- float time_secs, gflops, gflops_ref, cpu_freq_GHz;
- cl_platform_id platform;
- cl_uint num_platforms;
- cl_device_id devices;
- cl_uint num_devices;
- cl_uint cpu_freq;
- size_t cpu_freq_size;
+ int M, N, K, m, n, k;
+ float time_secs, gflops;
FILE *fp_time, *fp_gflops;
fp_time = fopen("dgemm_time.dat","w");
fp_gflops = fopen("dgemm_gflops.dat","w");
- if(clGetPlatformIDs(1, &platform, &num_platforms) != CL_SUCCESS) {
- printf("Error in clGetPlatformIDs\n.");
- exit(0);
- }
-
- if(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &devices, &num_devices) != CL_SUCCESS) {
- printf("Error in clGetDeviceIDs\n.");
- exit(0);
- }
- if(clGetDeviceInfo(devices, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), (void *)&cpu_freq, &cpu_freq_size) != CL_SUCCESS) {
- printf("Error in clGetDeviceInfo\n.");
- exit(0);
- }
- cpu_freq_GHz = (float)cpu_freq/1e3; /* convert from MHz to GHz */
- printf("Found %d devices.\n", num_devices);
- printf("CPU frequency is %f GHz.\n", cpu_freq_GHz);
-
srand(12345);
/* setting up TI CBLAS during first call */
run_dgemm(1000, 1000, 1000, &time_secs, &gflops);
/* sweep M, K, and N */
- test_idx = 0;
- for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
+ for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
- for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
{
- for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
+ for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
- printf("Running DGEMM for (M,N,K) = (%d,%d,%d).\t", M,N,K);
+ printf("Running DGEMM for (M,N,K) = (%d,%d,%d).\n", M,N,K);
dgemm_err = run_dgemm(M, N, K, &time_secs, &gflops);
-
- gflops_ref = dgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
- gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
- printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
- if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
- printf("DGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
- exit(0);
- }
if(dgemm_err == -1) { /* out of memory for DSP offloading */
printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
fclose(fp_time);
fclose(fp_gflops);
- printf("PASSED.\n");
return 0;
}
double total_GFLOPS = 0.0f;
int err_code = 0;
- /*-------------------------------------------------------------------------
- * Allocate space for the matrices.
- *------------------------------------------------------------------------*/
- double *A = (double *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(double));
- double *B = (double *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(double));
- double *C = (double *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(double));
-
- if (!A || !B || !C)
- {
- printf("Could not allocate enough space for the arrays!");
- if(A) __free_ddr(A);
- if(B) __free_ddr(B);
- if(C) __free_ddr(C);
-
- return (-1);
- }
-
total_time = 0.0;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
- {
- /*----------------------------------------------------------------------
- * Initialize matrices
- *---------------------------------------------------------------------*/
- for (i = 0; i < (long long)M*K; ++i) A[i] = (double)rand()/RAND_MAX;
- for (i = 0; i < (long long)K*N; ++i) B[i] = (double)rand()/RAND_MAX;
- for (i = 0; i < (long long)M*N; ++i) C[i] = 0;
-
- int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
- (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
-
- int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
- (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
-
- int ldc = (order == CblasColMajor) ? M : N;
-
- fflush(stdout);
-
- /*------------------------------------------------------------------------
- * Run and time dgemm
- *-----------------------------------------------------------------------*/
- tick();
- cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- time_secs = tock();
- total_time += time_secs;
- total_GFLOPS += operation_count/time_secs*1e-9;
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices.
+ *------------------------------------------------------------------------*/
+ double *A = (double *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(double));
+ double *B = (double *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(double));
+ double *C = (double *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(double));
+
+ if (!A || !B || !C)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(B) __free_ddr(B);
+ if(C) __free_ddr(C);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ for (i = 0; i < (long long)M*K; ++i) A[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
+ for (i = 0; i < (long long)K*N; ++i) B[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
+ for (i = 0; i < (long long)M*N; ++i) C[i] = 0;
+
+ int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+ (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
+
+ int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
+ (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
+
+ int ldc = (order == CblasColMajor) ? M : N;
+
+ fflush(stdout);
+
+ /*------------------------------------------------------------------------
+ * Run and time dgemm
+ *-----------------------------------------------------------------------*/
+ tick();
+ cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ time_secs = tock();
+ total_time += time_secs;
+ total_GFLOPS += operation_count/time_secs*1e-9;
+/*
+ if(M==4096 && K==256 && N==16) {
+ FILE *file_a = fopen("mat_a.dat","w");
+ FILE *file_b = fopen("mat_b.dat","w");
+ FILE *file_c = fopen("mat_c.dat","w");
+
+ for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+ for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",C[i]);
+ }
+*/
+
+ __free_ddr(A);
+ __free_ddr(B);
+ __free_ddr(C);
}
-
- __free_ddr(A);
- __free_ddr(B);
- __free_ddr(C);
*gflops = total_GFLOPS / (double)NUM_TEST_RUN;
*time = total_time / (double)NUM_TEST_RUN;