summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 9a17910)
raw | patch | inline | side by side (parent: 9a17910)
author | Jianzhong Xu <a0869574@ti.com> | |
Fri, 17 Apr 2015 15:38:55 +0000 (11:38 -0400) | ||
committer | Jianzhong Xu <a0869574@ti.com> | |
Fri, 17 Apr 2015 15:38:55 +0000 (11:38 -0400) |
61 files changed:
diff --git a/examples/cgemm_tune/Makefile b/examples/cgemm_tune/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-EXE = cgemm_tune
-
-include ../make.inc
-
-$(EXE): cgemm_tune.o
- $(CC) $(CFLAGS) cgemm_tune.o $(BLASLIB) -o $@
-
-tune: $(EXE)
- ./$(EXE);
\ No newline at end of file
diff --git a/examples/dgemm_tune/Makefile b/examples/dgemm_tune/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-EXE = dgemm_tune
-
-include ../make.inc
-
-$(EXE): dgemm_tune.o
- $(CC) $(CFLAGS) dgemm_tune.o $(BLASLIB) -o $@
-
-tune: $(EXE)
- ./$(EXE);
\ No newline at end of file
diff --git a/examples/sgemm_tune/Makefile b/examples/sgemm_tune/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-EXE = sgemm_tune
-
-include ../make.inc
-
-$(EXE): sgemm_tune.o
- $(CC) $(CFLAGS) sgemm_tune.o $(BLASLIB) -o $@
-
-tune: $(EXE)
- ./$(EXE);
\ No newline at end of file
diff --git a/examples/tuning/cgemm_test/Makefile b/examples/tuning/cgemm_test/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-EXE = cgemm_test
-
-include ../make.inc
-
-$(EXE): cgemm_test.o
- $(CC) $(CFLAGS) cgemm_test.o $(BLASLIB) -o $@
-
-tune: $(EXE)
- ./$(EXE);
\ No newline at end of file
diff --git a/examples/tuning/cgemm_test/cgemm_test.c b/examples/tuning/cgemm_test/cgemm_test.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Texas Instruments Incorporated nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <time.h>
-#include <complex.h>
-
-#include "cblas.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "cblas.h"
-#ifdef __cplusplus
-}
-#endif
-
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
-/*-----------------------------------------------------------------------------
-* Global Variables
-*----------------------------------------------------------------------------*/
-float complex alpha = 0.7 - 0.3*I;
-float complex beta = 0.4 + 0.6*I;
-enum CBLAS_ORDER order = CblasColMajor;
-enum CBLAS_TRANSPOSE transA = CblasNoTrans;
-enum CBLAS_TRANSPOSE transB = CblasNoTrans;
-
-extern int TI_CBLAS_L3_OFFLOAD;
-
-/*-----------------------------------------------------------------------------
-* Prototypes
-*----------------------------------------------------------------------------*/
-int check_results(const float complex *C1, const float complex *C2, int M, int N);
-int run_cgemm(int M, int K, int N, float *time, float *gflops);
-
-/*-----------------------------------------------------------------------------
-* MAIN
-*----------------------------------------------------------------------------*/
-int main()
-{
- int num_size, cgemm_err;
- int M, N, K, m, n, k;
- float time_secs, gflops;
- FILE *fp_time;
-
- fp_time = fopen("cgemm_time_ARMvsDSP.dat","w");
-
- srand(12345);
-
- /* sweep M, K, and N */
- for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
- {
- for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
- {
- for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
- {
- printf("Running CGEMM for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
- cgemm_err = run_cgemm(M, N, K, &time_secs, &gflops);
-
- if(cgemm_err == -1) { /* out of memory for DSP offloading */
- printf("Out of memory, skipping next point.\n");
- }
- else {
- if (cgemm_err == 0){
- fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\n", M, N, K, time_secs);
- }
- else {
- printf("Error in CGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
- exit(0);
- }
- }
- }
- }
- }
-
- fclose(fp_time);
-
- return 0;
-}
-
-
-int run_cgemm(int M, int N, int K, float *time, float *gflops)
-{
- long long i;
- int iter;
- float time_secs, total_time;
- float gflops_ARM, gflops_DSP;
- float operation_count = 2.0*(float)M*(float)N*(float)K;
- float total_GFLOPS = 0.0f;
- int err_code = 0;
-
- total_time = 0.0;
- for (iter = 0; iter < NUM_TEST_RUN; iter++)
- {
- /*-------------------------------------------------------------------------
- * Allocate space for the matrices. The matrices that will be passed to
- * the DSP are allocated using device memory. The Carm array is not passed
- * to the dsp and so can use system memory.
- *------------------------------------------------------------------------*/
- float complex *A = (float complex*) __malloc_ddr(M*K*sizeof(float complex));
- float complex *B = (float complex*) __malloc_ddr(K*N*sizeof(float complex));
- float complex *Cdsp = (float complex*) __malloc_ddr(M*N*sizeof(float complex));
- float complex *Carm = (float complex*) malloc (M*N*sizeof(float complex));
-
- if (!A || !B || !Cdsp || !Carm)
- {
- printf("Could not allocate enough space for the arrays!");
- if(A) __free_ddr(A);
- if(B) __free_ddr(B);
- if(Cdsp) __free_ddr(Cdsp);
- if(Carm) free(Carm);
-
- return (-1);
- }
-
- /*-------------------------------------------------------------------------
- * Initialize matrices and print if small enough.
- *------------------------------------------------------------------------*/
- for (i = 0; i < M*K; ++i)
- {
- A[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
- }
- for (i = 0; i < K*N; ++i)
- {
- B[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
- }
- for (i = 0; i < M*N; ++i)
- {
- Carm[i] = Cdsp[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
- }
-
- int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
- (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
-
- int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
- (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
-
- int ldc = (order == CblasColMajor) ? M : N;
-
- /*------------------------------------------------------------------------
- * Run and time cgemm
- *-----------------------------------------------------------------------*/
- tick();
- cblas_cgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Cdsp,ldc);
- time_secs = tock();
- total_time += time_secs;
- total_GFLOPS += operation_count/time_secs*1e-9;
-
- __free_ddr(A);
- __free_ddr(B);
- __free_ddr(Cdsp);
- free(Carm);
- }
-
- *gflops = total_GFLOPS / (float)NUM_TEST_RUN;
- *time = total_time / (float)NUM_TEST_RUN;
-
- return err_code;
-}
-
-/*-----------------------------------------------------------------------------
-* check_results
-*----------------------------------------------------------------------------*/
-int check_results(const float complex *C1, const float complex *C2, int M, int N)
-{
- int i;
- const float EPISILON = 1e-5;
- //const float EPISILON = 1e-200;
- const int NERRORS = 5;
- int num_errors = 0;
-
- for (i=0; i<M*N; i++)
- {
- float delta = cabs(C1[i]) - cabs(C2[i]);
-
- if (delta > EPISILON*cabs(C1[i]))
- if ((num_errors += 1) < NERRORS)
- printf("Error [elem:%d]: %f <==> %f\n", i, cabs(C1[i]), cabs(C2[i]));
- }
-
- if (num_errors > 0)
- {
- printf("FAIL with %d errors!\n", num_errors);
- return -1;
- }
- else
- {
- //printf("PASS!\n");
- return 0;
- }
-}
-
-
diff --git a/examples/tuning/cgemm_tune/Makefile b/examples/tuning/cgemm_tune/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-EXE = cgemm_tune
-
-include ../make.inc
-
-$(EXE): cgemm_tune.o
- $(CC) $(CFLAGS) cgemm_tune.o $(BLASLIB) -o $@
-
-tune: $(EXE)
- ./$(EXE);
\ No newline at end of file
diff --git a/examples/tuning/dgemm_test/Makefile b/examples/tuning/dgemm_test/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-EXE = dgemm_test
-
-include ../make.inc
-
-$(EXE): dgemm_test.o
- $(CC) $(CFLAGS) dgemm_test.o $(BLASLIB) -o $@
-
diff --git a/examples/tuning/dgemm_test/dgemm_tbl.m b/examples/tuning/dgemm_test/dgemm_tbl.m
+++ /dev/null
@@ -1,256 +0,0 @@
-tbl_dgemm=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
-0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
-0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1];
\ No newline at end of file
diff --git a/examples/tuning/dgemm_test/dgemm_test.c b/examples/tuning/dgemm_test/dgemm_test.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Texas Instruments Incorporated nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <time.h>
-
-#include "cblas.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "cblas.h"
-#ifdef __cplusplus
-}
-#endif
-
-#define TUNING_START_SIZE_RECTAN_MATRIX 50
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 6
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 1
-
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
-/*-----------------------------------------------------------------------------
-* Global Variables
-*----------------------------------------------------------------------------*/
-double alpha = 0.7;
-double beta = 0.3;
-enum CBLAS_ORDER order = CblasColMajor;
-//enum CBLAS_ORDER order = CblasRowMajor;
-enum CBLAS_TRANSPOSE transA = CblasNoTrans;
-enum CBLAS_TRANSPOSE transB = CblasNoTrans;
-
-extern int TI_CBLAS_L3_OFFLOAD;
-/*-----------------------------------------------------------------------------
-* Prototypes
-*----------------------------------------------------------------------------*/
-int run_dgemm(int M, int N, int K, float *time, float *gflops);
-
-/*-----------------------------------------------------------------------------
-* MAIN
-*----------------------------------------------------------------------------*/
-int main()
-{
- int num_size, dgemm_err;
- int M, N, K, m, n, k;
- int M_pre, N_pre, K_pre, M_start_size, N_start_size;
- float time_secs_arm, gflops_arm, time_secs_dsp, gflops_dsp, time_secs_opt, gflops_opt;
- FILE *fp_time, *fp_flops;
-
- fp_time = fopen("dgemm_time.dat","w");
- fp_flops = fopen("dgemm_flops.dat","w");
-
- srand(12345);
-
- /* setting up TI CBLAS during first call */
- run_dgemm(100, 100, 100, &time_secs_arm, &gflops_arm);
-
- /* sweep M, K, and N */
- for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
- {
- for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
- {
- for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
- {
- printf("Running DGEMM for (M,N,K) = (%d,%d,%d).\n", M,N,K);
-
- TI_CBLAS_L3_OFFLOAD = 0;
- dgemm_err = run_dgemm(M, N, K, &time_secs_arm, &gflops_arm);
-
- if(dgemm_err == -1) { /* out of memory for DSP offloading */
- printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- }
- else {
- TI_CBLAS_L3_OFFLOAD = 1;
- dgemm_err = run_dgemm(M, N, K, &time_secs_dsp, &gflops_dsp);
-
- TI_CBLAS_L3_OFFLOAD = 2;
- dgemm_err = run_dgemm(M, N, K, &time_secs_opt, &gflops_opt);
-
- fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\t%10.8e\t%10.8e\n",
- M, N, K, time_secs_arm, time_secs_dsp, time_secs_opt);
- fprintf(fp_flops, "%6d\t%6d\t%6d\t%10.8e\t%10.8e\t%10.8e\n",
- M, N, K, gflops_arm, gflops_dsp, gflops_opt);
- }
- }
- }
- }
-
- fclose(fp_time);
- fclose(fp_flops);
-
- return 0;
-}
-
-
-int run_dgemm(int M, int N, int K, float *time, float *gflops)
-{
- int iter;
- long long i;
- double time_secs, total_time;
- double operation_count = 2.0*(double)M*(double)N*(double)K;
- double total_GFLOPS = 0.0f;
- int err_code = 0;
-
- total_time = 0.0;
- for (iter = 0; iter < NUM_TEST_RUN; iter++)
- {
- /*-------------------------------------------------------------------------
- * Allocate space for the matrices.
- *------------------------------------------------------------------------*/
- double *A = (double *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(double));
- double *B = (double *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(double));
- double *C = (double *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(double));
-
- if (!A || !B || !C)
- {
- printf("Could not allocate enough space for the arrays!");
- if(A) __free_ddr(A);
- if(B) __free_ddr(B);
- if(C) __free_ddr(C);
-
- return (-1);
- }
-
- /*-------------------------------------------------------------------------
- * Initialize matrices
- *------------------------------------------------------------------------*/
- for (i = 0; i < (long long)M*K; ++i) A[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
- for (i = 0; i < (long long)K*N; ++i) B[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
- for (i = 0; i < (long long)M*N; ++i) C[i] = 0;
-
- int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
- (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
-
- int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
- (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
-
- int ldc = (order == CblasColMajor) ? M : N;
-
- fflush(stdout);
-
- /*------------------------------------------------------------------------
- * Run and time dgemm
- *-----------------------------------------------------------------------*/
- tick();
- cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- time_secs = tock();
- total_time += time_secs;
- total_GFLOPS += operation_count/time_secs*1e-9;
-/*
- if(M==4096 && K==256 && N==16) {
- FILE *file_a = fopen("mat_a.dat","w");
- FILE *file_b = fopen("mat_b.dat","w");
- FILE *file_c = fopen("mat_c.dat","w");
-
- for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
- for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
- for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",C[i]);
- }
-*/
-
- __free_ddr(A);
- __free_ddr(B);
- __free_ddr(C);
- }
-
- *gflops = total_GFLOPS / (double)NUM_TEST_RUN;
- *time = total_time / (double)NUM_TEST_RUN;
-
- return err_code;
-}
diff --git a/examples/tuning/dgemm_test/log2_int.m b/examples/tuning/dgemm_test/log2_int.m
+++ /dev/null
@@ -1,11 +0,0 @@
-function log2_x = log2_int(x)
- log2_x = 0;
- x0 = x;
- while x > 1
- x = bitsrl(int32(x),1);
- log2_x = log2_x+1;
- end
- if bitand(x0,bitsll(1,log2_x-1)) > 0
- log2_x = log2_x+1;
- end
-end
diff --git a/examples/tuning/dgemm_test/tbl_lookup.m b/examples/tuning/dgemm_test/tbl_lookup.m
+++ /dev/null
@@ -1,35 +0,0 @@
-function offload_decision = tbl_lookup(tbl,M,K,N)
-
-size_min=8;
-size_max=262144;
-size_min_log2=3;
-num_pnt=16;
-
-if M>size_max
- M = size_max;
-end
-if M<size_min
- M = size_min;
-end
-
-if K>size_max
- K = size_max;
-end
-if K<size_min
- K = size_min;
-end
-
-if N>size_max
- N = size_max;
-end
-if N<size_min
- N = size_min;
-end
-
-M_log2 = log2_int(int32(M));
-N_log2 = log2_int(int32(N));
-K_log2 = log2_int(int32(K));
-
-offload_decision = tbl((M_log2-size_min_log2)*num_pnt^2+(K_log2-size_min_log2)*num_pnt+N_log2-size_min_log2+1);
-end
-
diff --git a/examples/tuning/dgemm_test/time_ana.m b/examples/tuning/dgemm_test/time_ana.m
+++ /dev/null
@@ -1,41 +0,0 @@
-dgemm_tbl;
-x=reshape(tbl_dgemm',256*16,1);
-
-test1=dgemmt;
-test2=dgemm_time;
-
-test1_t0= dgemmt(:,4);
-test1_t1= dgemmt(:,5);
-test1_t2= dgemmt(:,6);
-test1_M = dgemmt(:,1);
-test1_N = dgemmt(:,2);
-test1_K = dgemmt(:,3);
-
-test1_dec = zeros(length(test1_M),1);
-
-for i=1:length(test1_M)
- test1_dec(i) = tbl_lookup(x,test1_M(i),test1_N(i),test1_K(i));
-end
-
-dec_err = (test1_t1>test1_t0) == test1_dec;
-
-test1_err_abs = (min(test1_t0,test1_t1)-test1_t2) .* dec_err;
-test1_err_rel = (min(test1_t0,test1_t1)-test1_t2) .* dec_err ./ min(test1_t0,test1_t1);
-
-test2_t0= dgemm_time(:,4);
-test2_t1= dgemm_time(:,5);
-test2_t2= dgemm_time(:,6);
-test2_M = dgemm_time(:,1);
-test2_N = dgemm_time(:,2);
-test2_K = dgemm_time(:,3);
-
-test2_dec = zeros(length(test2_M),1);
-
-for i=1:length(test2_M)
- test2_dec(i) = tbl_lookup(x,test2_M(i),test2_N(i),test2_K(i));
-end
-
-dec_err = (test2_t1>test2_t0) == test2_dec;
-
-test2_err_abs = (min(test2_t0,test2_t1)-test2_t2) .* dec_err;
-test2_err_rel = (min(test2_t0,test2_t1)-test2_t2) .* dec_err ./ min(test2_t0,test2_t1);
diff --git a/examples/tuning/dgemm_tune/Makefile b/examples/tuning/dgemm_tune/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-EXE = dgemm_tune
-
-include ../make.inc
-
-$(EXE): dgemm_tune.o
- $(CC) $(CFLAGS) dgemm_tune.o $(BLASLIB) -o $@
-
-tune: $(EXE)
- ./$(EXE);
\ No newline at end of file
diff --git a/examples/tuning/sgemm_test/Makefile b/examples/tuning/sgemm_test/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-
-EXE = sgemm_test
-
-include ../make.inc
-
-$(EXE): sgemm_test.o
- $(CC) $(CFLAGS) sgemm_test.o $(BLASLIB) -o $@
-
diff --git a/examples/tuning/sgemm_test/sgemm_test.c b/examples/tuning/sgemm_test/sgemm_test.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Texas Instruments Incorporated nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <time.h>
-
-#include "cblas.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include "cblas.h"
-#ifdef __cplusplus
-}
-#endif
-
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
-/*-----------------------------------------------------------------------------
-* Global Variables
-*----------------------------------------------------------------------------*/
-float alpha = 0.7;
-float beta = 0.3;
-enum CBLAS_ORDER order = CblasColMajor;
-//enum CBLAS_ORDER order = CblasRowMajor;
-enum CBLAS_TRANSPOSE transA = CblasNoTrans;
-enum CBLAS_TRANSPOSE transB = CblasNoTrans;
-
-extern int TI_CBLAS_L3_OFFLOAD;
-/*-----------------------------------------------------------------------------
-* Prototypes
-*----------------------------------------------------------------------------*/
-int run_sgemm(int M, int N, int K, float *time, float *gflops);
-
-/*-----------------------------------------------------------------------------
-* MAIN
-*----------------------------------------------------------------------------*/
-int main()
-{
- int num_size, dgemm_err;
- int M, N, K, m, n, k;
- int M_pre, N_pre, K_pre, M_start_size, N_start_size;
- float time_secs, gflops;
- FILE *fp_time;
-
- fp_time = fopen("dgemm_time.dat","w");
-
- srand(12345);
-
- /* sweep M, K, and N */
- for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
- {
- for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
- {
- for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
- {
- dgemm_err = run_sgemm(M, N, K, &time_secs, &gflops);
-
- if(dgemm_err == -1) { /* out of memory for DSP offloading */
- printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- }
- else {
- if (dgemm_err == 0){
- fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\n", M, N, K, time_secs);
- }
- else {
- printf("Error in DGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
- exit(0);
- }
- }
- }
- }
- }
-
- fclose(fp_time);
-
- return 0;
-}
-
-
-int run_sgemm(int M, int N, int K, float *time, float *gflops)
-{
- int iter;
- long long i;
- float time_secs, total_time;
- float operation_count = 2.0*(float)M*(float)N*(float)K;
- float total_GFLOPS = 0.0f;
- int err_code = 0;
-
- total_time = 0.0;
- for (iter = 0; iter < NUM_TEST_RUN; iter++)
- {
- /*-------------------------------------------------------------------------
- * Allocate space for the matrices. The matrices that will be passed to
- * the DSP are allocated using device memory. The Carm array is not passed
- * to the dsp and so can use system memory.
- *------------------------------------------------------------------------*/
- float *A = (float *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(float));
- float *B = (float *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(float));
- float *Cdsp = (float *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(float));
- float *Carm = (float *) malloc ((long long)M*(long long)N*(long long)sizeof(float));
-
- if (!A || !B || !Cdsp || !Carm)
- {
- printf("Could not allocate enough space for the arrays!");
- if(A) __free_ddr(A);
- if(B) __free_ddr(B);
- if(Cdsp) __free_ddr(Cdsp);
- if(Carm) free(Carm);
-
- return (-1);
- }
-
- /*-------------------------------------------------------------------------
- * Initialize matrices
- *------------------------------------------------------------------------*/
- for (i = 0; i < (long long)M*K; ++i) A[i] = (float)rand()/RAND_MAX;// (float)(rand() % 5 + 1);
- for (i = 0; i < (long long)K*N; ++i) B[i] = (float)rand()/RAND_MAX;// (float)(rand() % 5 + 1);
- for (i = 0; i < (long long)M*N; ++i) Carm[i] = Cdsp[i] = 0;
-
- int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
- (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
-
- int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
- (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
-
- int ldc = (order == CblasColMajor) ? M : N;
-
- fflush(stdout);
-
- /*------------------------------------------------------------------------
- * Run and time dgemm
- *-----------------------------------------------------------------------*/
- tick();
- cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Cdsp,ldc);
- time_secs = tock();
- total_time += time_secs;
- total_GFLOPS += operation_count/time_secs*1e-9;
-/*
- if(M==4096 && K==256 && N==16) {
- FILE *file_a = fopen("mat_a.dat","w");
- FILE *file_b = fopen("mat_b.dat","w");
- FILE *file_c = fopen("mat_c.dat","w");
-
- for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
- for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
- for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Cdsp[i]);
- }
-*/
-
- __free_ddr(A);
- __free_ddr(B);
- __free_ddr(Cdsp);
- free(Carm);
- }
-
- *gflops = total_GFLOPS / (float)NUM_TEST_RUN;
- *time = total_time / (float)NUM_TEST_RUN;
-
- return err_code;
-}
diff --git a/examples/tuning/sgemm_tune/Makefile b/examples/tuning/sgemm_tune/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-EXE = sgemm_tune
-
-include ../make.inc
-
-$(EXE): sgemm_tune.o
- $(CC) $(CFLAGS) sgemm_tune.o $(BLASLIB) -o $@
-
-tune: $(EXE)
- ./$(EXE);
\ No newline at end of file
diff --git a/examples/tuning/zgemm_tune/Makefile b/examples/tuning/zgemm_tune/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-EXE = zgemm_tune
-
-include ../make.inc
-
-$(EXE): zgemm_tune.o
- $(CC) $(CFLAGS) zgemm_tune.o $(BLASLIB) -o $@
-
-tune: $(EXE)
- ./$(EXE);
\ No newline at end of file
diff --git a/examples/zgemm_tune/Makefile b/examples/zgemm_tune/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-
-EXE = zgemm_tune
-
-include ../make.inc
-
-$(EXE): zgemm_tune.o
- $(CC) $(CFLAGS) zgemm_tune.o $(BLASLIB) -o $@
-
-tune: $(EXE)
- ./$(EXE);
\ No newline at end of file
diff --git a/examples/tuning/Makefile b/tuning/Makefile
similarity index 64%
rename from examples/tuning/Makefile
rename to tuning/Makefile
index 18d54c3bc6cb097a686da35f02a687ccaec09e28..5aad157bce26297c391bd5d96462237fc78a2e96 100644 (file)
rename from examples/tuning/Makefile
rename to tuning/Makefile
index 18d54c3bc6cb097a686da35f02a687ccaec09e28..5aad157bce26297c391bd5d96462237fc78a2e96 100644 (file)
--- a/examples/tuning/Makefile
+++ b/tuning/Makefile
$(MAKE) -C $$dir; \
done
$(MAKE) -C $$dir; \
done
-run:
+tune:
for dir in $(DIRS); do \
echo "=============== " $$dir " =================" ; \
for dir in $(DIRS); do \
echo "=============== " $$dir " =================" ; \
- $(MAKE tune) -C $$dir \
- done
-
-cross:
- for dir in $(DIRS); do \
- echo "=============== " $$dir " =================" ; \
- $(MAKE) -C $$dir cross; \
- done
+ $(MAKE) -C $$dir tune; \
+ done; \
+ mkdir ../ofld_tbls; find . -iname "ofld_tbl*.c" -exec cp {} ../ofld_tbls \;
clean:
for dir in $(DIRS); do \
clean:
for dir in $(DIRS); do \
diff --git a/tuning/cgemm_tune/Makefile b/tuning/cgemm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = cgemm_tune
+
+include ../make.inc
+
+$(EXE): cgemm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) cgemm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
similarity index 94%
rename from examples/tuning/cgemm_tune/cgemm_tune.c
rename to tuning/cgemm_tune/cgemm_tune.c
index 489979a760cd33bf595d6ef5a3f649551ca77a1a..c796c2e3b00fccdf5df68ac9fdf959672ddd2575 100644 (file)
rename from examples/tuning/cgemm_tune/cgemm_tune.c
rename to tuning/cgemm_tune/cgemm_tune.c
index 489979a760cd33bf595d6ef5a3f649551ca77a1a..c796c2e3b00fccdf5df68ac9fdf959672ddd2575 100644 (file)
#include <math.h>
#include <time.h>
#include <complex.h>
#include <math.h>
#include <time.h>
#include <complex.h>
+#include "../common/tune_com.h"
#include "cblas.h"
#ifdef __cplusplus
#include "cblas.h"
#ifdef __cplusplus
}
#endif
}
#endif
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
/*-----------------------------------------------------------------------------
* Global Variables
/*-----------------------------------------------------------------------------
* Global Variables
fp_tbl = fopen("ofld_tbl_cgemm.c","w");
fp_time = fopen("cgemm_time_ARMvsDSP.dat","w");
fp_tbl = fopen("ofld_tbl_cgemm.c","w");
fp_time = fopen("cgemm_time_ARMvsDSP.dat","w");
- fprintf(fp_tbl, "char ofld_tbl_cgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_cgemm[GEMM_OFFLOAD_TBL_SIZE] = {\n");
srand(12345);
srand(12345);
@@ -314,7 +299,7 @@ int check_results(const float complex *C1, const float complex *C2, int M, int N
if (num_errors > 0)
{
printf("FAIL with %d errors!\n", num_errors);
if (num_errors > 0)
{
printf("FAIL with %d errors!\n", num_errors);
- return -1;
+ return num_errors;
}
else
{
}
else
{
diff --git a/tuning/common/print_header.c b/tuning/common/print_header.c
--- /dev/null
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+
+void print_file_header(FILE *fp_tbl)
+{
+ fprintf(fp_tbl,
+"/****************************************************************************** \n"
+" * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/ \n"
+" * All rights reserved. \n"
+" * \n"
+" * Redistribution and use in source and binary forms, with or without \n"
+" * modification, are permitted provided that the following conditions are met:\n"
+" * * Redistributions of source code must retain the above copyright \n"
+" * notice, this list of conditions and the following disclaimer. \n"
+" * * Redistributions in binary form must reproduce the above copyright \n"
+" * notice, this list of conditions and the following disclaimer in the \n"
+" * documentation and/or other materials provided with the distribution. \n"
+" * * Neither the name of Texas Instruments Incorporated nor the \n"
+" * names of its contributors may be used to endorse or promote products \n"
+" * derived from this software without specific prior written permission.\n"
+" * \n"
+" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n"
+" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE \n"
+" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE \n"
+" * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE \n"
+" * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR \n"
+" * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF \n"
+" * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS \n"
+" * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN \n"
+" * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) \n"
+" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF \n"
+" * THE POSSIBILITY OF SUCH DAMAGE. \n"
+" *****************************************************************************/ \n"
+"\n"
+"#include \"ti_cblas.h\"\n\n");
+
+} /* print_file_header */
diff --git a/tuning/common/tune_com.h b/tuning/common/tune_com.h
--- /dev/null
+++ b/tuning/common/tune_com.h
@@ -0,0 +1,50 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+
+#define TUNING_START_SIZE_SQUARE_MATRIX 16
+#define TUNING_START_SIZE_RECTAN_MATRIX 8
+#define NUM_MATRIX_SIZE_TO_BENCHMARK 16
+#define HAS_MEMORY 1
+#define NO_MEMORY 0
+#define OFFLOAD 1
+#define NO_OFFLOAD 0
+
+#define NUM_TEST_RUN 5
+
+/*-----------------------------------------------------------------------------
+* Timing Setup
+*----------------------------------------------------------------------------*/
+struct timespec t0,t1;
+#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
+#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
+ t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
+
+extern void print_file_header(FILE *fp_tbl);
+
diff --git a/tuning/csyrk_tune/Makefile b/tuning/csyrk_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = csyrk_tune
+
+include ../make.inc
+
+$(EXE): csyrk_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) csyrk_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/csyrk_tune/csyrk_tune.c b/tuning/csyrk_tune/csyrk_tune.c
--- /dev/null
@@ -0,0 +1,296 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <complex.h>
+#include "../common/tune_com.h"
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+float complex alpha = 0.7 - 0.3*I;
+float complex beta = 0.4 + 0.6*I;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_UPLO uplo = CblasUpper;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const float complex *C1, const float complex *C2, int N, int K);
+int run_csyrk_dsp_and_arm(int N, int K, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int num_size, csyrk_err;
+ int N, K, n, k;
+ int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+ int offload_threshold_1, offload_threshold_2;
+ float total_GFLOPS_DSP, total_GFLOPS_ARM;
+ float time_DSP, time_ARM, t_dsp, t_arm;
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ int skip_next_point;
+ float diff_tmp, diff_pre;
+ FILE *fp_flag, *fp_time, *fp_tbl;
+
+ fp_flag = fopen("ofld_flag_csyrk.dat","w");
+ fp_tbl = fopen("ofld_tbl_csyrk.c","w");
+ fp_time = fopen("csyrk_time_ARMvsDSP.dat","w");
+
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_csyrk[SYRK_OFFLOAD_TBL_SIZE] = {\n");
+
+ srand(12345);
+
+ /* sweep K, and N */
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ {
+ for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
+ {
+ if( (n>0 && ofld_flag[n-1][k]==OFFLOAD)
+ ||(k>0 && ofld_flag[n][k-1]==OFFLOAD) ) {
+ ofld_flag[n][k] = OFFLOAD;
+ mem_flag[n][k] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ }
+ else if( (n>0 && (mem_flag[n-1][k]==NO_MEMORY))
+ ||(k>0 && (mem_flag[n][k-1]==NO_MEMORY))) {
+ ofld_flag[n][k] = NO_OFFLOAD;
+ mem_flag[n][k] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ csyrk_err = run_csyrk_dsp_and_arm(N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(csyrk_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[n][k] = NO_OFFLOAD;
+ mem_flag[n][k] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory, skipping next point.\n");
+ }
+ else {
+ mem_flag[n][k] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (csyrk_err == 0){
+ if(t_dsp < t_arm) {
+ ofld_flag[n][k] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
+ }
+ else {
+ ofld_flag[n][k] = NO_OFFLOAD;
+ }
+ }
+ else {
+ printf("Error in CSYRK tuning for (N,K)=(%d,%d)!\n", N,K);
+ exit(0);
+ }
+ }
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[n][k], (int)ofld_flag[n][k]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", N, K, time_ARM, time_DSP);
+
+ if( (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[n][k]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[n][k]);
+ }
+ }
+ fprintf(fp_tbl, "\n");
+ }
+
+ fclose(fp_flag);
+ fclose(fp_time);
+ fclose(fp_tbl);
+
+ return 0;
+}
+
+int run_csyrk_dsp_and_arm(int N, int K, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm)
+{
+ int iter;
+ long long i, size_A, size_C;
+ float time_secs, total_time_dsp, total_time_arm;
+ float gflops_ARM, gflops_DSP;
+ float operation_count = 2.0*(float)N*(float)K;
+ float total_GFLOPS_DSP = 0.0f;
+ float total_GFLOPS_ARM = 0.0f;
+ int err_code = 0;
+
+ total_time_dsp = 0.0;
+ total_time_arm = 0.0;
+ size_A = (long long)N*(long long)K;
+ size_C = (long long)N*(long long)N;
+ if( (size_A*sizeof(float complex)>(long long)0x0ffffffff)
+ ||(size_C*sizeof(float complex)>(long long)0x0ffffffff) ) {
+ return (-1);
+ }
+
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory. The Carm array is not passed
+ * to the dsp and so can use system memory.
+ *------------------------------------------------------------------------*/
+ float complex *A = (float complex *) __malloc_ddr(size_A*sizeof(float complex));
+ float complex *Cdsp = (float complex *) __malloc_ddr(size_C*sizeof(float complex));
+ float complex *Carm = (float complex *) malloc(size_C*sizeof(float complex));
+
+ if (!A || !Cdsp || !Carm)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(Cdsp) __free_ddr(Cdsp);
+ if(Carm) free(Carm);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ for (i = 0; i < size_A; ++i)
+ {
+ A[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+ }
+
+ for (i = 0; i < size_C; ++i)
+ {
+ Cdsp[i] = Carm[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+ }
+
+ int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+ (order == CblasRowMajor && transA == CblasTrans)) ? N : K;
+
+ int ldc = N;
+
+ /*============ BLAS tuning: running on DSP and then on ARM =============*/
+ /*------------------------------------------------------------------------
+ * Time DSP csyrk
+ *-----------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 1;
+
+ tick();
+ cblas_csyrk(order,uplo,transA,N,K,&alpha,A,lda,&beta,Cdsp,ldc);
+ time_secs = tock();
+ total_time_dsp += time_secs;
+ gflops_DSP = operation_count/time_secs*1e-9;
+ total_GFLOPS_DSP += gflops_DSP;
+
+ /*-------------------------------------------------------------------------
+ * Time ARM csyrk
+ *------------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 0;
+
+ tick();
+ cblas_csyrk(order,uplo,transA,N,K,&alpha,A,lda,&beta,Carm,ldc);
+ time_secs = tock();
+ total_time_arm += time_secs;
+ gflops_ARM = operation_count/time_secs*1e-9;
+ total_GFLOPS_ARM += gflops_ARM;
+
+ /*-------------------------------------------------------------------------
+ * Verify Results
+ *------------------------------------------------------------------------*/
+ err_code += check_results(Cdsp, Carm, N, N);
+
+ __free_ddr(A);
+ __free_ddr(Cdsp);
+ free(Carm);
+ }
+
+ *gflops_dsp = total_GFLOPS_DSP;
+ *gflops_arm = total_GFLOPS_ARM;
+ *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
+ *time_arm = total_time_arm / (float)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const float complex *C1, const float complex *C2, int M, int N)
+{
+ int i;
+ const float EPISILON = 1e-5;
+ //const float EPISILON = 1e-200;
+ const int NERRORS = 5;
+ int num_errors = 0;
+
+ for (i=0; i<M*N; i++)
+ {
+ float delta = cabs(C1[i]) - cabs(C2[i]);
+
+ if (delta > EPISILON*cabs(C1[i]))
+ if ((num_errors += 1) < NERRORS)
+ printf("Error [elem:%d]: %f <==> %f\n", i, cabs(C1[i]), cabs(C2[i]));
+ }
+
+ if (num_errors > 0)
+ {
+ printf("FAIL with %d errors!\n", num_errors);
+ return num_errors;
+ }
+ else
+ {
+ //printf("PASS!\n");
+ return 0;
+ }
+}
+
+
diff --git a/tuning/ctrmm_tune/Makefile b/tuning/ctrmm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = ctrmm_tune
+
+include ../make.inc
+
+$(EXE): ctrmm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) ctrmm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/ctrmm_tune/ctrmm_proc.m b/tuning/ctrmm_tune/ctrmm_proc.m
--- /dev/null
@@ -0,0 +1,30 @@
+load mat_a.dat;
+load mat_b.dat;
+load mat_c.dat;
+load mat_a2.dat;
+load mat_b2.dat;
+load mat_c2.dat;
+
+A=reshape(mat_a(:,1) + j*mat_a(:,2),8,8);
+B=reshape(mat_b(:,1) + j*mat_b(:,2),8,8);
+C=reshape(mat_c(:,1) + j*mat_c(:,2),8,8);
+A2=reshape(mat_a2(:,1)+j*mat_a2(:,2),8,8);
+B2=reshape(mat_b2(:,1)+j*mat_b2(:,2),8,8);
+C2=reshape(mat_c2(:,1)+j*mat_c2(:,2),8,8);
+
+diff=B-C;
+diff2=B2-C2;
+
+A3=A;
+for i=1:8
+ A3(i,i)=1;
+end
+for i=1:8
+ for k=1:8
+ if k<i
+ A3(i,k)=0;
+ end
+ end
+end
+
+B3=(0.7-j*0.3)*A3*B;
similarity index 53%
rename from examples/cgemm_tune/cgemm_tune.c
rename to tuning/ctrmm_tune/ctrmm_tune.c
index 71110d37579d627fa9f242ea0ba6c4376bb4ba27..9823cab52c38b1ace7cd292d52e7a7f3f4ce8c01 100644 (file)
rename from examples/cgemm_tune/cgemm_tune.c
rename to tuning/ctrmm_tune/ctrmm_tune.c
index 71110d37579d627fa9f242ea0ba6c4376bb4ba27..9823cab52c38b1ace7cd292d52e7a7f3f4ce8c01 100644 (file)
#include <math.h>
#include <time.h>
#include <complex.h>
#include <math.h>
#include <time.h>
#include <complex.h>
+#include "../common/tune_com.h"
#include "cblas.h"
#ifdef __cplusplus
#include "cblas.h"
#ifdef __cplusplus
}
#endif
}
#endif
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 8 //16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
float complex alpha = 0.7 - 0.3*I;
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
float complex alpha = 0.7 - 0.3*I;
-float complex beta = 0.4 + 0.6*I;
enum CBLAS_ORDER order = CblasColMajor;
enum CBLAS_TRANSPOSE transA = CblasNoTrans;
enum CBLAS_ORDER order = CblasColMajor;
enum CBLAS_TRANSPOSE transA = CblasNoTrans;
-enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasUnit;
extern int TI_CBLAS_L3_OFFLOAD;
extern int TI_CBLAS_L3_OFFLOAD;
* Prototypes
*----------------------------------------------------------------------------*/
int check_results(const float complex *C1, const float complex *C2, int M, int N);
* Prototypes
*----------------------------------------------------------------------------*/
int check_results(const float complex *C1, const float complex *C2, int M, int N);
-int run_cgemm_dsp_and_arm(int M, int K, int N, float *time_dsp, float *time_arm,
+int run_ctrmm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
float *gflops_dsp, float *gflops_arm);
/*-----------------------------------------------------------------------------
float *gflops_dsp, float *gflops_arm);
/*-----------------------------------------------------------------------------
@@ -81,99 +65,93 @@ int run_cgemm_dsp_and_arm(int M, int K, int N, float *time_dsp, float *time_arm,
*----------------------------------------------------------------------------*/
int main()
{
*----------------------------------------------------------------------------*/
int main()
{
- int num_size, cgemm_err;
- int M, N, K, m, n, k;
+ int num_size, ctrmm_err;
+ int M, N, m, n;
int M_pre, N_pre, K_pre, M_start_size, N_start_size;
int offload_threshold_1, offload_threshold_2;
float total_GFLOPS_DSP, total_GFLOPS_ARM;
float time_DSP, time_ARM, t_dsp, t_arm;
int M_pre, N_pre, K_pre, M_start_size, N_start_size;
int offload_threshold_1, offload_threshold_2;
float total_GFLOPS_DSP, total_GFLOPS_ARM;
float time_DSP, time_ARM, t_dsp, t_arm;
- char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
- char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
int skip_next_point;
float diff_tmp, diff_pre;
FILE *fp_flag, *fp_time, *fp_tbl;
int skip_next_point;
float diff_tmp, diff_pre;
FILE *fp_flag, *fp_time, *fp_tbl;
- fp_flag = fopen("ofld_flag_cgemm.dat","w");
- fp_tbl = fopen("ofld_tbl_cgemm.c","w");
- fp_time = fopen("cgemm_time_ARMvsDSP.dat","w");
+ fp_flag = fopen("ofld_flag_ctrmm.dat","w");
+ fp_tbl = fopen("ofld_tbl_ctrmm.c","w");
+ fp_time = fopen("ctrmm_time_ARMvsDSP.dat","w");
- fprintf(fp_tbl, "char ofld_tbl_cgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_ctrmm[TRMM_OFFLOAD_TBL_SIZE] = {\n");
srand(12345);
srand(12345);
-
- /* sweep M, K, and N */
+
+ /* sweep M, and N */
for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
- {
- for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
- {
- if( (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
- ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
- ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
- ofld_flag[m][n][k] = OFFLOAD;
- mem_flag[m][n][k] = HAS_MEMORY; // to avoid error
- time_DSP = -1.0;
- time_ARM = -1.0;
- printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
- }
- else if( (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
- ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
- ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
- ofld_flag[m][n][k] = NO_OFFLOAD;
- mem_flag[m][n][k] = NO_MEMORY;
+ {
+ if( (m>0 && ofld_flag[m-1][n]==OFFLOAD)
+ ||(n>0 && ofld_flag[m][n-1]==OFFLOAD) ) {
+ ofld_flag[m][n] = OFFLOAD;
+ mem_flag[m][n] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else if( (m>0 && (mem_flag[m-1][n]==NO_MEMORY))
+ ||(n>0 && (mem_flag[m][n-1]==NO_MEMORY)) ) {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ ctrmm_err = run_ctrmm_dsp_and_arm(M, N, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(ctrmm_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
time_DSP = -2.0;
time_ARM = -2.0;
time_DSP = -2.0;
time_ARM = -2.0;
- printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+ printf("Out of memory, skipping next point.\n");
}
}
- else {
- printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
- cgemm_err = run_cgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
- //dsym_err = run_dsymm_dsp_and_arm();
-
- if(cgemm_err == -1) { /* out of memory for DSP offloading */
- ofld_flag[m][n][k] = NO_OFFLOAD;
- mem_flag[m][n][k] = NO_MEMORY;
- time_DSP = -2.0;
- time_ARM = -2.0;
- printf("Out of memory, skipping next point.\n");
- }
- else {
- mem_flag[m][n][k] = HAS_MEMORY;
- time_DSP = t_dsp;
- time_ARM = t_arm;
- if (cgemm_err == 0){
- //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
- if(t_dsp < t_arm) {
- ofld_flag[m][n][k] = OFFLOAD;
- printf("Offloading to DSP for this point. Skipping next point.\n");
- }
- else {
- ofld_flag[m][n][k] = NO_OFFLOAD;
- }
+ else {
+ mem_flag[m][n] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (ctrmm_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[m][n] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
}
else {
}
else {
- printf("Error in CGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
- exit(0);
+ ofld_flag[m][n] = NO_OFFLOAD;
}
}
}
}
+ else {
+ printf("Error in CTRMM tuning for (M,N)=(%d,%d)!\n", M,N);
+ exit(0);
+ }
}
}
-
- fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
- fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n], (int)ofld_flag[m][n]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", M, N, time_ARM, time_DSP);
- if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
- && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
- && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
- fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
- } else {
- fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
- }
+ if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1)) ) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n]);
}
}
- fprintf(fp_tbl, "\n");
}
}
+ fprintf(fp_tbl, "\n");
}
}
-
+
fclose(fp_flag);
fclose(fp_time);
fclose(fp_tbl);
fclose(fp_flag);
fclose(fp_time);
fclose(fp_tbl);
return 0;
}
return 0;
}
-
-int run_cgemm_dsp_and_arm(int M, int K, int N, float *time_dsp, float *time_arm,
+int run_ctrmm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
float *gflops_dsp, float *gflops_arm)
{
float *gflops_dsp, float *gflops_arm)
{
- int iter;
- long long i;
+ int iter;
+ long long i, size_A, size_B;
float time_secs, total_time_dsp, total_time_arm;
float gflops_ARM, gflops_DSP;
float time_secs, total_time_dsp, total_time_arm;
float gflops_ARM, gflops_DSP;
- float operation_count = 2.0*(float)M*(float)N*(float)K;
+ float operation_count = 2.0*(float)M*(float)N;
float total_GFLOPS_DSP = 0.0f;
float total_GFLOPS_ARM = 0.0f;
int err_code = 0;
total_time_dsp = 0.0;
total_time_arm = 0.0;
float total_GFLOPS_DSP = 0.0f;
float total_GFLOPS_ARM = 0.0f;
int err_code = 0;
total_time_dsp = 0.0;
total_time_arm = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ }
+ size_B = (long long)M*(long long)N;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*-------------------------------------------------------------------------
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*-------------------------------------------------------------------------
@@ -203,94 +187,109 @@ int run_cgemm_dsp_and_arm(int M, int K, int N, float *time_dsp, float *time_arm,
* the DSP are allocated using device memory. The Carm array is not passed
* to the dsp and so can use system memory.
*------------------------------------------------------------------------*/
* the DSP are allocated using device memory. The Carm array is not passed
* to the dsp and so can use system memory.
*------------------------------------------------------------------------*/
- float complex *A = (float complex*) __malloc_ddr(M*K*sizeof(float complex));
- float complex *B = (float complex*) __malloc_ddr(K*N*sizeof(float complex));
- float complex *Cdsp = (float complex*) __malloc_ddr(M*N*sizeof(float complex));
- float complex *Carm = (float complex*) malloc (M*N*sizeof(float complex));
+ float complex *A = (float complex *) __malloc_ddr(size_A*sizeof(float complex));
+ float complex *Bdsp = (float complex *) __malloc_ddr(size_B*sizeof(float complex));
+ float complex *Barm = (float complex *) malloc(size_B*sizeof(float complex));
- if (!A || !B || !Cdsp || !Carm)
+ if (!A || !Bdsp || !Barm)
{
printf("Could not allocate enough space for the arrays!");
if(A) __free_ddr(A);
{
printf("Could not allocate enough space for the arrays!");
if(A) __free_ddr(A);
- if(B) __free_ddr(B);
- if(Cdsp) __free_ddr(Cdsp);
- if(Carm) free(Carm);
+ if(Bdsp) __free_ddr(Bdsp);
+ if(Barm) free(Barm);
return (-1);
}
/*-------------------------------------------------------------------------
return (-1);
}
/*-------------------------------------------------------------------------
- * Initialize matrices and print if small enough.
+ * Initialize matrices
*------------------------------------------------------------------------*/
*------------------------------------------------------------------------*/
- for (i = 0; i < M*K; ++i)
- {
+ for (i = 0; i < size_A; ++i)
+ {
A[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
A[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+ }
+
+ for (i = 0; i < size_B; ++i)
+ {
+ Bdsp[i] = Barm[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
}
}
- for (i = 0; i < K*N; ++i)
- {
- B[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+
+ if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a.dat","w");
+ FILE *file_b = fopen("mat_b.dat","w");
+ FILE *file_c = fopen("mat_c.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\t%1.10e\n",crealf(A[i]), cimagf(A[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_b, "%1.10e\t%1.10e\n",crealf(Barm[i]), cimagf(Barm[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_c, "%1.10e\t%1.10e\n",crealf(Bdsp[i]), cimagf(Bdsp[i]));
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
}
}
- for (i = 0; i < M*N; ++i)
- {
- Carm[i] = Cdsp[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
- }
-
- int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
- (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
-
- int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
- (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
-
- int ldc = (order == CblasColMajor) ? M : N;
+
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
/*============ BLAS tuning: running on DSP and then on ARM =============*/
/*------------------------------------------------------------------------
/*============ BLAS tuning: running on DSP and then on ARM =============*/
/*------------------------------------------------------------------------
- * Time DSP cgemm
+ * Time DSP ctrmm
*-----------------------------------------------------------------------*/
*-----------------------------------------------------------------------*/
- //ti_cblas_offload_config("001"); /* force offloading level 3 to DSP */
TI_CBLAS_L3_OFFLOAD = 1;
tick();
TI_CBLAS_L3_OFFLOAD = 1;
tick();
- cblas_cgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Cdsp,ldc);
+ cblas_ctrmm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Bdsp,ldb);
time_secs = tock();
total_time_dsp += time_secs;
gflops_DSP = operation_count/time_secs*1e-9;
total_GFLOPS_DSP += gflops_DSP;
time_secs = tock();
total_time_dsp += time_secs;
gflops_DSP = operation_count/time_secs*1e-9;
total_GFLOPS_DSP += gflops_DSP;
-
+
/*-------------------------------------------------------------------------
/*-------------------------------------------------------------------------
- * Time ARM cgemm
+ * Time ARM ctrmm
*------------------------------------------------------------------------*/
*------------------------------------------------------------------------*/
- //ti_cblas_offload_config("000"); /* force no offloading */
TI_CBLAS_L3_OFFLOAD = 0;
tick();
TI_CBLAS_L3_OFFLOAD = 0;
tick();
- cblas_cgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Carm,ldc);
+ cblas_ctrmm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Barm,ldb);
time_secs = tock();
total_time_arm += time_secs;
gflops_ARM = operation_count/time_secs*1e-9;
total_GFLOPS_ARM += gflops_ARM;
//printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
time_secs = tock();
total_time_arm += time_secs;
gflops_ARM = operation_count/time_secs*1e-9;
total_GFLOPS_ARM += gflops_ARM;
//printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
-
+
+ if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a2.dat","w");
+ FILE *file_b = fopen("mat_b2.dat","w");
+ FILE *file_c = fopen("mat_c2.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\t%1.10e\n",crealf(A[i]), cimagf(A[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_b, "%1.10e\t%1.10e\n",crealf(Barm[i]), cimagf(Barm[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_c, "%1.10e\t%1.10e\n",crealf(Bdsp[i]), cimagf(Bdsp[i]));
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ }
+
/*-------------------------------------------------------------------------
* Verify Results
*------------------------------------------------------------------------*/
/*-------------------------------------------------------------------------
* Verify Results
*------------------------------------------------------------------------*/
- //return check_results(Cdsp, Carm, M, N);
- err_code += check_results(Cdsp, Carm, M, N);
+ err_code += check_results(Bdsp, Barm, M, N);
__free_ddr(A);
__free_ddr(A);
- __free_ddr(B);
- __free_ddr(Cdsp);
- free(Carm);
+ __free_ddr(Bdsp);
+ free(Barm);
}
*gflops_dsp = total_GFLOPS_DSP;
*gflops_arm = total_GFLOPS_ARM;
*time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
*time_arm = total_time_arm / (float)NUM_TEST_RUN;
}
*gflops_dsp = total_GFLOPS_DSP;
*gflops_arm = total_GFLOPS_ARM;
*time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
*time_arm = total_time_arm / (float)NUM_TEST_RUN;
-
+
return err_code;
}
return err_code;
}
+
/*-----------------------------------------------------------------------------
* check_results
*----------------------------------------------------------------------------*/
/*-----------------------------------------------------------------------------
* check_results
*----------------------------------------------------------------------------*/
@@ -314,7 +313,7 @@ int check_results(const float complex *C1, const float complex *C2, int M, int N
if (num_errors > 0)
{
printf("FAIL with %d errors!\n", num_errors);
if (num_errors > 0)
{
printf("FAIL with %d errors!\n", num_errors);
- return -1;
+ return num_errors;
}
else
{
}
else
{
diff --git a/tuning/ctrsm_tune/Makefile b/tuning/ctrsm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = ctrsm_tune
+
+include ../make.inc
+
+$(EXE): ctrsm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) ctrsm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/ctrsm_tune/ctrmm_proc.m b/tuning/ctrsm_tune/ctrmm_proc.m
--- /dev/null
@@ -0,0 +1,30 @@
+load mat_a.dat;
+load mat_b.dat;
+load mat_c.dat;
+load mat_a2.dat;
+load mat_b2.dat;
+load mat_c2.dat;
+
+A=reshape(mat_a(:,1) + j*mat_a(:,2),8,8);
+B=reshape(mat_b(:,1) + j*mat_b(:,2),8,8);
+C=reshape(mat_c(:,1) + j*mat_c(:,2),8,8);
+A2=reshape(mat_a2(:,1)+j*mat_a2(:,2),8,8);
+B2=reshape(mat_b2(:,1)+j*mat_b2(:,2),8,8);
+C2=reshape(mat_c2(:,1)+j*mat_c2(:,2),8,8);
+
+diff=B-C;
+diff2=B2-C2;
+
+A3=A;
+for i=1:8
+ A3(i,i)=1;
+end
+for i=1:8
+ for k=1:8
+ if k<i
+ A3(i,k)=0;
+ end
+ end
+end
+
+B3=(0.7-j*0.3)*A3*B;
diff --git a/tuning/ctrsm_tune/ctrsm_tune.c b/tuning/ctrsm_tune/ctrsm_tune.c
--- /dev/null
@@ -0,0 +1,341 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <complex.h>
+#include "../common/tune_com.h"
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+float complex alpha = 0.7 - 0.3*I;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasNonUnit;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const float complex *C1, const float complex *C2, int M, int N);
+int run_ctrsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int num_size, ctrsm_err;
+ int M, N, m, n;
+ int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+ int offload_threshold_1, offload_threshold_2;
+ float total_GFLOPS_DSP, total_GFLOPS_ARM;
+ float time_DSP, time_ARM, t_dsp, t_arm;
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ int skip_next_point;
+ float diff_tmp, diff_pre;
+ FILE *fp_flag, *fp_time, *fp_tbl;
+
+ fp_flag = fopen("ofld_flag_ctrsm.dat","w");
+ fp_tbl = fopen("ofld_tbl_ctrsm.c","w");
+ fp_time = fopen("ctrsm_time_ARMvsDSP.dat","w");
+
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_ctrsm[TRMM_OFFLOAD_TBL_SIZE] = {\n");
+
+ srand(12345);
+
+ /* sweep M, and N */
+ for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
+ {
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ {
+ if( (m>0 && ofld_flag[m-1][n]==OFFLOAD)
+ ||(n>0 && ofld_flag[m][n-1]==OFFLOAD) ) {
+ ofld_flag[m][n] = OFFLOAD;
+ mem_flag[m][n] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else if( (m>0 && (mem_flag[m-1][n]==NO_MEMORY))
+ ||(n>0 && (mem_flag[m][n-1]==NO_MEMORY)) ) {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ ctrsm_err = run_ctrsm_dsp_and_arm(M, N, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(ctrsm_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory, skipping next point.\n");
+ }
+ else {
+ mem_flag[m][n] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (ctrsm_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[m][n] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
+ }
+ else {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ }
+ }
+ else {
+ printf("Error in CTRSM tuning for (M,N)=(%d,%d)!\n", M,N);
+ exit(0);
+ }
+ }
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n], (int)ofld_flag[m][n]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", M, N, time_ARM, time_DSP);
+
+ if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1)) ) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n]);
+ }
+ }
+ fprintf(fp_tbl, "\n");
+ }
+
+ fclose(fp_flag);
+ fclose(fp_time);
+ fclose(fp_tbl);
+
+ return 0;
+}
+
+int run_ctrsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm)
+{
+ int iter,j,k;
+ long long i, size_A, size_B;
+ float time_secs, total_time_dsp, total_time_arm;
+ float gflops_ARM, gflops_DSP;
+ float operation_count = 2.0*(float)M*(float)N;
+ float total_GFLOPS_DSP = 0.0f;
+ float total_GFLOPS_ARM = 0.0f;
+ int err_code = 0;
+
+ total_time_dsp = 0.0;
+ total_time_arm = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ }
+ size_B = (long long)M*(long long)N;
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory. The Carm array is not passed
+ * to the dsp and so can use system memory.
+ *------------------------------------------------------------------------*/
+ float complex *A = (float complex *) __malloc_ddr(size_A*sizeof(float complex));
+ float complex *Bdsp = (float complex *) __malloc_ddr(size_B*sizeof(float complex));
+ float complex *Barm = (float complex *) malloc(size_B*sizeof(float complex));
+
+ if (!A || !Bdsp || !Barm)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(Bdsp) __free_ddr(Bdsp);
+ if(Barm) free(Barm);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
+ for(j=0;j<lda;j++)
+ {
+ for(k=0;k<lda;k++)
+ {
+ if (j==k)
+ A[j*lda+k] = 1.0+j + 0.0*I;
+ else if (j<k)
+ A[j*lda+k] = 0.0 + 0.0*I;
+ else
+ A[j*lda+k] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+ }
+ }
+// for (i = 0; i < size_A; ++i)
+// {
+// A[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+// }
+
+ for (i = 0; i < size_B; ++i)
+ {
+ Bdsp[i] = Barm[i] = (float)rand()/RAND_MAX + (float)rand()/RAND_MAX * I;
+ }
+
+ if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a.dat","w");
+ FILE *file_b = fopen("mat_b.dat","w");
+ FILE *file_c = fopen("mat_c.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\t%1.10e\n",crealf(A[i]), cimagf(A[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_b, "%1.10e\t%1.10e\n",crealf(Barm[i]), cimagf(Barm[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_c, "%1.10e\t%1.10e\n",crealf(Bdsp[i]), cimagf(Bdsp[i]));
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ }
+
+ /*============ BLAS tuning: running on DSP and then on ARM =============*/
+ /*------------------------------------------------------------------------
+ * Time DSP ctrsm
+ *-----------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 1;
+
+ tick();
+ cblas_ctrsm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Bdsp,ldb);
+ time_secs = tock();
+ total_time_dsp += time_secs;
+ gflops_DSP = operation_count/time_secs*1e-9;
+ total_GFLOPS_DSP += gflops_DSP;
+
+ /*-------------------------------------------------------------------------
+ * Time ARM ctrsm
+ *------------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 0;
+
+ tick();
+ cblas_ctrsm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Barm,ldb);
+ time_secs = tock();
+ total_time_arm += time_secs;
+ gflops_ARM = operation_count/time_secs*1e-9;
+ total_GFLOPS_ARM += gflops_ARM;
+ //printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
+
+ if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a2.dat","w");
+ FILE *file_b = fopen("mat_b2.dat","w");
+ FILE *file_c = fopen("mat_c2.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\t%1.10e\n",crealf(A[i]), cimagf(A[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_b, "%1.10e\t%1.10e\n",crealf(Barm[i]), cimagf(Barm[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_c, "%1.10e\t%1.10e\n",crealf(Bdsp[i]), cimagf(Bdsp[i]));
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Verify Results
+ *------------------------------------------------------------------------*/
+ err_code += check_results(Bdsp, Barm, M, N);
+
+ __free_ddr(A);
+ __free_ddr(Bdsp);
+ free(Barm);
+ }
+
+ *gflops_dsp = total_GFLOPS_DSP;
+ *gflops_arm = total_GFLOPS_ARM;
+ *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
+ *time_arm = total_time_arm / (float)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const float complex *C1, const float complex *C2, int M, int N)
+{
+ int i;
+ float norm, delta;
+ const float EPISILON = 1e-5;
+ const float DELTA = 1e-5;
+ //const float EPISILON = 1e-200;
+ const int NERRORS = 5;
+ int num_errors = 0;
+
+ for (i=0; i<M*N; i++)
+ {
+ delta = cabs(C1[i]) - cabs(C2[i]);
+ norm = cabs(C1[i]);
+ if(norm < cabs(C2[i]))
+ norm = cabs(C2[i]);
+
+ if (delta > EPISILON*norm && delta>DELTA)
+ if ((num_errors += 1) < NERRORS)
+ printf("Error [elem:%d]: %f <==> %f\n", i, cabs(C1[i]), cabs(C2[i]));
+ }
+
+ if (num_errors > 0)
+ {
+ printf("FAIL with %d errors!\n", num_errors);
+ return num_errors;
+ }
+ else
+ {
+ //printf("PASS!\n");
+ return 0;
+ }
+}
+
+
diff --git a/tuning/dgemm_tune/Makefile b/tuning/dgemm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = dgemm_tune
+
+include ../make.inc
+
+$(EXE): dgemm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) dgemm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
similarity index 94%
rename from examples/tuning/dgemm_tune/dgemm_tune.c
rename to tuning/dgemm_tune/dgemm_tune.c
index 2c2741153eebbed537cefafabc80f28934a9f2a5..e81e68cf01a23bdbed09c3762e52ab8e4d5bb694 100644 (file)
rename from examples/tuning/dgemm_tune/dgemm_tune.c
rename to tuning/dgemm_tune/dgemm_tune.c
index 2c2741153eebbed537cefafabc80f28934a9f2a5..e81e68cf01a23bdbed09c3762e52ab8e4d5bb694 100644 (file)
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
+#include "../common/tune_com.h"
#include "cblas.h"
#ifdef __cplusplus
#include "cblas.h"
#ifdef __cplusplus
}
#endif
}
#endif
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
fp_tbl = fopen("ofld_tbl_dgemm.c","w");
fp_time = fopen("dgemm_time_ARMvsDSP.dat","w");
fp_tbl = fopen("ofld_tbl_dgemm.c","w");
fp_time = fopen("dgemm_time_ARMvsDSP.dat","w");
- fprintf(fp_tbl, "char ofld_tbl_dgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_dgemm[GEMM_OFFLOAD_TBL_SIZE] = {\n");
srand(12345);
srand(12345);
diff --git a/tuning/dsyrk_tune/Makefile b/tuning/dsyrk_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = dsyrk_tune
+
+include ../make.inc
+
+$(EXE): dsyrk_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) dsyrk_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/dsyrk_tune/dsyrk_proc.m b/tuning/dsyrk_tune/dsyrk_proc.m
--- /dev/null
@@ -0,0 +1,36 @@
+N=8;
+K=8;
+load mat_a_1.dat;
+load mat_cdsp_1.dat;
+load mat_carm_1.dat;
+load mat_a_2.dat;
+load mat_cdsp_2.dat;
+load mat_carm_2.dat;
+
+A1=reshape(mat_a_1,N,K);
+Cdsp1=reshape(mat_cdsp_1,N,N);
+Carm1=reshape(mat_carm_1,N,N);
+A2=reshape(mat_a_2,N,K);
+Cdsp2=reshape(mat_cdsp_2,N,N);
+Carm2=reshape(mat_carm_2,N,N);
+
+diff1=Cdsp1-Carm1;
+diff2=Cdsp2-Carm2;
+
+C=Cdsp1;
+for i=1:N
+ C(i,i)=1;
+end
+for i=1:N
+ for k=1:N
+ if k<i
+ C(i,k)=0;
+ end
+ end
+end
+
+Cnew=0.7*A1*transpose(A1)+0.3*C;
+
+diff3=Cnew-Cdsp2;
+diff4=Cnew-Carm2;
+
similarity index 53%
rename from examples/dgemm_tune/dgemm_tune.c
rename to tuning/dsyrk_tune/dsyrk_tune.c
index c3ebcc757330f302f61eb96587fe67d6537c7bc9..2d6f99341772a114ba1f53c14a62da0e28817b35 100644 (file)
rename from examples/dgemm_tune/dgemm_tune.c
rename to tuning/dsyrk_tune/dsyrk_tune.c
index c3ebcc757330f302f61eb96587fe67d6537c7bc9..2d6f99341772a114ba1f53c14a62da0e28817b35 100644 (file)
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
+#include "../common/tune_com.h"
#include "cblas.h"
#ifdef __cplusplus
#include "cblas.h"
#ifdef __cplusplus
}
#endif
}
#endif
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 8 //16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
double alpha = 0.7;
double beta = 0.3;
enum CBLAS_ORDER order = CblasColMajor;
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
double alpha = 0.7;
double beta = 0.3;
enum CBLAS_ORDER order = CblasColMajor;
-//enum CBLAS_ORDER order = CblasRowMajor;
enum CBLAS_TRANSPOSE transA = CblasNoTrans;
enum CBLAS_TRANSPOSE transA = CblasNoTrans;
-enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+enum CBLAS_UPLO uplo = CblasUpper;
extern int TI_CBLAS_L3_OFFLOAD;
/*-----------------------------------------------------------------------------
* Prototypes
*----------------------------------------------------------------------------*/
extern int TI_CBLAS_L3_OFFLOAD;
/*-----------------------------------------------------------------------------
* Prototypes
*----------------------------------------------------------------------------*/
-int check_results(const double *C1, const double *C2, int M, int N);
-int run_dgemm_dsp_and_arm(int M, int N, int K, double *time_dsp, double *time_arm,
+int check_results(const double *C1, const double *C2, int N, int K);
+int run_dsyrk_dsp_and_arm(int N, int K, double *time_dsp, double *time_arm,
double *gflops_dsp, double *gflops_arm);
/*-----------------------------------------------------------------------------
double *gflops_dsp, double *gflops_arm);
/*-----------------------------------------------------------------------------
@@ -81,97 +62,92 @@ int run_dgemm_dsp_and_arm(int M, int N, int K, double *time_dsp, double *time_ar
*----------------------------------------------------------------------------*/
int main()
{
*----------------------------------------------------------------------------*/
int main()
{
- int num_size, dgemm_err;
- int M, N, K, m, n, k;
+ int num_size, dsyrk_err;
+ int N, K, n, k;
int M_pre, N_pre, K_pre, M_start_size, N_start_size;
int offload_threshold_1, offload_threshold_2;
double total_GFLOPS_DSP, total_GFLOPS_ARM;
double time_DSP, time_ARM, t_dsp, t_arm;
int M_pre, N_pre, K_pre, M_start_size, N_start_size;
int offload_threshold_1, offload_threshold_2;
double total_GFLOPS_DSP, total_GFLOPS_ARM;
double time_DSP, time_ARM, t_dsp, t_arm;
- char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
- char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
int skip_next_point;
float diff_tmp, diff_pre;
FILE *fp_flag, *fp_time, *fp_tbl;
int skip_next_point;
float diff_tmp, diff_pre;
FILE *fp_flag, *fp_time, *fp_tbl;
- fp_flag = fopen("ofld_flag_dgemm.dat","w");
- fp_tbl = fopen("ofld_tbl_dgemm.c","w");
- fp_time = fopen("dgemm_time_ARMvsDSP.dat","w");
+ fp_flag = fopen("ofld_flag_dsyrk.dat","w");
+ fp_tbl = fopen("ofld_tbl_dsyrk.c","w");
+ fp_time = fopen("dsyrk_time_ARMvsDSP.dat","w");
- fprintf(fp_tbl, "char ofld_tbl_dgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_dsyrk[SYRK_OFFLOAD_TBL_SIZE] = {\n");
srand(12345);
srand(12345);
- /* sweep M, K, and N */
- for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
+ /* sweep K, and N */
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
{
{
- for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
{
- for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
- {
- if( (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
- ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
- ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
- ofld_flag[m][n][k] = OFFLOAD;
- mem_flag[m][n][k] = HAS_MEMORY; // to avoid error
- time_DSP = -1.0;
- time_ARM = -1.0;
- printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
- }
- else if( (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
- ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
- ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
- ofld_flag[m][n][k] = NO_OFFLOAD;
- mem_flag[m][n][k] = NO_MEMORY;
+ if( (n>0 && ofld_flag[n-1][k]==OFFLOAD)
+ ||(k>0 && ofld_flag[n][k-1]==OFFLOAD) ) {
+ ofld_flag[n][k] = OFFLOAD;
+ mem_flag[n][k] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ }
+ else if( (n>0 && (mem_flag[n-1][k]==NO_MEMORY))
+ ||(k>0 && (mem_flag[n][k-1]==NO_MEMORY))) {
+ ofld_flag[n][k] = NO_OFFLOAD;
+ mem_flag[n][k] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ dsyrk_err = run_dsyrk_dsp_and_arm(N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+ //dsym_err = run_dsymm_dsp_and_arm();
+
+ if(dsyrk_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[n][k] = NO_OFFLOAD;
+ mem_flag[n][k] = NO_MEMORY;
time_DSP = -2.0;
time_ARM = -2.0;
time_DSP = -2.0;
time_ARM = -2.0;
- printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+ printf("Out of memory, skipping next point.\n");
}
}
- else {
- printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
- dgemm_err = run_dgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
- //dsym_err = run_dsymm_dsp_and_arm();
-
- if(dgemm_err == -1) { /* out of memory for DSP offloading */
- ofld_flag[m][n][k] = NO_OFFLOAD;
- mem_flag[m][n][k] = NO_MEMORY;
- time_DSP = -2.0;
- time_ARM = -2.0;
- printf("Out of memory, skipping next point.\n");
- }
- else {
- mem_flag[m][n][k] = HAS_MEMORY;
- time_DSP = t_dsp;
- time_ARM = t_arm;
- if (dgemm_err == 0){
- //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
- if(t_dsp < t_arm) {
- ofld_flag[m][n][k] = OFFLOAD;
- printf("Offloading to DSP for this point. Skipping next point.\n");
- }
- else {
- ofld_flag[m][n][k] = NO_OFFLOAD;
- }
+ else {
+ mem_flag[n][k] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (dsyrk_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[n][k] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
}
else {
}
else {
- printf("Error in DGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
- exit(0);
+ ofld_flag[n][k] = NO_OFFLOAD;
}
}
}
}
+ else {
+ printf("Error in DSYRK tuning for (N,K)=(%d,%d)!\n", N,K);
+ exit(0);
+ }
}
}
-
- fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
- fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[n][k], (int)ofld_flag[n][k]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", N, K, time_ARM, time_DSP);
- if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
- && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
- && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
- fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
- } else {
- fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
- }
+ if( (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[n][k]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[n][k]);
}
}
- fprintf(fp_tbl, "\n");
}
}
+ fprintf(fp_tbl, "\n");
}
fclose(fp_flag);
}
fclose(fp_flag);
}
}
-int run_dgemm_dsp_and_arm(int M, int N, int K, double *time_dsp, double *time_arm,
+int run_dsyrk_dsp_and_arm(int N, int K, double *time_dsp, double *time_arm,
double *gflops_dsp, double *gflops_arm)
{
double *gflops_dsp, double *gflops_arm)
{
+ long long i, size_A, size_C;
int iter;
int iter;
- long long i;
double time_secs, total_time_dsp, total_time_arm;
double gflops_ARM, gflops_DSP;
double time_secs, total_time_dsp, total_time_arm;
double gflops_ARM, gflops_DSP;
- double operation_count = 2.0*(double)M*(double)N*(double)K;
+ double operation_count = 2.0*(double)N*(double)K;
double total_GFLOPS_DSP = 0.0f;
double total_GFLOPS_ARM = 0.0f;
int err_code = 0;
total_time_dsp = 0.0;
total_time_arm = 0.0;
double total_GFLOPS_DSP = 0.0f;
double total_GFLOPS_ARM = 0.0f;
int err_code = 0;
total_time_dsp = 0.0;
total_time_arm = 0.0;
+ size_A = (long long)N*(long long)K;
+ size_C = (long long)N*(long long)N;
+
+ if( (size_A*sizeof(double)>(long long)0x0ffffffff)
+ ||(size_C*sizeof(double)>(long long)0x0ffffffff) ) {
+ return (-1);
+ }
+
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*-------------------------------------------------------------------------
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*-------------------------------------------------------------------------
@@ -203,16 +187,14 @@ int run_dgemm_dsp_and_arm(int M, int N, int K, double *time_dsp, double *time_ar
* the DSP are allocated using device memory. The Carm array is not passed
* to the dsp and so can use system memory.
*------------------------------------------------------------------------*/
* the DSP are allocated using device memory. The Carm array is not passed
* to the dsp and so can use system memory.
*------------------------------------------------------------------------*/
- double *A = (double *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(double));
- double *B = (double *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(double));
- double *Cdsp = (double *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(double));
- double *Carm = (double *) malloc ((long long)M*(long long)N*(long long)sizeof(double));
+ double *A = (double *) __malloc_ddr(size_A*sizeof(double));
+ double *Cdsp = (double *) __malloc_ddr(size_C*sizeof(double));
+ double *Carm = (double *) malloc (size_C*sizeof(double));
- if (!A || !B || !Cdsp || !Carm)
+ if (!A || !Cdsp || !Carm)
{
printf("Could not allocate enough space for the arrays!");
if(A) __free_ddr(A);
{
printf("Could not allocate enough space for the arrays!");
if(A) __free_ddr(A);
- if(B) __free_ddr(B);
if(Cdsp) __free_ddr(Cdsp);
if(Carm) free(Carm);
if(Cdsp) __free_ddr(Cdsp);
if(Carm) free(Carm);
@@ -222,54 +204,52 @@ int run_dgemm_dsp_and_arm(int M, int N, int K, double *time_dsp, double *time_ar
/*-------------------------------------------------------------------------
* Initialize matrices
*------------------------------------------------------------------------*/
/*-------------------------------------------------------------------------
* Initialize matrices
*------------------------------------------------------------------------*/
- for (i = 0; i < (long long)M*K; ++i) A[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
- for (i = 0; i < (long long)K*N; ++i) B[i] = (double)rand()/RAND_MAX;// (double)(rand() % 5 + 1);
- for (i = 0; i < (long long)M*N; ++i) Carm[i] = Cdsp[i] = 0;
+ for (i = 0; i < (long long)N*K; ++i) A[i] = (double)rand()/RAND_MAX;
+ for (i = 0; i < (long long)N*N; ++i) Carm[i] = Cdsp[i] = (double)rand()/RAND_MAX;
int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
- (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
+ (order == CblasRowMajor && transA == CblasTrans)) ? N : K;
- int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
- (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
-
- int ldc = (order == CblasColMajor) ? M : N;
+ int ldc = N;
fflush(stdout);
fflush(stdout);
-
+
+ if(N==8 && K==8) {
+ FILE *file_a = fopen("mat_a_1.dat","w");
+ FILE *file_cdsp = fopen("mat_cdsp_1.dat","w");
+ FILE *file_carm = fopen("mat_carm_1.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+ for(i=0; i < size_C; ++i) fprintf(file_cdsp, "%1.10e\n",Cdsp[i]);
+ for(i=0; i < size_C; ++i) fprintf(file_carm, "%1.10e\n",Carm[i]);
+
+ fclose(file_a);
+ fclose(file_cdsp);
+ fclose(file_carm);
+ }
+
/*============ BLAS tuning: running on DSP and then on ARM =============*/
/*------------------------------------------------------------------------
/*============ BLAS tuning: running on DSP and then on ARM =============*/
/*------------------------------------------------------------------------
- * Time DSP dgemm
+ * Time DSP dsyrk
*-----------------------------------------------------------------------*/
*-----------------------------------------------------------------------*/
- //ti_cblas_offload_config("001"); /* force offloading level 3 to DSP */
- //printf("Running on DSP.\n");
TI_CBLAS_L3_OFFLOAD = 1;
tick();
TI_CBLAS_L3_OFFLOAD = 1;
tick();
- cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Cdsp,ldc);
+ cblas_dsyrk(order,uplo,transA,N,K,alpha,A,lda,beta,Cdsp,ldc);
time_secs = tock();
total_time_dsp += time_secs;
gflops_DSP = operation_count/time_secs*1e-9;
total_GFLOPS_DSP += gflops_DSP;
time_secs = tock();
total_time_dsp += time_secs;
gflops_DSP = operation_count/time_secs*1e-9;
total_GFLOPS_DSP += gflops_DSP;
-/*
- if(M==4096 && K==256 && N==16) {
- FILE *file_a = fopen("mat_a.dat","w");
- FILE *file_b = fopen("mat_b.dat","w");
- FILE *file_c = fopen("mat_c.dat","w");
-
- for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
- for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
- for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Cdsp[i]);
- }
-*/
+
/*-------------------------------------------------------------------------
/*-------------------------------------------------------------------------
- * Time ARM dgemm
+ * Time ARM dsyrk
*------------------------------------------------------------------------*/
//ti_cblas_offload_config("000"); /* force no offloading */
//printf("Running on ARM.\n");
TI_CBLAS_L3_OFFLOAD = 0;
tick();
*------------------------------------------------------------------------*/
//ti_cblas_offload_config("000"); /* force no offloading */
//printf("Running on ARM.\n");
TI_CBLAS_L3_OFFLOAD = 0;
tick();
- cblas_dgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Carm,ldc);
+ cblas_dsyrk(order,uplo,transA,N,K,alpha,A,lda,beta,Carm,ldc);
time_secs = tock();
total_time_arm += time_secs;
gflops_ARM = operation_count/time_secs*1e-9;
time_secs = tock();
total_time_arm += time_secs;
gflops_ARM = operation_count/time_secs*1e-9;
@@ -277,14 +257,26 @@ int run_dgemm_dsp_and_arm(int M, int N, int K, double *time_dsp, double *time_ar
//printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
fflush(stdout);
//printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
fflush(stdout);
+ if(N==8 && K==8) {
+ FILE *file_a = fopen("mat_a_2.dat","w");
+ FILE *file_cdsp = fopen("mat_cdsp_2.dat","w");
+ FILE *file_carm = fopen("mat_carm_2.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+ for(i=0; i < size_C; ++i) fprintf(file_cdsp, "%1.10e\n",Cdsp[i]);
+ for(i=0; i < size_C; ++i) fprintf(file_carm, "%1.10e\n",Carm[i]);
+
+ fclose(file_a);
+ fclose(file_cdsp);
+ fclose(file_carm);
+ }
+
/*-------------------------------------------------------------------------
* Verify Results
*------------------------------------------------------------------------*/
/*-------------------------------------------------------------------------
* Verify Results
*------------------------------------------------------------------------*/
- //return check_results(Cdsp, Carm, M, N);
- err_code += check_results(Cdsp, Carm, M, N);
+ err_code += check_results(Cdsp, Carm, N, N);
__free_ddr(A);
__free_ddr(A);
- __free_ddr(B);
__free_ddr(Cdsp);
free(Carm);
}
__free_ddr(Cdsp);
free(Carm);
}
diff --git a/tuning/dtrmm_tune/Makefile b/tuning/dtrmm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = dtrmm_tune
+
+include ../make.inc
+
+$(EXE): dtrmm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) dtrmm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/dtrmm_tune/dtrmm_tune.c b/tuning/dtrmm_tune/dtrmm_tune.c
--- /dev/null
@@ -0,0 +1,287 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include "../common/tune_com.h"
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double alpha = 0.7;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasUnit;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const double *C1, const double *C2, int M, int N);
+int run_dtrmm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int num_size, dtrmm_err;
+ int M, N, m, n;
+ int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+ int offload_threshold_1, offload_threshold_2;
+ float total_GFLOPS_DSP, total_GFLOPS_ARM;
+ float time_DSP, time_ARM, t_dsp, t_arm;
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ int skip_next_point;
+ float diff_tmp, diff_pre;
+ FILE *fp_flag, *fp_time, *fp_tbl;
+
+ fp_flag = fopen("ofld_flag_dtrmm.dat","w");
+ fp_tbl = fopen("ofld_tbl_dtrmm.c","w");
+ fp_time = fopen("dtrmm_time_ARMvsDSP.dat","w");
+
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_dtrmm[TRMM_OFFLOAD_TBL_SIZE] = {\n");
+
+ srand(12345);
+
+ /* sweep M, and N */
+ for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
+ {
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ {
+ if( (m>0 && ofld_flag[m-1][n]==OFFLOAD)
+ ||(n>0 && ofld_flag[m][n-1]==OFFLOAD) ) {
+ ofld_flag[m][n] = OFFLOAD;
+ mem_flag[m][n] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else if( (m>0 && (mem_flag[m-1][n]==NO_MEMORY))
+ ||(n>0 && (mem_flag[m][n-1]==NO_MEMORY)) ) {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ dtrmm_err = run_dtrmm_dsp_and_arm(M, N, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(dtrmm_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory, skipping next point.\n");
+ }
+ else {
+ mem_flag[m][n] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (dtrmm_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[m][n] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
+ }
+ else {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ }
+ }
+ else {
+ printf("Error in DTRMM tuning for (M,N)=(%d,%d)!\n", M,N);
+ exit(0);
+ }
+ }
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n], (int)ofld_flag[m][n]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", M, N, time_ARM, time_DSP);
+
+ if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1)) ) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n]);
+ }
+ }
+ fprintf(fp_tbl, "\n");
+ }
+
+ fclose(fp_flag);
+ fclose(fp_time);
+ fclose(fp_tbl);
+
+ return 0;
+}
+
+
+int run_dtrmm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm)
+{
+ int iter;
+ long long i, size_A, size_B;
+ float time_secs, total_time_dsp, total_time_arm;
+ float gflops_ARM, gflops_DSP;
+ float operation_count = 2.0*(float)M*(float)N;
+ float total_GFLOPS_DSP = 0.0f;
+ float total_GFLOPS_ARM = 0.0f;
+ int err_code = 0;
+
+ total_time_dsp = 0.0;
+ total_time_arm = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ }
+ size_B = (long long)M*(long long)N;
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory. The Carm array is not passed
+ * to the dsp and so can use system memory.
+ *------------------------------------------------------------------------*/
+ double *A = (double *) __malloc_ddr(size_A*(long long)sizeof(double));
+ double *Bdsp = (double *) __malloc_ddr(size_B*(long long)sizeof(double));
+ double *Barm = (double *) malloc(size_B*(long long)sizeof(double));
+
+ if (!A || !Bdsp || !Barm)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(Bdsp) __free_ddr(Bdsp);
+ if(Barm) free(Barm);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ for (i = 0; i < size_A; ++i) A[i] = (double)rand()/RAND_MAX;
+ for (i = 0; i < (long long)M*N; ++i) Bdsp[i] = Barm[i] = (double)rand()/RAND_MAX;
+
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
+
+ /*============ BLAS tuning: running on DSP and then on ARM =============*/
+ /*------------------------------------------------------------------------
+ * Time DSP dtrmm
+ *-----------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 1;
+
+ tick();
+ cblas_dtrmm(order,side,uplo,transA,diag,M,N,alpha,A,lda,Bdsp,ldb);
+ time_secs = tock();
+ total_time_dsp += time_secs;
+ gflops_DSP = operation_count/time_secs*1e-9;
+ total_GFLOPS_DSP += gflops_DSP;
+
+ /*-------------------------------------------------------------------------
+ * Time ARM dtrmm
+ *------------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 0;
+
+ tick();
+ cblas_dtrmm(order,side,uplo,transA,diag,M,N,alpha,A,lda,Barm,ldb);
+ time_secs = tock();
+ total_time_arm += time_secs;
+ gflops_ARM = operation_count/time_secs*1e-9;
+ total_GFLOPS_ARM += gflops_ARM;
+
+ /*-------------------------------------------------------------------------
+ * Verify Results
+ *------------------------------------------------------------------------*/
+ err_code += check_results(Bdsp, Barm, M, N);
+
+ __free_ddr(A);
+ __free_ddr(Bdsp);
+ free(Barm);
+ }
+
+ *gflops_dsp = total_GFLOPS_DSP;
+ *gflops_arm = total_GFLOPS_ARM;
+ *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
+ *time_arm = total_time_arm / (float)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const double *C1, const double *C2, int M, int N)
+{
+ int i;
+ const double EPISILON = 1e-5;
+ const int NERRORS = 5;
+ int num_errors = 0;
+
+ for (i=0; i<(long)M*N; i++)
+ {
+ double delta = fabs(C1[i] - C2[i]);
+
+ if (delta > EPISILON*fabs(C1[i]))
+ if ((num_errors += 1) < NERRORS)
+ printf("Error [elem:%d]: %e <==> %e\n", i, C1[i], C2[i]);
+ }
+
+ if (num_errors > 0)
+ {
+ printf("FAIL with %d errors!\n", num_errors);
+ return num_errors;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+
diff --git a/tuning/dtrmm_tune/ofld_tbl_dtrmm.c b/tuning/dtrmm_tune/ofld_tbl_dtrmm.c
--- /dev/null
@@ -0,0 +1,257 @@
+char ofld_tbl_dtrmm[TRMM_OFFLOAD_TBL_SIZE] = {
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+0,
+0,
+0,
+0,
+0,
+0,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+0,
+0,
+0,
+0,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+0,
+0,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1};
diff --git a/tuning/dtrsm_tune/Makefile b/tuning/dtrsm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = dtrsm_tune
+
+include ../make.inc
+
+$(EXE): dtrsm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) dtrsm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/dtrsm_tune/dtrsm_tune.c b/tuning/dtrsm_tune/dtrsm_tune.c
--- /dev/null
@@ -0,0 +1,298 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include "../common/tune_com.h"
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double alpha = 0.7;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasNonUnit;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const double *C1, const double *C2, int M, int N);
+int run_dtrsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int num_size, dtrsm_err;
+ int M, N, m, n;
+ int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+ int offload_threshold_1, offload_threshold_2;
+ float total_GFLOPS_DSP, total_GFLOPS_ARM;
+ float time_DSP, time_ARM, t_dsp, t_arm;
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ int skip_next_point;
+ float diff_tmp, diff_pre;
+ FILE *fp_flag, *fp_time, *fp_tbl;
+
+ fp_flag = fopen("ofld_flag_dtrsm.dat","w");
+ fp_tbl = fopen("ofld_tbl_dtrsm.c","w");
+ fp_time = fopen("dtrsm_time_ARMvsDSP.dat","w");
+
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_dtrsm[TRMM_OFFLOAD_TBL_SIZE] = {\n");
+
+ srand(12345);
+
+ /* sweep M, and N */
+ for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
+ {
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ {
+ if( (m>0 && ofld_flag[m-1][n]==OFFLOAD)
+ ||(n>0 && ofld_flag[m][n-1]==OFFLOAD) ) {
+ ofld_flag[m][n] = OFFLOAD;
+ mem_flag[m][n] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else if( (m>0 && (mem_flag[m-1][n]==NO_MEMORY))
+ ||(n>0 && (mem_flag[m][n-1]==NO_MEMORY)) ) {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ dtrsm_err = run_dtrsm_dsp_and_arm(M, N, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(dtrsm_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory, skipping next point.\n");
+ }
+ else {
+ mem_flag[m][n] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (dtrsm_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[m][n] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
+ }
+ else {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ }
+ }
+ else {
+ printf("Error in DTRSM tuning for (M,N)=(%d,%d)!\n", M,N);
+ exit(0);
+ }
+ }
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n], (int)ofld_flag[m][n]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", M, N, time_ARM, time_DSP);
+
+ if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1)) ) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n]);
+ }
+ }
+ fprintf(fp_tbl, "\n");
+ }
+
+ fclose(fp_flag);
+ fclose(fp_time);
+ fclose(fp_tbl);
+
+ return 0;
+}
+
+
+int run_dtrsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm)
+{
+ int iter,j,k;
+ long long i, size_A, size_B;
+ float time_secs, total_time_dsp, total_time_arm;
+ float gflops_ARM, gflops_DSP;
+ float operation_count = 2.0*(float)M*(float)N;
+ float total_GFLOPS_DSP = 0.0f;
+ float total_GFLOPS_ARM = 0.0f;
+ int err_code = 0;
+
+ total_time_dsp = 0.0;
+ total_time_arm = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ }
+ size_B = (long long)M*(long long)N;
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory. The Carm array is not passed
+ * to the dsp and so can use system memory.
+ *------------------------------------------------------------------------*/
+ double *A = (double *) __malloc_ddr(size_A*(long long)sizeof(double));
+ double *Bdsp = (double *) __malloc_ddr(size_B*(long long)sizeof(double));
+ double *Barm = (double *) malloc(size_B*(long long)sizeof(double));
+
+ if (!A || !Bdsp || !Barm)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(Bdsp) __free_ddr(Bdsp);
+ if(Barm) free(Barm);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
+// for (i = 0; i < size_A; ++i) A[i] = (double)rand()/RAND_MAX;
+ for(j=0;j<lda;j++)
+ {
+ for(k=0;k<lda;k++)
+ {
+ if (j==k)
+ A[j*lda+k] = 1.0+j;
+ else if (j<k)
+ A[j*lda+k] = 0.0;
+ else
+ A[j*lda+k] = (float)rand()/RAND_MAX;
+ }
+ }
+ for (i = 0; i < (long long)M*N; ++i) Bdsp[i] = Barm[i] = (double)rand()/RAND_MAX;
+
+ /*============ BLAS tuning: running on DSP and then on ARM =============*/
+ /*------------------------------------------------------------------------
+ * Time DSP dtrsm
+ *-----------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 1;
+
+ tick();
+ cblas_dtrsm(order,side,uplo,transA,diag,M,N,alpha,A,lda,Bdsp,ldb);
+ time_secs = tock();
+ total_time_dsp += time_secs;
+ gflops_DSP = operation_count/time_secs*1e-9;
+ total_GFLOPS_DSP += gflops_DSP;
+
+ /*-------------------------------------------------------------------------
+ * Time ARM dtrsm
+ *------------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 0;
+
+ tick();
+ cblas_dtrsm(order,side,uplo,transA,diag,M,N,alpha,A,lda,Barm,ldb);
+ time_secs = tock();
+ total_time_arm += time_secs;
+ gflops_ARM = operation_count/time_secs*1e-9;
+ total_GFLOPS_ARM += gflops_ARM;
+
+ /*-------------------------------------------------------------------------
+ * Verify Results
+ *------------------------------------------------------------------------*/
+ err_code += check_results(Bdsp, Barm, M, N);
+
+ __free_ddr(A);
+ __free_ddr(Bdsp);
+ free(Barm);
+ }
+
+ *gflops_dsp = total_GFLOPS_DSP;
+ *gflops_arm = total_GFLOPS_ARM;
+ *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
+ *time_arm = total_time_arm / (float)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const double *C1, const double *C2, int M, int N)
+{
+ int i;
+ const double EPISILON = 1e-5;
+ const int NERRORS = 5;
+ int num_errors = 0;
+
+ for (i=0; i<(long)M*N; i++)
+ {
+ double delta = fabs(C1[i] - C2[i]);
+
+ if (delta > EPISILON*fabs(C1[i]))
+ if ((num_errors += 1) < NERRORS)
+ printf("Error [elem:%d]: %e <==> %e\n", i, C1[i], C2[i]);
+ }
+
+ if (num_errors > 0)
+ {
+ printf("FAIL with %d errors!\n", num_errors);
+ return num_errors;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+
diff --git a/tuning/dtrsm_tune/ofld_tbl_dtrmm.c b/tuning/dtrsm_tune/ofld_tbl_dtrmm.c
--- /dev/null
@@ -0,0 +1,257 @@
+char ofld_tbl_dtrmm[TRMM_OFFLOAD_TBL_SIZE] = {
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+0,
+0,
+0,
+0,
+0,
+0,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+0,
+0,
+0,
+0,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+0,
+0,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1,
+1};
diff --git a/examples/tuning/make.inc b/tuning/make.inc
similarity index 53%
rename from examples/tuning/make.inc
rename to tuning/make.inc
index 0125a07bc946176900fb795bdf6ca3190ac8e911..ceb605e53de506c8fed95a2c7c3ec5c58614beec 100644 (file)
rename from examples/tuning/make.inc
rename to tuning/make.inc
index 0125a07bc946176900fb795bdf6ca3190ac8e911..ceb605e53de506c8fed95a2c7c3ec5c58614beec 100644 (file)
--- a/examples/tuning/make.inc
+++ b/tuning/make.inc
BLAS_LIB_DIR = /usr/lib/
BLASLIB = $(BLAS_LIB_DIR)libcblas_armplusdsp.a $(BLAS_LIB_DIR)libblis.a -lOpenCL -locl_util -lstdc++ -lrt -lm -lgomp
BLAS_LIB_DIR = /usr/lib/
BLASLIB = $(BLAS_LIB_DIR)libcblas_armplusdsp.a $(BLAS_LIB_DIR)libblis.a -lOpenCL -locl_util -lstdc++ -lrt -lm -lgomp
-
+TUNE_UTILS = ../common/print_header.o
+TUNE_UTILS_OBJ = print_header.o
%.o: %.c
@$(CC) -c $(CFLAGS) $<
%.o: %.c
@$(CC) -c $(CFLAGS) $<
$(EXE):
$(EXE):
-cross: $(EXE)
-
-clean::
- @rm -f $(EXE) *.o *.obj *.out *.asm *.if *.opt *.bc *.objc *.map *.bin *.dsp_h
+clean:
+ @rm -f $(EXE) *.o *.dat ofld_tbl*.c
-test: clean $(EXE)
- @echo Running $(EXE)
- @./$(EXE) >> /dev/null
- @if [ $$? -ne 0 ] ; then echo "FAILED !!!" ; fi
diff --git a/tuning/sgemm_tune/Makefile b/tuning/sgemm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = sgemm_tune
+
+include ../make.inc
+
+$(EXE): sgemm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) sgemm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
similarity index 94%
rename from examples/tuning/sgemm_tune/sgemm_tune.c
rename to tuning/sgemm_tune/sgemm_tune.c
index 16b28286cccbefb9e1831b411012928f2bd7ec90..5f33ded1880723d32713bc290f788de87b444b23 100644 (file)
rename from examples/tuning/sgemm_tune/sgemm_tune.c
rename to tuning/sgemm_tune/sgemm_tune.c
index 16b28286cccbefb9e1831b411012928f2bd7ec90..5f33ded1880723d32713bc290f788de87b444b23 100644 (file)
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
+#include "../common/tune_com.h"
#include "cblas.h"
#ifdef __cplusplus
#include "cblas.h"
#ifdef __cplusplus
}
#endif
}
#endif
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
fp_tbl = fopen("ofld_tbl_sgemm.c","w");
fp_time = fopen("sgemm_time_ARMvsDSP.dat","w");
fp_tbl = fopen("ofld_tbl_sgemm.c","w");
fp_time = fopen("sgemm_time_ARMvsDSP.dat","w");
- fprintf(fp_tbl, "char ofld_tbl_sgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_sgemm[GEMM_OFFLOAD_TBL_SIZE] = {\n");
srand(12345);
srand(12345);
diff --git a/tuning/ssyrk_tune/Makefile b/tuning/ssyrk_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = ssyrk_tune
+
+include ../make.inc
+
+$(EXE): ssyrk_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) ssyrk_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/ssyrk_tune/ssyrk_tune.c b/tuning/ssyrk_tune/ssyrk_tune.c
--- /dev/null
@@ -0,0 +1,294 @@
+/******************************************************************************
+ * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include "../common/tune_com.h"
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+float alpha = 0.7;
+float beta = 0.3;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_UPLO uplo = CblasUpper;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const float *C1, const float *C2, int N, int K);
+int run_ssyrk_dsp_and_arm(int N, int K, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int num_size, ssyrk_err;
+ int N, K, n, k;
+ int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+ int offload_threshold_1, offload_threshold_2;
+ float total_GFLOPS_DSP, total_GFLOPS_ARM;
+ float time_DSP, time_ARM, t_dsp, t_arm;
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ int skip_next_point;
+ float diff_tmp, diff_pre;
+ FILE *fp_flag, *fp_time, *fp_tbl;
+
+ fp_flag = fopen("ofld_flag_ssyrk.dat","w");
+ fp_tbl = fopen("ofld_tbl_ssyrk.c","w");
+ fp_time = fopen("ssyrk_time_ARMvsDSP.dat","w");
+
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_ssyrk[SYRK_OFFLOAD_TBL_SIZE] = {\n");
+
+ srand(12345);
+
+ /* sweep K, and N */
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ {
+ for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
+ {
+ if( (n>0 && ofld_flag[n-1][k]==OFFLOAD)
+ ||(k>0 && ofld_flag[n][k-1]==OFFLOAD) ) {
+ ofld_flag[n][k] = OFFLOAD;
+ mem_flag[n][k] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ }
+ else if( (n>0 && (mem_flag[n-1][k]==NO_MEMORY))
+ ||(k>0 && (mem_flag[n][k-1]==NO_MEMORY))) {
+ ofld_flag[n][k] = NO_OFFLOAD;
+ mem_flag[n][k] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ ssyrk_err = run_ssyrk_dsp_and_arm(N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+ //dsym_err = run_dsymm_dsp_and_arm();
+
+ if(ssyrk_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[n][k] = NO_OFFLOAD;
+ mem_flag[n][k] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory, skipping next point.\n");
+ }
+ else {
+ mem_flag[n][k] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (ssyrk_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[n][k] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
+ }
+ else {
+ ofld_flag[n][k] = NO_OFFLOAD;
+ }
+ }
+ else {
+ printf("Error in SSYRK tuning for (N,K)=(%d,%d)!\n", N,K);
+ exit(0);
+ }
+ }
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[n][k], (int)ofld_flag[n][k]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", N, K, time_ARM, time_DSP);
+
+ if( (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[n][k]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[n][k]);
+ }
+ }
+ fprintf(fp_tbl, "\n");
+ }
+
+ fclose(fp_flag);
+ fclose(fp_time);
+ fclose(fp_tbl);
+
+ return 0;
+}
+
+
+int run_ssyrk_dsp_and_arm(int N, int K, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm)
+{
+ long long i, size_A, size_C;
+ int iter;
+ float time_secs, total_time_dsp, total_time_arm;
+ float gflops_ARM, gflops_DSP;
+ float operation_count = 2.0*(float)N*(float)K;
+ float total_GFLOPS_DSP = 0.0f;
+ float total_GFLOPS_ARM = 0.0f;
+ int err_code = 0;
+
+ total_time_dsp = 0.0;
+ total_time_arm = 0.0;
+ size_A = (long long)N*(long long)K;
+ size_C = (long long)N*(long long)N;
+
+ if( (size_A*sizeof(float)>(long long)0x0ffffffff)
+ ||(size_C*sizeof(float)>(long long)0x0ffffffff) ) {
+ return (-1);
+ }
+
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory. The Carm array is not passed
+ * to the dsp and so can use system memory.
+ *------------------------------------------------------------------------*/
+ float *A = (float *) __malloc_ddr(size_A*sizeof(float));
+ float *Cdsp = (float *) __malloc_ddr(size_C*sizeof(float));
+ float *Carm = (float *) malloc (size_C*sizeof(float));
+
+ if (!A || !Cdsp || !Carm)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(Cdsp) __free_ddr(Cdsp);
+ if(Carm) free(Carm);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ for (i = 0; i < size_A; ++i) A[i] = (float)rand()/RAND_MAX;
+ for (i = 0; i < size_C; ++i) Carm[i] = Cdsp[i] = (float)rand()/RAND_MAX;
+
+ int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+ (order == CblasRowMajor && transA == CblasTrans)) ? N : K;
+
+ int ldc = N;
+
+ /*============ BLAS tuning: running on DSP and then on ARM =============*/
+ /*------------------------------------------------------------------------
+ * Time DSP ssyrk
+ *-----------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 1;
+
+ tick();
+ cblas_ssyrk(order,uplo,transA,N,K,alpha,A,lda,beta,Cdsp,ldc);
+ time_secs = tock();
+ total_time_dsp += time_secs;
+ gflops_DSP = operation_count/time_secs*1e-9;
+ total_GFLOPS_DSP += gflops_DSP;
+
+ /*-------------------------------------------------------------------------
+ * Time ARM ssyrk
+ *------------------------------------------------------------------------*/
+ //ti_cblas_offload_config("000"); /* force no offloading */
+ //printf("Running on ARM.\n");
+ TI_CBLAS_L3_OFFLOAD = 0;
+
+ tick();
+ cblas_ssyrk(order,uplo,transA,N,K,alpha,A,lda,beta,Carm,ldc);
+ time_secs = tock();
+ total_time_arm += time_secs;
+ gflops_ARM = operation_count/time_secs*1e-9;
+ total_GFLOPS_ARM += gflops_ARM;
+ //printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
+ fflush(stdout);
+
+ /*-------------------------------------------------------------------------
+ * Verify Results
+ *------------------------------------------------------------------------*/
+ err_code += check_results(Cdsp, Carm, N, N);
+
+ __free_ddr(A);
+ __free_ddr(Cdsp);
+ free(Carm);
+ }
+
+ *gflops_dsp = total_GFLOPS_DSP;
+ *gflops_arm = total_GFLOPS_ARM;
+ *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
+ *time_arm = total_time_arm / (float)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const float *C1, const float *C2, int M, int N)
+{
+ int i;
+ const float EPISILON = 1e-5;
+ //const float EPISILON = 1e-200;
+ const int NERRORS = 5;
+ int num_errors = 0;
+
+ for (i=0; i<(long)M*N; i++)
+ {
+ float delta = fabs(C1[i] - C2[i]);
+ if (delta > EPISILON*fabs(C1[i]))
+ if ((num_errors += 1) < NERRORS)
+ printf("Error [elem:%d]: %e <==> %e\n", i, C1[i], C2[i]);
+ }
+
+ if (num_errors > 0)
+ {
+ printf("FAIL with %d errors!\n", num_errors);
+ return num_errors;
+ }
+ else
+ {
+ //printf("PASS!\n");
+ return 0;
+ }
+}
+
+
diff --git a/tuning/strmm_tune/Makefile b/tuning/strmm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = strmm_tune
+
+include ../make.inc
+
+$(EXE): strmm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) strmm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/strmm_tune/strmm_proc.m b/tuning/strmm_tune/strmm_proc.m
--- /dev/null
@@ -0,0 +1,30 @@
+load mat_a.dat;
+load mat_b.dat;
+load mat_c.dat;
+load mat_a2.dat;
+load mat_b2.dat;
+load mat_c2.dat;
+
+A=reshape(mat_a,8,8);
+B=reshape(mat_b,8,8);
+C=reshape(mat_c,8,8);
+A2=reshape(mat_a2,8,8);
+B2=reshape(mat_b2,8,8);
+C2=reshape(mat_c2,8,8);
+
+diff=B-C;
+diff2=B2-C2;
+
+A3=A;
+for i=1:8
+ A3(i,i)=1;
+end
+for i=1:8
+ for k=1:8
+ if k<i
+ A3(i,k)=0;
+ end
+ end
+end
+
+B3=0.7*A3*B;
similarity index 53%
rename from examples/sgemm_tune/sgemm_tune.c
rename to tuning/strmm_tune/strmm_tune.c
index 05ffc7c6b133b614ba088eaa29413f2022fc13fd..b7d2d8b199a7d9501091cb42d0a10f9b2abbcb5d 100644 (file)
rename from examples/sgemm_tune/sgemm_tune.c
rename to tuning/strmm_tune/strmm_tune.c
index 05ffc7c6b133b614ba088eaa29413f2022fc13fd..b7d2d8b199a7d9501091cb42d0a10f9b2abbcb5d 100644 (file)
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
+#include "../common/tune_com.h"
#include "cblas.h"
#ifdef __cplusplus
#include "cblas.h"
#ifdef __cplusplus
}
#endif
}
#endif
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 8 //16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
float alpha = 0.7;
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
float alpha = 0.7;
-float beta = 0.3;
enum CBLAS_ORDER order = CblasColMajor;
enum CBLAS_ORDER order = CblasColMajor;
-//enum CBLAS_ORDER order = CblasRowMajor;
enum CBLAS_TRANSPOSE transA = CblasNoTrans;
enum CBLAS_TRANSPOSE transA = CblasNoTrans;
-enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasUnit;
extern int TI_CBLAS_L3_OFFLOAD;
extern int TI_CBLAS_L3_OFFLOAD;
* Prototypes
*----------------------------------------------------------------------------*/
int check_results(const float *C1, const float *C2, int M, int N);
* Prototypes
*----------------------------------------------------------------------------*/
int check_results(const float *C1, const float *C2, int M, int N);
-int run_sgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
+int run_strmm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
float *gflops_dsp, float *gflops_arm);
/*-----------------------------------------------------------------------------
float *gflops_dsp, float *gflops_arm);
/*-----------------------------------------------------------------------------
@@ -82,96 +64,91 @@ int run_sgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
*----------------------------------------------------------------------------*/
int main()
{
*----------------------------------------------------------------------------*/
int main()
{
- int num_size, sgemm_err;
- int M, N, K, m, n, k;
+ int num_size, strmm_err;
+ int M, N, m, n;
int M_pre, N_pre, K_pre, M_start_size, N_start_size;
int offload_threshold_1, offload_threshold_2;
float total_GFLOPS_DSP, total_GFLOPS_ARM;
float time_DSP, time_ARM, t_dsp, t_arm;
int M_pre, N_pre, K_pre, M_start_size, N_start_size;
int offload_threshold_1, offload_threshold_2;
float total_GFLOPS_DSP, total_GFLOPS_ARM;
float time_DSP, time_ARM, t_dsp, t_arm;
- char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
- char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
int skip_next_point;
float diff_tmp, diff_pre;
FILE *fp_flag, *fp_time, *fp_tbl;
int skip_next_point;
float diff_tmp, diff_pre;
FILE *fp_flag, *fp_time, *fp_tbl;
- fp_flag = fopen("ofld_flag_sgemm.dat","w");
- fp_tbl = fopen("ofld_tbl_sgemm.c","w");
- fp_time = fopen("sgemm_time_ARMvsDSP.dat","w");
+ fp_flag = fopen("ofld_flag_strmm.dat","w");
+ fp_tbl = fopen("ofld_tbl_strmm.c","w");
+ fp_time = fopen("strmm_time_ARMvsDSP.dat","w");
- fprintf(fp_tbl, "char ofld_tbl_sgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_strmm[TRMM_OFFLOAD_TBL_SIZE] = {\n");
srand(12345);
srand(12345);
- /* sweep M, K, and N */
+ /* sweep M, and N */
for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
{
for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
{
- for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
- {
- if( (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
- ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
- ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
- ofld_flag[m][n][k] = OFFLOAD;
- mem_flag[m][n][k] = HAS_MEMORY; // to avoid error
- time_DSP = -1.0;
- time_ARM = -1.0;
- printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
- }
- else if( (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
- ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
- ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
- ofld_flag[m][n][k] = NO_OFFLOAD;
- mem_flag[m][n][k] = NO_MEMORY;
+ if( (m>0 && ofld_flag[m-1][n]==OFFLOAD)
+ ||(n>0 && ofld_flag[m][n-1]==OFFLOAD) ) {
+ ofld_flag[m][n] = OFFLOAD;
+ mem_flag[m][n] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else if( (m>0 && (mem_flag[m-1][n]==NO_MEMORY))
+ ||(n>0 && (mem_flag[m][n-1]==NO_MEMORY)) ) {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ strmm_err = run_strmm_dsp_and_arm(M, N, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(strmm_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
time_DSP = -2.0;
time_ARM = -2.0;
time_DSP = -2.0;
time_ARM = -2.0;
- printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+ printf("Out of memory, skipping next point.\n");
}
}
- else {
- printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
- sgemm_err = run_sgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
-
- if(sgemm_err == -1) { /* out of memory for DSP offloading */
- ofld_flag[m][n][k] = NO_OFFLOAD;
- mem_flag[m][n][k] = NO_MEMORY;
- time_DSP = -2.0;
- time_ARM = -2.0;
- printf("Out of memory, skipping next point.\n");
- }
- else {
- mem_flag[m][n][k] = HAS_MEMORY;
- time_DSP = t_dsp;
- time_ARM = t_arm;
- if (sgemm_err == 0){
- //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
- if(t_dsp < t_arm) {
- ofld_flag[m][n][k] = OFFLOAD;
- printf("Offloading to DSP for this point. Skipping next point.\n");
- }
- else {
- ofld_flag[m][n][k] = NO_OFFLOAD;
- }
+ else {
+ mem_flag[m][n] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (strmm_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[m][n] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
}
else {
}
else {
- printf("Error in SGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
- exit(0);
+ ofld_flag[m][n] = NO_OFFLOAD;
}
}
}
}
+ else {
+ printf("Error in STRMM tuning for (M,N)=(%d,%d)!\n", M,N);
+ exit(0);
+ }
}
}
-
- fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
- fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n], (int)ofld_flag[m][n]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", M, N, time_ARM, time_DSP);
- if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
- && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
- && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
- fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
- } else {
- fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
- }
+ if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1)) ) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n]);
}
}
- fprintf(fp_tbl, "\n");
}
}
+ fprintf(fp_tbl, "\n");
}
fclose(fp_flag);
}
fclose(fp_flag);
}
}
-int run_sgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
+int run_strmm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
float *gflops_dsp, float *gflops_arm)
{
float *gflops_dsp, float *gflops_arm)
{
- int iter;
- long long i;
+ int iter;
+ long long i, size_A, size_B;
float time_secs, total_time_dsp, total_time_arm;
float gflops_ARM, gflops_DSP;
float time_secs, total_time_dsp, total_time_arm;
float gflops_ARM, gflops_DSP;
- float operation_count = 2.0*(float)M*(float)N*(float)K;
+ float operation_count = 2.0*(float)M*(float)N;
float total_GFLOPS_DSP = 0.0f;
float total_GFLOPS_ARM = 0.0f;
int err_code = 0;
total_time_dsp = 0.0;
total_time_arm = 0.0;
float total_GFLOPS_DSP = 0.0f;
float total_GFLOPS_ARM = 0.0f;
int err_code = 0;
total_time_dsp = 0.0;
total_time_arm = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ }
+ size_B = (long long)M*(long long)N;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*-------------------------------------------------------------------------
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*-------------------------------------------------------------------------
@@ -203,18 +187,16 @@ int run_sgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
* the DSP are allocated using device memory. The Carm array is not passed
* to the dsp and so can use system memory.
*------------------------------------------------------------------------*/
* the DSP are allocated using device memory. The Carm array is not passed
* to the dsp and so can use system memory.
*------------------------------------------------------------------------*/
- float *A = (float *) __malloc_ddr((long long)M*(long long)K*(long long)sizeof(float));
- float *B = (float *) __malloc_ddr((long long)K*(long long)N*(long long)sizeof(float));
- float *Cdsp = (float *) __malloc_ddr((long long)M*(long long)N*(long long)sizeof(float));
- float *Carm = (float *) malloc ((long long)M*(long long)N*(long long)sizeof(float));
+ float *A = (float *) __malloc_ddr(size_A*(long long)sizeof(float));
+ float *Bdsp = (float *) __malloc_ddr(size_B*(long long)sizeof(float));
+ float *Barm = (float *) malloc(size_B*(long long)sizeof(float));
- if (!A || !B || !Cdsp || !Carm)
+ if (!A || !Bdsp || !Barm)
{
printf("Could not allocate enough space for the arrays!");
if(A) __free_ddr(A);
{
printf("Could not allocate enough space for the arrays!");
if(A) __free_ddr(A);
- if(B) __free_ddr(B);
- if(Cdsp) __free_ddr(Cdsp);
- if(Carm) free(Carm);
+ if(Bdsp) __free_ddr(Bdsp);
+ if(Barm) free(Barm);
return (-1);
}
return (-1);
}
@@ -222,69 +204,75 @@ int run_sgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
/*-------------------------------------------------------------------------
* Initialize matrices
*------------------------------------------------------------------------*/
/*-------------------------------------------------------------------------
* Initialize matrices
*------------------------------------------------------------------------*/
- for (i = 0; i < (long long)M*K; ++i) A[i] = (float)rand()/RAND_MAX;
- for (i = 0; i < (long long)K*N; ++i) B[i] = (float)rand()/RAND_MAX;
- for (i = 0; i < (long long)M*N; ++i) Carm[i] = Cdsp[i] = 0;
-
- int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
- (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
-
- int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
- (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
-
- int ldc = (order == CblasColMajor) ? M : N;
+ for (i = 0; i < size_A; ++i) A[i] = (float)rand()/RAND_MAX;
+ for (i = 0; i < (long long)M*N; ++i) Bdsp[i] = Barm[i] = (float)rand()/RAND_MAX;
+ //for (i = 0; i < (long long)M*N; ++i) Barm[i] = Bdsp[i];
+
+ if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a.dat","w");
+ FILE *file_b = fopen("mat_b.dat","w");
+ FILE *file_c = fopen("mat_c.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_b, "%1.10e\n",Barm[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Bdsp[i]);
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ }
+
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
/*============ BLAS tuning: running on DSP and then on ARM =============*/
/*------------------------------------------------------------------------
/*============ BLAS tuning: running on DSP and then on ARM =============*/
/*------------------------------------------------------------------------
- * Time DSP sgemm
+ * Time DSP strmm
*-----------------------------------------------------------------------*/
*-----------------------------------------------------------------------*/
- //ti_cblas_offload_config("001"); /* force offloading level 3 to DSP */
- //printf("Running on DSP.\n");
TI_CBLAS_L3_OFFLOAD = 1;
tick();
TI_CBLAS_L3_OFFLOAD = 1;
tick();
- cblas_sgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Cdsp,ldc);
+ cblas_strmm(order,side,uplo,transA,diag,M,N,alpha,A,lda,Bdsp,ldb);
time_secs = tock();
total_time_dsp += time_secs;
gflops_DSP = operation_count/time_secs*1e-9;
total_GFLOPS_DSP += gflops_DSP;
time_secs = tock();
total_time_dsp += time_secs;
gflops_DSP = operation_count/time_secs*1e-9;
total_GFLOPS_DSP += gflops_DSP;
-/*
- if(M==4096 && K==256 && N==16) {
- FILE *file_a = fopen("mat_a.dat","w");
- FILE *file_b = fopen("mat_b.dat","w");
- FILE *file_c = fopen("mat_c.dat","w");
-
- for(i=0; i < M*K; ++i) fprintf(file_a, "%1.10e\n",A[i]);
- for(i=0; i < K*N; ++i) fprintf(file_b, "%1.10e\n",B[i]);
- for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Cdsp[i]);
- }
-*/
+
/*-------------------------------------------------------------------------
/*-------------------------------------------------------------------------
- * Time ARM sgemm
+ * Time ARM strmm
*------------------------------------------------------------------------*/
*------------------------------------------------------------------------*/
- //ti_cblas_offload_config("000"); /* force no offloading */
- //printf("Running on ARM.\n");
TI_CBLAS_L3_OFFLOAD = 0;
tick();
TI_CBLAS_L3_OFFLOAD = 0;
tick();
- cblas_sgemm(order,transA,transB,M,N,K,alpha,A,lda,B,ldb,beta,Carm,ldc);
+ cblas_strmm(order,side,uplo,transA,diag,M,N,alpha,A,lda,Barm,ldb);
time_secs = tock();
total_time_arm += time_secs;
gflops_ARM = operation_count/time_secs*1e-9;
total_GFLOPS_ARM += gflops_ARM;
//printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
time_secs = tock();
total_time_arm += time_secs;
gflops_ARM = operation_count/time_secs*1e-9;
total_GFLOPS_ARM += gflops_ARM;
//printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
- fflush(stdout);
-
+
+ if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a2.dat","w");
+ FILE *file_b = fopen("mat_b2.dat","w");
+ FILE *file_c = fopen("mat_c2.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_b, "%1.10e\n",Barm[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Bdsp[i]);
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ }
+
/*-------------------------------------------------------------------------
* Verify Results
*------------------------------------------------------------------------*/
/*-------------------------------------------------------------------------
* Verify Results
*------------------------------------------------------------------------*/
- //return check_results(Cdsp, Carm, M, N);
- err_code += check_results(Cdsp, Carm, M, N);
+ err_code += check_results(Bdsp, Barm, M, N);
__free_ddr(A);
__free_ddr(A);
- __free_ddr(B);
- __free_ddr(Cdsp);
- free(Carm);
+ __free_ddr(Bdsp);
+ free(Barm);
}
*gflops_dsp = total_GFLOPS_DSP;
}
*gflops_dsp = total_GFLOPS_DSP;
diff --git a/tuning/strsm_tune/Makefile b/tuning/strsm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = strsm_tune
+
+include ../make.inc
+
+$(EXE): strsm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) strsm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/strsm_tune/strsm_proc.m b/tuning/strsm_tune/strsm_proc.m
--- /dev/null
@@ -0,0 +1,39 @@
+M=1024;
+N=8;
+load mat_a.dat;
+load mat_b.dat;
+load mat_c.dat;
+load mat_a2.dat;
+load mat_b2.dat;
+load mat_c2.dat;
+
+A=reshape(mat_a,M,M);
+B=reshape(mat_b,M,N);
+C=reshape(mat_c,M,N);
+A2=reshape(mat_a2,M,M);
+B2=reshape(mat_b2,M,N);
+C2=reshape(mat_c2,M,N);
+
+diff=B-C;
+diff2=B2-C2;
+
+A3=A;
+
+if 0
+for i=1:M
+ A3(i,i)=1;
+end
+for i=1:M
+ for k=1:M
+ if k<i
+ A3(i,k)=0;
+ end
+ end
+end
+end
+
+A3_inv=inv(A3);
+B3=0.7*A3_inv*B;
+
+diff3=B3-B2;
+diff4=B3-C2;
diff --git a/tuning/strsm_tune/strsm_tune.c b/tuning/strsm_tune/strsm_tune.c
--- /dev/null
@@ -0,0 +1,335 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include "../common/tune_com.h"
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+float alpha = 0.7;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasNonUnit;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const float *C1, const float *C2, int M, int N);
+int run_strsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int num_size, strsm_err;
+ int M, N, m, n;
+ int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+ int offload_threshold_1, offload_threshold_2;
+ float total_GFLOPS_DSP, total_GFLOPS_ARM;
+ float time_DSP, time_ARM, t_dsp, t_arm;
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ int skip_next_point;
+ float diff_tmp, diff_pre;
+ FILE *fp_flag, *fp_time, *fp_tbl;
+
+ fp_flag = fopen("ofld_flag_strsm.dat","w");
+ fp_tbl = fopen("ofld_tbl_strsm.c","w");
+ fp_time = fopen("strsm_time_ARMvsDSP.dat","w");
+
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_strsm[TRMM_OFFLOAD_TBL_SIZE] = {\n");
+
+ srand(12345);
+
+ /* sweep M, and N */
+ for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
+ {
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ {
+ if( (m>0 && ofld_flag[m-1][n]==OFFLOAD)
+ ||(n>0 && ofld_flag[m][n-1]==OFFLOAD) ) {
+ ofld_flag[m][n] = OFFLOAD;
+ mem_flag[m][n] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else if( (m>0 && (mem_flag[m-1][n]==NO_MEMORY))
+ ||(n>0 && (mem_flag[m][n-1]==NO_MEMORY)) ) {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ strsm_err = run_strsm_dsp_and_arm(M, N, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(strsm_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory, skipping next point.\n");
+ }
+ else {
+ mem_flag[m][n] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (strsm_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[m][n] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
+ }
+ else {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ }
+ }
+ else {
+ printf("Error in STRSM tuning for (M,N)=(%d,%d)!\n", M,N);
+ exit(0);
+ }
+ }
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n], (int)ofld_flag[m][n]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", M, N, time_ARM, time_DSP);
+
+ if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1)) ) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n]);
+ }
+ }
+ fprintf(fp_tbl, "\n");
+ }
+
+ fclose(fp_flag);
+ fclose(fp_time);
+ fclose(fp_tbl);
+
+ return 0;
+}
+
+
+int run_strsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm)
+{
+ int iter, j,k;
+ long long i, size_A, size_B;
+ float time_secs, total_time_dsp, total_time_arm;
+ float gflops_ARM, gflops_DSP;
+ float operation_count = 2.0*(float)M*(float)N;
+ float total_GFLOPS_DSP = 0.0f;
+ float total_GFLOPS_ARM = 0.0f;
+ int err_code = 0;
+
+ total_time_dsp = 0.0;
+ total_time_arm = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ }
+ size_B = (long long)M*(long long)N;
+
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory. The Carm array is not passed
+ * to the dsp and so can use system memory.
+ *------------------------------------------------------------------------*/
+ float *A = (float *) __malloc_ddr(size_A*(long long)sizeof(float));
+ float *Bdsp = (float *) __malloc_ddr(size_B*(long long)sizeof(float));
+ float *Barm = (float *) malloc(size_B*(long long)sizeof(float));
+
+ if (!A || !Bdsp || !Barm)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(Bdsp) __free_ddr(Bdsp);
+ if(Barm) free(Barm);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
+ //for (i = 0; i < size_A; ++i) A[i] = (float)rand()/RAND_MAX;
+ for(j=0;j<lda;j++)
+ {
+ for(k=0;k<lda;k++)
+ {
+ if (j==k)
+ A[j*lda+k] = 1.0+j;
+ else if (j<k)
+ A[j*lda+k] = 0.0;
+ else
+ A[j*lda+k] = (float)rand()/RAND_MAX;
+ }
+ }
+ for (i = 0; i < (long long)M*N; ++i) Bdsp[i] = Barm[i] = (float)rand()/RAND_MAX;
+
+ if(M==256 && N==128) {
+ FILE *file_a = fopen("mat_a.dat","w");
+ FILE *file_b = fopen("mat_b.dat","w");
+ FILE *file_c = fopen("mat_c.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_b, "%1.10e\n",Barm[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Bdsp[i]);
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ }
+
+ /*============ BLAS tuning: running on DSP and then on ARM =============*/
+ /*------------------------------------------------------------------------
+ * Time DSP strsm
+ *-----------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 1;
+
+ tick();
+ cblas_strsm(order,side,uplo,transA,diag,M,N,alpha,A,lda,Bdsp,ldb);
+ time_secs = tock();
+ total_time_dsp += time_secs;
+ gflops_DSP = operation_count/time_secs*1e-9;
+ total_GFLOPS_DSP += gflops_DSP;
+
+ /*-------------------------------------------------------------------------
+ * Time ARM strsm
+ *------------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 0;
+
+ tick();
+ cblas_strsm(order,side,uplo,transA,diag,M,N,alpha,A,lda,Barm,ldb);
+ time_secs = tock();
+ total_time_arm += time_secs;
+ gflops_ARM = operation_count/time_secs*1e-9;
+ total_GFLOPS_ARM += gflops_ARM;
+ //printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
+
+ if(M==256 && N==128) {
+ FILE *file_a2 = fopen("mat_a2.dat","w");
+ FILE *file_b2 = fopen("mat_b2.dat","w");
+ FILE *file_c2 = fopen("mat_c2.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a2, "%1.10e\n",A[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_b2, "%1.10e\n",Barm[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_c2, "%1.10e\n",Bdsp[i]);
+
+ fclose(file_a2);
+ fclose(file_b2);
+ fclose(file_c2);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Verify Results
+ *------------------------------------------------------------------------*/
+ err_code += check_results(Bdsp, Barm, M, N);
+
+ __free_ddr(A);
+ __free_ddr(Bdsp);
+ free(Barm);
+ }
+
+ *gflops_dsp = total_GFLOPS_DSP;
+ *gflops_arm = total_GFLOPS_ARM;
+ *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
+ *time_arm = total_time_arm / (float)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const float *C1, const float *C2, int M, int N)
+{
+ int i;
+ float norm, delta;
+ const float EPISILON = 1e-2;
+ const float DELTA = 1e-5;
+ //const float EPISILON = 1e-200;
+ const int NERRORS = 5;
+ int num_errors = 0;
+
+ for (i=0; i<(long)M*N; i++)
+ {
+ delta = fabs(C1[i] - C2[i]);
+ norm = fabs(C1[i]);
+ if(norm < fabs(C2[i]))
+ norm = fabs(C2[i]);
+
+ if (delta > EPISILON*norm && delta>DELTA)
+ if ((num_errors += 1) < NERRORS)
+ printf("Error [elem:%d]: %e <==> %e\n", i, C1[i], C2[i]);
+ }
+
+ if (num_errors > 0)
+ {
+ printf("FAIL with %d errors!\n", num_errors);
+ return num_errors;
+ }
+ else
+ {
+ //printf("PASS!\n");
+ return 0;
+ }
+}
+
+
diff --git a/tuning/zgemm_tune/Makefile b/tuning/zgemm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = zgemm_tune
+
+include ../make.inc
+
+$(EXE): zgemm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) zgemm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
similarity index 94%
rename from examples/tuning/zgemm_tune/zgemm_tune.c
rename to tuning/zgemm_tune/zgemm_tune.c
index 31bf0972114c40b15e35bfe5fa4739cc058b319d..e7c639381e52a62bdb3aa0a4552f36e184138007 100644 (file)
rename from examples/tuning/zgemm_tune/zgemm_tune.c
rename to tuning/zgemm_tune/zgemm_tune.c
index 31bf0972114c40b15e35bfe5fa4739cc058b319d..e7c639381e52a62bdb3aa0a4552f36e184138007 100644 (file)
#include <math.h>
#include <time.h>
#include <complex.h>
#include <math.h>
#include <time.h>
#include <complex.h>
+#include "../common/tune_com.h"
#include "cblas.h"
#ifdef __cplusplus
#include "cblas.h"
#ifdef __cplusplus
}
#endif
}
#endif
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
fp_tbl = fopen("ofld_tbl_zgemm.c","w");
fp_time = fopen("zgemm_time_ARMvsDSP.dat","w");
fp_tbl = fopen("ofld_tbl_zgemm.c","w");
fp_time = fopen("zgemm_time_ARMvsDSP.dat","w");
- fprintf(fp_tbl, "char ofld_tbl_zgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_zgemm[GEMM_OFFLOAD_TBL_SIZE] = {\n");
srand(12345);
srand(12345);
}
}
else {
}
}
else {
- printf("Error in DGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
+ printf("Error in ZGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
exit(0);
}
}
exit(0);
}
}
@@ -313,7 +297,7 @@ int check_results(const double complex *C1, const double complex *C2, int M, int
if (num_errors > 0)
{
printf("FAIL with %d errors!\n", num_errors);
if (num_errors > 0)
{
printf("FAIL with %d errors!\n", num_errors);
- return -1;
+ return num_errors;
}
else
{
}
else
{
diff --git a/tuning/zsyrk_tune/Makefile b/tuning/zsyrk_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = zsyrk_tune
+
+include ../make.inc
+
+$(EXE): zsyrk_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) zsyrk_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/zsyrk_tune/zsyrk_tune.c b/tuning/zsyrk_tune/zsyrk_tune.c
--- /dev/null
@@ -0,0 +1,296 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <complex.h>
+#include "../common/tune_com.h"
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double complex alpha = 0.7 - 0.3*I;
+double complex beta = 0.4 + 0.6*I;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_UPLO uplo = CblasUpper;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const double complex *C1, const double complex *C2, int N, int K);
+int run_zsyrk_dsp_and_arm(int N, int K, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int num_size, zsyrk_err;
+ int N, K, n, k;
+ int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+ int offload_threshold_1, offload_threshold_2;
+ float total_GFLOPS_DSP, total_GFLOPS_ARM;
+ float time_DSP, time_ARM, t_dsp, t_arm;
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ int skip_next_point;
+ float diff_tmp, diff_pre;
+ FILE *fp_flag, *fp_time, *fp_tbl;
+
+ fp_flag = fopen("ofld_flag_zsyrk.dat","w");
+ fp_tbl = fopen("ofld_tbl_zsyrk.c","w");
+ fp_time = fopen("zsyrk_time_ARMvsDSP.dat","w");
+
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_zsyrk[SYRK_OFFLOAD_TBL_SIZE] = {\n");
+
+ srand(12345);
+
+ /* sweep K, and N */
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ {
+ for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
+ {
+ if( (n>0 && ofld_flag[n-1][k]==OFFLOAD)
+ ||(k>0 && ofld_flag[n][k-1]==OFFLOAD) ) {
+ ofld_flag[n][k] = OFFLOAD;
+ mem_flag[n][k] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ }
+ else if( (n>0 && (mem_flag[n-1][k]==NO_MEMORY))
+ ||(k>0 && (mem_flag[n][k-1]==NO_MEMORY))) {
+ ofld_flag[n][k] = NO_OFFLOAD;
+ mem_flag[n][k] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (N,K)=(%d,%d), (n,k)=(%d,%d).\n", N,K,n,k);
+ zsyrk_err = run_zsyrk_dsp_and_arm(N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(zsyrk_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[n][k] = NO_OFFLOAD;
+ mem_flag[n][k] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory, skipping next point.\n");
+ }
+ else {
+ mem_flag[n][k] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (zsyrk_err == 0){
+ if(t_dsp < t_arm) {
+ ofld_flag[n][k] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
+ }
+ else {
+ ofld_flag[n][k] = NO_OFFLOAD;
+ }
+ }
+ else {
+ printf("Error in ZSYRK tuning for (N,K)=(%d,%d)!\n", N,K);
+ exit(0);
+ }
+ }
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[n][k], (int)ofld_flag[n][k]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", N, K, time_ARM, time_DSP);
+
+ if( (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[n][k]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[n][k]);
+ }
+ }
+ fprintf(fp_tbl, "\n");
+ }
+
+ fclose(fp_flag);
+ fclose(fp_time);
+ fclose(fp_tbl);
+
+ return 0;
+}
+
+int run_zsyrk_dsp_and_arm(int N, int K, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm)
+{
+ int iter;
+ long long i, size_A, size_C;
+ float time_secs, total_time_dsp, total_time_arm;
+ float gflops_ARM, gflops_DSP;
+ float operation_count = 2.0*(float)N*(float)K;
+ float total_GFLOPS_DSP = 0.0f;
+ float total_GFLOPS_ARM = 0.0f;
+ int err_code = 0;
+
+ total_time_dsp = 0.0;
+ total_time_arm = 0.0;
+ size_A = (long long)N*(long long)K;
+ size_C = (long long)N*(long long)N;
+ if( (size_A*sizeof(double complex)>(long long)0x0ffffffff)
+ ||(size_C*sizeof(double complex)>(long long)0x0ffffffff) ) {
+ return (-1);
+ }
+
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory. The Carm array is not passed
+ * to the dsp and so can use system memory.
+ *------------------------------------------------------------------------*/
+ double complex *A = (double complex *) __malloc_ddr(size_A*sizeof(double complex));
+ double complex *Cdsp = (double complex *) __malloc_ddr(size_C*sizeof(double complex));
+ double complex *Carm = (double complex *) malloc(size_C*sizeof(double complex));
+
+ if (!A || !Cdsp || !Carm)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(Cdsp) __free_ddr(Cdsp);
+ if(Carm) free(Carm);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ for (i = 0; i < size_A; ++i)
+ {
+ A[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ }
+
+ for (i = 0; i < size_C; ++i)
+ {
+ Cdsp[i] = Carm[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ }
+
+ int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
+ (order == CblasRowMajor && transA == CblasTrans)) ? N : K;
+
+ int ldc = N;
+
+ /*============ BLAS tuning: running on DSP and then on ARM =============*/
+ /*------------------------------------------------------------------------
+ * Time DSP zsyrk
+ *-----------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 1;
+
+ tick();
+ cblas_zsyrk(order,uplo,transA,N,K,&alpha,A,lda,&beta,Cdsp,ldc);
+ time_secs = tock();
+ total_time_dsp += time_secs;
+ gflops_DSP = operation_count/time_secs*1e-9;
+ total_GFLOPS_DSP += gflops_DSP;
+
+ /*-------------------------------------------------------------------------
+ * Time ARM zsyrk
+ *------------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 0;
+
+ tick();
+ cblas_zsyrk(order,uplo,transA,N,K,&alpha,A,lda,&beta,Carm,ldc);
+ time_secs = tock();
+ total_time_arm += time_secs;
+ gflops_ARM = operation_count/time_secs*1e-9;
+ total_GFLOPS_ARM += gflops_ARM;
+
+ /*-------------------------------------------------------------------------
+ * Verify Results
+ *------------------------------------------------------------------------*/
+ err_code += check_results(Cdsp, Carm, N, N);
+
+ __free_ddr(A);
+ __free_ddr(Cdsp);
+ free(Carm);
+ }
+
+ *gflops_dsp = total_GFLOPS_DSP;
+ *gflops_arm = total_GFLOPS_ARM;
+ *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
+ *time_arm = total_time_arm / (float)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const double complex *C1, const double complex *C2, int M, int N)
+{
+ int i;
+ const double EPISILON = 1e-5;
+ //const double EPISILON = 1e-200;
+ const int NERRORS = 5;
+ int num_errors = 0;
+
+ for (i=0; i<M*N; i++)
+ {
+ double delta = cabs(C1[i]) - cabs(C2[i]);
+
+ if (delta > EPISILON*cabs(C1[i]))
+ if ((num_errors += 1) < NERRORS)
+ printf("Error [elem:%d]: %f <==> %f\n", i, cabs(C1[i]), cabs(C2[i]));
+ }
+
+ if (num_errors > 0)
+ {
+ printf("FAIL with %d errors!\n", num_errors);
+ return num_errors;
+ }
+ else
+ {
+ //printf("PASS!\n");
+ return 0;
+ }
+}
+
+
diff --git a/tuning/ztrmm_tune/Makefile b/tuning/ztrmm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = ztrmm_tune
+
+include ../make.inc
+
+$(EXE): ztrmm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) ztrmm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
similarity index 53%
rename from examples/zgemm_tune/zgemm_tune.c
rename to tuning/ztrmm_tune/ztrmm_tune.c
index a8e83d9be346a31c946bdde934e6da5945aa32c9..65b9c96b0966e1b74eedc25817bbe719ad8e2aa0 100644 (file)
rename from examples/zgemm_tune/zgemm_tune.c
rename to tuning/ztrmm_tune/ztrmm_tune.c
index a8e83d9be346a31c946bdde934e6da5945aa32c9..65b9c96b0966e1b74eedc25817bbe719ad8e2aa0 100644 (file)
#include <math.h>
#include <time.h>
#include <complex.h>
#include <math.h>
#include <time.h>
#include <complex.h>
+#include "../common/tune_com.h"
#include "cblas.h"
#ifdef __cplusplus
#include "cblas.h"
#ifdef __cplusplus
}
#endif
}
#endif
-#define TUNING_START_SIZE_SQUARE_MATRIX 16
-#define TUNING_START_SIZE_RECTAN_MATRIX 8
-#define NUM_MATRIX_SIZE_TO_BENCHMARK 8 //16
-#define HAS_MEMORY 1
-#define NO_MEMORY 0
-#define OFFLOAD 1
-#define NO_OFFLOAD 0
-
-#define NUM_TEST_RUN 5
-
-/*-----------------------------------------------------------------------------
-* Timing Setup
-*----------------------------------------------------------------------------*/
-struct timespec t0,t1;
-#define tick() clock_gettime(CLOCK_MONOTONIC, &t0);
-#define tock() (clock_gettime(CLOCK_MONOTONIC, &t1), \
- t1.tv_sec - t0.tv_sec + (t1.tv_nsec - t0.tv_nsec) / 1e9)
-
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
double complex alpha = 0.7 - 0.3*I;
/*-----------------------------------------------------------------------------
* Global Variables
*----------------------------------------------------------------------------*/
double complex alpha = 0.7 - 0.3*I;
-double complex beta = 0.4 + 0.6*I;
enum CBLAS_ORDER order = CblasColMajor;
enum CBLAS_TRANSPOSE transA = CblasNoTrans;
enum CBLAS_ORDER order = CblasColMajor;
enum CBLAS_TRANSPOSE transA = CblasNoTrans;
-enum CBLAS_TRANSPOSE transB = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasUnit;
extern int TI_CBLAS_L3_OFFLOAD;
extern int TI_CBLAS_L3_OFFLOAD;
* Prototypes
*----------------------------------------------------------------------------*/
int check_results(const double complex *C1, const double complex *C2, int M, int N);
* Prototypes
*----------------------------------------------------------------------------*/
int check_results(const double complex *C1, const double complex *C2, int M, int N);
-int run_zgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
+int run_ztrmm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
float *gflops_dsp, float *gflops_arm);
/*-----------------------------------------------------------------------------
float *gflops_dsp, float *gflops_arm);
/*-----------------------------------------------------------------------------
@@ -81,98 +65,93 @@ int run_zgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
*----------------------------------------------------------------------------*/
int main()
{
*----------------------------------------------------------------------------*/
int main()
{
- int num_size, zgemm_err;
- int M, N, K, m, n, k;
+ int num_size, ztrmm_err;
+ int M, N, m, n;
int M_pre, N_pre, K_pre, M_start_size, N_start_size;
int offload_threshold_1, offload_threshold_2;
float total_GFLOPS_DSP, total_GFLOPS_ARM;
float time_DSP, time_ARM, t_dsp, t_arm;
int M_pre, N_pre, K_pre, M_start_size, N_start_size;
int offload_threshold_1, offload_threshold_2;
float total_GFLOPS_DSP, total_GFLOPS_ARM;
float time_DSP, time_ARM, t_dsp, t_arm;
- char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
- char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
int skip_next_point;
int skip_next_point;
+ float diff_tmp, diff_pre;
FILE *fp_flag, *fp_time, *fp_tbl;
FILE *fp_flag, *fp_time, *fp_tbl;
- fp_flag = fopen("ofld_flag_zgemm.dat","w");
- fp_tbl = fopen("ofld_tbl_zgemm.c","w");
- fp_time = fopen("zgemm_time_ARMvsDSP.dat","w");
+ fp_flag = fopen("ofld_flag_ztrmm.dat","w");
+ fp_tbl = fopen("ofld_tbl_ztrmm.c","w");
+ fp_time = fopen("ztrmm_time_ARMvsDSP.dat","w");
- fprintf(fp_tbl, "char ofld_tbl_zgemm[TI_L3_OFFLOAD_TBL_SIZE] = {\n");
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_ztrmm[TRMM_OFFLOAD_TBL_SIZE] = {\n");
srand(12345);
srand(12345);
- /* sweep M, K, and N */
+ /* sweep M, and N */
for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
- {
- for (K=TUNING_START_SIZE_RECTAN_MATRIX,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
- {
- if( (m>0 && ofld_flag[m-1][n][k]==OFFLOAD)
- ||(n>0 && ofld_flag[m][n-1][k]==OFFLOAD)
- ||(k>0 && ofld_flag[m][n][k-1]==OFFLOAD) ) {
- ofld_flag[m][n][k] = OFFLOAD;
- mem_flag[m][n][k] = HAS_MEMORY; // to avoid error
- time_DSP = -1.0;
- time_ARM = -1.0;
- printf("Offloading. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
- }
- else if( (m>0 && (mem_flag[m-1][n][k]==NO_MEMORY))
- ||(n>0 && (mem_flag[m][n-1][k]==NO_MEMORY))
- ||(k>0 && (mem_flag[m][n][k-1]==NO_MEMORY))) {
- ofld_flag[m][n][k] = NO_OFFLOAD;
- mem_flag[m][n][k] = NO_MEMORY;
+ {
+ if( (m>0 && ofld_flag[m-1][n]==OFFLOAD)
+ ||(n>0 && ofld_flag[m][n-1]==OFFLOAD) ) {
+ ofld_flag[m][n] = OFFLOAD;
+ mem_flag[m][n] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else if( (m>0 && (mem_flag[m-1][n]==NO_MEMORY))
+ ||(n>0 && (mem_flag[m][n-1]==NO_MEMORY)) ) {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ ztrmm_err = run_ztrmm_dsp_and_arm(M, N, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(ztrmm_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
time_DSP = -2.0;
time_ARM = -2.0;
time_DSP = -2.0;
time_ARM = -2.0;
- printf("Out of memory. Skipping (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
+ printf("Out of memory, skipping next point.\n");
}
}
- else {
- printf("Measuring DSP and ARM GFLOPS for (M,N,K)=(%d,%d,%d), (m,n,k)=(%d,%d,%d).\n", M,N,K,m,n,k);
- zgemm_err = run_zgemm_dsp_and_arm(M, N, K, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
- //dsym_err = run_dsymm_dsp_and_arm();
-
- if(zgemm_err == -1) { /* out of memory for DSP offloading */
- ofld_flag[m][n][k] = NO_OFFLOAD;
- mem_flag[m][n][k] = NO_MEMORY;
- time_DSP = -2.0;
- time_ARM = -2.0;
- printf("Out of memory, skipping next point.\n");
- }
- else {
- mem_flag[m][n][k] = HAS_MEMORY;
- time_DSP = t_dsp;
- time_ARM = t_arm;
- if (zgemm_err == 0){
- //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
- if(t_dsp < t_arm) {
- ofld_flag[m][n][k] = OFFLOAD;
- printf("Offloading to DSP for this point. Skipping next point.\n");
- }
- else {
- ofld_flag[m][n][k] = NO_OFFLOAD;
- }
+ else {
+ mem_flag[m][n] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (ztrmm_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[m][n] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
}
else {
}
else {
- printf("Error in DGEMM tuning for (M,N,K)=(%d,%d,%d)!\n", M,N,K);
- exit(0);
+ ofld_flag[m][n] = NO_OFFLOAD;
}
}
}
}
+ else {
+ printf("Error in ZTRMM tuning for (M,N)=(%d,%d)!\n", M,N);
+ exit(0);
+ }
}
}
-
- fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n][k], (int)ofld_flag[m][n][k]);
- fprintf(fp_time, "%6d,%6d,%6d\t%10.8e\t%10.8e\n", M, N, K, time_ARM, time_DSP);
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n], (int)ofld_flag[m][n]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", M, N, time_ARM, time_DSP);
- if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
- && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
- && (k==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))) {
- fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n][k]);
- } else {
- fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n][k]);
- }
+ if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1)) ) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n]);
}
}
- fprintf(fp_tbl, "\n");
}
}
+ fprintf(fp_tbl, "\n");
}
}
-
+
fclose(fp_flag);
fclose(fp_time);
fclose(fp_tbl);
fclose(fp_flag);
fclose(fp_time);
fclose(fp_tbl);
return 0;
}
return 0;
}
-
-int run_zgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
+int run_ztrmm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
float *gflops_dsp, float *gflops_arm)
{
float *gflops_dsp, float *gflops_arm)
{
- int iter;
- long long i;
+ int iter;
+ long long i, size_A, size_B;
float time_secs, total_time_dsp, total_time_arm;
float gflops_ARM, gflops_DSP;
float time_secs, total_time_dsp, total_time_arm;
float gflops_ARM, gflops_DSP;
- float operation_count = 2.0*(float)M*(float)N*(float)K;
+ float operation_count = 2.0*(float)M*(float)N;
float total_GFLOPS_DSP = 0.0f;
float total_GFLOPS_ARM = 0.0f;
int err_code = 0;
total_time_dsp = 0.0;
total_time_arm = 0.0;
float total_GFLOPS_DSP = 0.0f;
float total_GFLOPS_ARM = 0.0f;
int err_code = 0;
total_time_dsp = 0.0;
total_time_arm = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ }
+ size_B = (long long)M*(long long)N;
+ if( (size_A*sizeof(double complex)>(long long)0x0ffffffff)
+ ||(size_B*sizeof(double complex)>(long long)0x0ffffffff) ) {
+ return (-1);
+ }
+
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*-------------------------------------------------------------------------
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*-------------------------------------------------------------------------
@@ -202,108 +192,123 @@ int run_zgemm_dsp_and_arm(int M, int N, int K, float *time_dsp, float *time_arm,
* the DSP are allocated using device memory. The Carm array is not passed
* to the dsp and so can use system memory.
*------------------------------------------------------------------------*/
* the DSP are allocated using device memory. The Carm array is not passed
* to the dsp and so can use system memory.
*------------------------------------------------------------------------*/
- double complex *A = (double complex*) __malloc_ddr(M*K*sizeof(double complex));
- double complex *B = (double complex*) __malloc_ddr(K*N*sizeof(double complex));
- double complex *Cdsp = (double complex*) __malloc_ddr(M*N*sizeof(double complex));
- double complex *Carm = (double complex*) malloc (M*N*sizeof(double complex));
+ double complex *A = (double complex *) __malloc_ddr(size_A*sizeof(double complex));
+ double complex *Bdsp = (double complex *) __malloc_ddr(size_B*sizeof(double complex));
+ double complex *Barm = (double complex *) malloc(size_B*sizeof(double complex));
- if (!A || !B || !Cdsp || !Carm)
+ if (!A || !Bdsp || !Barm)
{
printf("Could not allocate enough space for the arrays!");
if(A) __free_ddr(A);
{
printf("Could not allocate enough space for the arrays!");
if(A) __free_ddr(A);
- if(B) __free_ddr(B);
- if(Cdsp) __free_ddr(Cdsp);
- if(Carm) free(Carm);
+ if(Bdsp) __free_ddr(Bdsp);
+ if(Barm) free(Barm);
return (-1);
}
/*-------------------------------------------------------------------------
return (-1);
}
/*-------------------------------------------------------------------------
- * Initialize matrices and print if small enough.
+ * Initialize matrices
*------------------------------------------------------------------------*/
*------------------------------------------------------------------------*/
- for (i = 0; i < M*K; ++i)
- {
+ for (i = 0; i < size_A; ++i)
+ {
A[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
A[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ }
+
+ for (i = 0; i < size_B; ++i)
+ {
+ Bdsp[i] = Barm[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
}
}
- for (i = 0; i < K*N; ++i)
- {
- B[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
- }
- for (i = 0; i < M*N; ++i)
- {
- Carm[i] = Cdsp[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
- }
-
- int lda = ((order == CblasColMajor && transA == CblasNoTrans) ||
- (order == CblasRowMajor && transA == CblasTrans)) ? M : K;
-
- int ldb = ((order == CblasColMajor && transB == CblasNoTrans) ||
- (order == CblasRowMajor && transB == CblasTrans)) ? K : N;
-
- int ldc = (order == CblasColMajor) ? M : N;
-
+/*
+ if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a.dat","w");
+ FILE *file_b = fopen("mat_b.dat","w");
+ FILE *file_c = fopen("mat_c.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e + i*%1.10e\n",freal(A[i]), fimag(A[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_b, "%1.10e + i*%1.10e\n",freal(Barm[i], fimag(Barm[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_c, "%1.10e + i*%1.10e\n",freal(Bdsp[i], fimag(Bdsp[i]));
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ }*/
+
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
+
/*============ BLAS tuning: running on DSP and then on ARM =============*/
/*------------------------------------------------------------------------
/*============ BLAS tuning: running on DSP and then on ARM =============*/
/*------------------------------------------------------------------------
- * Time DSP zgemm
+ * Time DSP ztrmm
*-----------------------------------------------------------------------*/
*-----------------------------------------------------------------------*/
- //ti_cblas_offload_config("001"); /* force offloading level 3 to DSP */
TI_CBLAS_L3_OFFLOAD = 1;
tick();
TI_CBLAS_L3_OFFLOAD = 1;
tick();
- cblas_zgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Cdsp,ldc);
+ cblas_ztrmm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Bdsp,ldb);
time_secs = tock();
total_time_dsp += time_secs;
gflops_DSP = operation_count/time_secs*1e-9;
total_GFLOPS_DSP += gflops_DSP;
time_secs = tock();
total_time_dsp += time_secs;
gflops_DSP = operation_count/time_secs*1e-9;
total_GFLOPS_DSP += gflops_DSP;
-
+
/*-------------------------------------------------------------------------
/*-------------------------------------------------------------------------
- * Time ARM zgemm
+ * Time ARM ztrmm
*------------------------------------------------------------------------*/
*------------------------------------------------------------------------*/
- //ti_cblas_offload_config("000"); /* force no offloading */
TI_CBLAS_L3_OFFLOAD = 0;
tick();
TI_CBLAS_L3_OFFLOAD = 0;
tick();
- cblas_zgemm(order,transA,transB,M,N,K,&alpha,A,lda,B,ldb,&beta,Carm,ldc);
+ cblas_ztrmm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Barm,ldb);
time_secs = tock();
total_time_arm += time_secs;
gflops_ARM = operation_count/time_secs*1e-9;
total_GFLOPS_ARM += gflops_ARM;
//printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
time_secs = tock();
total_time_arm += time_secs;
gflops_ARM = operation_count/time_secs*1e-9;
total_GFLOPS_ARM += gflops_ARM;
//printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
-
+
+/* if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a2.dat","w");
+ FILE *file_b = fopen("mat_b2.dat","w");
+ FILE *file_c = fopen("mat_c2.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_b, "%1.10e\n",Barm[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Bdsp[i]);
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ } */
+
/*-------------------------------------------------------------------------
* Verify Results
*------------------------------------------------------------------------*/
/*-------------------------------------------------------------------------
* Verify Results
*------------------------------------------------------------------------*/
- //return check_results(Cdsp, Carm, M, N);
- err_code += check_results(Cdsp, Carm, M, N);
+ err_code += check_results(Bdsp, Barm, M, N);
__free_ddr(A);
__free_ddr(A);
- __free_ddr(B);
- __free_ddr(Cdsp);
- free(Carm);
+ __free_ddr(Bdsp);
+ free(Barm);
}
*gflops_dsp = total_GFLOPS_DSP;
*gflops_arm = total_GFLOPS_ARM;
*time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
*time_arm = total_time_arm / (float)NUM_TEST_RUN;
}
*gflops_dsp = total_GFLOPS_DSP;
*gflops_arm = total_GFLOPS_ARM;
*time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
*time_arm = total_time_arm / (float)NUM_TEST_RUN;
-
+
return err_code;
}
return err_code;
}
+
/*-----------------------------------------------------------------------------
* check_results
*----------------------------------------------------------------------------*/
int check_results(const double complex *C1, const double complex *C2, int M, int N)
{
int i;
/*-----------------------------------------------------------------------------
* check_results
*----------------------------------------------------------------------------*/
int check_results(const double complex *C1, const double complex *C2, int M, int N)
{
int i;
- const double EPISILON = 1e-10;
+ const double EPISILON = 1e-5;
//const double EPISILON = 1e-200;
const int NERRORS = 5;
int num_errors = 0;
for (i=0; i<M*N; i++)
{
//const double EPISILON = 1e-200;
const int NERRORS = 5;
int num_errors = 0;
for (i=0; i<M*N; i++)
{
- double delta = fabs(cabs(C1[i]) - cabs(C2[i]));
+ double delta = cabs(C1[i]) - cabs(C2[i]);
if (delta > EPISILON*cabs(C1[i]))
if ((num_errors += 1) < NERRORS)
if (delta > EPISILON*cabs(C1[i]))
if ((num_errors += 1) < NERRORS)
@@ -313,7 +318,7 @@ int check_results(const double complex *C1, const double complex *C2, int M, int
if (num_errors > 0)
{
printf("FAIL with %d errors!\n", num_errors);
if (num_errors > 0)
{
printf("FAIL with %d errors!\n", num_errors);
- return -1;
+ return num_errors;
}
else
{
}
else
{
diff --git a/tuning/ztrsm_tune/Makefile b/tuning/ztrsm_tune/Makefile
--- /dev/null
@@ -0,0 +1,10 @@
+
+EXE = ztrsm_tune
+
+include ../make.inc
+
+$(EXE): ztrsm_tune.o $(TUNE_UTILS)
+ $(CC) $(CFLAGS) ztrsm_tune.o $(TUNE_UTILS_OBJ) $(BLASLIB) -o $@
+
+tune: $(EXE)
+ ./$(EXE);
\ No newline at end of file
diff --git a/tuning/ztrsm_tune/ztrsm_tune.c b/tuning/ztrsm_tune/ztrsm_tune.c
--- /dev/null
@@ -0,0 +1,342 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <complex.h>
+#include "../common/tune_com.h"
+
+#include "cblas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cblas.h"
+#ifdef __cplusplus
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Global Variables
+*----------------------------------------------------------------------------*/
+double complex alpha = 0.7 - 0.3*I;
+enum CBLAS_ORDER order = CblasColMajor;
+enum CBLAS_TRANSPOSE transA = CblasNoTrans;
+enum CBLAS_SIDE side = CblasLeft;
+enum CBLAS_UPLO uplo = CblasUpper;
+enum CBLAS_DIAG diag = CblasUnit;
+
+extern int TI_CBLAS_L3_OFFLOAD;
+
+/*-----------------------------------------------------------------------------
+* Prototypes
+*----------------------------------------------------------------------------*/
+int check_results(const double complex *C1, const double complex *C2, int M, int N);
+int run_ztrsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm);
+
+/*-----------------------------------------------------------------------------
+* MAIN
+*----------------------------------------------------------------------------*/
+int main()
+{
+ int num_size, ztrsm_err;
+ int M, N, m, n;
+ int M_pre, N_pre, K_pre, M_start_size, N_start_size;
+ int offload_threshold_1, offload_threshold_2;
+ float total_GFLOPS_DSP, total_GFLOPS_ARM;
+ float time_DSP, time_ARM, t_dsp, t_arm;
+ char ofld_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ char mem_flag[NUM_MATRIX_SIZE_TO_BENCHMARK][NUM_MATRIX_SIZE_TO_BENCHMARK];
+ int skip_next_point;
+ float diff_tmp, diff_pre;
+ FILE *fp_flag, *fp_time, *fp_tbl;
+
+ fp_flag = fopen("ofld_flag_ztrsm.dat","w");
+ fp_tbl = fopen("ofld_tbl_ztrsm.c","w");
+ fp_time = fopen("ztrsm_time_ARMvsDSP.dat","w");
+
+ print_file_header(fp_tbl);
+ fprintf(fp_tbl, "char ofld_tbl_ztrsm[TRMM_OFFLOAD_TBL_SIZE] = {\n");
+
+ srand(12345);
+
+ /* sweep M, and N */
+ for (M=TUNING_START_SIZE_RECTAN_MATRIX,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
+ {
+ for (N=TUNING_START_SIZE_RECTAN_MATRIX,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
+ {
+ if( (m>0 && ofld_flag[m-1][n]==OFFLOAD)
+ ||(n>0 && ofld_flag[m][n-1]==OFFLOAD) ) {
+ ofld_flag[m][n] = OFFLOAD;
+ mem_flag[m][n] = HAS_MEMORY; // to avoid error
+ time_DSP = -1.0;
+ time_ARM = -1.0;
+ printf("Offloading. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else if( (m>0 && (mem_flag[m-1][n]==NO_MEMORY))
+ ||(n>0 && (mem_flag[m][n-1]==NO_MEMORY)) ) {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory. Skipping (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ }
+ else {
+ printf("Measuring DSP and ARM GFLOPS for (M,N)=(%d,%d), (m,n)=(%d,%d).\n", M,N,m,n);
+ ztrsm_err = run_ztrsm_dsp_and_arm(M, N, &t_dsp, &t_arm, &total_GFLOPS_DSP, &total_GFLOPS_ARM);
+
+ if(ztrsm_err == -1) { /* out of memory for DSP offloading */
+ ofld_flag[m][n] = NO_OFFLOAD;
+ mem_flag[m][n] = NO_MEMORY;
+ time_DSP = -2.0;
+ time_ARM = -2.0;
+ printf("Out of memory, skipping next point.\n");
+ }
+ else {
+ mem_flag[m][n] = HAS_MEMORY;
+ time_DSP = t_dsp;
+ time_ARM = t_arm;
+ if (ztrsm_err == 0){
+ //if(total_GFLOPS_DSP - total_GFLOPS_ARM > 1.0) {
+ if(t_dsp < t_arm) {
+ ofld_flag[m][n] = OFFLOAD;
+ printf("Offloading to DSP for this point. Skipping next point.\n");
+ }
+ else {
+ ofld_flag[m][n] = NO_OFFLOAD;
+ }
+ }
+ else {
+ printf("Error in ZTRSM tuning for (M,N)=(%d,%d)!\n", M,N);
+ exit(0);
+ }
+ }
+ }
+
+ fprintf(fp_flag, "%d\t%d\n", (int)mem_flag[m][n], (int)ofld_flag[m][n]);
+ fprintf(fp_time, "%6d,%6d\t%10.8e\t%10.8e\n", M, N, time_ARM, time_DSP);
+
+ if( (m==(NUM_MATRIX_SIZE_TO_BENCHMARK-1))
+ && (n==(NUM_MATRIX_SIZE_TO_BENCHMARK-1)) ) {
+ fprintf(fp_tbl, "%d};", (int)ofld_flag[m][n]);
+ } else {
+ fprintf(fp_tbl, "%d,", (int)ofld_flag[m][n]);
+ }
+ }
+ fprintf(fp_tbl, "\n");
+ }
+
+ fclose(fp_flag);
+ fclose(fp_time);
+ fclose(fp_tbl);
+
+ return 0;
+}
+
+int run_ztrsm_dsp_and_arm(int M, int N, float *time_dsp, float *time_arm,
+ float *gflops_dsp, float *gflops_arm)
+{
+ int iter,j,k;
+ long long i, size_A, size_B;
+ float time_secs, total_time_dsp, total_time_arm;
+ float gflops_ARM, gflops_DSP;
+ float operation_count = 2.0*(float)M*(float)N;
+ float total_GFLOPS_DSP = 0.0f;
+ float total_GFLOPS_ARM = 0.0f;
+ int err_code = 0;
+
+ total_time_dsp = 0.0;
+ total_time_arm = 0.0;
+ if(side == CblasLeft) {
+ size_A = (long long)M*(long long)M;
+ }
+ else {
+ size_A = (long long)N*(long long)N;
+ }
+ size_B = (long long)M*(long long)N;
+
+ if( (size_A*sizeof(double complex)>(long long)0x0ffffffff)
+ ||(size_B*sizeof(double complex)>(long long)0x0ffffffff) ) {
+ return (-1);
+ }
+
+ for (iter = 0; iter < NUM_TEST_RUN; iter++)
+ {
+ /*-------------------------------------------------------------------------
+ * Allocate space for the matrices. The matrices that will be passed to
+ * the DSP are allocated using device memory. The Carm array is not passed
+ * to the dsp and so can use system memory.
+ *------------------------------------------------------------------------*/
+ double complex *A = (double complex *) __malloc_ddr(size_A*sizeof(double complex));
+ double complex *Bdsp = (double complex *) __malloc_ddr(size_B*sizeof(double complex));
+ double complex *Barm = (double complex *) malloc(size_B*sizeof(double complex));
+
+ if (!A || !Bdsp || !Barm)
+ {
+ printf("Could not allocate enough space for the arrays!");
+ if(A) __free_ddr(A);
+ if(Bdsp) __free_ddr(Bdsp);
+ if(Barm) free(Barm);
+
+ return (-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initialize matrices
+ *------------------------------------------------------------------------*/
+ int lda = (side == CblasLeft) ? M : N;
+ int ldb = M;
+ for(j=0;j<lda;j++)
+ {
+ for(k=0;k<lda;k++)
+ {
+ if (j==k)
+ A[j*lda+k] = 1.0+j + 0.0*I;
+ else if (j<k)
+ A[j*lda+k] = 0.0 + 0.0*I;
+ else
+ A[j*lda+k] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ }
+ }
+ // for (i = 0; i < size_A; ++i)
+ // {
+ // A[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ // }
+
+ for (i = 0; i < size_B; ++i)
+ {
+ Bdsp[i] = Barm[i] = (double)rand()/RAND_MAX + (double)rand()/RAND_MAX * I;
+ }
+/*
+ if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a.dat","w");
+ FILE *file_b = fopen("mat_b.dat","w");
+ FILE *file_c = fopen("mat_c.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e + i*%1.10e\n",freal(A[i]), fimag(A[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_b, "%1.10e + i*%1.10e\n",freal(Barm[i], fimag(Barm[i]));
+ for(i=0; i < size_B; ++i) fprintf(file_c, "%1.10e + i*%1.10e\n",freal(Bdsp[i], fimag(Bdsp[i]));
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ }*/
+
+ /*============ BLAS tuning: running on DSP and then on ARM =============*/
+ /*------------------------------------------------------------------------
+ * Time DSP ztrsm
+ *-----------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 1;
+
+ tick();
+ cblas_ztrsm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Bdsp,ldb);
+ time_secs = tock();
+ total_time_dsp += time_secs;
+ gflops_DSP = operation_count/time_secs*1e-9;
+ total_GFLOPS_DSP += gflops_DSP;
+
+ /*-------------------------------------------------------------------------
+ * Time ARM ztrsm
+ *------------------------------------------------------------------------*/
+ TI_CBLAS_L3_OFFLOAD = 0;
+
+ tick();
+ cblas_ztrsm(order,side,uplo,transA,diag,M,N,&alpha,A,lda,Barm,ldb);
+ time_secs = tock();
+ total_time_arm += time_secs;
+ gflops_ARM = operation_count/time_secs*1e-9;
+ total_GFLOPS_ARM += gflops_ARM;
+ //printf(" %6.3f %6.3f %9.6fs %9.6fs\n", gflops_DSP, gflops_ARM, time_dsp, time_arm);
+
+/* if(M==8 && N==8) {
+ FILE *file_a = fopen("mat_a2.dat","w");
+ FILE *file_b = fopen("mat_b2.dat","w");
+ FILE *file_c = fopen("mat_c2.dat","w");
+
+ for(i=0; i < size_A; ++i) fprintf(file_a, "%1.10e\n",A[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_b, "%1.10e\n",Barm[i]);
+ for(i=0; i < M*N; ++i) fprintf(file_c, "%1.10e\n",Bdsp[i]);
+
+ fclose(file_a);
+ fclose(file_b);
+ fclose(file_c);
+ } */
+
+ /*-------------------------------------------------------------------------
+ * Verify Results
+ *------------------------------------------------------------------------*/
+ err_code += check_results(Bdsp, Barm, M, N);
+
+ __free_ddr(A);
+ __free_ddr(Bdsp);
+ free(Barm);
+ }
+
+ *gflops_dsp = total_GFLOPS_DSP;
+ *gflops_arm = total_GFLOPS_ARM;
+ *time_dsp = total_time_dsp / (float)NUM_TEST_RUN;
+ *time_arm = total_time_arm / (float)NUM_TEST_RUN;
+
+ return err_code;
+}
+
+
+/*-----------------------------------------------------------------------------
+* check_results
+*----------------------------------------------------------------------------*/
+int check_results(const double complex *C1, const double complex *C2, int M, int N)
+{
+ int i;
+ const double EPISILON = 1e-5;
+ //const double EPISILON = 1e-200;
+ const int NERRORS = 5;
+ int num_errors = 0;
+
+ for (i=0; i<M*N; i++)
+ {
+ double delta = cabs(C1[i]) - cabs(C2[i]);
+
+ if (delta > EPISILON*cabs(C1[i]))
+ if ((num_errors += 1) < NERRORS)
+ printf("Error [elem:%d]: %f <==> %f\n", i, cabs(C1[i]), cabs(C2[i]));
+ }
+
+ if (num_errors > 0)
+ {
+ printf("FAIL with %d errors!\n", num_errors);
+ return num_errors;
+ }
+ else
+ {
+ //printf("PASS!\n");
+ return 0;
+ }
+}
+
+