summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 4823f27)
raw | patch | inline | side by side (parent: 4823f27)
author | Jianzhong Xu <xuj@ti.com> | |
Fri, 4 Mar 2016 20:26:22 +0000 (15:26 -0500) | ||
committer | Jianzhong Xu <xuj@ti.com> | |
Fri, 4 Mar 2016 20:26:22 +0000 (15:26 -0500) |
394 files changed:
diff --git a/Makefile b/Makefile
index d7de1721ad201466f0f87f353b355ddc5c2efa6e..ca06b9e084f594a56ba67a83787f30b0579a427f 100644 (file)
--- a/Makefile
+++ b/Makefile
LINALG_BLIS_DIR = blis
LINALG_CBLAS_DIR = cblas
+LINALG_TICBLAS_DIR = ticblas
LINALG_BLASACC_DIR = blasblisacc
LINALG_CLAPACK_DIR = clapack
BLIS_VERSION = $(shell cat $(LINALG_BLIS_DIR)/version)
-LINALG_HEADERS =$(LINALG_CLAPACK_DIR)/INCLUDE/blaswrap.h
-LINALG_HEADERS+=$(LINALG_CLAPACK_DIR)/INCLUDE/clapack.h
-LINALG_HEADERS+=$(LINALG_CLAPACK_DIR)/INCLUDE/f2c.h
-LINALG_HEADERS+=$(LINALG_CBLAS_DIR)/include/cblas.h
+CBLAS_HEADERS =$(LINALG_CBLAS_DIR)/include/cblas.h
+CBLAS_HEADERS +=$(LINALG_TICBLAS_DIR)/ticblas.h
+CLAPACK_HEADERS =$(LINALG_CLAPACK_DIR)/INCLUDE/blaswrap.h
+CLAPACK_HEADERS+=$(LINALG_CLAPACK_DIR)/INCLUDE/clapack.h
+CLAPACK_HEADERS+=$(LINALG_CLAPACK_DIR)/INCLUDE/f2c.h
-build: ARMonly
-prebuild: DSPonly
-linalg: ARMplusDSP
+ifeq ($(MEM_MODEL),Large)
+BLIS_CFG = c66xLarge
+else ifeq ($(MEM_MODEL),Medium)
+BLIS_CFG = c66xMedium
+else ifeq ($(MEM_MODEL),Small)
+BLIS_CFG = c66xSmall
+endif
-ARMonly:
+DSPlibs:
+ cd $(LINALG_CBLAS_DIR); make arch=C66 alllib; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/$(BLIS_CFG) c66x; make -j8 MEM_MODEL=$(MEM_MODEL) TARGET=$(TARGET) LIBOS=$(LIBOS); make install; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make MEM_MODEL=$(MEM_MODEL) TARGET=$(TARGET) LIBOS=$(LIBOS); cd ../lib; \
+ echo "combining BLIS, CBLAS, and TICBLAS libraries into one: libcblas.ae66"; \
+ mkdir -p objs; cd objs; rm -f *; ar x ../../../blis/install/$(BLIS_CFG)/lib/libblis.ae66; mmv 'cblas*.o' 'blis_cblas#1.o'; \
+ ar -x ../../../cblas/lib/C66/libcblas.ae66; ar -x ../libticblas.ae66; chmod +rw *;cd ../../..; \
+ mkdir -p lib; cd lib; rm -f *; ar -cr libcblas.ae66 ../ticblas/lib/objs/*; cd ..
+
+ARMlibs:
cd $(LINALG_CBLAS_DIR); make arch=ARM alllib; \
cd ../$(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; make -j8; make install; \
- cd ../$(LINALG_BLASACC_DIR)/src; make -f Makefile.ARM; \
- cd ../../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make
+ cd ../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make -j8; cd ..
-ARMplusDSP:
- cd $(LINALG_CBLAS_DIR); make arch=ARM alllib; make arch=C66 alllib; \
- cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make -j8; make install; \
- ./configure -p install/arm cortex-a15; make -j8; make install; \
- cd ../$(LINALG_BLASACC_DIR); make cross; \
- cd ../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make
+ARMplusDSP: DSPlibs ARMlibs
+ cd $(LINALG_BLASACC_DIR)/src; make MEM_MODEL=$(MEM_MODEL) TARGET=$(TARGET); cd ../..; \
+ cd lib; rm -f *; \
+ cp ../$(LINALG_BLASACC_DIR)/lib/libcblas_armplusdsp.a .; \
+ cp ../$(LINALG_BLIS_DIR)/install/arm/lib/libblis.a .; \
+ ar -x libblis.a; mmv "cblas_*.o" "blis_cblas_#1.o"; ar -x libcblas_armplusdsp.a; rm *.a; \
+ ar -cr libcblas_armplusdsp.a *.o; rm *.o; cd ..; \
+ cp $(LINALG_CLAPACK_DIR)/lapack_ARM.a ./lib/liblapack.a; \
+ cp $(LINALG_CLAPACK_DIR)/libcblaswr_ARM.a ./lib/libcblaswr.a; \
+ cp $(LINALG_CLAPACK_DIR)/F2CLIBS/libf2c_ARM.a ./lib/libf2c.a
-BLIStest:
- cd $(LINALG_BLIS_DIR)/testsuite; make lib=OpenCLCBLAS -j8
+cleanDSPlibs:
+ cd $(LINALG_CBLAS_DIR); make arch=C66 clean; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/$(BLIS_CFG) c66x; make -j8 clean; rm install/$(BLIS_CFG)/lib/*; rm install/$(BLIS_CFG)/include/blis/*; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make clean; \
+ cd ../..; rm lib/*
-cleanARMplusDSP:
- cd $(LINALG_CBLAS_DIR); make arch=ARM clean; make arch=C66 clean; \
- cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make clean; \
- ./configure -p install/arm cortex-a15; make clean; \
+cleanARMlibs:
+ cd $(LINALG_CBLAS_DIR); make arch=ARM clean; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; make clean; \
cd ../$(LINALG_BLASACC_DIR); make clean; \
cd ../$(LINALG_BLIS_DIR)/testsuite; make clean; \
cd ../../$(LINALG_CLAPACK_DIR); make clean
-clean:
- cd $(LINALG_CBLAS_DIR)/src; make arch=ARM clean; \
- cd ../../$(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; make clean; \
- cd ../$(LINALG_BLASACC_DIR)/src; make -f Makefile.ARM cleanARM; \
- cd ../../$(LINALG_CLAPACK_DIR); make clean
+BLIStest:
+ cd $(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; \
+ cd testsuite; make lib=OpenCLCBLAS -j8
+
+BLIStestDSP:
+ cd $(LINALG_BLIS_DIR); ./configure -p install/$(BLIS_CFG) c66x; \
+ cd testsuite/dsponly; make MEM_MODEL=Small TARGET=SOC_C6678 LIBOS=LIB_RTOS
+
+BLAStest:
+ cd $(LINALG_CLAPACK_DIR)/BLAS/TESTING; make -f Makeblat1; make -f Makeblat2; make -f Makeblat3
+
+CLAPACKtest:
+ cd $(LINALG_CLAPACK_DIR)/TESTING/MATGEN; make
+ cd $(LINALG_CLAPACK_DIR)/TESTING/LIN; make
+ cd $(LINALG_CLAPACK_DIR)/TESTING/EIG; make
+
+
+cleanARMplusDSP: cleanDSPlibs cleanARMlibs
+
+docs: ./docs/doxygen/doxycfg.txt ./docs/doxygen/mainpage.dox
+ doxygen ./docs/doxygen/doxycfg.txt
+
+installDSPlib:
+ install -m 755 -d ${DESTDIR}/include
+ install -m 755 -d ${DESTDIR}/lib
+ cp $(CBLAS_HEADERS) ${DESTDIR}/include
+ cp ./lib/libcblas.ae66 ${DESTDIR}/lib
+ cp -r docs ${DESTDIR}
+
+installARMplusDSPlib:
+ install -m 755 -d ${DESTDIR}/include
+ install -m 755 -d ${DESTDIR}/lib
+ cp $(CBLAS_HEADERS) ${DESTDIR}/include
+ cp $(CLAPACK_HEADERS) ${DESTDIR}/include
+ cp ./lib/libcblas_armplusdsp.a ${DESTDIR}/lib
+ cp ./lib/liblapack.a ${DESTDIR}/lib
+ cp ./lib/libcblaswr.a ${DESTDIR}/lib
+ cp ./lib/libf2c.a ${DESTDIR}/lib
+ cp -r docs ${DESTDIR}
-DSPonly:
- cd $(LINALG_CBLAS_DIR); make arch=C66 alllib; \
- cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make -j8; make install; \
- cd ../$(LINALG_BLASACC_DIR)/src; make ti_cblas_kernel.dsp_h
-
-install:
- install -m 755 -d ${DESTDIR}/usr/include
- install -m 755 -d ${DESTDIR}/usr/lib
- install -m 755 -d ${DESTDIR}/usr/share/doc/ti-linalg
- install -m 755 -d ${DESTDIR}/usr/share/ti/examples/linalg
- cp $(LINALG_HEADERS) ${DESTDIR}/usr/include
- cp $(LINALG_BLASACC_DIR)/lib/libcblas_armplusdsp.a ${DESTDIR}/usr/lib
- cp $(LINALG_BLIS_DIR)/install/arm/lib/libblis-$(BLIS_VERSION)-cortex-a15.a ${DESTDIR}/usr/lib/libblis.a
- cp -r ./examples/* ${DESTDIR}/usr/share/ti/examples/linalg
- cp $(LINALG_CLAPACK_DIR)/lapack_ARM.a ${DESTDIR}/usr/lib/liblapack.a
- cp $(LINALG_CLAPACK_DIR)/libcblaswr_ARM.a ${DESTDIR}/usr/lib/libcblaswr.a
- cp $(LINALG_CLAPACK_DIR)/F2CLIBS/libf2c_ARM.a ${DESTDIR}/usr/lib/libf2c.a
- cp docs/* ${DESTDIR}/usr/share/doc/ti-linalg
diff --git a/blasblisacc/Makefile b/blasblisacc/Makefile
index 8d02c6746ca589e283737cfdc86e2d002fcc83a4..1c6e202d87b96753837a5ea321a4f314be6047a8 100644 (file)
--- a/blasblisacc/Makefile
+++ b/blasblisacc/Makefile
include ../make.inc
# use all for cross compilation
-cross: all
# build library
all:
cd src; $(MAKE)
+crossC66x:
+ cd src; $(MAKE) crossC66x
+
+crossAM57x:
+ cd src; $(MAKE) crossAM57x
+
+crossShannon:
+ cd src; $(MAKE) crossShannon
+
debug:
cd src; $(MAKE) debug
index 0431d63aa62f52e0d6115bc8670a21d1a53a4d62..1946377c397f40d394014ad3b3564c96e7f30836 100644 (file)
--- a/blasblisacc/src/Makefile
+++ b/blasblisacc/src/Makefile
include ../../make.inc
-TI_INSTALL_DIR?=/usr/src/dsp
-
-PATH:=$(TI_OCL_CGT_INSTALL)/bin:$(PATH)
-
-define FIND_DSP_PKG
- export $(1)?=$$(patsubst %/$(3),%,$$(lastword $$(sort $$(wildcard $$(TI_INSTALL_DIR)/$(2)/$(3)))))
- ifeq ($$($(1)),)
- $$(error ERROR - $(1) is not defined and could not be found in $(TI_INSTALL_DIR)/ )
- else
- ifeq ($$(wildcard $$($(1))/$(3)),)
- $$(error ERROR - "$(1) = $$($(1))" Is not valid!)
- endif
- endif
- $$(info Using $(1) = $$($(1)))
-endef
-
-UNAME_M :=$(shell uname -m)
-
-MAKEFLAGS += -j1
+PATH:=$(CGTROOT)/bin:$(PATH)
# Defines
TI_CBLAS_FAT_BINARY = 1
ARM_PLUS_DSP_LIB_DIR = ../lib
-# INCLUDE Directory
-$(eval $(call FIND_DSP_PKG,OMP_DIR,openmp_dsp*,packages))
-ifneq (,$(findstring 86, $(UNAME_M)))
-$(eval $(call FIND_DSP_PKG,C6636_PDK_DIR,pdk_keystone2*,packages))
-endif
-INCDIR := $(TI_OCL_CGT_INSTALL)/include;$(TARGET_ROOTDIR)/usr/include;../../blis/install/c66x/include/blis/;$(OMP_DIR)/packages/ti/runtime/openmp;$(PDK_DIR)/packages
+INCDIR := $(CGTROOT)/include
+INCDIR += -I$(OMP_DIR)/packages/ti/runtime/openmp
+INCDIR += -I$(FC_DIR)/packages
+INCDIR += -I$(XDC_DIR)/packages
+INCDIR += -I$(BIOS_DIR)/packages
+INCDIR += -I$(XDAIS_DIR)/packages
+INCDIR += -I$(LIBARCH_DIR)/include
+INCDIR += -I$(PDK_DIR)/packages
+INCDIR += -I$(TI_OCL_INSTALL_DIR)
INCS = -I. -I$(strip $(subst ;, -I,$(subst $(space),$(space),$(INCDIR))))
OBJS = ti_cblas_initfini.o
+# CBLAS and BLIS directories
+CBLAS_DSP_LIB = ../../cblas/lib/C66/libcblas.ae66
+TICBLAS_DSP_LIB = ../../ticblas/lib/libticblas.ae66
+CBLAS_ARM_LIB = ../../cblas/lib/ARM/libcblas_ARM.a
+LIBARCH_LIB = $(LIBARCH_DIR)/lib/libArch.ae66
+
+ifeq ($(MEM_MODEL),Large)
+BLIS_DSP_LIB = ../../blis/install/c66xLarge/lib/libblis.ae66
+else ifeq ($(MEM_MODEL),Medium)
+BLIS_DSP_LIB = ../../blis/install/c66xMedium/lib/libblis.ae66
+else ifeq ($(MEM_MODEL),Small)
+BLIS_DSP_LIB = ../../blis/install/c66xSmall/lib/libblis.ae66
+#else ifeq ($(MEM_MODEL),Tiny)
+endif
CPP_DEBUG = -g
-CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DDEVICE_K2H -I../../cblas/include -I../../blis/install/arm/include/blis/ -I$(TI_OCL_INSTALL_DIR)/include -fopenmp
-CL6X_FLAGS = $(INCS) --openmp --use_g2
+CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -D$(TARGET) -I../../cblas/include -I../../blis/install/arm/include/blis/ -I$(TI_OCL_INSTALL_DIR)/include -fopenmp
+CL6X_FLAGS = $(INCS) --openmp --use_g2 -D$(TARGET) -DLIB_OPENCL
CLOCL_FLAGS =
OBJCOPY_ARGS=
ARM_PLUS_DSP_LIB= $(ARM_PLUS_DSP_LIB_DIR)/libcblas_armplusdsp.a
# OpenCL libraries included in make.inc
LIBS += -lpthread
-# CBLAS and BLIS directories
-CBLAS_DSP_LIB = ../../cblas/lib/C66/libcblas_C66.ae66
-BLIS_DSP_LIB = ../../blis/install/c66x/lib/libblis.ae66
-CBLAS_ARM_LIB = ../../cblas/lib/ARM/libcblas_ARM.a
-
OCL_BIN = ti_cblas_kernel.out
ifeq ($(TI_CBLAS_FAT_BINARY), 1)
OBJS += ofld_tbl_strsm.o
OBJS += ofld_tbl_ztrsm.o
-
all: armplusdsp
cross: armplusdsp
+crossC66x: BLIS_DSP_LIB = ../../blis/install/c66x/lib/libblis.ae66
+#crossC66x: CL6X_FLAGS+= -I../../blis/install/c66x/include/blis/
+crossC66x: cross $(OCL_BIN) $(OBJS)
+
+crossAM57x: BLIS_DSP_LIB = ../../blis/install/am57x/lib/libblis.ae66
+#crossAM57x: CL6X_FLAGS+= -I../../blis/install/am57x/include/blis/
+crossAM57x: cross $(OCL_BIN) $(OBJS)
+
+crossShannon: BLIS_DSP_LIB = ../../blis/install/shannon/lib/libblis.ae66
+#crossShannon: CL6X_FLAGS+= -I../../blis/install/shannon/include/blis/
+crossShannon: cross $(OCL_BIN) $(OBJS)
+
debug: CPP_FLAGS += -DTI_CBLAS_DEBUG $(CPP_DEBUG) #-DTI_CBLAS_PROFILE
debug: cross
+debug: CL6X_FLAGS += -DTI_CBLAS_DEBUG
profile: CPP_FLAGS += -DTI_CBLAS_PROFILE
profile: armplusdsp
ti_cblas_initfini.o: $(OCL_BIN)
# target for fat binary
-ti_cblas_kernel.dsp_h: ti_cblas_kernel.cl facade.obj $(CBLAS_DSP_LIB) $(BLIS_DSP_LIB)
+ti_cblas_kernel.dsp_h: ti_cblas_kernel.cl facade.obj ti_cblas_mem_config.obj $(CBLAS_DSP_LIB) $(BLIS_DSP_LIB) $(TICBLAS_DSP_LIB) $(LIBARCH_LIB)
@echo; echo Building $@
@rm -f ti_cblas_kernel.out
@echo Building fat binary header
index a2a7f1202d163beee946bac735023dbf197ff0bf..a4e8255c1be7b6fd9da86d6bd9b7ead1405007a9 100644 (file)
ARM_PLUS_DSP_LIB_DIR = ../lib
# INCLUDE Directory
-INCDIR := $(TI_OCL_CGT_INSTALL)/include;$(TI_OCL_INSTALL_DIR)/opencl/include;$(LINUX_DEVKIT_ROOT)/usr/include;../../blis/install/c66x/include/blis/;$(OMP_DIR)/packages/ti/runtime/openmp
+INCDIR := $(CGTROOT)/include;$(TI_OCL_INSTALL_DIR)/opencl/include;$(LINUX_DEVKIT_ROOT)/usr/include;../../blis/install/c66x/include/blis/;$(OMP_DIR)/packages/ti/runtime/openmp
INCS = -I. -I$(strip $(subst ;, -I,$(subst $(space),$(space),$(INCDIR))))
OBJS = ti_cblas_initfini.o
CPP_DEBUG = -g
-CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DDEVICE_K2H -I../../cblas/include -I../../blis/install/arm/include/blis/ -I$(TI_OCL_INSTALL_DIR)/include -fopenmp
+CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DSOC_K2H -I../../cblas/include -I../../blis/install/arm/include/blis/ -I$(TI_OCL_INSTALL_DIR)/include -fopenmp
CL6X_FLAGS = $(INCS) --openmp --use_g2
CLOCL_FLAGS =
OBJCOPY_ARGS=
ARM_PLUS_DSP_LIB= $(ARM_PLUS_DSP_LIB_DIR)/libcblas_armplusdsp.a
-# CBLAS and BLIS directories
-CBLAS_DSP_LIB = ../../cblas/lib/C66/libcblas_C66.ae66
-BLIS_DSP_LIB = ../../blis/install/c66x/lib/libblis.ae66
-CBLAS_ARM_LIB = ../../cblas/lib/ARM/libcblas_ARM.a
-
OCL_BIN = ti_cblas_kernel.out
ifeq ($(TI_CBLAS_FAT_BINARY), 1)
index b4f117b66c2811b621db1166e5c297aaafc97161..547d98328ae13fad4ada8e194bedd8736449e132 100644 (file)
--- a/blasblisacc/src/facade.c
+++ b/blasblisacc/src/facade.c
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
#include "../../cblas/include/cblas.h"
-#include "blis.h"
-#define DEVICE_K2H
+#include "../../ticblas/ticblas.h"
-#include <dsp_c.h>
+#ifdef TI_CBLAS_DEBUG
+#include "stdio.h"
-#define getNextMultiple(x, y) ( ( ((x)+(y)-1)/(y) )* (y) )
-// L1 buffer is hardwared here
-#define L1_BUF_LOC 0x00F00000
+extern char *pool_mk_mem_L1;
+extern char *pool_kn_mem_L1;
+extern char *pool_mn_mem_L1;
+extern char *pool_mk_mem_L2;
+extern char *pool_kn_mem_L2;
+extern char *pool_mn_mem_L2;
+extern char *pool_mk_mem_L3;
+extern char *pool_kn_mem_L3;
+extern char *pool_mn_mem_L3;
+#endif
-// note these pointers must be filled if used functions
-char *pool_mk_mem_L1;
-char *pool_kn_mem_L1;
-char *pool_mn_mem_L1;
+extern int bli_l3_mem_config(void *msmc_buf, size_t msmc_buf_size, void *ddr_buf, size_t ddr_buf_size, size_t *l1D_SRAM_size_orig, size_t *l2_SRAM_size_orig);
+extern int bli_l3_mem_reconfig(size_t l1D_SRAM_size_orig, size_t l2_SRAM_size_orig);
-char *pool_mk_mem_L2;
-char *pool_kn_mem_L2;
-char *pool_mn_mem_L2;
-
-char *pool_mk_mem_L3;
-char *pool_kn_mem_L3;
-char *pool_mn_mem_L3;
-
-void ti_bli_init_dsp(char *l3_buf, char *l2_buf)
-{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- bli_init();
-}
-
-void ti_bli_finalize_dsp(void)
-{
- bli_finalize();
-}
void cblas_caxpy_facade(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_caxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ccopy_facade(const int N, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ccopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cdotc_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cdotc_sub(N, X, incX, Y, incY, dotc);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cdotu_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cdotu_sub(N, X, incX, Y, incY, dotu);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
-
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_cgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_cgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgerc_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgerc(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgeru_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgeru(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_chemm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_chemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cher_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const void *X, const int incX, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cher(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_cher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const float beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_cher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const float beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_cher2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_cherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const void *A, const int lda, const float beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_cherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const void *A, const int lda, const float beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_cherk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_chpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *Ap, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chpr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const void *X, const int incX, void *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chpr(order, Uplo, N, alpha, X, incX, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chpr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_crotg_facade(void *a, void *b, float *c, void *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_crotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cscal_facade(const int N, const void *alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_csscal_facade(const int N, const float alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_csscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cswap_facade(const int N, void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_csymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_csymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_csymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_csyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_csyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_csyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_csyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_csyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_csyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ctbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ctrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ctrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ctrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ctrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dasum_facade(const int N, const double *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_daxpy_facade(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_daxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dcopy_facade(const int N, const double *X, const int incX, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dcopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ddot_facade(const int N, const double *X, const int incX, const double *Y, const int incY, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_ddot(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
-// printf("dgemm facade A: %x, B: %x\n", A, B);
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dger_facade(const enum CBLAS_ORDER order, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dger(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dnrm2_facade(const int N, const double *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dnrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drot_facade(const int N, double *X, const int incX, double *Y, const int incY, const double c, const double s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drot(N, X, incX, Y, incY, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drotg_facade(double *a, double *b, double *c, double *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drotm_facade(const int N, double *X, const int incX, double *Y, const int incY, const double *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drotm(N, X, incX, Y, incY, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drotmg_facade(double *d1, double *d2, double *b1, const double b2, double *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drotmg(d1, d2, b1, b2, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dscal_facade(const int N, const double alpha, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsdot_facade(const int N, const float *X, const int incX, const float *Y, const int incY, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dsdot(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dspmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *Ap, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dspr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, double *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dspr(order, Uplo, N, alpha, X, incX, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dspr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dspr2(order, Uplo, N, alpha, X, incX, Y, incY, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dswap_facade(const int N, double *X, const int incX, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dsymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dsymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsyr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, double *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsyr(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_dsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dsyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_dsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_dsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double beta, double *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dsyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dtbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *Ap, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *Ap, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dtrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dtrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dtrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dtrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dzasum_facade(const int N, const void *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dzasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dznrm2_facade(const int N, const void *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dznrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_icamax_facade(const int N, const void *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_icamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_idamax_facade(const int N, const double *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_idamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_isamax_facade(const int N, const float *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_isamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_izamax_facade(const int N, const void *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_izamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sasum_facade(const int N, const float *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_sasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_saxpy_facade(const int N, const float alpha, const float *X, const int incX, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_saxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_scasum_facade(const int N, const void *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_scasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_scnrm2_facade(const int N, const void *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_scnrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_scopy_facade(const int N, const float *X, const int incX, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_scopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sdot_facade(const int N, const float *X, const int incX, const float *Y, const int incY, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_sdot(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sdsdot_facade(const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_sdsdot(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_sgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sger_facade(const enum CBLAS_ORDER order, const int M, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sger(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_snrm2_facade(const int N, const float *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_snrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srot_facade(const int N, float *X, const int incX, float *Y, const int incY, const float c, const float s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srot(N, X, incX, Y, incY, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srotg_facade(float *a, float *b, float *c, float *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srotm_facade(const int N, float *X, const int incX, float *Y, const int incY, const float *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srotm(N, X, incX, Y, incY, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srotmg_facade(float *d1, float *d2, float *b1, const float b2, float *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srotmg(d1, d2, b1, b2, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sscal_facade(const int N, const float alpha, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sspmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *Ap, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sspr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, float *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sspr(order, Uplo, N, alpha, X, incX, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sspr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sspr2(order, Uplo, N, alpha, X, incX, Y, incY, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sswap_facade(const int N, float *X, const int incX, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ssymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ssymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssyr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, float *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssyr(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ssyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_ssyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ssyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_ssyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_ssyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float beta, float *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ssyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_stbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_stbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_stpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *Ap, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_stpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *Ap, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_strmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_strmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_strmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_strsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_strsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_strsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_xerbla_facade(int p, const char *rout, const char *form)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_xerbla(p, rout, form);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zaxpy_facade(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zaxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zcopy_facade(const int N, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zcopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zdotc_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zdotc_sub(N, X, incX, Y, incY, dotc);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zdotu_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zdotu_sub(N, X, incX, Y, incY, dotu);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zdscal_facade(const int N, const double alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zdscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_zgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgerc_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgerc(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgeru_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgeru(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zhemm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_zhemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zher_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zher(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_zher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const double beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const double beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zher2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_zherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const void *A, const int lda, const double beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const void *A, const int lda, const double beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zherk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_zhpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *Ap, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhpr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhpr(order, Uplo, N, alpha, X, incX, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhpr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zrotg_facade(void *a, void *b, double *c, void *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zrotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zscal_facade(const int N, const void *alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zswap_facade(const int N, void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_zsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zsymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_zsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zsyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_zsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zsyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ztbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ztrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ztrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ztrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ztrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
index 512b3de01ec805dcc16adf503ac940db972b1786..e9ca10dcfb969a338d9680fcc12c4c85cb24a35c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_cgemm[GEMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 33e7a384d61192c03d14b038302493ed2c0d6d33..74c637b8c29f1af78277048e32a85d8e0b9e30b4 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_csyrk[SYRK_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 959136b0435c881f0215cfe95d0e268f74ce064d..1d0522cfe906a8ce216d6182ca89afcdc4da839a 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ctrmm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index d817eb843661e4042b93db86ed79762f83ffd4d9..21dcdaf7e2a7a418860ef9ce237f86261e913bf6 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ctrsm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
index f24fc228aa363b2fde5a0123d55a7ee8da8ba607..75819ee81081f97ab372f0f4087fb7844240f1be 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_dgemm[GEMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index ee3a346a32154d9156b9261ee4b27ca9ace25bc2..776469b243fcba5789a8c3a4eb7169419754351e 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_dsyrk[SYRK_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index f96dec82c700530e4b02a7ff75f6cc9dda04c444..446189f43978c51e8a802b985670cda0ed2a9ad7 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_dtrmm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 29cfd611247f54943909be96d0f6b874cd273d19..daeb38ba53c4633d809a81df30524e28aff58e81 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_dtrsm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 4ecf176f0d708a3447bc4afa063dc603347edc52..b058b2fc71543a14d0c789e1a2cd0032bfb7accd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_sgemm[GEMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 0600ebb476a2ccb6a6310810265da06947b03880..5b89cc96949345af939f91043e4f3667e16cb38c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ssyrk[SYRK_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index e108f6561fc2c6ccf09a352b6e50826eab609a36..22f14e7ad1f5d14889ff04d2ffb6187e05ecfae5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_strmm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 9cbd4848161076093d95d09f4aa42a3803e757cd..68cc3ae8c6e40b9571c82dc91920a6d7d805fc96 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_strsm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 594dee1d142bbc22e54a1dc76a450385a6ec7105..f7259284446da61c46a6b666b5c75e5cc6a7746a 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_zgemm[GEMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index e713dea37beddd640da49d37be12db9ba2bca51b..8df383eabbb90e63e03754c186af49196c64c714 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_zsyrk[SYRK_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 6aa24fb6936eaae5fdfd2d310971736966e647b7..0d9caa8943f49acd5e7c1da22b0eb6ee20febfc2 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ztrmm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 012cec7311442a93c98fc9c2267eb9e836fa6565..2d630e8e4c0e0dfc0fc5caa0fb2c34dfe49e42ce 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ztrsm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
similarity index 99%
rename from blasblisacc/src/ti_cblas.h
rename to blasblisacc/src/ti_cblas_acc.h
index fdea549af1fc8564b93fe57071c73b12b391c815..70a040c7758529e14f487b8e89a092300dd61e3c 100644 (file)
rename from blasblisacc/src/ti_cblas.h
rename to blasblisacc/src/ti_cblas_acc.h
index fdea549af1fc8564b93fe57071c73b12b391c815..70a040c7758529e14f487b8e89a092300dd61e3c 100644 (file)
#define MSMC_BUF_SIZE 0x47FDC0
//#define MSMC_BUF_SIZE 0x47F100 // MR=NR=4 for S
+#define DDR_BUF_SIZE (16384)
//DSPBLIS
//#define MSMC_BUF_SIZE 0x400000
index 2af9c66bb289f3c7b7fd3fb5ebecebede04e22af..9f20b26c5c959bd22f81091306731494796cd593 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_caxpy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_caxpy");
- __real_cblas_caxpy(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_caxpy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_caxpy");
+ __real_cblas_caxpy(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_caxpy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_caxpy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_caxpy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CAXPY_IDX, "ocl_cblas_caxpy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CAXPY_IDX, "ocl_cblas_caxpy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_caxpy");
index 4458a8428c256bcc5ec52ac49f23b716fb4c8907..bf48d8435b8ebb2d106c242cd1e87bb2797835e3 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ccopy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ccopy");
- __real_cblas_ccopy(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ccopy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ccopy");
+ __real_cblas_ccopy(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ccopy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ccopy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ccopy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CCOPY_IDX, "ocl_cblas_ccopy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CCOPY_IDX, "ocl_cblas_ccopy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ccopy");
diff --git a/blasblisacc/src/ti_cblas_cblas_cdotc_sub.c b/blasblisacc/src/ti_cblas_cblas_cdotc_sub.c
index 730493ab1d33db83f82e64e998bdecd9f979192c..c54530ef6cebdcdc221d47d613b5b5b405ba2ce3 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cdotc_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotc_sub");
- __real_cblas_cdotc_sub(N,X,incX,Y,incY,dotc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotc_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotc_sub");
+ __real_cblas_cdotc_sub(N,X,incX,Y,incY,dotc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotc_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotc_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotc_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CDOTC_SUB_IDX, "ocl_cblas_cdotc_sub");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CDOTC_SUB_IDX, "ocl_cblas_cdotc_sub");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cdotc_sub");
diff --git a/blasblisacc/src/ti_cblas_cblas_cdotu_sub.c b/blasblisacc/src/ti_cblas_cblas_cdotu_sub.c
index 8f795c7306faa23fef1358d10c2d73c51855ee14..4070c52189bb4baecbceff2636f44f78a3583df5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cdotu_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotu_sub");
- __real_cblas_cdotu_sub(N,X,incX,Y,incY,dotu);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotu_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotu_sub");
+ __real_cblas_cdotu_sub(N,X,incX,Y,incY,dotu);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotu_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotu_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotu_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CDOTU_SUB_IDX, "ocl_cblas_cdotu_sub");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CDOTU_SUB_IDX, "ocl_cblas_cdotu_sub");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -144,9 +144,10 @@ void cblas_cdotu_sub(const int N, const void *X, const int incX, const void *Y,
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cdotu_sub");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cdotu_sub", (float) clock_diff);
return ;
index 954148f3cd5a81443707ac8cf6dc37fe80a79a3f..f425fa1f73742d0754dbdc86339d717072ac2e0f 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgbmv");
- __real_cblas_cgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgbmv");
+ __real_cblas_cgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGBMV_IDX, "ocl_cblas_cgbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGBMV_IDX, "ocl_cblas_cgbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -206,8 +206,8 @@ void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgbmv");
index a8edae8b8dbce36fd6af685df6deff9a5b800d3b..3eb73efaa015ccac8fb7a71cb33dd5c1b3daf95c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cgemm_offload_dsp(Order,M,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemm");
- __real_cblas_cgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cgemm_offload_dsp(Order,M,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemm");
+ __real_cblas_cgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMM_IDX, "ocl_cblas_cgemm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMM_IDX, "ocl_cblas_cgemm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -196,6 +196,7 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
@@ -211,9 +212,28 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#endif
#ifdef __cplusplus
- __K->setArg(15, __local(L2_BUF_SIZE));
+ __K->setArg(15, msmc_size);
#else
- err |= clSetKernelArg(__K, 15, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 15, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(16, buf_DDR);
+ __K->setArg(17, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(18, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 18, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -227,9 +247,14 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_cgemm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index f6d7f9b0f4bc663bc6945bcda65a5d291131e9c4..05fe4cfa38d6ea3c18874a5be894dc2632ef0b4c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemv");
- __real_cblas_cgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemv");
+ __real_cblas_cgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMV_IDX, "ocl_cblas_cgemv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMV_IDX, "ocl_cblas_cgemv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -194,8 +194,8 @@ void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgemv");
index f8776e95f76acbee95d0bfdd7ba28277edece224..d16e435a1c22065338c6d2de95c0db1ee822aac5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgerc");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgerc");
- __real_cblas_cgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgerc", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgerc");
+ __real_cblas_cgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgerc", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgerc");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgerc");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGERC_IDX, "ocl_cblas_cgerc");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGERC_IDX, "ocl_cblas_cgerc");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -177,8 +177,8 @@ void cblas_cgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgerc");
index 087b519c40b6d7f7ac6dd4f1af24e23d5dc20668..ebf8c199e75cd4246eedc8f70ac34187efc691e0 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgeru");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgeru");
- __real_cblas_cgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgeru", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgeru");
+ __real_cblas_cgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgeru", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgeru");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgeru");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGERU_IDX, "ocl_cblas_cgeru");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGERU_IDX, "ocl_cblas_cgeru");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -177,9 +177,10 @@ void cblas_cgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgeru");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cgeru", (float) clock_diff);
return ;
index 8c264a868db44bee88fd041e28550001b25dc381..98ad84a78b75120d2ef3dbddf55405c452afc3fc 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chbmv");
- __real_cblas_chbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chbmv");
+ __real_cblas_chbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHBMV_IDX, "ocl_cblas_chbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHBMV_IDX, "ocl_cblas_chbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -194,9 +194,10 @@ void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chbmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_chbmv", (float) clock_diff);
return ;
index d48091a5f5f2170178aba7278739b7f960354a54..f8348692443c0900bd911592622e6b64326ad181 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!chemm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemm");
- __real_cblas_chemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!chemm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemm");
+ __real_cblas_chemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHEMM_IDX, "ocl_cblas_chemm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHEMM_IDX, "ocl_cblas_chemm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -190,12 +190,11 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
-
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
-
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -207,9 +206,28 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -223,9 +241,14 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_chemm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index f5a16d6bc462786122a657b43809c944dac7416b..b81a5ca893498ff1851356a8dc0588dee46b23fd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemv");
- __real_cblas_chemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemv");
+ __real_cblas_chemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHEMV_IDX, "ocl_cblas_chemv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHEMV_IDX, "ocl_cblas_chemv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -188,8 +188,8 @@ void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chemv");
index f42e4bcbb6a156f1df643d0b0697c24cd6edaa5a..f0ea8143e8f473bd6e4efb3d7bbc585a0329057e 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cher");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher");
- __real_cblas_cher(order,Uplo,N,alpha,X,incX,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher");
+ __real_cblas_cher(order,Uplo,N,alpha,X,incX,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER_IDX, "ocl_cblas_cher");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER_IDX, "ocl_cblas_cher");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -151,9 +151,10 @@ void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cher");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cher", (float) clock_diff);
return ;
index 00eb4aa2a31e311d01caa36a4b262bd7434d77b7..12ba685746c159533dc326afb43a520f3641febe 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cher2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2");
- __real_cblas_cher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2");
+ __real_cblas_cher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER2_IDX, "ocl_cblas_cher2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER2_IDX, "ocl_cblas_cher2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -177,9 +177,10 @@ void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cher2");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cher2", (float) clock_diff);
return ;
index b36127567e6967788e3b0d0592ca3837236873a8..ac6e37f4e20db12efe3e74d170d9e7c791a2b4bd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cher2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cher2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2k");
- __real_cblas_cher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cher2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2k");
+ __real_cblas_cher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER2K_IDX, "ocl_cblas_cher2k");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER2K_IDX, "ocl_cblas_cher2k");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -183,11 +183,13 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -199,9 +201,28 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -215,10 +236,15 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_cher2k is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cher2k");
index ed3f9e82c70bba64fd195c10e3bffd2faea7e484..bce49b1cd305abee7baa64eefc6f4b064cf26b26 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cherk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cherk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cherk");
- __real_cblas_cherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cherk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cherk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cherk");
+ __real_cblas_cherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cherk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cherk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cherk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHERK_IDX, "ocl_cblas_cherk");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHERK_IDX, "ocl_cblas_cherk");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -159,10 +159,11 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -174,9 +175,28 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
#ifdef __cplusplus
- __K->setArg(12, __local(L2_BUF_SIZE));
+ __K->setArg(12, msmc_size);
#else
- err |= clSetKernelArg(__K, 12, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 12, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(13, buf_DDR);
+ __K->setArg(14, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(15, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 15, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -190,9 +210,14 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_cherk is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index 1440c48809a0f924c01cae29c1684b153ab4cc85..3de67e443f5714b7acf185940575112751242127 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpmv");
- __real_cblas_chpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpmv");
+ __real_cblas_chpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPMV_IDX, "ocl_cblas_chpmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPMV_IDX, "ocl_cblas_chpmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -182,9 +182,10 @@ void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chpmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_chpmv", (float) clock_diff);
return ;
index 3209139569cc6768189af8591af8abaf5925f177..ec3155a58e59e5df7d40026b3d1ddece8e422b29 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chpr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr");
- __real_cblas_chpr(order,Uplo,N,alpha,X,incX,A);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr");
+ __real_cblas_chpr(order,Uplo,N,alpha,X,incX,A);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPR_IDX, "ocl_cblas_chpr");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPR_IDX, "ocl_cblas_chpr");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chpr");
index 21111f2a03aaef1469068ad3c6da249f2db5a136..e64057cbf75f07fab9af75620d3e99c189f82c17 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chpr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr2");
- __real_cblas_chpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr2");
+ __real_cblas_chpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPR2_IDX, "ocl_cblas_chpr2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPR2_IDX, "ocl_cblas_chpr2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -171,8 +171,8 @@ void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chpr2");
index 1b09cfb517b53af7edbdcbf393e24e1ac9b9686b..190ec8f1cca21f5d78802f88b28ec476ebdc5533 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_crotg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_crotg");
- __real_cblas_crotg(a,b,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_crotg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_crotg");
+ __real_cblas_crotg(a,b,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_crotg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_crotg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_crotg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CROTG_IDX, "ocl_cblas_crotg");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CROTG_IDX, "ocl_cblas_crotg");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
Buffer buf_a(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), (void *)a);
__K->setArg(0, buf_a);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_crotg");
index 1c5e49ad4f03bf0d121637bb786e6bf7165c1d3e..4c930606f994d222ea6a9cbd085763678db343b0 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cscal");
- __real_cblas_cscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cscal");
+ __real_cblas_cscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSCAL_IDX, "ocl_cblas_cscal");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSCAL_IDX, "ocl_cblas_cscal");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cscal");
index 1e28a64d0419d6bdeffa67add4ec526c94cae935..a548c62aca96cd7f9884405a665c336488c6c2b5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_csscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csscal");
- __real_cblas_csscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csscal");
+ __real_cblas_csscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSSCAL_IDX, "ocl_cblas_csscal");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSSCAL_IDX, "ocl_cblas_csscal");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_csscal");
index 9c2f0dd4275fe1d14e7c3bee794855ef9eaaed7b..e6d206a42c89e5be55b324d02633db0abc9cae19 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
/* Do an init on first use */
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cswap");
+
/* OpenCL cannot deal with overlapping memory regions. This is an issue when you
* are trying to swap two rows of a matrix, where the matrix is column major. Hence,
* the offload of this routine to the DSP is disabled.
*/
#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
TI_CBLAS_PROFILE_START();
-
TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cswap");
- __real_cblas_cswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
+ __real_cblas_cswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
return ;
#else
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cswap");
- __real_cblas_cswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cswap");
+ __real_cblas_cswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cswap");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cswap");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSWAP_IDX, "ocl_cblas_cswap");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSWAP_IDX, "ocl_cblas_cswap");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -144,9 +144,10 @@ void cblas_cswap(const int N, void *X, const int incX, void *Y, const int incY)
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cswap");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cswap", (float) clock_diff);
return ;
index f48c129c92044f8c79b3a22df0ab74818ca96f22..14785a23e6d2ac1ab59e6db6ff1f785ecc5a7eee 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_csymm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csymm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csymm");
- __real_cblas_csymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csymm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csymm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csymm");
+ __real_cblas_csymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csymm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csymm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csymm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSYMM_IDX, "ocl_cblas_csymm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSYMM_IDX, "ocl_cblas_csymm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -188,11 +188,13 @@ void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -204,9 +206,28 @@ void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -220,9 +241,14 @@ void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_csymm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index 9bec336cff81b3d30ec460d49b44cdc064c36781..2e26948440d0a54513459e29a6e9949b5b1b7205 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_csyr2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csyr2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csyr2k");
- __real_cblas_csyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csyr2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csyr2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csyr2k");
+ __real_cblas_csyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csyr2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csyr2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csyr2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSYR2K_IDX, "ocl_cblas_csyr2k");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSYR2K_IDX, "ocl_cblas_csyr2k");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -188,11 +188,13 @@ void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -204,9 +206,28 @@ void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -220,9 +241,14 @@ void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_csyr2k is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index dafffb44c2c873cbd958fa8c1fc1c55652f17ce1..c76555749e462a2415c7b12e9c1b6a08fced6c7b 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_csyrk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csyrk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csyrk");
- __real_cblas_csyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csyrk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!csyrk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csyrk");
+ __real_cblas_csyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csyrk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csyrk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csyrk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSYRK_IDX, "ocl_cblas_csyrk");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSYRK_IDX, "ocl_cblas_csyrk");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -167,11 +167,13 @@ void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#else
err |= clSetKernelArg(__K, 10, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -183,9 +185,28 @@ void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
#ifdef __cplusplus
- __K->setArg(12, __local(L2_BUF_SIZE));
+ __K->setArg(12, msmc_size);
#else
- err |= clSetKernelArg(__K, 12, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 12, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(13, buf_DDR);
+ __K->setArg(14, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(15, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 15, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -199,9 +220,14 @@ void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_csyrk is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index 6ae8f35c60c70c22f811ec6cc1c9b1e0627eefbd..467cf853e1394d09354185b127fd28a83c0ad82b 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctbmv");
- __real_cblas_ctbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctbmv");
+ __real_cblas_ctbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTBMV_IDX, "ocl_cblas_ctbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTBMV_IDX, "ocl_cblas_ctbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -163,9 +163,10 @@ void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctbmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctbmv", (float) clock_diff);
return ;
index 831f84e648b6353ff3d2f2d60b913482366cb221..9c9c9a92552ba3d01c88a3a187b6ac3c14d1ea6a 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctbsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctbsv");
- __real_cblas_ctbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctbsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctbsv");
+ __real_cblas_ctbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctbsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctbsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctbsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTBSV_IDX, "ocl_cblas_ctbsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTBSV_IDX, "ocl_cblas_ctbsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -163,9 +163,10 @@ void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctbsv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctbsv", (float) clock_diff);
return ;
index 0dc6106a5282b0108921a40d93498af4497449e4..251ce80a821841bc1125932513e2330a2abe5f18 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctpmv");
- __real_cblas_ctpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctpmv");
+ __real_cblas_ctpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTPMV_IDX, "ocl_cblas_ctpmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTPMV_IDX, "ocl_cblas_ctpmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -111,7 +111,7 @@ void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
size_bufAp = MAX(size_bufAp,1);
#ifdef __cplusplus
- Buffer buf_Ap(*ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufAp , (void *)Ap);
+ Buffer buf_Ap(*ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufAp, (void *)Ap);
__K->setArg(5, buf_Ap);
#else
cl_mem buf_Ap = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufAp, (void *)Ap, &err);
@@ -151,9 +151,10 @@ void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctpmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctpmv", (float) clock_diff);
return ;
index cef43540830e87ae26253da1edb806887c840623..42b6a0bed910d142f0f870fa2ca2d1572e9f9f93 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctpsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctpsv");
- __real_cblas_ctpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctpsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctpsv");
+ __real_cblas_ctpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctpsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctpsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctpsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTPSV_IDX, "ocl_cblas_ctpsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTPSV_IDX, "ocl_cblas_ctpsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -151,9 +151,10 @@ void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctpsv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctpsv", (float) clock_diff);
return ;
index dcb56943859e8b2c347f773bf3d29222212c0b95..50d3607515a703784a85b67db26f5228ba228480 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctrmm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ctrmm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrmm");
- __real_cblas_ctrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrmm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ctrmm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrmm");
+ __real_cblas_ctrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrmm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrmm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrmm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTRMM_IDX, "ocl_cblas_ctrmm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTRMM_IDX, "ocl_cblas_ctrmm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -168,11 +168,13 @@ void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 11, sizeof(ldb), &ldb);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -184,9 +186,28 @@ void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(13, __local(L2_BUF_SIZE));
+ __K->setArg(13, msmc_size);
#else
- err |= clSetKernelArg(__K, 13, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 13, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(14, buf_DDR);
+ __K->setArg(15, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(16, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 16, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -200,9 +221,14 @@ void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_ctrmm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index ee423018f4fcbe362257aab0f4a883c8e91b2e9e..c501935c1d3d91f0c2199a1c8cfb2abb66576f86 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctrmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrmv");
- __real_cblas_ctrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrmv");
+ __real_cblas_ctrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTRMV_IDX, "ocl_cblas_ctrmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTRMV_IDX, "ocl_cblas_ctrmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -157,9 +157,10 @@ void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctrmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctrmv", (float) clock_diff);
return ;
index c9f4716dcac1b5d1d75312660d6b7edff8da92b7..c03d8a0acb7997d19aaf56c2f5d31703f80bb753 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctrsm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ctrsm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrsm");
- __real_cblas_ctrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrsm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ctrsm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrsm");
+ __real_cblas_ctrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrsm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrsm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrsm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTRSM_IDX, "ocl_cblas_ctrsm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTRSM_IDX, "ocl_cblas_ctrsm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -168,11 +168,13 @@ void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 11, sizeof(ldb), &ldb);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -184,9 +186,28 @@ void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(13, __local(L2_BUF_SIZE));
+ __K->setArg(13, msmc_size);
#else
- err |= clSetKernelArg(__K, 13, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 13, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(14, buf_DDR);
+ __K->setArg(15, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(16, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 16, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -200,9 +221,15 @@ void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_ctrsm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctrsm");
index d50ab5aae83d988bba853e5df9c93b8c2a7d56b1..39c3ee4f3b6c4d28c4d6f467fea3cf0ce70d45b1 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ctrsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrsv");
- __real_cblas_ctrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ctrsv");
+ __real_cblas_ctrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ctrsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ctrsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTRSV_IDX, "ocl_cblas_ctrsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CTRSV_IDX, "ocl_cblas_ctrsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -157,9 +157,10 @@ void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ctrsv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ctrsv", (float) clock_diff);
return ;
index ae850f7feb719c3f95fdb48e8cf65b1eb6fcc2ae..60527bd78fd82c1773f9bac6fb77abe3c4f2ed72 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dasum");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dasum");
- double rval = __real_cblas_dasum(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dasum", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dasum");
+ double rval = __real_cblas_dasum(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dasum", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dasum");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dasum");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DASUM_IDX, "ocl_cblas_dasum");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DASUM_IDX, "ocl_cblas_dasum");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dasum");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dasum", (float) clock_diff);
return retval;
index 7ce6a9956e7d98f77879f56fc7670b6a58a3eef1..b673a31464acfc370cf5b15290ee488eb943a252 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_daxpy(const int N, const double alpha, const double *X, const int inc
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_daxpy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_daxpy");
- __real_cblas_daxpy(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_daxpy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_daxpy");
+ __real_cblas_daxpy(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_daxpy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_daxpy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_daxpy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DAXPY_IDX, "ocl_cblas_daxpy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DAXPY_IDX, "ocl_cblas_daxpy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -139,9 +139,10 @@ void cblas_daxpy(const int N, const double alpha, const double *X, const int inc
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_daxpy");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_daxpy", (float) clock_diff);
return ;
index 29ccdcc34f1a2e58ae1208768289c6e2775c2872..fa8877c17f807fc69412fa8eee340c4467c86171 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dcopy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dcopy");
- __real_cblas_dcopy(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dcopy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dcopy");
+ __real_cblas_dcopy(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dcopy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dcopy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dcopy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DCOPY_IDX, "ocl_cblas_dcopy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DCOPY_IDX, "ocl_cblas_dcopy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dcopy");
index 0c1e9ac51a14d2d0febe8bc4ac890a66304e211d..de913e558c46dab97c2865f864490e5eb80db47c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ double cblas_ddot(const int N, const double *X, const int incX, const double *Y,
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ddot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ddot");
- double rval = __real_cblas_ddot(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ddot", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ddot");
+ double rval = __real_cblas_ddot(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ddot", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ddot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ddot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DDOT_IDX, "ocl_cblas_ddot");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DDOT_IDX, "ocl_cblas_ddot");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -129,7 +129,7 @@ double cblas_ddot(const int N, const double *X, const int incX, const double *Y,
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval);
__K->setArg(5, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 5, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
@@ -146,8 +146,8 @@ double cblas_ddot(const int N, const double *X, const int incX, const double *Y,
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ddot");
index 7eda3f0a00b1d0f9e50ecf9ebb70e597191e8f57..4eb8ca18ff0bb15ef5f4bcb502a08e9987d5a9ba 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dgbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgbmv");
- __real_cblas_dgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgbmv");
+ __real_cblas_dgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DGBMV_IDX, "ocl_cblas_dgbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DGBMV_IDX, "ocl_cblas_dgbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -196,8 +196,8 @@ void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dgbmv");
index 7ce8311b9f724f8547abdf54073cb583beb5beb5..4cfa7e56da6ae7414a4d2f43e454e62849416228 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,31 +45,29 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dgemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dgemm_offload_dsp(Order,M,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgemm");
- __real_cblas_dgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dgemm_offload_dsp(Order,M,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgemm");
+ __real_cblas_dgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgemm");
/* Lookup kernel pointer from global table */
- void *msmc_ptr;
-
#ifdef __cplusplus
Event e;
Kernel* __K;
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DGEMM_IDX, "ocl_cblas_dgemm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DGEMM_IDX, "ocl_cblas_dgemm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -186,6 +184,9 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#else
err |= clSetKernelArg(__K, 13, sizeof(ldc), &ldc);
#endif
+
+ void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
@@ -193,17 +194,36 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
__K->setArg(14, buf_MSMC);
#else
- cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
//cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
+ cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 14, sizeof(buf_MSMC), &buf_MSMC);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
- __K->setArg(15, __local(L2_BUF_SIZE));
+ __K->setArg(15, msmc_size);
#else
- err |= clSetKernelArg(__K, 15, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 15, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(16, buf_DDR);
+ __K->setArg(17, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(18, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 18, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -217,9 +237,15 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_dgemm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dgemm");
index 617badbb63ab79e8bb4907dbbb12cf2d32123f32..c624f6cca3a40c9c5fa0597d910ce015da291c80 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dgemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgemv");
- __real_cblas_dgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dgemv");
+ __real_cblas_dgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dgemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dgemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DGEMV_IDX, "ocl_cblas_dgemv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DGEMV_IDX, "ocl_cblas_dgemv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -184,9 +184,10 @@ void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dgemv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dgemv", (float) clock_diff);
return ;
index b49bc04452479e9f4e6ce137d722e3eb55aba704..0492d0aa90393e9e91e4c8f47eed80e36a021a9d 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dger(const enum CBLAS_ORDER order, const int M, const int N, const do
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dger");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dger");
- __real_cblas_dger(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dger", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dger");
+ __real_cblas_dger(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dger", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dger");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dger");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DGER_IDX, "ocl_cblas_dger");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DGER_IDX, "ocl_cblas_dger");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -172,8 +172,8 @@ void cblas_dger(const enum CBLAS_ORDER order, const int M, const int N, const do
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dger");
index cd816df0f8f1c29b46020347d55dba5ff24a4392..bc200fa6b73a8483437b8be129743f34f50a0fe6 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dnrm2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dnrm2");
- double rval = __real_cblas_dnrm2(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dnrm2", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dnrm2");
+ double rval = __real_cblas_dnrm2(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dnrm2", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dnrm2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dnrm2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DNRM2_IDX, "ocl_cblas_dnrm2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DNRM2_IDX, "ocl_cblas_dnrm2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dnrm2");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dnrm2", (float) clock_diff);
return retval;
index 3c1afc0173c6e6e6deba420caccfcdac91e3775c..86bacba7ff9f9a8ccbc3e38c331251f153d8d0fe 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_drot(const int N, double *X, const int incX, double *Y, const int inc
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_drot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drot");
- __real_cblas_drot(N,X,incX,Y,incY,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drot", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drot");
+ __real_cblas_drot(N,X,incX,Y,incY,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drot", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DROT_IDX, "ocl_cblas_drot");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DROT_IDX, "ocl_cblas_drot");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -144,8 +144,8 @@ void cblas_drot(const int N, double *X, const int incX, double *Y, const int inc
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_drot");
index a2b5b5ea9771fa02a9a18808865c2e8230d5ba30..c535574abfa0498693584096376268f81f14520e 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_drotg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotg");
- __real_cblas_drotg(a,b,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotg");
+ __real_cblas_drotg(a,b,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DROTG_IDX, "ocl_cblas_drotg");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DROTG_IDX, "ocl_cblas_drotg");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
Buffer buf_a(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), (void *)a);
__K->setArg(0, buf_a);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_drotg");
index 4fc4d24769439ccaf410e4e8431c5ce8cefa15ee..0cb941fd2abe30879be6c2c59814e806e1a3d741 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_drotm(const int N, double *X, const int incX, double *Y, const int in
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_drotm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotm");
- __real_cblas_drotm(N,X,incX,Y,incY,P);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotm");
+ __real_cblas_drotm(N,X,incX,Y,incY,P);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DROTM_IDX, "ocl_cblas_drotm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DROTM_IDX, "ocl_cblas_drotm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -143,8 +143,8 @@ void cblas_drotm(const int N, double *X, const int incX, double *Y, const int in
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_drotm");
index 0be15c2cb6cae87c02dd97ff55708d9cc4a40358..0cf5e8fa10c592ad895766bd4fdd07af8f603af5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_drotmg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotmg");
- __real_cblas_drotmg(d1,d2,b1,b2,P);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotmg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_drotmg");
+ __real_cblas_drotmg(d1,d2,b1,b2,P);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_drotmg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotmg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_drotmg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DROTMG_IDX, "ocl_cblas_drotmg");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DROTMG_IDX, "ocl_cblas_drotmg");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
Buffer buf_d1(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), (void *)d1);
__K->setArg(0, buf_d1);
@@ -132,9 +132,10 @@ void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_drotmg");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_drotmg", (float) clock_diff);
return ;
index e1e77c5f080861a372932d7ad159fd7783c2fc04..920b40361577f965bfc38fb63842a17fa4d7a31c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsbmv");
- __real_cblas_dsbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsbmv");
+ __real_cblas_dsbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSBMV_IDX, "ocl_cblas_dsbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSBMV_IDX, "ocl_cblas_dsbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -184,9 +184,10 @@ void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsbmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dsbmv", (float) clock_diff);
return ;
index d085915522bccc92b120795b3bd517485faeed8b..d75049f65afa31e77da37c9623bcde8b2a2fb6fb 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dscal");
- __real_cblas_dscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dscal");
+ __real_cblas_dscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSCAL_IDX, "ocl_cblas_dscal");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSCAL_IDX, "ocl_cblas_dscal");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dscal");
index 9dd52b969ccc17dd90951f9e313e839b720f5995..c976fefc2d1db46dff46a00058ce01f1755065c1 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsdot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsdot");
- double rval = __real_cblas_dsdot(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsdot", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsdot");
+ double rval = __real_cblas_dsdot(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsdot", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsdot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsdot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSDOT_IDX, "ocl_cblas_dsdot");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSDOT_IDX, "ocl_cblas_dsdot");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval);
__K->setArg(5, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 5, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
@@ -146,9 +146,10 @@ double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsdot");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dsdot", (float) clock_diff);
return retval;
index 36f91b96683f8386b8a0fa9163c0837b6de90c03..48a0e452f080642d37d9fd16dd0e9f567247bdb6 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dspmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspmv");
- __real_cblas_dspmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspmv");
+ __real_cblas_dspmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSPMV_IDX, "ocl_cblas_dspmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSPMV_IDX, "ocl_cblas_dspmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -172,8 +172,8 @@ void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dspmv");
index 06ef54e2ef99d709bde5493fde78b7e548a520ce..93c94843f2675e378546df796d7ff164c7ff11cd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dspr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspr");
- __real_cblas_dspr(order,Uplo,N,alpha,X,incX,Ap);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspr");
+ __real_cblas_dspr(order,Uplo,N,alpha,X,incX,Ap);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSPR_IDX, "ocl_cblas_dspr");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSPR_IDX, "ocl_cblas_dspr");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dspr");
index cea52f7df8bfcefd5127826d4b569f7b082b070d..89aff959b3476b815b36e62be51e06c638c1ce63 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dspr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspr2");
- __real_cblas_dspr2(order,Uplo,N,alpha,X,incX,Y,incY,A);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dspr2");
+ __real_cblas_dspr2(order,Uplo,N,alpha,X,incX,Y,incY,A);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dspr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dspr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSPR2_IDX, "ocl_cblas_dspr2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSPR2_IDX, "ocl_cblas_dspr2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -166,8 +166,8 @@ void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dspr2");
index cd4cc27ae13c1d5278c2e9ec7598111a2f642ba4..b3e692928613b5f5f34a048a47df2bd8a27d52d2 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
/* Do an init on first use */
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dswap");
+
/* OpenCL cannot deal with overlapping memory regions. This is an issue when you
* are trying to swap two rows of a matrix, where the matrix is column major. Hence,
* the offload of this routine to the DSP is disabled.
@@ -50,25 +52,24 @@ void cblas_dswap(const int N, double *X, const int incX, double *Y, const int in
#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
TI_CBLAS_PROFILE_START();
TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dswap");
- __real_cblas_dswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dswap", (float) clock_diff);
+ __real_cblas_dswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dswap", (float) clock_diff);
return ;
#else
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dswap");
- __real_cblas_dswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dswap", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dswap");
+ __real_cblas_dswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dswap", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dswap");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dswap");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSWAP_IDX, "ocl_cblas_dswap");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSWAP_IDX, "ocl_cblas_dswap");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -143,9 +144,10 @@ void cblas_dswap(const int N, double *X, const int incX, double *Y, const int in
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dswap");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dswap", (float) clock_diff);
return ;
index 97e89a963629f5edfa1433eed1d99c9a9c1b140e..52dd00838ae0bc335c17b15fcdff3215a8055161 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsymm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsymm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsymm");
- __real_cblas_dsymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsymm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsymm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsymm");
+ __real_cblas_dsymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsymm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsymm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsymm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYMM_IDX, "ocl_cblas_dsymm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYMM_IDX, "ocl_cblas_dsymm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -180,10 +180,11 @@ void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -195,9 +196,28 @@ void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
+#else
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -211,9 +231,15 @@ void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_dsymm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsymm");
index 966e2a3b7efaaf7b309a3b9d981845eaf6f1fbfe..7e6a9627ff427826615c8aaf8a4c4a20c359b6ed 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsymv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsymv");
- __real_cblas_dsymv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsymv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsymv");
+ __real_cblas_dsymv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsymv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsymv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsymv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYMV_IDX, "ocl_cblas_dsymv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYMV_IDX, "ocl_cblas_dsymv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -178,8 +178,8 @@ void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsymv");
index ca5044fa5bcab19db1910ff3e86aac146d83f528..1d42f0bd40df24f47c96d9a097cb85c9721731fd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsyr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr");
- __real_cblas_dsyr(order,Uplo,N,alpha,X,incX,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr");
+ __real_cblas_dsyr(order,Uplo,N,alpha,X,incX,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYR_IDX, "ocl_cblas_dsyr");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYR_IDX, "ocl_cblas_dsyr");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsyr");
index a0d366f914e55d2b2fd340da2824f2766b6b16bc..5fa094ba29d3f72ba026ea29774e3460e2ec878f 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsyr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr2");
- __real_cblas_dsyr2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr2");
+ __real_cblas_dsyr2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYR2_IDX, "ocl_cblas_dsyr2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYR2_IDX, "ocl_cblas_dsyr2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -172,8 +172,10 @@ void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsyr2");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dsyr2", (float) clock_diff);
return ;
index cea6055bd8ed8801d5fcf740724ec43f06bb12c9..4c5f281b5d62fa41efb43e50c29f274fafaad2c7 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsyr2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsyr2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr2k");
- __real_cblas_dsyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsyr2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyr2k");
+ __real_cblas_dsyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyr2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyr2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYR2K_IDX, "ocl_cblas_dsyr2k");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYR2K_IDX, "ocl_cblas_dsyr2k");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -180,10 +180,11 @@ void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -195,9 +196,28 @@ void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
+#else
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -211,9 +231,15 @@ void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_dsyr2k is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsyr2k");
index d3b7f1a78cabcd93cd2e33efcba8d017eea2ae9f..e07661799bfb34b4af00823f737fd24490f8ef35 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dsyrk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsyrk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyrk");
- __real_cblas_dsyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyrk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dsyrk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dsyrk");
+ __real_cblas_dsyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dsyrk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyrk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dsyrk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYRK_IDX, "ocl_cblas_dsyrk");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DSYRK_IDX, "ocl_cblas_dsyrk");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -159,10 +159,11 @@ void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -174,9 +175,28 @@ void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
#ifdef __cplusplus
- __K->setArg(12, __local(L2_BUF_SIZE));
+ __K->setArg(12, msmc_size);
+#else
+ err |= clSetKernelArg(__K, 12, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(13, buf_DDR);
+ __K->setArg(14, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(15, buf_err);
#else
- err |= clSetKernelArg(__K, 12, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 15, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -190,9 +210,15 @@ void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_dsyrk is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dsyrk");
index 2b1570ee54d66f081d349fb242c5685e1973796b..4856f3cb8a92bf0d1332cb118a6dee1470fcf8e0 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtbmv");
- __real_cblas_dtbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtbmv");
+ __real_cblas_dtbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTBMV_IDX, "ocl_cblas_dtbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTBMV_IDX, "ocl_cblas_dtbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -163,9 +163,10 @@ void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtbmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dtbmv", (float) clock_diff);
return ;
index a024be828c48f3d9da279759c599c73991afde7e..f9aaf2d9d20c7277c82d9053cb0bd6f668c4afc2 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtbsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtbsv");
- __real_cblas_dtbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtbsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtbsv");
+ __real_cblas_dtbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtbsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtbsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtbsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTBSV_IDX, "ocl_cblas_dtbsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTBSV_IDX, "ocl_cblas_dtbsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -163,8 +163,8 @@ void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtbsv");
index 323098c6ee29900f9792f436b4968fa979d29df8..d3740ee3cfecdc52912e75338f954cb094434303 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtpmv");
- __real_cblas_dtpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtpmv");
+ __real_cblas_dtpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTPMV_IDX, "ocl_cblas_dtpmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTPMV_IDX, "ocl_cblas_dtpmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -151,8 +151,8 @@ void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtpmv");
index 53a0e2275f4e3e90fa62c48eecebbc33b286c6c4..030eea6765d132fdf1ef7a895afe68bbc5c82ec5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtpsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtpsv");
- __real_cblas_dtpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtpsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtpsv");
+ __real_cblas_dtpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtpsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtpsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtpsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTPSV_IDX, "ocl_cblas_dtpsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTPSV_IDX, "ocl_cblas_dtpsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -151,8 +151,8 @@ void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtpsv");
index 2a32919aac6a49c5a8e71a3d1a77293f1dd8e2f9..f2107933378711c35ae27855d4096eb5b0d668f2 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtrmm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dtrmm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrmm");
- __real_cblas_dtrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrmm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dtrmm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrmm");
+ __real_cblas_dtrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrmm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrmm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrmm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTRMM_IDX, "ocl_cblas_dtrmm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTRMM_IDX, "ocl_cblas_dtrmm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -163,11 +163,13 @@ void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 11, sizeof(ldb), &ldb);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -179,9 +181,28 @@ void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(13, __local(L2_BUF_SIZE));
+ __K->setArg(13, msmc_size);
#else
- err |= clSetKernelArg(__K, 13, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 13, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(14, buf_DDR);
+ __K->setArg(15, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(16, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 16, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -195,10 +216,15 @@ void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_dtrmm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtrmm");
index a2f840ca21a6d9588253c17248063d122dad56d2..20d08c8d4b57582572ee6e67c76f9a41ae3df512 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtrmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrmv");
- __real_cblas_dtrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrmv");
+ __real_cblas_dtrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTRMV_IDX, "ocl_cblas_dtrmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTRMV_IDX, "ocl_cblas_dtrmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -157,8 +157,8 @@ void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtrmv");
index 662960d8cbd4170d7d82bdc323ae1adf66e1989d..0dbfa28d843a8de0bdff5137f86cb8ce0372428c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtrsm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dtrsm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrsm");
- __real_cblas_dtrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrsm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!dtrsm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrsm");
+ __real_cblas_dtrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrsm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrsm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrsm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTRSM_IDX, "ocl_cblas_dtrsm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTRSM_IDX, "ocl_cblas_dtrsm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -165,10 +165,11 @@ void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -180,9 +181,28 @@ void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(13, __local(L2_BUF_SIZE));
+ __K->setArg(13, msmc_size);
#else
- err |= clSetKernelArg(__K, 13, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 13, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(14, buf_DDR);
+ __K->setArg(15, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(16, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 16, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -196,9 +216,14 @@ void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_dtrsm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index 7c8d7d336f945385a2c07e02272e7a06fc89996b..d9229c0ae5af6723db06d1b0e26ecb4e96050690 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dtrsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrsv");
- __real_cblas_dtrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dtrsv");
+ __real_cblas_dtrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dtrsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dtrsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTRSV_IDX, "ocl_cblas_dtrsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DTRSV_IDX, "ocl_cblas_dtrsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -157,9 +157,10 @@ void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dtrsv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_dtrsv", (float) clock_diff);
return ;
index eb15f740fc0daf15acd9a9718995eaf189b608d2..6d5641bc9ea69bf9dbd245966dbc7069d4abf18b 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dzasum");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dzasum");
- double rval = __real_cblas_dzasum(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dzasum", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dzasum");
+ double rval = __real_cblas_dzasum(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dzasum", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dzasum");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dzasum");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DZASUM_IDX, "ocl_cblas_dzasum");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DZASUM_IDX, "ocl_cblas_dzasum");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dzasum");
index 860bc3202aa8097f8421a41a8d060cfd6acb43d7..4557b2ef40342b27aa1b4cd4747397da85bdf5a7 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_dznrm2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dznrm2");
- double rval = __real_cblas_dznrm2(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dznrm2", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_dznrm2");
+ double rval = __real_cblas_dznrm2(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_dznrm2", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dznrm2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_dznrm2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DZNRM2_IDX, "ocl_cblas_dznrm2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_DZNRM2_IDX, "ocl_cblas_dznrm2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(double), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_dznrm2");
index 13ec89596afaab9ac7888af0da35753f20b40373..ce9e10c21c5f582e110dd01f75d7f28b0ad98f5f 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_icamax");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_icamax");
- CBLAS_INDEX rval = __real_cblas_icamax(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_icamax", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_icamax");
+ CBLAS_INDEX rval = __real_cblas_icamax(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_icamax", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_icamax");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_icamax");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ICAMAX_IDX, "ocl_cblas_icamax");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ICAMAX_IDX, "ocl_cblas_icamax");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_icamax");
index 939303de8de7306d2055656b7809845c647b1e15..d6bfedfdb6c8e5c7f032ab6819dac7fca6e9d388 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_idamax");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_idamax");
- CBLAS_INDEX rval = __real_cblas_idamax(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_idamax", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_idamax");
+ CBLAS_INDEX rval = __real_cblas_idamax(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_idamax", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_idamax");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_idamax");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_IDAMAX_IDX, "ocl_cblas_idamax");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_IDAMAX_IDX, "ocl_cblas_idamax");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_idamax");
index c97db11a5fc1bd37e7b5c1f151dc34ab7c0df996..5beae2407db6d0578d18afe45592d7786978e9c1 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_isamax");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_isamax");
- CBLAS_INDEX rval = __real_cblas_isamax(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_isamax", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_isamax");
+ CBLAS_INDEX rval = __real_cblas_isamax(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_isamax", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_isamax");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_isamax");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ISAMAX_IDX, "ocl_cblas_isamax");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ISAMAX_IDX, "ocl_cblas_isamax");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_isamax");
index f4eaa90a5a96a0543b4227c46f63eeb9e659d9b0..c80492381b2fbef510666d734f23fb14de910c89 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_izamax");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_izamax");
- CBLAS_INDEX rval = __real_cblas_izamax(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_izamax", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_izamax");
+ CBLAS_INDEX rval = __real_cblas_izamax(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_izamax", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_izamax");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_izamax");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_IZAMAX_IDX, "ocl_cblas_izamax");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_IZAMAX_IDX, "ocl_cblas_izamax");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(CBLAS_INDEX), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_izamax");
index af48cc2eca8d443a5d73bea92d12b63b114a0921..6f4172d8596c2196c515c80186af5cb4690d0abb 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sasum");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sasum");
- float rval = __real_cblas_sasum(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sasum", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sasum");
+ float rval = __real_cblas_sasum(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sasum", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sasum");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sasum");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SASUM_IDX, "ocl_cblas_sasum");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SASUM_IDX, "ocl_cblas_sasum");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sasum");
index c9ec621e9da671ec8cd82b50261e785aef3bbcd8..9c7af6ef5e3df994d1ebee6e3962b9e3724531a2 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_saxpy(const int N, const float alpha, const float *X, const int incX,
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_saxpy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_saxpy");
- __real_cblas_saxpy(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_saxpy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_saxpy");
+ __real_cblas_saxpy(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_saxpy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_saxpy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_saxpy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SAXPY_IDX, "ocl_cblas_saxpy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SAXPY_IDX, "ocl_cblas_saxpy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -139,8 +139,8 @@ void cblas_saxpy(const int N, const float alpha, const float *X, const int incX,
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_saxpy");
index c712fb51673ad1e515dc74b35ebee65f2ef655f1..3f0a9a34a382de498be87f36cb82e93a37fb9014 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_scasum");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scasum");
- float rval = __real_cblas_scasum(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scasum", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scasum");
+ float rval = __real_cblas_scasum(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scasum", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scasum");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scasum");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SCASUM_IDX, "ocl_cblas_scasum");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SCASUM_IDX, "ocl_cblas_scasum");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_scasum");
index 234112694851bcb34a2bceda7ac6ce7d3416f012..43f0b2f8daee71bb7fbc2da09bd650cb34985650 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_scnrm2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scnrm2");
- float rval = __real_cblas_scnrm2(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scnrm2", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scnrm2");
+ float rval = __real_cblas_scnrm2(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scnrm2", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scnrm2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scnrm2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SCNRM2_IDX, "ocl_cblas_scnrm2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SCNRM2_IDX, "ocl_cblas_scnrm2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_scnrm2");
index f1bdaedb41c464f58c3ed9fed988f6918d8c5c42..7e225e88b36f50ae35a1526305af6ca5a1a1dc3d 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_scopy(const int N, const float *X, const int incX, float *Y, const in
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_scopy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scopy");
- __real_cblas_scopy(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scopy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_scopy");
+ __real_cblas_scopy(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_scopy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scopy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_scopy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SCOPY_IDX, "ocl_cblas_scopy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SCOPY_IDX, "ocl_cblas_scopy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -133,8 +133,8 @@ void cblas_scopy(const int N, const float *X, const int incX, float *Y, const in
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_scopy");
index d6b70e811d8b092f311a62ed4ac29819f6e824ec..9cd85250c62450db3c743038082f3fba35f47f43 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ float cblas_sdot(const int N, const float *X, const int incX, const float *Y, co
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sdot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sdot");
- float rval = __real_cblas_sdot(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sdot", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sdot");
+ float rval = __real_cblas_sdot(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sdot", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sdot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sdot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SDOT_IDX, "ocl_cblas_sdot");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SDOT_IDX, "ocl_cblas_sdot");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -129,7 +129,7 @@ float cblas_sdot(const int N, const float *X, const int incX, const float *Y, co
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval);
__K->setArg(5, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 5, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
@@ -146,8 +146,8 @@ float cblas_sdot(const int N, const float *X, const int incX, const float *Y, co
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sdot");
index 829ba41bc234c938da58fc12bfdca5749a58dcab..268fb76d9fe697dbef58b7a2ede06fe3e8eef091 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ float cblas_sdsdot(const int N, const float alpha, const float *X, const int inc
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sdsdot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sdsdot");
- float rval = __real_cblas_sdsdot(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sdsdot", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sdsdot");
+ float rval = __real_cblas_sdsdot(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sdsdot", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sdsdot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sdsdot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SDSDOT_IDX, "ocl_cblas_sdsdot");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SDSDOT_IDX, "ocl_cblas_sdsdot");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -135,7 +135,7 @@ float cblas_sdsdot(const int N, const float alpha, const float *X, const int inc
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval);
__K->setArg(6, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 6, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
@@ -152,8 +152,8 @@ float cblas_sdsdot(const int N, const float alpha, const float *X, const int inc
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sdsdot");
index c27de76076a42315f85a69eebb50732630b6a3c1..956ee4110936c7bddf2c7d52798e3438249e2b22 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sgbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgbmv");
- __real_cblas_sgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgbmv");
+ __real_cblas_sgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SGBMV_IDX, "ocl_cblas_sgbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SGBMV_IDX, "ocl_cblas_sgbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -144,7 +144,7 @@ void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
size_bufX = MAX(size_bufX,1);
#ifdef __cplusplus
- Buffer buf_X(*ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufX , (void *)X);
+ Buffer buf_X(*ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufX, (void *)X);
__K->setArg(9, buf_X);
#else
cl_mem buf_X = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, size_bufX, (void *)X, &err);
@@ -196,8 +196,8 @@ void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sgbmv");
index 89a010fdc2ee179ad254a0371d81283e3be7b688..3385f80c8c17b0fe1a9321ca22ea50284d7a81d0 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,32 +45,28 @@ void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sgemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!sgemm_offload_dsp(Order,M,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgemm");
- __real_cblas_sgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!sgemm_offload_dsp(Order,M,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgemm");
+ __real_cblas_sgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgemm");
/* Lookup kernel pointer from global table */
-
- void *msmc_ptr;
-
#ifdef __cplusplus
Event e;
Kernel* __K;
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SGEMM_IDX, "ocl_cblas_sgemm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SGEMM_IDX, "ocl_cblas_sgemm");
#ifdef __cplusplus
try
@@ -187,13 +184,17 @@ void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#else
err |= clSetKernelArg(__K, 13, sizeof(ldc), &ldc);
#endif
+
+ void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
- //Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(14, buf_MSMC);
#else
+ //cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 14, sizeof(buf_MSMC), &buf_MSMC);
@@ -201,9 +202,28 @@ void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#endif
#ifdef __cplusplus
- __K->setArg(15, __local(L2_BUF_SIZE));
+ __K->setArg(15, msmc_size);
+#else
+ err |= clSetKernelArg(__K, 15, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(16, buf_DDR);
+ __K->setArg(17, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(18, buf_err);
#else
- err |= clSetKernelArg(__K, 15, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 18, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -217,9 +237,14 @@ void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_sgemm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index 699187898cda0ac635ac914fb3193a6ae1c519de..9acaf531852879ddc1df0546b8da64c99afbad4f 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sgemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgemv");
- __real_cblas_sgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sgemv");
+ __real_cblas_sgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sgemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sgemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SGEMV_IDX, "ocl_cblas_sgemv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SGEMV_IDX, "ocl_cblas_sgemv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -184,13 +184,12 @@ void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sgemv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_sgemv", (float) clock_diff);
-
return ;
}
index 223e4302994449aa6ba50588e8548c774256c304..fcf4ca2447b629688d2508ee97467595506a823f 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_sger(const enum CBLAS_ORDER order, const int M, const int N, const fl
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sger");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sger");
- __real_cblas_sger(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sger", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sger");
+ __real_cblas_sger(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sger", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sger");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sger");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SGER_IDX, "ocl_cblas_sger");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SGER_IDX, "ocl_cblas_sger");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -172,8 +172,8 @@ void cblas_sger(const enum CBLAS_ORDER order, const int M, const int N, const fl
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sger");
index f5b75652bfbc1aab58ba5b6c06b2a8c40c08b866..013acfabcdf148eb1bcc16480e45f5d9ee9b81f2 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_snrm2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_snrm2");
- float rval = __real_cblas_snrm2(N,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_snrm2", (float) clock_diff);
- return rval;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_snrm2");
+ float rval = __real_cblas_snrm2(N,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_snrm2", (float) clock_diff);
+ return rval;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_snrm2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_snrm2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SNRM2_IDX, "ocl_cblas_snrm2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SNRM2_IDX, "ocl_cblas_snrm2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
Buffer buf_retval(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval);
__K->setArg(3, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), &retval, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 3, sizeof(buf_retval), &buf_retval);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_snrm2");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_snrm2", (float) clock_diff);
return retval;
index 160d339654a3410f68da607b429c7abaf498597e..cafda115a4d0d08b9352f6f0864a0812bb416330 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_srot(const int N, float *X, const int incX, float *Y, const int incY,
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_srot");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srot");
- __real_cblas_srot(N,X,incX,Y,incY,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srot", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srot");
+ __real_cblas_srot(N,X,incX,Y,incY,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srot", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srot");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srot");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SROT_IDX, "ocl_cblas_srot");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SROT_IDX, "ocl_cblas_srot");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -144,8 +144,8 @@ void cblas_srot(const int N, float *X, const int incX, float *Y, const int incY,
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_srot");
index 0ab8423f641a959e19a3706cc9826aed157b6384..dbe5e2d6c86d533809a642281dca108f4878cce8 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_srotg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotg");
- __real_cblas_srotg(a,b,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotg");
+ __real_cblas_srotg(a,b,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SROTG_IDX, "ocl_cblas_srotg");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SROTG_IDX, "ocl_cblas_srotg");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
Buffer buf_a(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), (void *)a);
__K->setArg(0, buf_a);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_srotg");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_srotg", (float) clock_diff);
return ;
index ec94428d1512907e242e297fddd6c76c8786ae06..42a2bf43dbfc23695ec3e34b15b19a5ec7b7ac7b 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_srotm(const int N, float *X, const int incX, float *Y, const int incY
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_srotm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotm");
- __real_cblas_srotm(N,X,incX,Y,incY,P);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotm");
+ __real_cblas_srotm(N,X,incX,Y,incY,P);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SROTM_IDX, "ocl_cblas_srotm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SROTM_IDX, "ocl_cblas_srotm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -143,8 +143,8 @@ void cblas_srotm(const int N, float *X, const int incX, float *Y, const int incY
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_srotm");
index 4cfa6b23da62ba85ff188ad696a8cdc035644ad9..892f0646b2af12ee17521023641ef47b87f687fd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_srotmg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotmg");
- __real_cblas_srotmg(d1,d2,b1,b2,P);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotmg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_srotmg");
+ __real_cblas_srotmg(d1,d2,b1,b2,P);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_srotmg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotmg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_srotmg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SROTMG_IDX, "ocl_cblas_srotmg");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SROTMG_IDX, "ocl_cblas_srotmg");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
Buffer buf_d1(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), (void *)d1);
__K->setArg(0, buf_d1);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_srotmg");
index 2791b007e99c2962c06932fd64e67808c8d1d562..8e7f770cfccd9abf1cd3a93936bdd58f1289241b 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssbmv");
- __real_cblas_ssbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssbmv");
+ __real_cblas_ssbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSBMV_IDX, "ocl_cblas_ssbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSBMV_IDX, "ocl_cblas_ssbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -184,8 +184,8 @@ void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssbmv");
index a38f6844c965ac85b59e88e0d24c7ba37d16a8de..d375b548ca045d954c31030bb01a3ef76b1d8941 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sscal");
- __real_cblas_sscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sscal");
+ __real_cblas_sscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSCAL_IDX, "ocl_cblas_sscal");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSCAL_IDX, "ocl_cblas_sscal");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sscal");
index 97003a764e5890dbe13af65563ab03a595dfbbd5..d292322a2c27abfa3ffca743e9fb8305c7e6b31e 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sspmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspmv");
- __real_cblas_sspmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspmv");
+ __real_cblas_sspmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSPMV_IDX, "ocl_cblas_sspmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSPMV_IDX, "ocl_cblas_sspmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -172,8 +172,8 @@ void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sspmv");
index 3b7e58b0c9cc5cd152ad4af8f0aefc571374662d..229cc92b922d28d3fa8096ac72c80a631caae19d 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sspr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspr");
- __real_cblas_sspr(order,Uplo,N,alpha,X,incX,Ap);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspr");
+ __real_cblas_sspr(order,Uplo,N,alpha,X,incX,Ap);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSPR_IDX, "ocl_cblas_sspr");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSPR_IDX, "ocl_cblas_sspr");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sspr");
index 6bc1302ce09d87668dc6b2a5a2d57609763df902..30c784ea1b50e838cfb8977fc2a79a642b1118ac 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_sspr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspr2");
- __real_cblas_sspr2(order,Uplo,N,alpha,X,incX,Y,incY,A);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sspr2");
+ __real_cblas_sspr2(order,Uplo,N,alpha,X,incX,Y,incY,A);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sspr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sspr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSPR2_IDX, "ocl_cblas_sspr2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSPR2_IDX, "ocl_cblas_sspr2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -166,8 +166,8 @@ void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sspr2");
index f67e00a27037f6359161277bb5492e2b00a122b3..2f7865eecf9215983fac2111e695c37074930a53 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -49,27 +50,26 @@ void cblas_sswap(const int N, float *X, const int incX, float *Y, const int incY
* the offload of this routine to the DSP is disabled.
*/
#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sswap");
- __real_cblas_sswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sswap", (float) clock_diff);
+ __real_cblas_sswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sswap", (float) clock_diff);
return ;
#else
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sswap");
- __real_cblas_sswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sswap", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_sswap");
+ __real_cblas_sswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_sswap", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sswap");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_sswap");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSWAP_IDX, "ocl_cblas_sswap");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSWAP_IDX, "ocl_cblas_sswap");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -144,9 +144,10 @@ void cblas_sswap(const int N, float *X, const int incX, float *Y, const int incY
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_sswap");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_sswap", (float) clock_diff);
return ;
index 426489589cf6940136ae8363c341f9dbe59f020a..f3852044622292eeb09776f982424e8d953e4617 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssymm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssymm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssymm");
- __real_cblas_ssymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssymm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssymm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssymm");
+ __real_cblas_ssymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssymm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssymm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssymm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYMM_IDX, "ocl_cblas_ssymm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYMM_IDX, "ocl_cblas_ssymm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -180,10 +180,11 @@ void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -195,9 +196,28 @@ void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -211,9 +231,14 @@ void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_ssymm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index d2c495d00849f450cd1fa3cba681fcc2c771fecd..6aedd21b2099f9c68f579f2b095647bebc6c6da6 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssymv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssymv");
- __real_cblas_ssymv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssymv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssymv");
+ __real_cblas_ssymv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssymv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssymv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssymv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYMV_IDX, "ocl_cblas_ssymv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYMV_IDX, "ocl_cblas_ssymv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -178,8 +178,8 @@ void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssymv");
index 7bf8d2d8f59243e7c5bdaacccbc47d85f324540f..64166fb7805017b77f8915fd5eaefb28dea769c6 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssyr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr");
- __real_cblas_ssyr(order,Uplo,N,alpha,X,incX,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr");
+ __real_cblas_ssyr(order,Uplo,N,alpha,X,incX,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYR_IDX, "ocl_cblas_ssyr");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYR_IDX, "ocl_cblas_ssyr");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssyr");
index 47fde30cda5047666d3c3a49c34bd9ba1d5fa3d2..d5edeac4e9449fe712ae4f487bdf0742bab36d92 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssyr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr2");
- __real_cblas_ssyr2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr2");
+ __real_cblas_ssyr2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYR2_IDX, "ocl_cblas_ssyr2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYR2_IDX, "ocl_cblas_ssyr2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -172,8 +172,8 @@ void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssyr2");
index cf39e18a627eff51253a3a4662ab99ea1d02934d..8564f71b855ce2d164fa90786928001a17692661 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssyr2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssyr2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr2k");
- __real_cblas_ssyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssyr2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyr2k");
+ __real_cblas_ssyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyr2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyr2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYR2K_IDX, "ocl_cblas_ssyr2k");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYR2K_IDX, "ocl_cblas_ssyr2k");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -180,10 +180,11 @@ void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -195,9 +196,28 @@ void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
+#else
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -211,9 +231,15 @@ void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_ssyr2k is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssyr2k");
index 12c59ed731d692ccd50488a384189518b86b68d3..a9f8a2db47ddd19ef695eb5039aaf9e83e22b656 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ssyrk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssyrk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyrk");
- __real_cblas_ssyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyrk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ssyrk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ssyrk");
+ __real_cblas_ssyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ssyrk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyrk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ssyrk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYRK_IDX, "ocl_cblas_ssyrk");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_SSYRK_IDX, "ocl_cblas_ssyrk");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -159,13 +159,15 @@ void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
+ //cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 11, sizeof(buf_MSMC), &buf_MSMC);
@@ -173,9 +175,28 @@ void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
#ifdef __cplusplus
- __K->setArg(12, __local(L2_BUF_SIZE));
+ __K->setArg(12, msmc_size);
+#else
+ err |= clSetKernelArg(__K, 12, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(13, buf_DDR);
+ __K->setArg(14, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(15, buf_err);
#else
- err |= clSetKernelArg(__K, 12, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 15, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -189,9 +210,15 @@ void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_ssyrk is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ssyrk");
index e1732cc1fbab304f00c8cf1b6f0270fad5b14188..60fe80300ab035b668a733666a499ca83e86844e 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_stbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stbmv");
- __real_cblas_stbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stbmv");
+ __real_cblas_stbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STBMV_IDX, "ocl_cblas_stbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STBMV_IDX, "ocl_cblas_stbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -163,8 +163,8 @@ void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_stbmv");
index 052657cd1f12229ccbeb4214a55ce2b30ecbbf02..7ac2d478afac27e1f380af80382e6de53045f293 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_stbsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stbsv");
- __real_cblas_stbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stbsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stbsv");
+ __real_cblas_stbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stbsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stbsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stbsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STBSV_IDX, "ocl_cblas_stbsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STBSV_IDX, "ocl_cblas_stbsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -163,8 +163,8 @@ void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_stbsv");
index 3d2844cadf7da87394e771871f424f8b4e7d57fc..0e0a326afeca05ba803d6f0944cc47f5b386e047 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_stpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stpmv");
- __real_cblas_stpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stpmv");
+ __real_cblas_stpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STPMV_IDX, "ocl_cblas_stpmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STPMV_IDX, "ocl_cblas_stpmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -151,8 +151,8 @@ void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_stpmv");
index 236273c04c295ceb2d702bf14b030ba0db64d024..e54e5650deb312b09f00c746d4247f5f0e8d8780 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_stpsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stpsv");
- __real_cblas_stpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stpsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_stpsv");
+ __real_cblas_stpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_stpsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stpsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_stpsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STPSV_IDX, "ocl_cblas_stpsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STPSV_IDX, "ocl_cblas_stpsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -151,8 +151,8 @@ void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_stpsv");
index fb28e11717b2c9c1e9d6d5827434e9bd849b73b9..4e80830d0e4ac23c9e1133e6098afd336ed2ebce 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_strmm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!strmm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strmm");
- __real_cblas_strmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strmm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!strmm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strmm");
+ __real_cblas_strmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strmm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strmm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strmm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STRMM_IDX, "ocl_cblas_strmm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STRMM_IDX, "ocl_cblas_strmm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -165,10 +165,11 @@ void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -180,9 +181,28 @@ void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(13, __local(L2_BUF_SIZE));
+ __K->setArg(13, msmc_size);
#else
- err |= clSetKernelArg(__K, 13, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 13, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(14, buf_DDR);
+ __K->setArg(15, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(16, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 16, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -196,11 +216,16 @@ void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_strmm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
- ti_cblas_delete_kernel(__K);
+ __free_ddr(ddr_ptr);
+ ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_strmm");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_strmm", (float) clock_diff);
index 63e3fa1d43439db7b10b1f52a815d9df0f6685f5..9860ef5389428bac5ac5bc344f55043531ef2e83 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_strmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strmv");
- __real_cblas_strmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strmv");
+ __real_cblas_strmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STRMV_IDX, "ocl_cblas_strmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STRMV_IDX, "ocl_cblas_strmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -157,8 +157,8 @@ void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_strmv");
index 831df896ee87758e888ea773ae81057d1fa1f326..4bf872ca5f052ca5c8db46826e154982c7e698d6 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_strsm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!strsm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strsm");
- __real_cblas_strsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strsm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!strsm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strsm");
+ __real_cblas_strsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strsm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strsm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strsm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STRSM_IDX, "ocl_cblas_strsm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STRSM_IDX, "ocl_cblas_strsm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -165,10 +165,11 @@ void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -180,9 +181,28 @@ void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(13, __local(L2_BUF_SIZE));
+ __K->setArg(13, msmc_size);
+#else
+ err |= clSetKernelArg(__K, 13, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(14, buf_DDR);
+ __K->setArg(15, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(16, buf_err);
#else
- err |= clSetKernelArg(__K, 13, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 16, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -196,9 +216,15 @@ void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_strsm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_strsm");
index 19ef805413c1609cdaaa73f4d1bc58df8aab408e..1a7e2096f48a89a55075e2816760173a67ab0b20 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_strsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strsv");
- __real_cblas_strsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_strsv");
+ __real_cblas_strsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_strsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_strsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STRSV_IDX, "ocl_cblas_strsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_STRSV_IDX, "ocl_cblas_strsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -157,8 +157,8 @@ void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_strsv");
index 6d9a5a211718a030fce311f10a9cc89939236e16..f7da8da5f5bfcbee9eaa11924c8a55b556cae538 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_xerbla");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_xerbla");
- __real_cblas_xerbla(p,rout,form);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_xerbla", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_xerbla");
+ __real_cblas_xerbla(p,rout,form);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_xerbla", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_xerbla");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_xerbla");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_XERBLA_IDX, "ocl_cblas_xerbla");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_XERBLA_IDX, "ocl_cblas_xerbla");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, p);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
- TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_errprn");
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_errprn", (float) clock_diff);
+ TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_xerbla");
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_xerbla", (float) clock_diff);
return ;
}
index 623a96fee04f2195283bc54d6b06389c247a2783..c4899aa2aa4e361b238f6d07d600028dddc9e8d2 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zaxpy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zaxpy");
- __real_cblas_zaxpy(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zaxpy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zaxpy");
+ __real_cblas_zaxpy(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zaxpy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zaxpy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zaxpy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZAXPY_IDX, "ocl_cblas_zaxpy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZAXPY_IDX, "ocl_cblas_zaxpy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zaxpy");
index b4c9f9252381f69d78ecdbe093d9b1a807b4768d..36b75550ec81044c32c1e0f358d180b5d1d07b35 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zcopy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zcopy");
- __real_cblas_zcopy(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zcopy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zcopy");
+ __real_cblas_zcopy(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zcopy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zcopy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zcopy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZCOPY_IDX, "ocl_cblas_zcopy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZCOPY_IDX, "ocl_cblas_zcopy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zcopy");
diff --git a/blasblisacc/src/ti_cblas_cblas_zdotc_sub.c b/blasblisacc/src/ti_cblas_cblas_zdotc_sub.c
index 3b2937dcb7c2ebf599bb9d9ed294563388ab76e0..7bc1336cd31f3d5576c4676db93f5301f9cc6b27 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zdotc_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdotc_sub");
- __real_cblas_zdotc_sub(N,X,incX,Y,incY,dotc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdotc_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdotc_sub");
+ __real_cblas_zdotc_sub(N,X,incX,Y,incY,dotc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdotc_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdotc_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdotc_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZDOTC_SUB_IDX, "ocl_cblas_zdotc_sub");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZDOTC_SUB_IDX, "ocl_cblas_zdotc_sub");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zdotc_sub");
diff --git a/blasblisacc/src/ti_cblas_cblas_zdotu_sub.c b/blasblisacc/src/ti_cblas_cblas_zdotu_sub.c
index 63bd00b5714db92ef0450ec7b9e141a1c23053e2..0327eebe150194bb5ea43796d293af7103a1eb41 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zdotu_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdotu_sub");
- __real_cblas_zdotu_sub(N,X,incX,Y,incY,dotu);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdotu_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdotu_sub");
+ __real_cblas_zdotu_sub(N,X,incX,Y,incY,dotu);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdotu_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdotu_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdotu_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZDOTU_SUB_IDX, "ocl_cblas_zdotu_sub");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZDOTU_SUB_IDX, "ocl_cblas_zdotu_sub");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zdotu_sub");
index a81e4b0c730584436ec2c4d056abae2a54ac2856..838c7dea90c45dbcb4ee8e60e6f01c4fb0a8430d 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zdscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdscal");
- __real_cblas_zdscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zdscal");
+ __real_cblas_zdscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zdscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zdscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZDSCAL_IDX, "ocl_cblas_zdscal");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZDSCAL_IDX, "ocl_cblas_zdscal");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zdscal");
index b35307bacc1fd4a0c28f8ff5213f6bb370886b9e..049ca5811653975b0bce644c61be20cb0356cf9e 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgbmv");
- __real_cblas_zgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgbmv");
+ __real_cblas_zgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGBMV_IDX, "ocl_cblas_zgbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGBMV_IDX, "ocl_cblas_zgbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -206,8 +206,8 @@ void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgbmv");
index 6c4df83b91f76d60fd0fe26431678f5f1b7b6e57..fc11cb1c7672bc45fc2fbbea949939375d677332 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,21 +45,19 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
-
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zgemm_offload_dsp(Order,M,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgemm");
- __real_cblas_zgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zgemm_offload_dsp(Order,M,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgemm");
+ __real_cblas_zgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGEMM_IDX, "ocl_cblas_zgemm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGEMM_IDX, "ocl_cblas_zgemm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -197,10 +196,11 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(14, buf_MSMC);
#else
@@ -212,9 +212,28 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#endif
#ifdef __cplusplus
- __K->setArg(15, __local(L2_BUF_SIZE));
+ __K->setArg(15, msmc_size);
#else
- err |= clSetKernelArg(__K, 15, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 15, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(16, buf_DDR);
+ __K->setArg(17, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(18, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 18, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -228,9 +247,15 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_zgemm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgemm");
index 529d7fafe29e8fc332fc07526fe25375fccf35f3..2e1dbc1d6209cd2e3c8c340bfca0d54a9c32c716 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgemv");
- __real_cblas_zgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgemv");
+ __real_cblas_zgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGEMV_IDX, "ocl_cblas_zgemv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGEMV_IDX, "ocl_cblas_zgemv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -194,8 +194,8 @@ void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgemv");
index c4f9f419bf61727e969074f58df33821bd33e4a4..6292ddccffd4e7509d3e2515c6362177a1937c88 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgerc");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgerc");
- __real_cblas_zgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgerc", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgerc");
+ __real_cblas_zgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgerc", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgerc");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgerc");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGERC_IDX, "ocl_cblas_zgerc");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGERC_IDX, "ocl_cblas_zgerc");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -177,8 +177,8 @@ void cblas_zgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgerc");
index 1ab3344394d4e2a83fe56af07239c3081cdbaac2..8302431314f7ef2f0452302e7771b983121dd136 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zgeru");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgeru");
- __real_cblas_zgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgeru", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zgeru");
+ __real_cblas_zgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zgeru", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgeru");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zgeru");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGERU_IDX, "ocl_cblas_zgeru");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZGERU_IDX, "ocl_cblas_zgeru");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -177,8 +177,8 @@ void cblas_zgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zgeru");
index f572f9a34b55291276e5eb37fa061cf2c9efb0f7..6afd4cbd2c2c27421dceed1e9d16227fe91fda32 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhbmv");
- __real_cblas_zhbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhbmv");
+ __real_cblas_zhbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHBMV_IDX, "ocl_cblas_zhbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHBMV_IDX, "ocl_cblas_zhbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -194,8 +194,8 @@ void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhbmv");
index 2ca9ef5e6aebfeee03c8caa2be09238be90084b4..193b0f345d6d6d8390e39525947a7e6f7961a5db 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,22 +45,19 @@ void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zhemm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhemm");
-
- __real_cblas_zhemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
-
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zhemm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhemm");
+ __real_cblas_zhemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHEMM_IDX, "ocl_cblas_zhemm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHEMM_IDX, "ocl_cblas_zhemm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -192,10 +190,11 @@ void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -207,9 +206,28 @@ void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -223,9 +241,14 @@ void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_zhemm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index 1327c6357bd7bdc4e81c108cf7f9f94de10a9b39..283bc8d77feec964a2f6205685d33073c0172551 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhemv");
- __real_cblas_zhemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhemv");
+ __real_cblas_zhemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHEMV_IDX, "ocl_cblas_zhemv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHEMV_IDX, "ocl_cblas_zhemv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -188,8 +188,8 @@ void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhemv");
index 5097968b4e32cf562db195b0ae7be67f9e35d901..bf0dec2e521f2e40394ebd26ed609a0794301e70 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zher");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher");
- __real_cblas_zher(order,Uplo,N,alpha,X,incX,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher");
+ __real_cblas_zher(order,Uplo,N,alpha,X,incX,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHER_IDX, "ocl_cblas_zher");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHER_IDX, "ocl_cblas_zher");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zher");
index c4e640050154811b2a65e60cfcd49d3abd421a53..9b72f952c717e8b87f0e45464343b7e4a45500c5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zher2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher2");
- __real_cblas_zher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher2");
+ __real_cblas_zher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHER2_IDX, "ocl_cblas_zher2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHER2_IDX, "ocl_cblas_zher2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -177,8 +177,8 @@ void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zher2");
index 7884ddfe0b6504013137cfaa1e866b9001e6025a..94442469dbf76005979ff4b53146ddbf3c9f4ef8 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zher2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zher2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher2k");
- __real_cblas_zher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zher2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zher2k");
+ __real_cblas_zher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zher2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zher2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHER2K_IDX, "ocl_cblas_zher2k");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHER2K_IDX, "ocl_cblas_zher2k");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -185,24 +185,44 @@ void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
//cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
- cl_mem buf_MSMC = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
+ cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
err |= clSetKernelArg(__K, 13, sizeof(buf_MSMC), &buf_MSMC);
TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -216,9 +236,14 @@ void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_zher2k is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index 2285f9b02c4bdb19edc81aa1abc498c07ec0d2c8..651fab2d45c10ff4ff177a89ac8089699d99aaec 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zherk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zherk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zherk");
- __real_cblas_zherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zherk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zherk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zherk");
+ __real_cblas_zherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zherk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zherk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zherk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHERK_IDX, "ocl_cblas_zherk");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHERK_IDX, "ocl_cblas_zherk");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -159,10 +159,11 @@ void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -174,9 +175,28 @@ void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
#ifdef __cplusplus
- __K->setArg(12, __local(L2_BUF_SIZE));
+ __K->setArg(12, msmc_size);
+#else
+ err |= clSetKernelArg(__K, 12, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(13, buf_DDR);
+ __K->setArg(14, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(15, buf_err);
#else
- err |= clSetKernelArg(__K, 12, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 15, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -190,9 +210,15 @@ void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_zherk is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zherk");
index 3c6b71974bb5d0220f36621f332a3d52889765f3..8c14a93696a54038c91b093be502852b680fd6d9 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpmv");
- __real_cblas_zhpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpmv");
+ __real_cblas_zhpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHPMV_IDX, "ocl_cblas_zhpmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHPMV_IDX, "ocl_cblas_zhpmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -182,8 +182,8 @@ void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhpmv");
index 0288c7b0abc0520ffc6d2f36147afd9eda8eed45..ff810ec1ded88adb08f1f8aca3d66df6308abfc1 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhpr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpr");
- __real_cblas_zhpr(order,Uplo,N,alpha,X,incX,A);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpr");
+ __real_cblas_zhpr(order,Uplo,N,alpha,X,incX,A);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHPR_IDX, "ocl_cblas_zhpr");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHPR_IDX, "ocl_cblas_zhpr");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhpr");
index d8f30b7382a4e5a9a060a87a34c59d40cbe26578..33c9fb6149e06308ed18df853a188c5b1ddba158 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zhpr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpr2");
- __real_cblas_zhpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zhpr2");
+ __real_cblas_zhpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zhpr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zhpr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHPR2_IDX, "ocl_cblas_zhpr2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZHPR2_IDX, "ocl_cblas_zhpr2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -171,8 +171,8 @@ void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zhpr2");
index dd4a8d62fce1bcabf4b9e25ebf03753ae5c612dd..f13b5c94c59ca73113957901051ee8933b2fc773 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zrotg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zrotg");
- __real_cblas_zrotg(a,b,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zrotg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zrotg");
+ __real_cblas_zrotg(a,b,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zrotg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zrotg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zrotg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZROTG_IDX, "ocl_cblas_zrotg");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZROTG_IDX, "ocl_cblas_zrotg");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
Buffer buf_a(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), (void *)a);
__K->setArg(0, buf_a);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zrotg");
index 47ab78a9bdc32974a73529b1e97cf995591b284a..c54445b49c073aeab180876b1bf32d1dc18e0f4f 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zscal");
- __real_cblas_zscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zscal");
+ __real_cblas_zscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSCAL_IDX, "ocl_cblas_zscal");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSCAL_IDX, "ocl_cblas_zscal");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zscal");
index e8996851650d88b0476d58aa1478529cb24766bd..d4855ae08da00f6abf1519179d23430905a8827c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
* the offload of this routine to the DSP is disabled.
*/
#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zswap");
- __real_cblas_zswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zswap", (float) clock_diff);
+ __real_cblas_zswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zswap", (float) clock_diff);
return ;
#else
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zswap");
- __real_cblas_zswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zswap", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zswap");
+ __real_cblas_zswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zswap", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zswap");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zswap");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSWAP_IDX, "ocl_cblas_zswap");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSWAP_IDX, "ocl_cblas_zswap");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -144,10 +144,10 @@ void cblas_zswap(const int N, void *X, const int incX, void *Y, const int incY)
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zswap");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_zswap", (float) clock_diff);
return ;
index 84b891f7c6bcec1cd6a3a1b342af7dfe2da69197..cd9731d6cb66008a89a604a38f9e28c3189229bd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zsymm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsymm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsymm");
- __real_cblas_zsymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsymm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsymm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsymm");
+ __real_cblas_zsymm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsymm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsymm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsymm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSYMM_IDX, "ocl_cblas_zsymm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSYMM_IDX, "ocl_cblas_zsymm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -188,11 +188,13 @@ void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -204,9 +206,28 @@ void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -220,9 +241,14 @@ void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_zsymm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index a1974118b4a308d3b030701de7882ca3861182e2..5c83eedc82d6d7212f1fd816652debc8dce66dc8 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zsyr2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsyr2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsyr2k");
- __real_cblas_zsyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsyr2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsyr2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsyr2k");
+ __real_cblas_zsyr2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsyr2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsyr2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsyr2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSYR2K_IDX, "ocl_cblas_zsyr2k");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSYR2K_IDX, "ocl_cblas_zsyr2k");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -188,11 +188,13 @@ void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -204,9 +206,28 @@ void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -220,9 +241,14 @@ void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_zsyr2k is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index 562202f8c988e0bb3debf15a4620b7f846439d37..49622bba4be258772826ffaff90d3e66818d9739 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_zsyrk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsyrk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsyrk");
- __real_cblas_zsyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsyrk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!zsyrk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_zsyrk");
+ __real_cblas_zsyrk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_zsyrk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsyrk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_zsyrk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSYRK_IDX, "ocl_cblas_zsyrk");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZSYRK_IDX, "ocl_cblas_zsyrk");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -167,11 +167,13 @@ void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#else
err |= clSetKernelArg(__K, 10, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -183,9 +185,28 @@ void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
#ifdef __cplusplus
- __K->setArg(12, __local(L2_BUF_SIZE));
+ __K->setArg(12, msmc_size);
#else
- err |= clSetKernelArg(__K, 12, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 12, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(13, buf_DDR);
+ __K->setArg(14, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(15, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 15, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -199,9 +220,15 @@ void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_zsyrk is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_zsyrk");
index f4551f9c3de526c57f79a9f0f27ecac0de42e4c9..f17892612f30f69f0a03cbc6555baf5727aa8989 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztbmv");
- __real_cblas_ztbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztbmv");
+ __real_cblas_ztbmv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTBMV_IDX, "ocl_cblas_ztbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTBMV_IDX, "ocl_cblas_ztbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -163,8 +163,8 @@ void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztbmv");
index 80cdb22a3d7d00fb6068fbb2d5a981e743cf9c5f..6202e8d8ed3c0fcbb7026b62638994eb4fb575a6 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztbsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztbsv");
- __real_cblas_ztbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztbsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztbsv");
+ __real_cblas_ztbsv(order,Uplo,TransA,Diag,N,K,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztbsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztbsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztbsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTBSV_IDX, "ocl_cblas_ztbsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTBSV_IDX, "ocl_cblas_ztbsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -163,8 +163,8 @@ void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztbsv");
index 8198ad0898e672d632f0bb97c82f9ceed622789f..9d26625f0c751cac87d6cd0e6f3f17a86f3da394 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztpmv");
- __real_cblas_ztpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztpmv");
+ __real_cblas_ztpmv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTPMV_IDX, "ocl_cblas_ztpmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTPMV_IDX, "ocl_cblas_ztpmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -151,8 +151,8 @@ void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztpmv");
index 53df46db9d7d4b428d8ebf3ff6fb5c6a8af54357..1c47aa7e9bdded2604a2a9611b77d46595d8c12a 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztpsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztpsv");
- __real_cblas_ztpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztpsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztpsv");
+ __real_cblas_ztpsv(order,Uplo,TransA,Diag,N,Ap,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztpsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztpsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztpsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTPSV_IDX, "ocl_cblas_ztpsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTPSV_IDX, "ocl_cblas_ztpsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -151,8 +151,8 @@ void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztpsv");
index 20095bbd5c99300667b2b22a3919cc8d4f2e7360..4953031e56496c0551003e67f2f225a8279dbcb7 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztrmm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ztrmm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrmm");
- __real_cblas_ztrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrmm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ztrmm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrmm");
+ __real_cblas_ztrmm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrmm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrmm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrmm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTRMM_IDX, "ocl_cblas_ztrmm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTRMM_IDX, "ocl_cblas_ztrmm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -170,10 +170,11 @@ void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -185,9 +186,28 @@ void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(13, __local(L2_BUF_SIZE));
+ __K->setArg(13, msmc_size);
#else
- err |= clSetKernelArg(__K, 13, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 13, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(14, buf_DDR);
+ __K->setArg(15, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(16, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 16, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -201,10 +221,17 @@ void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_ztrmm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztrmm");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_ztrmm", (float) clock_diff);
return ;
index 14c5990682cf43f1dfafe520c4e6fc8bcd3e5f14..e3f466cdbb08267fe2c6998ffd7c440a0593cc71 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztrmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrmv");
- __real_cblas_ztrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrmv");
+ __real_cblas_ztrmv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTRMV_IDX, "ocl_cblas_ztrmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTRMV_IDX, "ocl_cblas_ztrmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -157,8 +157,8 @@ void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztrmv");
index 6c4c5c68165f7617ffec1ecbab625f0b6f4a909a..1be9ded6fa0b7171f9a4875c196cf1db10a1cb6b 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztrsm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ztrsm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrsm");
- __real_cblas_ztrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrsm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!ztrsm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrsm");
+ __real_cblas_ztrsm(Order,Side,Uplo,TransA,Diag,M,N,alpha,A,lda,B,ldb);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrsm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrsm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrsm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTRSM_IDX, "ocl_cblas_ztrsm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTRSM_IDX, "ocl_cblas_ztrsm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -170,10 +170,11 @@ void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(12, buf_MSMC);
#else
@@ -185,9 +186,28 @@ void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(13, __local(L2_BUF_SIZE));
+ __K->setArg(13, msmc_size);
+#else
+ err |= clSetKernelArg(__K, 13, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(14, buf_DDR);
+ __K->setArg(15, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(16, buf_err);
#else
- err |= clSetKernelArg(__K, 13, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 16, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -201,9 +221,15 @@ void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_ztrsm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztrsm");
index 498c208dd49e74436c5f268921f96abb0800ad48..558a8873098fa7924ce835ed274f00777f8285ae 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ztrsv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrsv");
- __real_cblas_ztrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrsv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ztrsv");
+ __real_cblas_ztrsv(order,Uplo,TransA,Diag,N,A,lda,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ztrsv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrsv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ztrsv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTRSV_IDX, "ocl_cblas_ztrsv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_ZTRSV_IDX, "ocl_cblas_ztrsv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -157,8 +157,8 @@ void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ztrsv");
index 9b3a59085359e01e800f82da4b03a9a97cb1b700..7dcdbc968822268fb21b4380dc46d889af62f508 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
#include <pthread.h>
#ifdef TI_CBLAS_FAT_BINARY
if(ti_cblas_init_done == 0)
return 0;
- //r_val = ti_blis_finalize();
+ r_val = ti_blis_finalize();
/*Using same name as ti_cblas_init critical region. See notes in bli_init*/
#pragma omp critical (ti_cblas_init_critical)
{
pthread_mutex_init(&MUTEX, 0);
TI_CBLAS_DEBUG_PRINT("Pthreads initialized\n");
- //TI_CBLAS_DEBUG_PRINT("Initializing BLIS\n");
- //ti_blis_init();
- //TI_CBLAS_DEBUG_PRINT("BLIS initialized\n");
+ TI_CBLAS_DEBUG_PRINT("Initializing BLIS\n");
+ ti_blis_init();
+ TI_CBLAS_DEBUG_PRINT("BLIS initialized\n");
atexit(ti_cblas_auto_finalize);
index 8745a47d4d13fbadaebe967883cb627fc6d1d635..36b6a4da1e4d1a63565c69096a8ccf320ee24249 100644 (file)
enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
+
void ti_bli_init_dsp(global char *l3_buf, local char *l2_buf);
kernel void ocl_bli_init(global char *l3_buf, local char *l2_buf)
{ ti_bli_init_dsp(l3_buf, l2_buf); }
void ti_bli_finalize_dsp(void);
kernel void ocl_bli_finalize(void)
{ ti_bli_finalize_dsp(); }
+
void cblas_caxpy_facade(const int N, global const void *alpha, global const void *X, const int incX, global void *Y, const int incY);
kernel void ocl_cblas_caxpy(const int N, global const void *alpha, global const void *X, const int incX, global void *Y, const int incY)
{ cblas_caxpy_facade(N, alpha, X, incX, Y, incY); }
void cblas_cgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_cgbmv_facade(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY); }
-void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_cgemm_facade(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_cgemm_facade(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_cgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_cgemv_facade(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY); }
void cblas_chbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_chbmv_facade(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY); }
-void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_chemm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_chemm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_chemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_chemv_facade(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY); }
@@ -82,12 +84,12 @@ kernel void ocl_cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO U
void cblas_cher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *X, const int incX, global const void *Y, const int incY, global void *A, const int lda);
kernel void ocl_cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *X, const int incX, global const void *Y, const int incY, global void *A, const int lda)
{ cblas_cher2_facade(order, Uplo, N, alpha, X, incX, Y, incY, A, lda); }
-void cblas_cher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, const float beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, const float beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_cher2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
-void cblas_cherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const void *A, const int lda, const float beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const void *A, const int lda, const float beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_cherk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_cher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, const float beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, const float beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_cher2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
+void cblas_cherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const void *A, const int lda, const float beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const void *A, const int lda, const float beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_cherk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_chpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *Ap, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *Ap, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_chpmv_facade(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY); }
@@ -109,15 +111,15 @@ kernel void ocl_cblas_csscal(const int N, const float alpha, global void *X, con
void cblas_cswap_facade(const int N, global void *X, const int incX, global void *Y, const int incY);
kernel void ocl_cblas_cswap(const int N, global void *X, const int incX, global void *Y, const int incY)
{ cblas_cswap_facade(N, X, incX, Y, incY); }
-void cblas_csymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_csymm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
-void cblas_csyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_csyr2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
-void cblas_csyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_csyrk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_csymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_csymm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
+void cblas_csyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_csyr2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
+void cblas_csyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_csyrk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_ctbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, global const void *A, const int lda, global void *X, const int incX);
kernel void ocl_cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, global const void *A, const int lda, global void *X, const int incX)
{ cblas_ctbmv_facade(order, Uplo, TransA, Diag, N, K, A, lda, X, incX); }
@@ -130,20 +132,20 @@ kernel void ocl_cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO
void cblas_ctpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *Ap, global void *X, const int incX);
kernel void ocl_cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *Ap, global void *X, const int incX)
{ cblas_ctpsv_facade(order, Uplo, TransA, Diag, N, Ap, X, incX); }
-void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_ctrmm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l2_buf_loc); }
+void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_ctrmm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_ctrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *A, const int lda, global void *X, const int incX);
kernel void ocl_cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *A, const int lda, global void *X, const int incX)
{ cblas_ctrmv_facade(order, Uplo, TransA, Diag, N, A, lda, X, incX); }
-void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_ctrsm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l2_buf_loc); }
+void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_ctrsm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_ctrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *A, const int lda, global void *X, const int incX);
kernel void ocl_cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *A, const int lda, global void *X, const int incX)
{ cblas_ctrsv_facade(order, Uplo, TransA, Diag, N, A, lda, X, incX); }
-void cblas_dasum_facade(const int N, global const double *X, const int incX, double *retval);
-kernel void ocl_cblas_dasum(const int N, global const double *X, const int incX, double *retval)
+void cblas_dasum_facade(const int N, global const double *X, const int incX, global double *retval);
+kernel void ocl_cblas_dasum(const int N, global const double *X, const int incX, global double *retval)
{ cblas_dasum_facade(N, X, incX, retval); }
void cblas_daxpy_facade(const int N, const double alpha, global const double *X, const int incX, global double *Y, const int incY);
kernel void ocl_cblas_daxpy(const int N, const double alpha, global const double *X, const int incX, global double *Y, const int incY)
@@ -151,23 +153,23 @@ kernel void ocl_cblas_daxpy(const int N, const double alpha, global const double
void cblas_dcopy_facade(const int N, global const double *X, const int incX, global double *Y, const int incY);
kernel void ocl_cblas_dcopy(const int N, global const double *X, const int incX, global double *Y, const int incY)
{ cblas_dcopy_facade(N, X, incX, Y, incY); }
-void cblas_ddot_facade(const int N, global const double *X, const int incX, global const double *Y, const int incY, double *retval);
-kernel void ocl_cblas_ddot(const int N, global const double *X, const int incX, global const double *Y, const int incY, double *retval)
+void cblas_ddot_facade(const int N, global const double *X, const int incX, global const double *Y, const int incY, global double *retval);
+kernel void ocl_cblas_ddot(const int N, global const double *X, const int incX, global const double *Y, const int incY, global double *retval)
{ cblas_ddot_facade(N, X, incX, Y, incY, retval); }
void cblas_dgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const double alpha, global const double *A, const int lda, global const double *X, const int incX, const double beta, global double *Y, const int incY);
kernel void ocl_cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const double alpha, global const double *A, const int lda, global const double *X, const int incX, const double beta, global double *Y, const int incY)
{ cblas_dgbmv_facade(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY); }
-void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_dgemm_facade(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_dgemm_facade(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_dgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, global const double *A, const int lda, global const double *X, const int incX, const double beta, global double *Y, const int incY);
kernel void ocl_cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, global const double *A, const int lda, global const double *X, const int incX, const double beta, global double *Y, const int incY)
{ cblas_dgemv_facade(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY); }
void cblas_dger_facade(const enum CBLAS_ORDER order, const int M, const int N, const double alpha, global const double *X, const int incX, global const double *Y, const int incY, global double *A, const int lda);
kernel void ocl_cblas_dger(const enum CBLAS_ORDER order, const int M, const int N, const double alpha, global const double *X, const int incX, global const double *Y, const int incY, global double *A, const int lda)
{ cblas_dger_facade(order, M, N, alpha, X, incX, Y, incY, A, lda); }
-void cblas_dnrm2_facade(const int N, global const double *X, const int incX, double *retval);
-kernel void ocl_cblas_dnrm2(const int N, global const double *X, const int incX, double *retval)
+void cblas_dnrm2_facade(const int N, global const double *X, const int incX, global double *retval);
+kernel void ocl_cblas_dnrm2(const int N, global const double *X, const int incX, global double *retval)
{ cblas_dnrm2_facade(N, X, incX, retval); }
void cblas_drot_facade(const int N, global double *X, const int incX, global double *Y, const int incY, const double c, const double s);
kernel void ocl_cblas_drot(const int N, global double *X, const int incX, global double *Y, const int incY, const double c, const double s)
void cblas_dscal_facade(const int N, const double alpha, global double *X, const int incX);
kernel void ocl_cblas_dscal(const int N, const double alpha, global double *X, const int incX)
{ cblas_dscal_facade(N, alpha, X, incX); }
-void cblas_dsdot_facade(const int N, global const float *X, const int incX, global const float *Y, const int incY, double *retval);
-kernel void ocl_cblas_dsdot(const int N, global const float *X, const int incX, global const float *Y, const int incY, double *retval)
+void cblas_dsdot_facade(const int N, global const float *X, const int incX, global const float *Y, const int incY, global double *retval);
+kernel void ocl_cblas_dsdot(const int N, global const float *X, const int incX, global const float *Y, const int incY, global double *retval)
{ cblas_dsdot_facade(N, X, incX, Y, incY, retval); }
void cblas_dspmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, global const double *Ap, global const double *X, const int incX, const double beta, global double *Y, const int incY);
kernel void ocl_cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, global const double *Ap, global const double *X, const int incX, const double beta, global double *Y, const int incY)
void cblas_dswap_facade(const int N, global double *X, const int incX, global double *Y, const int incY);
kernel void ocl_cblas_dswap(const int N, global double *X, const int incX, global double *Y, const int incY)
{ cblas_dswap_facade(N, X, incX, Y, incY); }
-void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_dsymm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_dsymm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_dsymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, global const double *A, const int lda, global const double *X, const int incX, const double beta, global double *Y, const int incY);
kernel void ocl_cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, global const double *A, const int lda, global const double *X, const int incX, const double beta, global double *Y, const int incY)
{ cblas_dsymv_facade(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY); }
@@ -214,12 +216,12 @@ kernel void ocl_cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO U
void cblas_dsyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, global const double *X, const int incX, global const double *Y, const int incY, global double *A, const int lda);
kernel void ocl_cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, global const double *X, const int incX, global const double *Y, const int incY, global double *A, const int lda)
{ cblas_dsyr2_facade(order, Uplo, N, alpha, X, incX, Y, incY, A, lda); }
-void cblas_dsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_dsyr2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
-void cblas_dsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const double *A, const int lda, const double beta, global double *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const double *A, const int lda, const double beta, global double *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_dsyrk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_dsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const double *A, const int lda, global const double *B, const int ldb, const double beta, global double *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_dsyr2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
+void cblas_dsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const double *A, const int lda, const double beta, global double *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const double *A, const int lda, const double beta, global double *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_dsyrk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_dtbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, global const double *A, const int lda, global double *X, const int incX);
kernel void ocl_cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, global const double *A, const int lda, global double *X, const int incX)
{ cblas_dtbmv_facade(order, Uplo, TransA, Diag, N, K, A, lda, X, incX); }
@@ -232,71 +234,71 @@ kernel void ocl_cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO
void cblas_dtpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const double *Ap, global double *X, const int incX);
kernel void ocl_cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const double *Ap, global double *X, const int incX)
{ cblas_dtpsv_facade(order, Uplo, TransA, Diag, N, Ap, X, incX); }
-void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, global const double *A, const int lda, global double *B, const int ldb, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, global const double *A, const int lda, global double *B, const int ldb, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_dtrmm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l2_buf_loc); }
+void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, global const double *A, const int lda, global double *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, global const double *A, const int lda, global double *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_dtrmm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_dtrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const double *A, const int lda, global double *X, const int incX);
kernel void ocl_cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const double *A, const int lda, global double *X, const int incX)
{ cblas_dtrmv_facade(order, Uplo, TransA, Diag, N, A, lda, X, incX); }
-void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, global const double *A, const int lda, global double *B, const int ldb, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, global const double *A, const int lda, global double *B, const int ldb, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_dtrsm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l2_buf_loc); }
+void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, global const double *A, const int lda, global double *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, global const double *A, const int lda, global double *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_dtrsm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_dtrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const double *A, const int lda, global double *X, const int incX);
kernel void ocl_cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const double *A, const int lda, global double *X, const int incX)
{ cblas_dtrsv_facade(order, Uplo, TransA, Diag, N, A, lda, X, incX); }
-void cblas_dzasum_facade(const int N, global const void *X, const int incX, double *retval);
-kernel void ocl_cblas_dzasum(const int N, global const void *X, const int incX, double *retval)
+void cblas_dzasum_facade(const int N, global const void *X, const int incX, global double *retval);
+kernel void ocl_cblas_dzasum(const int N, global const void *X, const int incX, global double *retval)
{ cblas_dzasum_facade(N, X, incX, retval); }
-void cblas_dznrm2_facade(const int N, global const void *X, const int incX, double *retval);
-kernel void ocl_cblas_dznrm2(const int N, global const void *X, const int incX, double *retval)
+void cblas_dznrm2_facade(const int N, global const void *X, const int incX, global double *retval);
+kernel void ocl_cblas_dznrm2(const int N, global const void *X, const int incX, global double *retval)
{ cblas_dznrm2_facade(N, X, incX, retval); }
-void cblas_icamax_facade(const int N, global const void *X, const int incX, CBLAS_INDEX *retval);
-kernel void ocl_cblas_icamax(const int N, global const void *X, const int incX, CBLAS_INDEX *retval)
+void cblas_icamax_facade(const int N, global const void *X, const int incX, global CBLAS_INDEX *retval);
+kernel void ocl_cblas_icamax(const int N, global const void *X, const int incX, global CBLAS_INDEX *retval)
{ cblas_icamax_facade(N, X, incX, retval); }
-void cblas_idamax_facade(const int N, global const double *X, const int incX, CBLAS_INDEX *retval);
-kernel void ocl_cblas_idamax(const int N, global const double *X, const int incX, CBLAS_INDEX *retval)
+void cblas_idamax_facade(const int N, global const double *X, const int incX, global CBLAS_INDEX *retval);
+kernel void ocl_cblas_idamax(const int N, global const double *X, const int incX, global CBLAS_INDEX *retval)
{ cblas_idamax_facade(N, X, incX, retval); }
-void cblas_isamax_facade(const int N, global const float *X, const int incX, CBLAS_INDEX *retval);
-kernel void ocl_cblas_isamax(const int N, global const float *X, const int incX, CBLAS_INDEX *retval)
+void cblas_isamax_facade(const int N, global const float *X, const int incX, global CBLAS_INDEX *retval);
+kernel void ocl_cblas_isamax(const int N, global const float *X, const int incX, global CBLAS_INDEX *retval)
{ cblas_isamax_facade(N, X, incX, retval); }
-void cblas_izamax_facade(const int N, global const void *X, const int incX, CBLAS_INDEX *retval);
-kernel void ocl_cblas_izamax(const int N, global const void *X, const int incX, CBLAS_INDEX *retval)
+void cblas_izamax_facade(const int N, global const void *X, const int incX, global CBLAS_INDEX *retval);
+kernel void ocl_cblas_izamax(const int N, global const void *X, const int incX, global CBLAS_INDEX *retval)
{ cblas_izamax_facade(N, X, incX, retval); }
-void cblas_sasum_facade(const int N, global const float *X, const int incX, float *retval);
-kernel void ocl_cblas_sasum(const int N, global const float *X, const int incX, float *retval)
+void cblas_sasum_facade(const int N, global const float *X, const int incX, global float *retval);
+kernel void ocl_cblas_sasum(const int N, global const float *X, const int incX, global float *retval)
{ cblas_sasum_facade(N, X, incX, retval); }
void cblas_saxpy_facade(const int N, const float alpha, global const float *X, const int incX, global float *Y, const int incY);
kernel void ocl_cblas_saxpy(const int N, const float alpha, global const float *X, const int incX, global float *Y, const int incY)
{ cblas_saxpy_facade(N, alpha, X, incX, Y, incY); }
-void cblas_scasum_facade(const int N, global const void *X, const int incX, float *retval);
-kernel void ocl_cblas_scasum(const int N, global const void *X, const int incX, float *retval)
+void cblas_scasum_facade(const int N, global const void *X, const int incX, global float *retval);
+kernel void ocl_cblas_scasum(const int N, global const void *X, const int incX, global float *retval)
{ cblas_scasum_facade(N, X, incX, retval); }
-void cblas_scnrm2_facade(const int N, global const void *X, const int incX, float *retval);
-kernel void ocl_cblas_scnrm2(const int N, global const void *X, const int incX, float *retval)
+void cblas_scnrm2_facade(const int N, global const void *X, const int incX, global float *retval);
+kernel void ocl_cblas_scnrm2(const int N, global const void *X, const int incX, global float *retval)
{ cblas_scnrm2_facade(N, X, incX, retval); }
void cblas_scopy_facade(const int N, global const float *X, const int incX, global float *Y, const int incY);
kernel void ocl_cblas_scopy(const int N, global const float *X, const int incX, global float *Y, const int incY)
{ cblas_scopy_facade(N, X, incX, Y, incY); }
-void cblas_sdot_facade(const int N, global const float *X, const int incX, global const float *Y, const int incY, float *retval);
-kernel void ocl_cblas_sdot(const int N, global const float *X, const int incX, global const float *Y, const int incY, float *retval)
+void cblas_sdot_facade(const int N, global const float *X, const int incX, global const float *Y, const int incY, global float *retval);
+kernel void ocl_cblas_sdot(const int N, global const float *X, const int incX, global const float *Y, const int incY, global float *retval)
{ cblas_sdot_facade(N, X, incX, Y, incY, retval); }
-void cblas_sdsdot_facade(const int N, const float alpha, global const float *X, const int incX, global const float *Y, const int incY, float *retval);
-kernel void ocl_cblas_sdsdot(const int N, const float alpha, global const float *X, const int incX, global const float *Y, const int incY, float *retval)
+void cblas_sdsdot_facade(const int N, const float alpha, global const float *X, const int incX, global const float *Y, const int incY, global float *retval);
+kernel void ocl_cblas_sdsdot(const int N, const float alpha, global const float *X, const int incX, global const float *Y, const int incY, global float *retval)
{ cblas_sdsdot_facade(N, alpha, X, incX, Y, incY, retval); }
void cblas_sgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const float alpha, global const float *A, const int lda, global const float *X, const int incX, const float beta, global float *Y, const int incY);
kernel void ocl_cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const float alpha, global const float *A, const int lda, global const float *X, const int incX, const float beta, global float *Y, const int incY)
{ cblas_sgbmv_facade(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY); }
-void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_sgemm_facade(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_sgemm_facade(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_sgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, global const float *A, const int lda, global const float *X, const int incX, const float beta, global float *Y, const int incY);
kernel void ocl_cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, global const float *A, const int lda, global const float *X, const int incX, const float beta, global float *Y, const int incY)
{ cblas_sgemv_facade(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY); }
void cblas_sger_facade(const enum CBLAS_ORDER order, const int M, const int N, const float alpha, global const float *X, const int incX, global const float *Y, const int incY, global float *A, const int lda);
kernel void ocl_cblas_sger(const enum CBLAS_ORDER order, const int M, const int N, const float alpha, global const float *X, const int incX, global const float *Y, const int incY, global float *A, const int lda)
{ cblas_sger_facade(order, M, N, alpha, X, incX, Y, incY, A, lda); }
-void cblas_snrm2_facade(const int N, global const float *X, const int incX, float *retval);
-kernel void ocl_cblas_snrm2(const int N, global const float *X, const int incX, float *retval)
+void cblas_snrm2_facade(const int N, global const float *X, const int incX, global float *retval);
+kernel void ocl_cblas_snrm2(const int N, global const float *X, const int incX, global float *retval)
{ cblas_snrm2_facade(N, X, incX, retval); }
void cblas_srot_facade(const int N, global float *X, const int incX, global float *Y, const int incY, const float c, const float s);
kernel void ocl_cblas_srot(const int N, global float *X, const int incX, global float *Y, const int incY, const float c, const float s)
void cblas_sswap_facade(const int N, global float *X, const int incX, global float *Y, const int incY);
kernel void ocl_cblas_sswap(const int N, global float *X, const int incX, global float *Y, const int incY)
{ cblas_sswap_facade(N, X, incX, Y, incY); }
-void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_ssymm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_ssymm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_ssymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, global const float *A, const int lda, global const float *X, const int incX, const float beta, global float *Y, const int incY);
kernel void ocl_cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, global const float *A, const int lda, global const float *X, const int incX, const float beta, global float *Y, const int incY)
{ cblas_ssymv_facade(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY); }
@@ -340,12 +342,12 @@ kernel void ocl_cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO U
void cblas_ssyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, global const float *X, const int incX, global const float *Y, const int incY, global float *A, const int lda);
kernel void ocl_cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, global const float *X, const int incX, global const float *Y, const int incY, global float *A, const int lda)
{ cblas_ssyr2_facade(order, Uplo, N, alpha, X, incX, Y, incY, A, lda); }
-void cblas_ssyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_ssyr2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
-void cblas_ssyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const float *A, const int lda, const float beta, global float *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const float *A, const int lda, const float beta, global float *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_ssyrk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_ssyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const float *A, const int lda, global const float *B, const int ldb, const float beta, global float *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_ssyr2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
+void cblas_ssyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const float *A, const int lda, const float beta, global float *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, global const float *A, const int lda, const float beta, global float *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_ssyrk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_stbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, global const float *A, const int lda, global float *X, const int incX);
kernel void ocl_cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, global const float *A, const int lda, global float *X, const int incX)
{ cblas_stbmv_facade(order, Uplo, TransA, Diag, N, K, A, lda, X, incX); }
@@ -358,15 +360,15 @@ kernel void ocl_cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO
void cblas_stpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const float *Ap, global float *X, const int incX);
kernel void ocl_cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const float *Ap, global float *X, const int incX)
{ cblas_stpsv_facade(order, Uplo, TransA, Diag, N, Ap, X, incX); }
-void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, global const float *A, const int lda, global float *B, const int ldb, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, global const float *A, const int lda, global float *B, const int ldb, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_strmm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l2_buf_loc); }
+void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, global const float *A, const int lda, global float *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, global const float *A, const int lda, global float *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_strmm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_strmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const float *A, const int lda, global float *X, const int incX);
kernel void ocl_cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const float *A, const int lda, global float *X, const int incX)
{ cblas_strmv_facade(order, Uplo, TransA, Diag, N, A, lda, X, incX); }
-void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, global const float *A, const int lda, global float *B, const int ldb, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, global const float *A, const int lda, global float *B, const int ldb, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_strsm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l2_buf_loc); }
+void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, global const float *A, const int lda, global float *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, global const float *A, const int lda, global float *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_strsm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_strsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const float *A, const int lda, global float *X, const int incX);
kernel void ocl_cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const float *A, const int lda, global float *X, const int incX)
{ cblas_strsv_facade(order, Uplo, TransA, Diag, N, A, lda, X, incX); }
@@ -391,9 +393,9 @@ kernel void ocl_cblas_zdscal(const int N, const double alpha, global void *X, co
void cblas_zgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_zgbmv_facade(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY); }
-void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_zgemm_facade(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_zgemm_facade(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_zgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_zgemv_facade(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY); }
@@ -406,9 +408,9 @@ kernel void ocl_cblas_zgeru(const enum CBLAS_ORDER order, const int M, const int
void cblas_zhbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_zhbmv_facade(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY); }
-void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_zhemm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_zhemm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_zhemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *A, const int lda, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_zhemv_facade(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY); }
@@ -418,12 +420,12 @@ kernel void ocl_cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO U
void cblas_zher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *X, const int incX, global const void *Y, const int incY, global void *A, const int lda);
kernel void ocl_cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *X, const int incX, global const void *Y, const int incY, global void *A, const int lda)
{ cblas_zher2_facade(order, Uplo, N, alpha, X, incX, Y, incY, A, lda); }
-void cblas_zher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, const double beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, const double beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_zher2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
-void cblas_zherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const void *A, const int lda, const double beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const void *A, const int lda, const double beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_zherk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_zher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, const double beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, const double beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_zher2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
+void cblas_zherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const void *A, const int lda, const double beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, global const void *A, const int lda, const double beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_zherk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_zhpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *Ap, global const void *X, const int incX, global const void *beta, global void *Y, const int incY);
kernel void ocl_cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, global const void *alpha, global const void *Ap, global const void *X, const int incX, global const void *beta, global void *Y, const int incY)
{ cblas_zhpmv_facade(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY); }
@@ -442,15 +444,15 @@ kernel void ocl_cblas_zscal(const int N, global const void *alpha, global void *
void cblas_zswap_facade(const int N, global void *X, const int incX, global void *Y, const int incY);
kernel void ocl_cblas_zswap(const int N, global void *X, const int incX, global void *Y, const int incY)
{ cblas_zswap_facade(N, X, incX, Y, incY); }
-void cblas_zsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_zsymm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
-void cblas_zsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_zsyr2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l2_buf_loc); }
-void cblas_zsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *beta, global void *C, const int ldc, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_zsyrk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l2_buf_loc); }
+void cblas_zsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_zsymm_facade(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
+void cblas_zsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *B, const int ldb, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_zsyr2k_facade(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
+void cblas_zsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, global const void *alpha, global const void *A, const int lda, global const void *beta, global void *C, const int ldc, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_zsyrk_facade(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_ztbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, global const void *A, const int lda, global void *X, const int incX);
kernel void ocl_cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, global const void *A, const int lda, global void *X, const int incX)
{ cblas_ztbmv_facade(order, Uplo, TransA, Diag, N, K, A, lda, X, incX); }
@@ -463,15 +465,15 @@ kernel void ocl_cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO
void cblas_ztpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *Ap, global void *X, const int incX);
kernel void ocl_cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *Ap, global void *X, const int incX)
{ cblas_ztpsv_facade(order, Uplo, TransA, Diag, N, Ap, X, incX); }
-void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_ztrmm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l2_buf_loc); }
+void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_ztrmm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_ztrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *A, const int lda, global void *X, const int incX);
kernel void ocl_cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *A, const int lda, global void *X, const int incX)
{ cblas_ztrmv_facade(order, Uplo, TransA, Diag, N, A, lda, X, incX); }
-void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global double *l3_buf, local double *l2_buf_loc);
-kernel void ocl_cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global double *l3_buf, local double *l2_buf_loc)
-{ cblas_ztrsm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l2_buf_loc); }
+void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code);
+kernel void ocl_cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, global const void *alpha, global const void *A, const int lda, global void *B, const int ldb, global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code)
+{ cblas_ztrsm_facade(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb, l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code); }
void cblas_ztrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *A, const int lda, global void *X, const int incX);
kernel void ocl_cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, global const void *A, const int lda, global void *X, const int incX)
{ cblas_ztrsv_facade(order, Uplo, TransA, Diag, N, A, lda, X, incX); }
diff --git a/blasblisacc/src/ti_cblas_mem_config.c b/blasblisacc/src/ti_cblas_mem_config.c
--- /dev/null
@@ -0,0 +1,209 @@
+/******************************************************************************
+ * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifdef TI_CBLAS_DEBUG
+#include <stdio.h>
+#include <ti/csl/csl_chipAux.h>
+#include <ti/csl/csl_idmaAux.h>
+#endif
+#include "../../ticblas/ticblas.h"
+#include <libarch.h>
+
+extern void bli_init();
+extern void bli_finalize();
+
+#ifdef TI_CBLAS_DEBUG
+int malloc_size;
+extern lib_memdscr_t * blas_memdscr_tab[4];
+#endif
+
+int bli_l3_mem_config(void *msmc_buf, size_t msmc_buf_size, void *ddr_buf, size_t ddr_buf_size, size_t *l1D_SRAM_size_orig, size_t *l2_SRAM_size_orig)
+{
+ size_t smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow;
+ void *l1d_SRAM_ptr, *l2_SRAM_ptr;
+ int l1d_cfg_err, l2_cfg_err, blas_ret_err_code;
+
+#ifdef TI_CBLAS_DEBUG
+ malloc_size = 0;
+ printf("Memory buffers passed to bli_l3_mem_config are: MSMC base 0x%x, size %d, DDR base 0x%x, size%d.\n", (unsigned int)msmc_buf, msmc_buf_size, (unsigned int) ddr_buf, ddr_buf_size);
+ printf("Before calling BLIS, malloc_size is %d.\n", malloc_size);
+#endif
+
+ /* First, verify the provided/available memory meet requirements */
+ tiCblasGetSizes(&smem_size_vfast, &smem_size_fast, &smem_size_med, &smem_size_slow);
+
+ if( (smem_size_vfast> lib_get_L1D_total_size()) /* total available L1D */
+ ||(smem_size_fast > lib_get_L2_total_size()) /* total available L2 */
+ ||(smem_size_med > msmc_buf_size) /* provided MSMC memory */
+ ||(smem_size_slow > ddr_buf_size) /* provided DDR memory */
+ ) {
+ return(TICBLAS_ERROR);
+ }
+
+ /* Configure L1D if necessary */
+ *l1D_SRAM_size_orig = lib_get_L1D_SRAM_size(); /* get current L1D SRAM size */
+ l1d_cfg_err = LIB_CACHE_SUCCESS;
+
+#ifdef TI_CBLAS_DEBUG
+ printf("Original L1D SRAM size is: %d\n", *l1D_SRAM_size_orig);
+ printf("Required L1D SRAM size is: %d\n", smem_size_vfast);
+#endif
+
+ if(*l1D_SRAM_size_orig < smem_size_vfast) { /* configure L1D if needs more SRAM */
+ /*printf("Configuring L1D SRAM on all cores.\n");*/
+ #pragma omp parallel
+ {
+ l1d_cfg_err = lib_L1D_config_SRAM(smem_size_vfast);
+ }
+ }
+
+#ifdef TI_CBLAS_DEBUG
+ #pragma omp parallel
+ {
+ int core_id = lib_get_coreID();
+ }
+#endif
+
+ /* Configure L2 if necessary */
+ *l2_SRAM_size_orig = lib_get_L2_SRAM_size(); /* get current L2 SRAM size */
+ l2_cfg_err = LIB_CACHE_SUCCESS;
+
+#ifdef TI_CBLAS_DEBUG
+ printf("Original L2 SRAM size is: %d\n", *l2_SRAM_size_orig);
+ printf("Required L2 SRAM size is: %d\n", smem_size_fast);
+#endif
+
+ if(*l2_SRAM_size_orig < smem_size_fast) { /* configure L2 if needs more SRAM */
+ #pragma omp parallel
+ {
+ l2_cfg_err = lib_L2_config_SRAM(smem_size_fast);
+ }
+ }
+
+ if(l1d_cfg_err || l2_cfg_err) {
+ return(TICBLAS_ERROR);
+ }
+
+#ifdef TI_CBLAS_DEBUG
+ printf("New L2 SRAM size is: %d\n", lib_get_L2_SRAM_size());
+#endif
+
+ /* get L1D and L2 SRAM base address */
+ l1d_SRAM_ptr = lib_get_L1D_SRAM_base();
+ l2_SRAM_ptr = lib_get_L2_SRAM_base();
+
+#ifdef TI_CBLAS_DEBUG
+ printf("L1D SRAM base address is 0x%x.\n", (unsigned int)l1d_SRAM_ptr);
+ printf("L2 SRAM base address is 0x%x.\n", (unsigned int) l2_SRAM_ptr);
+ printf("MSMC SRAM address is 0x%x.\n", (unsigned int) msmc_buf);
+#endif
+
+ /* pass allocated memories for heap initialization */
+ blas_ret_err_code = tiCblasInit(l1d_SRAM_ptr, smem_size_vfast,
+ l2_SRAM_ptr, smem_size_fast,
+ msmc_buf, msmc_buf_size,
+ ddr_buf, ddr_buf_size);
+
+#ifdef TI_CBLAS_DEBUG
+ if(blas_ret_err_code == TICBLAS_SUCCESS) {
+ printf("Before calling BLIS, memory descriptor base is 0x%x, used is %d.\n", blas_memdscr_tab[3]->base, blas_memdscr_tab[3]->used);
+ }
+ else {
+ printf("BLAS init error.\n");
+ }
+#endif
+
+ return(blas_ret_err_code);
+} /* bli_l3_mem_config */
+
+/*==============================================================================
+ * This function reconfigures L1D and L2 after processing is finished
+ *============================================================================*/
+int bli_l3_mem_reconfig(size_t l1D_SRAM_size_orig, size_t l2_SRAM_size_orig)
+{
+ int l1d_cfg_err, l2_cfg_err;
+
+#ifdef TI_CBLAS_DEBUG
+ printf("After calling BLIS, malloc_size is %d.\n", malloc_size);
+ printf("After calling BLIS, used_size in memory descriptor is %d.\n", blas_memdscr_tab[3]->used);
+#endif
+
+ /* configure L1D back if necessary */
+ l1d_cfg_err = LIB_CACHE_SUCCESS;
+ if(l1D_SRAM_size_orig!=lib_get_L1D_SRAM_size()) {
+ #pragma omp parallel
+ {
+ l1d_cfg_err = lib_L1D_config_SRAM(l1D_SRAM_size_orig);
+ }
+ }
+ if(l1d_cfg_err != LIB_CACHE_SUCCESS) {
+ return(-3);
+ }
+
+ /* configure L2 back if necessary */
+ l2_cfg_err = LIB_CACHE_SUCCESS;
+ if(l2_SRAM_size_orig != lib_get_L2_SRAM_size()) {
+ #pragma omp parallel
+ {
+ l2_cfg_err = lib_L2_config_SRAM(l2_SRAM_size_orig);
+ }
+ }
+ if(l2_cfg_err != LIB_CACHE_SUCCESS) {
+ return(-4);
+ }
+
+ return(TICBLAS_SUCCESS);
+} /* bli_l3_mem_reconfig */
+
+/*==============================================================================
+ * This function initializes BLIS before first CBLAS call is made.
+ *============================================================================*/
+void ti_bli_init_dsp(char *l3_buf, char *l2_buf)
+{
+#ifdef TI_CBLAS_DEBUG
+ printf("In function ti_bli_init_dsp, l3_buff is 0x%x, l2_buf is 0x%x.\n", (unsigned int)l3_buf, (unsigned int)l2_buf);
+
+ malloc_size = 0;
+ printf("Before calling bli_init, malloc_size is %d.\n", malloc_size);
+#endif
+
+ tiCblasNew();
+
+#ifdef TI_CBLAS_DEBUG
+ printf("After calling bli_init, malloc_size is %d.\n", malloc_size);
+#endif
+}
+
+/*==============================================================================
+ * This function frees all memories allocated by ti_bli_init_dsp.
+ *============================================================================*/
+void ti_bli_finalize_dsp(void)
+{
+ tiCblasDelete();
+}
+
+/* Nothing after this line */
index 0c4ada17164a5785c384385d5c43b27314e366cb..d8651cf4d17ae31454804650c4133e80de3bd2a7 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
#define OFFLOAD_TO_DSP 1
#define NO_OFFLOAD_TO_DSP 0
similarity index 52%
rename from blasblisacc/src/blas_wrap_gen.sh
rename to blasblisacc/src/wrap_gen/blas_wrap_gen.sh
index 5df08d327e8520e2f99c10678d376d0e549e41ec..cacce9fb743c64320ae12a4d02eb69173eb8bbd5 100755 (executable)
rename from blasblisacc/src/blas_wrap_gen.sh
rename to blasblisacc/src/wrap_gen/blas_wrap_gen.sh
index 5df08d327e8520e2f99c10678d376d0e549e41ec..cacce9fb743c64320ae12a4d02eb69173eb8bbd5 100755 (executable)
-./oclgen.pl -f -offload=002 -dspl1=4 -dspl2=128 -offmin=10000 -offmax=10000000 blas ./wrap_gen/cblas.h
+./oclgen.pl -f -offload=002 -dspl1=4 -dspl2=128 -offmin=10000 -offmax=10000000 blas ./cblas.h
+
+#sudo apt-get install ctags
\ No newline at end of file
similarity index 95%
rename from blasblisacc/src/oclgen.pl
rename to blasblisacc/src/wrap_gen/oclgen.pl
index 32bd923d9c8fe53d6d7e7635701fe0447891f174..74c949da7e0e657d1a474c6cc23ad98146791128 100755 (executable)
rename from blasblisacc/src/oclgen.pl
rename to blasblisacc/src/wrap_gen/oclgen.pl
index 32bd923d9c8fe53d6d7e7635701fe0447891f174..74c949da7e0e657d1a474c6cc23ad98146791128 100755 (executable)
my @offloaded; # array of function names to be offloaded, filled in by generate_arm...
my $blas_prefix = 'cblas_';
-my $blas_L1 = '.asum|.axpy|.copy|.dot|.sdot|.dotc|.dotu|.nrm2|.rot|.rotg|.rotmg|.scal|.swap|i.amax|i.amin|.cabs1|';
+my $blas_L1 = '.asum|.axpy|.copy|.dot|.sdot|.dotc|.cdotc_sub|.cdotu_sub|.dotu|.nrm2|.rot|.rotg|.rotmg|.scal|.swap|i.amax|i.amin|.cabs1|.csscal|.drotm|.dzasum|.dznrm2|.scasum|.scnrm2|.sdsdot|.srotm|.xerbla|.zdotu_sub|.zdscal|';
my $blas_L2 = '.gbmv|.gemv|.ger|.gerc|.geru|.hbmv|.hemv|.her|.her2|.hpmv|.hpr|.hpr2|.sbmv|.spmv|.spr|.spr2|.symv|.syr|.syr2|.tbmv|.tbsv|.tpmv|.tpsv|.trmv|.trsv|';
my $blas_L3 = '.gemm|.hemm|.herk|.her2k|.symm|.syrk|.syr2k|.trmm|.trsm|';
my $blas_L123 = "${blas_L1}|${blas_L2}|${blas_L3}";
return $code;
}
+sub get_init_code
+{
+ my $code = "
+void ti_bli_init_dsp(global char *l3_buf, local char *l2_buf);
+kernel void ocl_bli_init(global char *l3_buf, local char *l2_buf)
+{ ti_bli_init_dsp(l3_buf, l2_buf); }
+void ti_bli_finalize_dsp(void);
+kernel void ocl_bli_finalize(void)
+{ ti_bli_finalize_dsp(); }
+";
+ return $code;
+}
+
# generates the initial portion of the Makefile. This needs to be done
# only once, and not on a per-function basis
sub generate_makefile_prologue
OBJS = ${namespace}_initfini.o
HOST_OBJS =
-CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DDEVICE_K2H
+CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DSOC_K2H
CL6X_FLAGS = \$(INCS)
CLOCL_FLAGS =
OBJCOPY = objcopy
my $hdr_init = generate_header_specific_init_code();
my $armcode = <<"END_ARM_INIT";
$source_code_header
-#include "${namespace}.h"
+#include "${namespace}_acc.h"
#ifdef ${NAMESPACE}_FAT_BINARY
#include "${namespace}_kernel.dsp_h"
my $func = shift;
my $NAMESPACE = uc($namespace);
$func_name = substr($func, 7);
-
+#JXU
+# print "function name is $func_name\n";
+#JXU
if (index($blas_L1, "${func_name}|") != -1) {
return "${NAMESPACE}_L1_OFFLOAD";
}
# return "${NAMESPACE}_L3_OFFLOAD" if ($func_name =~ /$blas_L3/);
# return "${NAMESPACE}_L2_OFFLOAD" if ($func_name =~ /$blas_L2/);
+#JXU
+ print "function name not matched!\n";
+#JXU
+
# if no match then use the default offload variable
return "${namespace}_offload";
}
#my $arm_func_cond = get_func_based_arm_cond($trampname, \@kernelargs);
my $arm_func_cond = get_offload_decision($trampname, \@kernelargs);
my $arm_condition_code = "";
+#JXU
+# print "trampname is $trampname\n";
+#JXU
my $offload_var = get_func_specific_offload_var($trampname);
+#JXU
+ print "offload_var is $offload_var\n";
+#JXU
my $indent = "";
my $arm_end_condition_code = "";
if (!$commentarm) {
my $armcode = <<"ARM_FROM_PROTO";
$source_code_header
-#include "${namespace}.h"
+#include "${namespace}_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
/* Do an init on first use */
if (!${namespace}_init_done) ${namespace}_init();
${NAMESPACE}_DEBUG_PRINT("Intercepted call to %s\\n", "$trampname");
+ARM_FROM_PROTO
- ${NAMESPACE}_PROFILE_START();
+ if (index($trampname, "swap") != -1) {
+ $armcode .= "
+ /* OpenCL cannot deal with overlapping memory regions. This is an issue when you
+ * are trying to swap two rows of a matrix, where the matrix is column major. Hence,
+ * the offload of this routine to the DSP is disabled.
+ */
+#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
+ TI_CBLAS_PROFILE_START();
+ TI_CBLAS_DEBUG_PRINT(\"Executing ARM %s\\n\", \"$trampname\");
+ $no_offload_arm_call
+ TI_CBLAS_PROFILE_REPORT(\" Entire %s call (ARM) took %8.2f us\\n\",\"$trampname\", (float) clock_diff);
+ return ;
+#else
+";
+}
+ $armcode .= "
+ ${NAMESPACE}_PROFILE_START();
$arm_comment_header
- $arm_condition_code
- ${indent}${NAMESPACE}_DEBUG_PRINT("Executing ARM %s\\n", "$trampname");
- ${indent}$no_offload_arm_call
- ${indent}${NAMESPACE}_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\\n","$trampname", (float) clock_diff);
- ${indent}$no_offload_arm_return
- $arm_end_condition_code
- $arm_comment_trailer
+ $arm_condition_code
+ ${NAMESPACE}_DEBUG_PRINT(\"Executing ARM %s\\n\", \"$trampname\");
+ $no_offload_arm_call
+ ${NAMESPACE}_PROFILE_REPORT(\" Entire %s call (ARM) took %8.2f us\\n\",\"$trampname\", (float) clock_diff);
+ $no_offload_arm_return
+ $arm_end_condition_code
+ $arm_comment_trailer
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- ${indent}${NAMESPACE}_DEBUG_PRINT("Offloading to DSP %s\\n", "$trampname");
+ ${NAMESPACE}_DEBUG_PRINT(\"Offloading to DSP %s\\n\", \"$trampname\");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ${namespace}_get_kernel($trampdef, "ocl_$trampname");
+ __K = ${namespace}_get_kernel($trampdef, \"ocl_$trampname\");
+
#ifdef __cplusplus
try
#else
cl_int err = CL_SUCCESS;
#endif
{
-
-ARM_FROM_PROTO
+";
my $i = 0;
foreach $arg (@kernelargs) {
size_buf$arg = MAX(size_buf$arg,1);
#ifdef __cplusplus
- Buffer buf_$arg(${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, size_buf$arg, (void *)$arg);
+ Buffer buf_$arg(*${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, size_buf$arg, (void *)$arg);
__K->setArg($i, buf_$arg);
#else
cl_mem buf_$arg = clCreateBuffer(${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, size_buf$arg, (void *)$arg, &err);
else {
$armcode .= "
#ifdef __cplusplus
- Buffer buf_$arg(${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, ${modify_bufsize}sizeof($sizeoftype), (void *)$arg);
+ Buffer buf_$arg(*${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, ${modify_bufsize}sizeof($sizeoftype), (void *)$arg);
__K->setArg($i, buf_$arg);
#else
cl_mem buf_$arg = clCreateBuffer(${namespace}_ocl_context, $perms|CL_MEM_USE_HOST_PTR, ${modify_bufsize}sizeof($sizeoftype), (void *)$arg, &err);
# print "kernel_name is " . $kernel_name . "\n";
# print "blas_L3 string is ". $blas_L3 . "\n";
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
- print "This is a level 3 function - " . $trampname . "\n";
+# print "This is a level 3 function - " . $trampname . "\n";
$i_plus_1 = $i+1;
$armcode .= "
+ void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
+ msmc_ptr = ${namespace}_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- Buffer buf_MSMC(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
+ Buffer buf_MSMC(*${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg($i, buf_MSMC);
#else
- cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
+ //cl_mem buf_MSMC = clCreateBuffer(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE, NULL, &err);
+ cl_mem buf_MSMC = clCreateBuffer(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr, &err);
TI_CBLAS_OCL_CHKERROR(\"clCreateBuffer\",err);
err |= clSetKernelArg(__K, $i, sizeof(buf_MSMC), &buf_MSMC);
TI_CBLAS_OCL_CHKERROR(\"clSetKernelArg\",err);
#endif
#ifdef __cplusplus
- __K->setArg($i_plus_1, __local(L2_BUF_SIZE));
+ __K->setArg($i_plus_1, msmc_size);
+#else
+ err |= clSetKernelArg(__K, $i_plus_1, sizeof(msmc_size), &msmc_size);
+#endif
+";
+ $i=$i+2;
+ $i_plus_1 = $i+1;
+ $armcode .= "
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg($i, buf_DDR);
+ __K->setArg($i_plus_1, ddr_size);
+";
+ $i=$i+2;
+ $armcode .= "
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg($i, buf_err);
#else
- err |= clSetKernelArg(__K, $i_plus_1, L2_BUF_SIZE, NULL);
+ cl_mem buf_err = clCreateBuffer(*${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ ${NAMESPACE}_OCL_CHKERROR(\"clCreateBuffer\",err);
+ err |= clSetKernelArg(__K, $i, sizeof(buf_err), &buf_err);
+ ${NAMESPACE}_OCL_CHKERROR(\"clSetKernelArg\",err);
#endif
";
+ $i++;
}
+
if ($tramptype !~ /^void$/i) {
$armcode .= "
/* create a buffer argument to get the return value from the DSP */
$tramptype retval;
#ifdef __cplusplus
- Buffer buf_retval(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof($tramptype), &retval);
+ Buffer buf_retval(*${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof($tramptype), &retval);
__K->setArg($i, buf_retval);
#else
- cl_mem buf_retval = clCreateBuffer(${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof($tramptype), &retval, &err);
+ cl_mem buf_retval = clCreateBuffer(*${namespace}_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof($tramptype), &retval, &err);
${NAMESPACE}_OCL_CHKERROR(\"clCreateBuffer\",err);
err |= clSetKernelArg(__K, $i, sizeof(buf_retval), &buf_retval);
${NAMESPACE}_OCL_CHKERROR(\"clSetKernelArg\",err);
#endif
";
}
+
$armcode .= "
#ifdef __cplusplus
- ${namespace}_ocl_Q.enqueueTask(*__K, 0, &e);
+ ${namespace}_ocl_Q->enqueueTask(*__K, 0, &e);
e.wait();
#else
cl_event e;
${NAMESPACE}_OCL_CHKERROR(\"clWaitForEvents\",err);
err |= clReleaseEvent(e);
${NAMESPACE}_OCL_CHKERROR(\"clReleaseEvent\",err);
-
#endif
+";
+ if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
+ $armcode .= "
+ if(err_code != TICBLAS_SUCCESS) {
+ printf(\"Error code returned by offloaded $trampname is %d\\n.\", err_code);
+ }
+
+ ${namespace}_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+";
+ }
+ $armcode .= "
+ ${namespace}_delete_kernel(__K);
+
${NAMESPACE}_DEBUG_PRINT(\"Finished executing %s\\n\", \"$trampname\");
${NAMESPACE}_PROFILE_REPORT(\" Entire %s call (DSP) took %8.2f us\\n\",\"$trampname\", (float) clock_diff);
return ";
$armcode .= "0" unless ($tramptype =~ /^void$/i);
$armcode .= ";\n";
$armcode .= "\t}\n#endif\n";
+ if (index($trampname, "swap") != -1) {
+ $armcode .= "#endif //TI_CBLAS_SWAP_ENABLE_OFFLOAD\n";
+ }
$armcode .= "}\n";
return $armcode;
sub generate_kernel_from_proto($)
{
my $string = shift;
+ print "In generate_kernel_from_proto, string is " . $string. "\n";
my $oclcode = "";
my @tmp = split /[\(\)]/,$string;
else {
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
print "In generate_kernel_from_proto, this is a level 3 function - " . $trampname . "\n";
- $oclcode .= ", global double *l3_buf, local double *l2_buf_loc";
- $trampproto .= ", global double *l3_buf, local double *l2_buf_loc";
+ $oclcode .= ", global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code";
+ $trampproto .= ", global void *l3_buf, size_t l3_buf_size, global void *ddr_buf, size_t ddr_buf_size, global int *err_code";
}
}
}
}
- $oclcode .= "${comma}$tramptype *retval" unless ($tramptype =~ /^void$/i);
- $trampproto .= "${comma}$tramptype *retval" unless ($tramptype =~ /^void$/i);
+ $oclcode .= "${comma}global $tramptype *retval" unless ($tramptype =~ /^void$/i);
+ $trampproto .= "${comma}global $tramptype *retval" unless ($tramptype =~ /^void$/i);
$trampproto .= ");";
$oclcode .= ")\n{ ";
$oclcode .= "${comma}retval" unless ($tramptype =~ /^void$/i);
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
- $oclcode .= ", l3_buf, l2_buf_loc";
+ $oclcode .= ", l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, err_code";
}
$oclcode .= "); }";
$oclcode = $trampproto . "\n" . $oclcode;
{
my $facade_prologue = <<"FACADE_PROLOGUE";
$source_code_header
-#include <stdio.h>
#include "../../cblas/include/cblas.h"
-#include "blis.h"
-#define DEVICE_K2H
-
-#include <dsp_c.h>
-
-#define getNextMultiple(x, y) ( ( ((x)+(y)-1)/(y) )* (y) )
-// L1 buffer is hardwared here
-#define L1_BUF_LOC 0x00F00000
-
-// note these pointers must be filled if used functions
-char *pool_mk_mem_L1;
-char *pool_kn_mem_L1;
-char *pool_mn_mem_L1;
-
-char *pool_mk_mem_L2;
-char *pool_kn_mem_L2;
-char *pool_mn_mem_L2;
+#include "../../ticblas/ticblas.h"
+
+#ifdef TI_CBLAS_DEBUG
+#include "stdio.h"
+
+extern char *pool_mk_mem_L1;
+extern char *pool_kn_mem_L1;
+extern char *pool_mn_mem_L1;
+extern char *pool_mk_mem_L2;
+extern char *pool_kn_mem_L2;
+extern char *pool_mn_mem_L2;
+extern char *pool_mk_mem_L3;
+extern char *pool_kn_mem_L3;
+extern char *pool_mn_mem_L3;
+#endif
-char *pool_mk_mem_L3;
-char *pool_kn_mem_L3;
-char *pool_mn_mem_L3;
+extern int bli_l3_mem_config(void *msmc_buf, size_t msmc_buf_size, void *ddr_buf, size_t ddr_buf_size, size_t *l1D_SRAM_size_orig, size_t *l2_SRAM_size_orig);
+extern int bli_l3_mem_reconfig(size_t l1D_SRAM_size_orig, size_t l2_SRAM_size_orig);
FACADE_PROLOGUE
return $facade_prologue;
$kernel_name = substr($trampname, 7);
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
- $dspcode .= ", float *l3_buf, float *l2_buf_loc";
- $trampproto .= ", float *l3_buf, float *l2_buf_loc" ;
+ $dspcode .= ", void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code";
+ $trampproto .= ", void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code";
}
$trampproto .= ");";
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
# print "facade code to setup cache for level 3 function ". $trampname ."\n";
$dspcode .= "
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
";
}
else {
# print "facade code to setup cache for level 1 or 2 function ". $trampname ."\n";
+# $dspcode .= "
+# #pragma omp parallel
+# {
+# __cache_l2_flush();
+# __cache_l2_512k();
+# }
+#";
$dspcode .= "
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
";
}
if (index($blas_L3, '.'.$kernel_name.'|') != -1) {
# print "facade code to return default cache for level 3 function ". $trampname ."\n";
$dspcode .= "
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
";
}
else {
# print "facade code to return default cache for level 1 or 2 function ". $trampname ."\n";
- $dspcode .= "
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
-";
+# $dspcode .= "
+# // return default L2 cache (128 K)
+# #pragma omp parallel
+# {
+# __cache_l2_flush();
+# __cache_l2_128k();
+# }
+#";
}
$dspcode .= "}\n";
# $dspcode = $trampproto . "\n" . $dspcode;
write_output(generate_kernel_prologue(),"${namespace}_kernel.cl");
print "DSP ${namespace}_kernel.cl generated.\n";
write_output(get_enums_and_defines(),"${namespace}_kernel.cl");
+ write_output(get_init_code(),"${namespace}_kernel.cl");
print "ARM ${namespace}_initfini.c code:\n" unless ($f);
write_output(generate_arm_init(), "${namespace}_initfini.c");
#print "Makefile:\n" unless ($f);
unlink glob "ti_cblas_initfini.c";
unlink glob "ti_cblas_kernel.cl";
unlink glob "ti_cblas_cblas*.c";
- unlink glob "ti_cblas.h";
+ unlink glob "ti_cblas_acc.h";
#unlink glob "*.inc";
#unlink glob "*.a";
}
diff --git a/blis/Makefile b/blis/Makefile
index b7ee56fbe1b153b485bf4ca8ff13dd274074bb7a..c41726c53e2e59a9dce2f159970a65c198d36c41 100644 (file)
--- a/blis/Makefile
+++ b/blis/Makefile
endif # pnacl
# --- Install rules ---
+#ifeq ($(CONFIG_NAME),c66x) $(filter $(var),X `')
install-libs: check-env $(MK_LIBS_INST_W_VERS_CONF)
-ifeq ($(CONFIG_NAME),c66x)
+ifeq ($(CONFIG_NAME), $(filter c66x am57x shannon,$(CONFIG_NAME)))
@echo "Installing as DSP Binary"
@cp $(MK_BLIS_LIB_INST_W_VERS_CONF) $(patsubst %.a, %.ae66, $(MK_BLIS_LIB_INST_W_VERS_CONF))
@$(SYMLINK) $(notdir $(patsubst %.a, %.ae66, $(MK_BLIS_LIB_INST_W_VERS_CONF))) $(INSTALL_PREFIX)/lib/$(BLIS_LIB_BASE_NAME).ae66
index 8e9d87baf59ac9decdfff946179f9cb03092b80a..290eb9f9d97babbf26fa4fd76ba5fe2fc54cb803 100755 (executable)
#define BLIS_CONFIG_H
#define BLIS_ENABLE_C66X_BUILD
+
#define BLIS_ENABLE_C66X_MEM_POOLS
-#define BLIS_ENABLE_C66X_OPENCL
+#ifdef BLIS_ENABLE_C66X_OPENCL
+// clocl creates a cio section in L2 when fprintf is used. Redefining fprintf to map to printf.
+#define fprintf ti_printf
+#endif
-// -- OPERATING SYSTEM ---------------------------------------------------------
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
// -- INTEGER PROPERTIES -------------------------------------------------------
// The bit size of the integer type used to track values such as dimensions,
// -- c66x headers -------------------------------------------------------------
#include "c6x.h"
-#include <ti/csl/device/k2h/src/cslr_device.h>
+//#include <ti/csl/device/k2h/src/cslr_device.h>
+
+#include <libarch.h>
#include <ti/csl/csl_chipAux.h> // CSL_chipReadDNUM -> to read coreID
#include <ti/csl/csl_cacheAux.h> // CACHE_invL1d
+// for __clock64()
+//#include <dsp_c.h>
+
// -- EDMA ---------------------------------------------------------------------
#define BLIS_ENABLE_C66X_EDMA
#ifdef BLIS_ENABLE_C66X_EDMA
+
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
+
+#define BLIS_GEMM_DMAA_CNTL gemm_dmaa_cntl
+#define BLIS_GEMM_DMAB_CNTL gemm_dmab_cntl
+
+#elif defined (MEM_MODEL_SMALL)
+#define BLIS_GEMM_DMAA_CNTL NULL // disabling EDMA
+#define BLIS_GEMM_DMAB_CNTL NULL
+#endif
+
+/*
+#if USING_FC_EDMAMGR
+#include <xdc/std.h>
+
+#define ECPY_INLINE_ALL 1
+#define EDMAMGR_INLINE_ALL 1
+#include <ti/sdo/fc/edmamgr/edmamgr.h>
+#else
#include "edmamgr.h"
-//#include <ti/sdo/fc/edmamgr/edmamgr.h>
+#endif
+*/
#define BLIS_C66X_MAXDMASTRIDE 0x7FFF
#include "idma.h"
#endif
-
+// -- PROFILE -----------------------------------------------------------------
+//uncomment to Profile performance
+//#define BLIS_ENABLE_PROFILE
// -- MULTITHREADING -----------------------------------------------------------
// The maximum number of BLIS threads that will run concurrently.
+
+/* While testing this code on Hawking, the value of BLIS_MAX_NUM_THREADS
+ * needs to be 8. OpenMP randomly assigns the OpenMP threads to the cores.
+ * This value needs to be 8 to make sure all the cores are initialized
+ * before the openMP region begins.
+ *
+ * When porting to the specific architecture. Change BLIS_MAX_NUM_THREADS to the
+ * number of cores available on the device, and change BLIS_C66X_IC_NT to
+ * BLIS_MAX_NUM_THREADS
+*/
#define BLIS_ENABLE_MULTITHREADING
#define BLIS_ENABLE_OPENMP
#define BLIS_MAX_NUM_THREADS 8
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
+#define BLIS_MAX_NUM_THREADS 8
+#define BLIS_C66X_IC_NT BLIS_MAX_NUM_THREADS
+#elif defined (MEM_MODEL_SMALL)
+#define BLIS_MAX_NUM_THREADS 8
+#define BLIS_C66X_IC_NT 8
+#endif
+
+#define BLIS_C66X_JC_NT 1
+#define BLIS_C66X_JR_NT 1
+#define BLIS_C66X_IR_NT 1
+
+
+
// -- MEMORY ALLOCATION --------------------------------------------------------
// contiguous memory pools.
#define BLIS_NUM_MC_X_KC_BLOCKS_L3 0
+#if defined (MEM_MODEL_SMALL)
+#define BLIS_NUM_MC_X_KC_BLOCKS_L2 1 // no need of ping-pong buffer if EDMA is not used. for matrix A, DDR->L2
+#else
#define BLIS_NUM_MC_X_KC_BLOCKS_L2 2 //Each L2 ram is local to the DSP Just need one buffer per thread that is packed
+#endif
#define BLIS_NUM_MC_X_KC_BLOCKS_L1 0
#define BLIS_NUM_MR_X_KC_BLOCKS_L1 2 // To transfer A to L1 in a ping-poing manner
#define BLIS_NUM_MC_X_KC_BLOCKS 2*BLIS_MAX_NUM_THREADS + 1 //To test w/o DMA and L2, L3 memory, all memory must be in DDR3 now
+#if defined (MEM_MODEL_SMALL)
+#define BLIS_NUM_KC_X_NC_BLOCKS_L3 1 // no need of ping-pong buffer if EDMA is not used. for matrix B, DDR->L3
+#else
#define BLIS_NUM_KC_X_NC_BLOCKS_L3 2 // Each thread shares a B block, so do not need 8 buffers *BLIS_MAX_NUM_THREADS // One for the partitioned B1, and one for the packed B1
+#endif
#define BLIS_NUM_KC_X_NC_BLOCKS_L2 0
#define BLIS_NUM_KC_X_NC_BLOCKS_L1 0
#define BLIS_NUM_KC_X_NR_BLOCKS_L1 1
#define BLIS_NUM_MC_X_NC_BLOCKS_L3 0
#define BLIS_NUM_MC_X_NC_BLOCKS_L2 0
+// still using EDMA in bli_gemm_ker_var2 (DDR->L2 for C output buffer, L2->L1 for matrix A, MSMC->L1 for B)
#define BLIS_NUM_MC_X_NR_BLOCKS_L2 3 //Bringing C into the L2 memory. We need 3 buffers, one to read, one to compute and one to write.
#define BLIS_NUM_MC_X_NC_BLOCKS_L1 0
#define BLIS_NUM_MR_X_NR_BLOCKS_L1 0
+extern void * blasGetMemHandle();
#endif
index f2eb785811367a4d093676f8bec0d99980b17812..1a9c9100ed9eece66534d9245ccc3096e09f4992 100755 (executable)
// (b) NR (for triangular operations such as trmm and trsm).
//
-/*
-#define BLIS_DEFAULT_MC_S 336
-#define BLIS_DEFAULT_KC_S 528
-#define BLIS_DEFAULT_NC_S 4096
-
-#define BLIS_DEFAULT_MC_D 64
-#define BLIS_DEFAULT_KC_D 128
-#define BLIS_DEFAULT_NC_D 4096
-
-#define BLIS_DEFAULT_MC_C 64
-#define BLIS_DEFAULT_KC_C 128
-#define BLIS_DEFAULT_NC_C 4096
-
-#define BLIS_DEFAULT_MC_Z 64
-#define BLIS_DEFAULT_KC_Z 128
-#define BLIS_DEFAULT_NC_Z 4096
-*/
-
-
-// Old values with only 2 buffers of KN in L2
-//#define BLIS_DEFAULT_MC_S 96
-//#define BLIS_DEFAULT_KC_S 428
-//#define BLIS_DEFAULT_NC_S 944
-//
-//#define BLIS_DEFAULT_MC_D 104
-//#define BLIS_DEFAULT_KC_D 220
-//#define BLIS_DEFAULT_NC_D 820
-//
-//#define BLIS_DEFAULT_MC_C 96
-//#define BLIS_DEFAULT_KC_C 260
-//#define BLIS_DEFAULT_NC_C 820
-//
-//#define BLIS_DEFAULT_MC_Z 90
-//#define BLIS_DEFAULT_KC_Z 178
-//#define BLIS_DEFAULT_NC_Z 584
-
-//Values for 2 buffers of KN in L3
+#if defined(MEM_MODEL_LARGE)
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 428
#define BLIS_DEFAULT_NC_S 944
-// MR = 4, NR = 4
-//#define BLIS_DEFAULT_MC_S 40
-//#define BLIS_DEFAULT_KC_S 580
-//#define BLIS_DEFAULT_NC_S 436
-
#define BLIS_DEFAULT_MC_D 132
#define BLIS_DEFAULT_KC_D 220
#define BLIS_DEFAULT_NC_D 864
#define BLIS_DEFAULT_3M_KC_Z 100
#define BLIS_DEFAULT_3M_NC_Z 100
+#elif defined (MEM_MODEL_MEDIUM)
+#define BLIS_DEFAULT_MC_S 128
+#define BLIS_DEFAULT_KC_S 240
+#define BLIS_DEFAULT_NC_S 1288
-//#define BLIS_DEFAULT_MC_S 104
-////#define BLIS_DEFAULT_KC_S 440 //So that 2 MRxKC buffers can fit in L1 = 448, reduced to 447 to accommodate for bank conflict, reduced to 440 because KC must be divisible by MR & NR
-//#define BLIS_DEFAULT_KC_S 260
-//#define BLIS_DEFAULT_NC_S 1184 // Increased to fill up 4MB of L3 also NC should be divisible by NR
-//
-//#define BLIS_DEFAULT_MC_D 100
-//#define BLIS_DEFAULT_KC_D 110
-//#define BLIS_DEFAULT_NC_D 884
-//
-//#define BLIS_DEFAULT_MC_C 64
-//#define BLIS_DEFAULT_KC_C 110
-//#define BLIS_DEFAULT_NC_C 512
-//
-//#define BLIS_DEFAULT_MC_Z 32
-//#define BLIS_DEFAULT_KC_Z 50
-//#define BLIS_DEFAULT_NC_Z 256
+#define BLIS_DEFAULT_MC_D 68
+#define BLIS_DEFAULT_KC_D 240
+#define BLIS_DEFAULT_NC_D 844
+#define BLIS_DEFAULT_MC_C 68
+#define BLIS_DEFAULT_KC_C 240
+#define BLIS_DEFAULT_NC_C 844
-// -- Register blocksizes --
+#define BLIS_DEFAULT_MC_Z 60
+#define BLIS_DEFAULT_KC_Z 136
+#define BLIS_DEFAULT_NC_Z 631
+
+#define BLIS_DEFAULT_4M_MC_C 68
+#define BLIS_DEFAULT_4M_KC_C 240
+#define BLIS_DEFAULT_4M_NC_C 844
+
+#define BLIS_DEFAULT_4M_MC_Z 60
+#define BLIS_DEFAULT_4M_KC_Z 136
+#define BLIS_DEFAULT_4M_NC_Z 628
+
+#define BLIS_DEFAULT_3M_MC_C 68
+#define BLIS_DEFAULT_3M_KC_C 160
+#define BLIS_DEFAULT_3M_NC_C 720
+
+#define BLIS_DEFAULT_3M_MC_Z 52
+#define BLIS_DEFAULT_3M_KC_Z 100
+#define BLIS_DEFAULT_3M_NC_Z 524
+
+
+#elif defined(MEM_MODEL_SMALL)
+// use this when EDMA is disabled for A and B
+#define BLIS_DEFAULT_MC_S 112
+#define BLIS_DEFAULT_KC_S 428
+#define BLIS_DEFAULT_NC_S 1224
+
+#define BLIS_DEFAULT_MC_D 96
+#define BLIS_DEFAULT_KC_D 220
+#define BLIS_DEFAULT_NC_D 1184
+
+#define BLIS_DEFAULT_MC_C 88
+#define BLIS_DEFAULT_KC_C 260
+#define BLIS_DEFAULT_NC_C 1008
+
+#define BLIS_DEFAULT_MC_Z 64
+#define BLIS_DEFAULT_KC_Z 178
+#define BLIS_DEFAULT_NC_Z 736
+
+#define BLIS_DEFAULT_4M_MC_C 108
+#define BLIS_DEFAULT_4M_KC_C 220
+#define BLIS_DEFAULT_4M_NC_C 1184
+#define BLIS_DEFAULT_4M_MC_Z 64
+#define BLIS_DEFAULT_4M_KC_Z 178
+#define BLIS_DEFAULT_4M_NC_Z 736
+
+#define BLIS_DEFAULT_3M_MC_C 64
+#define BLIS_DEFAULT_3M_KC_C 220
+#define BLIS_DEFAULT_3M_NC_C 792
+
+#define BLIS_DEFAULT_3M_MC_Z 48
+#define BLIS_DEFAULT_3M_KC_Z 178
+#define BLIS_DEFAULT_3M_NC_Z 488
+
+/*
+#define BLIS_DEFAULT_MC_S 144
+#define BLIS_DEFAULT_KC_S 428
+#define BLIS_DEFAULT_NC_S 1224
+
+#define BLIS_DEFAULT_MC_D 140
+#define BLIS_DEFAULT_KC_D 220
+#define BLIS_DEFAULT_NC_D 1184
+
+#define BLIS_DEFAULT_MC_C 116
+#define BLIS_DEFAULT_KC_C 260
+#define BLIS_DEFAULT_NC_C 1008
+
+#define BLIS_DEFAULT_MC_Z 86
+#define BLIS_DEFAULT_KC_Z 178
+#define BLIS_DEFAULT_NC_Z 736
+
+#define BLIS_DEFAULT_4M_MC_C 140
+#define BLIS_DEFAULT_4M_KC_C 220
+#define BLIS_DEFAULT_4M_NC_C 1184
+
+#define BLIS_DEFAULT_4M_MC_Z 86
+#define BLIS_DEFAULT_4M_KC_Z 178
+#define BLIS_DEFAULT_4M_NC_Z 736
+
+#define BLIS_DEFAULT_3M_MC_C 88
+#define BLIS_DEFAULT_3M_KC_C 220
+#define BLIS_DEFAULT_3M_NC_C 792
+
+#define BLIS_DEFAULT_3M_MC_Z 56
+#define BLIS_DEFAULT_3M_KC_Z 178
+#define BLIS_DEFAULT_3M_NC_Z 488
+*/
+// use this when EDMA is enabled
+/*
+#define BLIS_DEFAULT_MC_S 104
+#define BLIS_DEFAULT_KC_S 196
+#define BLIS_DEFAULT_NC_S 824
+
+#define BLIS_DEFAULT_MC_D 64
+#define BLIS_DEFAULT_KC_D 180
+#define BLIS_DEFAULT_NC_D 540
+
+#define BLIS_DEFAULT_MC_C 64
+#define BLIS_DEFAULT_KC_C 180
+#define BLIS_DEFAULT_NC_C 540
+
+#define BLIS_DEFAULT_MC_Z 32
+#define BLIS_DEFAULT_KC_Z 145
+#define BLIS_DEFAULT_NC_Z 306
+
+#define BLIS_DEFAULT_4M_MC_C 64
+#define BLIS_DEFAULT_4M_KC_C 180
+#define BLIS_DEFAULT_4M_NC_C 540
+
+#define BLIS_DEFAULT_4M_MC_Z 32
+#define BLIS_DEFAULT_4M_KC_Z 145
+#define BLIS_DEFAULT_4M_NC_Z 306
+
+#define BLIS_DEFAULT_3M_MC_C 64
+#define BLIS_DEFAULT_3M_KC_C 96
+#define BLIS_DEFAULT_3M_NC_C 488
+
+#define BLIS_DEFAULT_3M_MC_Z 36
+#define BLIS_DEFAULT_3M_KC_Z 108
+#define BLIS_DEFAULT_3M_NC_Z 196
+*/
+#endif
+
+// -- Register blocksizes --
+// same for different memory models (C66x architecture), need to redefine for C7x
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 8 //4 //
index 920d06c76e57e7c3e72a71f4db48def3cd31bb9a..f1cd21f864bdc024f7684fce7d1a98945ca3ee7a 100755 (executable)
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
-
-TI_INSTALL_DIR?=/usr/src/dsp
-
-PATH:=$(TI_OCL_CGT_INSTALL)/bin:$(PATH)
-
-define FIND_DSP_PKG
- export $(1)?=$$(patsubst %/$(3),%,$$(lastword $$(sort $$(wildcard $$(TI_INSTALL_DIR)/$(2)/$(3)))))
- ifeq ($$($(1)),)
- $$(error ERROR - $(1) is not defined and could not be found in $(TI_INSTALL_DIR)/ )
- else
- ifeq ($$(wildcard $$($(1))/$(3)),)
- $$(error ERROR - "$(1) = $$($(1))" Is not valid!)
- endif
- endif
- $$(info Using $(1) = $$($(1)))
-endef
-
-UNAME_M :=$(shell uname -m)
-
-ifneq (,$(findstring 86, $(UNAME_M)))
-$(eval $(call FIND_DSP_PKG,C6636_PDK_DIR,pdk_keystone2*,packages))
-endif
-
-$(eval $(call FIND_DSP_PKG,FC_DIR,framework_components*,packages))
-$(eval $(call FIND_DSP_PKG,OMP_DIR,openmp_dsp*,packages))
-#$(eval $(call FIND_DSP_PKG,C6636_PDK_DIR,pdk_keystone2*,packages))
-
-
-
#
# --- Build definitions --------------------------------------------------------
#
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := --c99
-#CMISCFLAGS += -I$(TI_OCL_CGT_INSTALL)/include
CMISCFLAGS += -I$(OMP_DIR)/packages/ti/runtime/openmp
CMISCFLAGS += -I$(FC_DIR)/packages
-#CMISCFLAGS += -I$(C6636_PDK_DIR)/packages
-CMISCFLAGS += -I$(LINUX_DEVKIT_ROOT)/usr/share/ti/cgt-c6x/include
-ifneq (,$(findstring 86, $(UNAME_M)))
-CMISCFLAGS += -I$(C6636_PDK_DIR)/packages
-$(info Using $(UNAME_M))
-else
-CMISCFLAGS += -I$(LINUX_DEVKIT_ROOT)/usr/include
-$(info Using $(UNAME_M))
-endif
-CMISCFLAGS += -mv6600 --use_g2 --omp #-std=c99 # -fopenmp -pg
-
-
-CDBGFLAGS := -s
+CMISCFLAGS += -I$(XDC_DIR)/packages
+CMISCFLAGS += -I$(BIOS_DIR)/packages
+CMISCFLAGS += -I$(XDAIS_DIR)/packages
+CMISCFLAGS += -I$(LIBARCH_DIR)/include
+CMISCFLAGS += -I$(CGTROOT)/include
+CMISCFLAGS += -I$(PDK_DIR)/packages
+
+CMISCFLAGS += -mv6600 --use_g2 --omp
+
+ifeq ($(LIBOS),LIB_OPENCL)
+CMISCFLAGS += -I$(TI_OCL_INSTALL_DIR)
+CMISCFLAGS += -DBLIS_ENABLE_C66X_OPENCL
+endif
+
+ifeq ($(MEM_MODEL),Large)
+BLIS_MEM_MODEL = MEM_MODEL_LARGE
+else ifeq ($(MEM_MODEL),Medium)
+BLIS_MEM_MODEL = MEM_MODEL_MEDIUM
+else ifeq ($(MEM_MODEL),Small)
+BLIS_MEM_MODEL = MEM_MODEL_SMALL
+endif
+
+CMISCFLAGS += -D$(BLIS_MEM_MODEL) -D$(TARGET) -D$(LIBOS)
+
+CDBGFLAGS := -s -k -mw
CWARNFLAGS :=
COPTFLAGS := -O2
CKOPTFLAGS := $(COPTFLAGS)
index 9771f1d617655459bfb446d0ffafb36ef5649313..19f88259463519e2d74ff7211a4daf0e709510b7 100644 (file)
--- a/blis/config/c66x/touch.h
+++ b/blis/config/c66x/touch.h
-#ifndef _TOUCH_H_
-#define _TOUCH_H_
-
-/* Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/
-*
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions
-* are met:
-*
-* Redistributions of source code must retain the above copyright
-* notice, this list of conditions and the following disclaimer.
-*
-* Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in the
-* documentation and/or other materials provided with the
-* distribution.
-*
-* Neither the name of Texas Instruments Incorporated nor the names of
-* its contributors may be used to endorse or promote products derived
-* from this software without specific prior written permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-*/
-
-/**
- * @file touch.h
- * @brief Contains interface to cache optimization utilities
- *
- */
-
-/** @defgroup util util */
-
-/** @ingroup util */
-/* @{ */
-
-/**
- * @brief touches an array to bring it into cache
- *
- * @param[in] array Pointer to array to touch
- * @param[in] length Length array in bytes
- *
- */
-void touch (const void *array, int length);
-
-#endif
-
-/* @} */ /* ingroup */
-
-/* Nothing past this point */
+#ifndef _TOUCH_H_\r
+#define _TOUCH_H_\r
+\r
+/* Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/\r
+*\r
+* Redistribution and use in source and binary forms, with or without\r
+* modification, are permitted provided that the following conditions\r
+* are met:\r
+*\r
+* Redistributions of source code must retain the above copyright\r
+* notice, this list of conditions and the following disclaimer.\r
+*\r
+* Redistributions in binary form must reproduce the above copyright\r
+* notice, this list of conditions and the following disclaimer in the\r
+* documentation and/or other materials provided with the\r
+* distribution.\r
+*\r
+* Neither the name of Texas Instruments Incorporated nor the names of\r
+* its contributors may be used to endorse or promote products derived\r
+* from this software without specific prior written permission.\r
+*\r
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*\r
+*/\r
+\r
+/**\r
+ * @file touch.h\r
+ * @brief Contains interface to cache optimization utilities\r
+ *\r
+ */\r
+\r
+/** @defgroup util util */\r
+\r
+/** @ingroup util */\r
+/* @{ */\r
+\r
+/** \r
+ * @brief touches an array to bring it into cache\r
+ * \r
+ * @param[in] array Pointer to array to touch\r
+ * @param[in] length Length array in bytes\r
+ *\r
+ */\r
+void touch (const void *array, int length); \r
+\r
+#endif\r
+\r
+/* @} */ /* ingroup */\r
+\r
+/* Nothing past this point */\r
index dbf9bb439a76d2925cb54b83847c602c18cdc2b7..76d521f40c3c955d8b3ebb7e1d829b79af5ddc81 100644 (file)
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
-//#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4
-//#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4
+#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4
+#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4
// -- trsm-related --
index 9376360a73a8282442a8c3856cb5f512405ea3ce..51935210691c863d6ae239e832948cde33b899c9 100644 (file)
{
if( thread == NULL ) return;
//Assume that the ocomm and the icomm are freed by something else and don't need to be freed.
- bli_free(thread);
+ bli_free_scratch(thread);
}
packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
index 3939f9ea2f78e57c19b075f6f893fb7e73bac2e2..7d0d1cdae2d8af65b821e6bcf4afdb9da682bd54 100644 (file)
{
num_t dt = bli_obj_datatype( *c );
+#if defined (BLIS_ENABLE_C66X_BUILD)
+ lib_smem_sreset(blasGetMemHandle()); /* reset BLAS scratch heap */
+#endif
+
if ( bli_3mh_is_enabled_dt( dt ) ) bli_gemm3mh_entry( alpha, a, b, beta, c );
else if ( bli_3m_is_enabled_dt( dt ) ) bli_gemm3m_entry( alpha, a, b, beta, c );
else if ( bli_4mh_is_enabled_dt( dt ) ) bli_gemm4mh_entry( alpha, a, b, beta, c );
index fc6123e4897147a50f1af2ecddad95754ff06cea..f739d56fcdff38e6e247dd089f30539c4ef03c76 100644 (file)
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *a );
dim_t start, end;
- bli_get_range( thread, 0, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- &start, &end );
+// bli_get_range( thread, 0, m_trans,
+// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+// &start, &end );
+
+ bli_get_range_t2b( thread, 0, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ &start, &end );
// Partition along the m dimension.
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_pack( b_pack , cntl_sub_packm_b( cntl ));
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( b_dma , cntl_sub_dmam_b( cntl ) );
- bli_obj_release_edma_handle( b_dma );
+ bli_obj_release_emt_handle( b_dma );
#endif
}
if( thread_am_ichief( thread ) ){
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma , cntl_sub_dmam_a( cntl ));
bli_obj_release_dma( c1_dma , cntl_sub_dmam_c( cntl ) );
- bli_obj_release_edma_handle( a1_dma );
- bli_obj_release_edma_handle( c1_dma );
+ bli_obj_release_emt_handle( a1_dma );
+ bli_obj_release_emt_handle( c1_dma );
#endif
}
}
index 1738b125707fd70850bdb5cf89516206885988bc..8da0f33c5f39ea547c61216f673e20127564270f 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- bli_get_range( thread, 0, n_trans,
- bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
- &start, &end );
+// bli_get_range( thread, 0, n_trans,
+// bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+// &start, &end );
+
+ bli_get_range_l2r( thread, 0, n_trans,
+ bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+ &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
index fb40fdd09368f7d1e1a26ac63df2a9a889adbbcd..bed1e05240f5e8d8165ac7bec367c55a3b8a977f 100644 (file)
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma , cntl_sub_dmam_a( cntl ));
bli_obj_release_dma( b1_dma , cntl_sub_dmam_b( cntl ) );
- bli_obj_release_edma_handle( a1_dma );
- bli_obj_release_edma_handle( b1_dma );
+ bli_obj_release_emt_handle( a1_dma );
+ bli_obj_release_emt_handle( b1_dma );
#endif
bli_obj_release_pack( a1_pack, cntl_sub_packm_a( cntl ) );
// printf("blk var 3 b1_pack before release %x\n", bli_mem_buffer(&(b1_pack->pack_mem)));
index 05fb033532e2da57908e93401b147cf601681ee5..acd632ee57254c7a87ce9bae3c94d5892a9afe25 100644 (file)
BLIS_UPANEL_B_ALIGN_SIZE_Z, 0 );
- // Attach the register blksz_t objects as sub-blocksizes to the cache
+ // Attach the register blksz_t objects as blocksize multiples to the cache
// blksz_t objects.
bli_blksz_obj_attach_to( gemm_mr, gemm_mc );
bli_blksz_obj_attach_to( gemm_nr, gemm_nc );
bli_blksz_obj_attach_to( gemm_kr, gemm_kc );
+ //bli_blksz_obj_attach_mult_to( gemm_mr, gemm_mc );
+ //bli_blksz_obj_attach_mult_to( gemm_nr, gemm_nc );
+ //bli_blksz_obj_attach_mult_to( gemm_kr, gemm_kc );
+
+
+ // Attach the mr and nr blksz_t objects to each cache blksz_t object.
+ // The primary example of why this is needed relates to nudging kc.
+ // In hemm, symm, trmm, or trmm3, we need to know both mr and nr,
+ // since the multiple we target in nudging depends on whether the
+ // structured matrix is on the left or the right.
+ //bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_mc );
+ //bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_nc );
+ //bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_kc );
// Create function pointer object for each datatype-specific gemm
// micro-kernel.
gemm_packa_cntl,
NULL, //gemm_packb_cntl, //
NULL,
- gemm_dmaa_cntl, //NULL, //
+ BLIS_GEMM_DMAA_CNTL, //gemm_dmaa_cntl, //
NULL, //gemm_dmab_cntl, //
NULL,
gemm_cntl_bp_ke,
gemm_packb_cntl, //NULL, //
NULL,
NULL,
- gemm_dmab_cntl, //NULL, //
+ BLIS_GEMM_DMAB_CNTL, // gemm_dmab_cntl, //NULL, //
NULL,
gemm_cntl_op_bp,
NULL );
index 060c5bddf76d209c11c59c3b6b4543de15be2f0c..712dec8a9d91eee028767bb56379d8e7223702f3 100644 (file)
#include "blis.h"
+#define CLOCK 1.2 // In GHz
+#define OPS_PER_CYCLE_S 16
+#define OPS_PER_CYCLE_D 4
+#define NUM_THREADS 8
+
+#ifdef BLIS_ENABLE_PROFILE
+ profile_data_t *bli_gemm_profile_data;
+#endif
void bli_gemm_front( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, n_profile, k_profile;
+#endif
+
+
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c );
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ n_profile = bli_obj_width( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+ bli_gemm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS); //One struct for each thread for each variant
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_gemm_int,
bli_gemm_thrinfo_free_paths( infos, n_threads );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_gemm_profile_data,
+ m_profile, n_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 2, n_threads);
+
+ bli_profile_data_free(bli_gemm_profile_data);
+
+#endif
+
}
index 000a7a4e251401116f680acb4eece5ac85dd1a66..af1ba8d8600fe8c28ff743b9ea117221b8762e54 100644 (file)
#include "blis.h"
-#ifdef BLIS_ENABLE_C66X_BUILD
-//#define BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
-#endif
-
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)( obj_t* a,
varnum_t n;
impl_t i;
FUNCPTR_T f;
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- volatile int counter_start;
- volatile int counter_end;
+#if defined(BLIS_ENABLE_PROFILE)
+ volatile uint64_t counter_start;
+ volatile uint64_t counter_end;
+ extern profile_data_t *bli_gemm_profile_data;
+ dim_t m_var, k_var, n_var;
+ dim_t index;
#endif
// Extract the variant number and implementation type.
// Check parameters.
if ( bli_error_checking_is_enabled() )
- bli_gemm_int_check( alpha, a, b, beta, c, cntl );
+ bli_gemm_int_check( alpha, a, b, beta, c, cntl ); // creating the errors. print sizes of a,b,c
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( *c ) ) return;
i = cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
- f = vars[n][i];
- //printf("gemm_int %d %d\n", n, i);
+ f = vars[n][i]; // print out n and i
// Invoke the variant.
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- TSCL = 0;
- counter_start = TSCL;
+#if defined(BLIS_ENABLE_PROFILE)
+ m_var = bli_obj_length( c_local );
+ k_var = bli_obj_width_after_trans( a_local );
+ n_var = bli_obj_width( c_local );
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ TSCL = 0;
+ counter_start = lib_clock64();
+#else
+ counter_start = (uint64_t) (bli_clock()*1.2e9);
+#endif
#endif
f( &a_local,
&b_local,
&c_local,
cntl,
- thread );
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- counter_end = TSCL;
- if(CSL_chipReadDNUM()==0) printf("xxxxx bli_gemm_int %d %d %d\n", n, i, counter_end-counter_start);
+ thread );
+
+#if defined(BLIS_ENABLE_PROFILE)
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ counter_end = lib_clock64();
+#else
+ counter_end = (uint64_t) (bli_clock()*1.2e9);
+#endif
+ bli_profile_get_index(n, i, index);
+ bli_profile_data_update(bli_gemm_profile_data[index], (counter_end-counter_start), m_var*k_var*n_var);
#endif
}
index 55280e6b18b384b61ae43ab76286d36704d9072a..43d2d05be5e6da4ddf99764bd45f77e4ac15b046 100644 (file)
#define FUNCPTR_T gemm_fp
-/* move this memory allocation to memory pool on L2 */
-/* buffer size needed is max(NR*sizeof(ctype)*MC*3) */
-/* c data movement currently works for rs_c=1 only; */
-/* need to carry this for cs_c=1 as well ensuring it works */
-/* for non unity rs_c and cs_c */
-/*#define CNEWBUFSIZE (104*8*4)
-
-#pragma DATA_SECTION(cNew,".mem_l2")
-#pragma DATA_ALIGN(cNew,8)
-char cNew[3*CNEWBUFSIZE];
-*/
+#ifdef BLIS_ENABLE_C66X_MEM_POOLS
+#define BLIS_ENABLE_C66X_C_L2
+#endif
+
typedef void (*FUNCPTR_T)(
pack_t schema_a,
pack_t schema_b,
}
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-
-#if defined (BLIS_ENABLE_C66X_EDMA) && defined (BLIS_ENABLE_C66X_IDMA)
+#ifdef BLIS_ENABLE_C66X_C_L2
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrtype ) \
ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
/*EDMA Declarations */ \
\
- EdmaMgr_Handle edma_handle_b = NULL; \
- EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL; \
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
\
/*For DSP timing*/ \
- /*volatile int counter_start; \
- volatile int counter_end;*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_gemm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
\
- /*Acquiring buffers for C (MC_x_NR) in L2 */\
+ /*Acquiring buffers for C (MC_x_NR) in L2 */\
bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
cNew0 = bli_mem_buffer( &c0_L2_mem ); \
\
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle B CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle B CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
\
/*For DSP Timing*/ \
- /*TSCL=0; \
- counter_start = TSCL;*/ \
/* initiate first c transfer */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = lib_clock64(); \
+ } \
n_cur = ( bli_is_not_edge_f( jr_thread_id, n_iter, n_left ) ? NR : n_left ); \
if(cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c_cast+jr_thread_id*cstep_c, \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c_cast+jr_thread_id*cstep_c, \
cNew1, m*sizeof(ctype), /*(m_iter-ir_thread_id)*sizeof(ctype)*MR,*/ \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
- else \
+ } \
+ else \
{ \
- dim_t ii; \
- ctype *ptr_source; \
- ctype *ptr_dest; \
- ptr_source = c_cast+jr_thread_id*cstep_c; \
- ptr_dest = cNew1; \
- for(ii = 0; ii < n_cur; ii++) \
- { \
- memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
- ptr_source += cs_c; \
- ptr_dest += cs_c11; \
- } \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c_cast+jr_thread_id*cstep_c; \
+ ptr_dest = cNew1; \
+ for(ii = 0; ii < n_cur; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c; \
+ ptr_dest += cs_c11; \
+ } \
} \
+ lib_emt_wait(emt_handle_c0); \
\
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
- n_next = ( bli_is_not_edge_f( j+jr_num_threads, n_iter, n_left ) ? NR : n_left ); \
\
if (j == jr_thread_id) \
{ \
/*Always use MR and NR while transfering a packed panel*/ \
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
- idma1_setup(a2_L1, a1 = a_cast + ir_thread_id * rstep_a, k*MR*sizeof(ctype), 0, 0, 7);\
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
+ lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype));\
} \
- /* wait for previous c transfer to complete and initiate next transfer */ \
- EdmaMgr_wait(edma_handle_c0); \
+\
+ n_next = ( bli_is_not_edge_f( j+jr_num_threads, n_iter, n_left ) ? NR : n_left ); \
+ /* wait for previous c transfer to complete and initiate next transfer */ \
+ lib_emt_wait(emt_handle_c0); \
+\
if(j < (n_iter-jr_num_threads)) /* no transfer for last iteration */ \
- {\
- if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1+jr_num_threads*cstep_c, \
- cNew0, m*sizeof(ctype), /*(m_iter-ir_thread_id)*sizeof(ctype)*MR,*/ \
- n_next, cs_c*sizeof(ctype), \
- cs_c11*sizeof(ctype)); \
- else \
- { \
- dim_t ii; \
- ctype *ptr_source; \
- ctype *ptr_dest; \
- ptr_source = c1+jr_num_threads*cstep_c; \
- ptr_dest = cNew0; \
- for(ii = 0; ii < n_next; ii++) \
- { \
- memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
- ptr_source += cs_c; \
- ptr_dest += cs_c11; \
- } \
- } \
- }\
+ { \
+ if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ {\
+ lib_emt_copy2D2D(emt_handle_c0, c1+jr_num_threads*cstep_c, \
+ cNew0, m*sizeof(ctype), /*(m_iter-ir_thread_id)*sizeof(ctype)*MR,*/ \
+ n_next, cs_c*sizeof(ctype), \
+ cs_c11*sizeof(ctype)); \
+ /*Testing if performance improves with fast API*/ \
+ /*lib_emt_copyFast(emt_handle_c0, c1+jr_num_threads*cstep_c, \
+ cNew0);*/ \
+ }\
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1+jr_num_threads*cstep_c; \
+ ptr_dest = cNew0; \
+ for(ii = 0; ii < n_next; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c; \
+ ptr_dest += cs_c11; \
+ } \
+ } \
+ } \
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
- /*c11 = c1 + i * rstep_c;*/ \
- c11 = cNew1 + i * rstep_c11; \
+ c11 = cNew1 + i * rstep_c11; \
+ /*c11 = c1 + i * rstep_c; */\
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a2_L1 = temp; \
\
/*Wait for the panel to finish transferring*/ \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
- idma1_setup(a2_L1, a1 = a_cast + ir_thread_id * rstep_a, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype)); \
a2 = a_cast; \
b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
} \
else \
{ \
- idma1_setup(a2_L1, a2, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype)); \
} \
\
if(i == ir_thread_id) \
{ \
- EdmaMgr_wait(edma_handle_b); \
+ lib_emt_wait(emt_handle_b); \
} \
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
- alpha_cast, \
- a1_L1, /*a1_L1,*/ \
- b1_L1, /*b1_L1,*/ \
- beta_cast, \
- c11, rs_c11, cs_c11, /*rs_c, cs_c,*/ \
- &aux ); \
+ alpha_cast, \
+ a1_L1, /*a1_L1,*/ \
+ b1_L1, /*b1_L1,*/ \
+ beta_cast, \
+ c11, rs_c11, cs_c11, /*rs_c, cs_c,*/ \
+ &aux ); \
} \
else \
{ \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c11, cs_c11 /*rs_c, cs_c */); \
- } \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*m_cur*k*n_cur); \
+ } \
if(!bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) && bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads )) \
{ \
/* Start the EDMA of the next panel (K*NR) of B*/ \
- EdmaMgr_copy1D1D(edma_handle_b, b2, b1_L1, k*NR*sizeof(ctype)); \
+ lib_emt_copy1D1D(emt_handle_b, b2, b1_L1, k*NR*sizeof(ctype)); \
+ /*Testing if performance improves with fast API*/ \
+ /*lib_emt_copyFast(emt_handle_b, b2, b1_L1);*/ \
} \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND],\
+ (counter_end_mr-counter_start_mr), (uint64_t) 2*m*k*n_cur); \
+ } \
/* circularly shift buffers */ \
cNewTemp = cNew0; \
cNew0 = cNew2; \
cNew1 = cNewTemp; \
if(j != jr_thread_id) /* wait for save c to complete; skip first iteration */ \
{ \
- EdmaMgr_wait(edma_handle_c1); \
+ lib_emt_wait(emt_handle_c1); \
} \
/* save updated c */ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
else \
{\
} \
\
} \
- /*For DSP timing*/ \
- /*counter_end=TSCL; \
- if (CSL_chipReadDNUM () == 0) */\
- /*printf("%d %d %d\t%d\n",n_iter, m_iter, k, counter_end-counter_start); */\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*m*k*n); \
+ } \
/* Loop over the n dimension (NR columns at a time). */ \
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
- if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
+ lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
INSERT_GENTFUNC_BASIC( gemm_ker_var2, gemm_ukr_t )
-#else //If EDMA IDMA is not enabled use memcpy
+#else //No Data movement for C
+
+
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrtype ) \
\
void PASTEMAC(ch,varname)( \
- pack_t schema_a, \
+ pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
- PASTECH(ch,ukrtype) gemm_ukr_cast = gemm_ukr; \
+ PASTECH(ch,ukrtype) gemm_ukr_cast = (PASTECH(ch,ukrtype) ) gemm_ukr; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,maxmr) * \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
+ dim_t n_next; \
inc_t rstep_a; \
inc_t cstep_b; \
- inc_t rstep_c, cstep_c; \
+ inc_t rstep_c; \
+ inc_t cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
auxinfo_t aux; \
+\
+ inc_t rstep_c11, rs_c11, cs_c11; \
\
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
\
mem_t a1_L1_mem, a2_L1_mem; \
ctype *a1_L1, *a2_L1, *temp; \
+\
+ mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
+ ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
+ /*EDMA Declarations */ \
+\
+ lib_emt_Handle emt_handle_b = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_gemm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
-\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
+\
+ rstep_c11 = MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
+ rs_c11 = 1; \
+ cs_c11 = (m%2 == 0) ? m : m+1 ; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k; \
+\
+ /*printf("n %d m %d n_left %d m_left = %d rstep_c = %d, cstep_c = %d rs_c = %d, cs_c = %d, cs_c11 = %d, NR = %d\n", n, m, n_left, m_left, rstep_c, cstep_c, rs_c, cs_c, (m%2 == 0) ? m : m+1, NR ); */\
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, aux ); \
/*Acquiring a buffer for B in L1*/ \
bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
- b1_L1 = (ctype *) (b1_L1_mem.buf + PASTEMAC(ch,bank)); \
+ /* Type casting pointer to char to avoid warning "arithmetic on pointer to void or function type"*/ \
+ b1_L1 = (ctype *) ((char *) b1_L1_mem.buf + PASTEMAC(ch,bank)); \
\
/*Acquiring a buffer for A in L1*/ \
- /*printf("Acquire A k %d, MR %d, size of ctype %d, A size requested %d\n", k, MR, sizeof(ctype), k*MR*sizeof(ctype));*/ \
bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
- a1_L1 = a1_L1; \
\
bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
\
- /* Loop over the n dimension (NR columns at a time). */ \
+ /*Acquiring buffers for C (MC_x_NR) in L2 */\
+ bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
+ cNew0 = bli_mem_buffer( &c0_L2_mem ); \
+\
+ bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
+ cNew1 = bli_mem_buffer( &c1_L2_mem ); \
+\
+ bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
+ cNew2 = bli_mem_buffer( &c2_L2_mem ); \
+\
+ /*Acquiring an EDMA handle from the pool*/ \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
+ { \
+ printf("ker_var2 Failed to alloc edma handle B CoreID %d \n", lib_get_coreID()); \
+ } \
+\
+ /*For DSP Timing*/ \
+ /* initiate first c transfer */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = lib_clock64(); \
+ } \
+\
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
- \
+\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
- n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
- /* Initialize our next panel of B to be the current panel of B. */ \
- b2 = b1; \
- memcpy(b1_L1, b1, k*NR*sizeof(ctype)); \
+ if (j == jr_thread_id) \
+ { \
+ /*Always use MR and NR while transfering a packed panel*/ \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
+ lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype));\
+ } \
\
- /* Loop over the m dimension (MR rows at a time). */ \
- for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+ /* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
+ for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
- c11 = c1 + i * rstep_c; \
+ a1 = a_cast + i * rstep_a; \
+ c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
- if(i == ir_thread_id) \
- { \
- a1 = a_cast + i * rstep_a; \
- memcpy(a2_L1, a1, k*MR*sizeof(ctype)); \
- } \
\
/* Compute the addresses of the next panels of A and B. */ \
- a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \
+ a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
- a1 = a2; /*Make the next panel the current panel for the next iteration*/ \
- /*Start next panel*/ \
- memcpy(a2_L1, a2, k*MR*sizeof(ctype)); \
\
- /* Save addresses of next panels of A and B to the auxinfo_t
- object. */ \
- /*if ( bli_is_last_iter( i, m_iter ) ) \
+ /*Wait for the panel to finish transferring*/ \
+ lib_imt_wait(); \
+ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
+ lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype)); \
a2 = a_cast; \
b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \
- if ( bli_is_last_iter( j, n_iter ) ) \
- b2 = b_cast; \
+ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+ b2 = b_cast; \
+ } \
+ else \
+ { \
+ lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype)); \
} \
- bli_auxinfo_set_next_a( a2, aux ); \
- bli_auxinfo_set_next_b( b2, aux );*/ \
\
- /* Handle interior and edge cases separately. */ \
- if ( m_cur == MR && n_cur == NR ) \
+ if(i == ir_thread_id) \
{ \
- /* Invoke the gemm micro-kernel. */ \
- gemm_ukr_cast( k, \
- alpha_cast, \
- a1_L1, \
- b1_L1, \
- beta_cast, \
- c11, rs_c, cs_c, \
- &aux ); \
+ lib_emt_wait(emt_handle_b); \
+ } \
+ /* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
+ if ( m_cur == MR && n_cur == NR ) \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ alpha_cast, \
+ a1_L1, /*a1_L1,*/ \
+ b1_L1, /*b1_L1,*/ \
+ beta_cast, \
+ c11, rs_c, cs_c, \
+ &aux ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
- a1_L1, \
- b1_L1, \
+ a1_L1, /*a1_L1,*/ \
+ b1_L1, /*b1_L1,*/ \
zero, \
- ct, rs_ct, cs_ct, \
+ ct, rs_ct, cs_ct, \
&aux ); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
- ct, rs_ct, cs_ct, \
- beta_cast, \
- c11, rs_c, cs_c ); \
+ ct, rs_ct, cs_ct, \
+ beta_cast, \
+ c11, rs_c, cs_c); \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*m_cur*k*n_cur); \
+ } \
+ if(!bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) && bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads )) \
+ { \
+ /* Start the EDMA of the next panel (K*NR) of B*/ \
+ lib_emt_copy1D1D(emt_handle_b, b2, b1_L1, k*NR*sizeof(ctype)); \
} \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*m*k*n_cur); \
+ } \
} \
- \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*m*k*n); \
+ } \
+ /* Loop over the n dimension (NR columns at a time). */ \
+ bli_mem_release( &c2_L2_mem ); \
+ bli_mem_release( &c1_L2_mem ); \
+ bli_mem_release( &c0_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
-/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
+ if ( emt_handle_b != NULL ) \
+ { \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
+ } \
+ /*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( gemm_ker_var2, gemm_ukr_t )
+
+
#endif
#else
index 4bd441d20db906c287f6ee10168be5ac7be83890..e88970edcc21c30264fc98bfd34d338702464ed9 100644 (file)
packm_thrinfo_t* ipackm,
gemm_thrinfo_t* sub_gemm )
{
- gemm_thrinfo_t* thread = ( gemm_thrinfo_t* ) bli_malloc( sizeof( gemm_thrinfo_t ) );
+ gemm_thrinfo_t* thread = ( gemm_thrinfo_t* ) bli_malloc_scratch( sizeof( gemm_thrinfo_t ) );
bli_setup_gemm_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_gemm_thrinfo_free( thread->sub_gemm );
- bli_free( thread );
+ bli_free_scratch( thread );
return;
}
bli_gemm_thrinfo_free( threads[i] );
}
- bli_free( threads );
+ bli_free_scratch( threads );
}
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( )
dim_t ir_nt = 1;
- gemm_thrinfo_t** paths = (gemm_thrinfo_t**) bli_malloc( global_num_threads * sizeof( gemm_thrinfo_t* ) );
+ gemm_thrinfo_t** paths = (gemm_thrinfo_t**) bli_malloc_scratch( global_num_threads * sizeof( gemm_thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
for( int a = 0; a < jc_way; a++ )
index c1c62a0b8a6358ee6a5218ad4f80382d134df3d8..94afbbdbde083cab4e435824fddc4d8bac4f056c 100644 (file)
{
num_t dt = bli_obj_datatype( *c );
+#if defined (BLIS_ENABLE_C66X_BUILD)
+ lib_smem_sreset(blasGetMemHandle()); /* reset BLAS scratch heap */
+#endif
+
if ( bli_3mh_is_enabled_dt( dt ) ) bli_hemm3mh_entry( side, alpha, a, b, beta, c );
else if ( bli_3m_is_enabled_dt( dt ) ) bli_hemm3m_entry( side, alpha, a, b, beta, c );
else if ( bli_4mh_is_enabled_dt( dt ) ) bli_hemm4mh_entry( side, alpha, a, b, beta, c );
index 91ae16d4743e955300f3329bf95af794ecb00be9..ff8a557c508172496fa04aec1e17f396c063d303 100644 (file)
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ extern profile_data_t *bli_gemm_profile_data;
+ dim_t m_profile, n_profile, k_profile;
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_hemm_check( side, alpha, a, b, beta, c );
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ n_profile = bli_obj_width( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+ bli_gemm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS); //One struct for each thread for each variant
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_gemm_int,
(void*) cntl,
(void**) infos );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_gemm_profile_data,
+ m_profile, n_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 2, n_threads);
+
+ bli_profile_data_free(bli_gemm_profile_data);
+
+#endif
+
bli_gemm_thrinfo_free_paths( infos, n_threads );
}
index 74e1613df1345260a72d61f4ba718164b5544a0e..64863831e57cadc16a1db0136fbdac24944592cc 100644 (file)
{
num_t dt = bli_obj_datatype( *c );
+#if defined (BLIS_ENABLE_C66X_BUILD)
+ lib_smem_sreset(blasGetMemHandle()); /* reset BLAS scratch heap */
+#endif
+
if ( bli_3mh_is_enabled_dt( dt ) ) bli_her2k3mh_entry( alpha, a, b, beta, c );
else if ( bli_3m_is_enabled_dt( dt ) ) bli_her2k3m_entry( alpha, a, b, beta, c );
else if ( bli_4mh_is_enabled_dt( dt ) ) bli_her2k4mh_entry( alpha, a, b, beta, c );
else if ( bli_4m_is_enabled_dt( dt ) ) bli_her2k4m_entry( alpha, a, b, beta, c );
- else bli_her2k_entry( alpha, a, b, beta, c );
+ else
+ bli_her2k_entry( alpha, a, b, beta, c );
}
index 7753e6b1e5c45dcb659485fba03b30cc25353430..863cfbe5ab1fba6d24b3a065dcb6b1171bdc43e0 100644 (file)
obj_t b_local;
obj_t ah_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, k_profile;
+ extern profile_data_t *bli_herk_profile_data;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_her2k_check( alpha, a, b, beta, c );
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_herk_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
(void*) cntl,
(void**) infos );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_herk_profile_data,
+ m_profile, m_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 1.0, n_threads);
+
+ bli_profile_data_free(bli_herk_profile_data);
+#endif
+
bli_herk_thrinfo_free_paths( infos, n_threads );
#endif
index a56ff3971b988ff1e8f9fab297a2eac935f0cfdc..9fe511b6f016fddb2bc644808c8c318ed18ac022 100644 (file)
{
num_t dt = bli_obj_datatype( *c );
+#if defined (BLIS_ENABLE_C66X_BUILD)
+ lib_smem_sreset(blasGetMemHandle()); /* reset BLAS scratch heap */
+#endif
+
if ( bli_3mh_is_enabled_dt( dt ) ) bli_herk3mh_entry( alpha, a, beta, c );
else if ( bli_3m_is_enabled_dt( dt ) ) bli_herk3m_entry( alpha, a, beta, c );
else if ( bli_4mh_is_enabled_dt( dt ) ) bli_herk4mh_entry( alpha, a, beta, c );
index c21db299f352ee192ff091377224dff191565357..7817434a7ac1b1d9e480a526f2e9240247cb2799 100644 (file)
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *c );
dim_t start, end;
- bli_get_range_weighted( thread, 0, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_is_upper( *c ), &start, &end );
+
+// bli_get_range_weighted( thread, 0, m_trans,
+// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+// bli_obj_is_upper( *c ), &start, &end );
+
+#if 0
+ if(bli_is_lower( bli_obj_root_uplo( *c ) ))
+ {
+ dim_t n_trans;
+ n_trans = bli_obj_width_after_trans(*c);
+
+ {
+ dim_t At, Ar, X;
+ dim_t num_threads_At, num_threads_Ar;
+ At = ceil(n_trans*n_trans/2);
+ Ar = bli_max(0, m_trans - n_trans)*n_trans;
+ X = ceil(Ar/At);
+
+ if (X > 0)
+ {
+ num_threads_At = thread->n_way / (1 + X);
+ num_threads_Ar = thread->n_way - num_threads_At;
+
+ printf("n_trans = %d At = %d, Ar = %d, X = %d, num_threads_At = %d, num_threads_Ar = %d\n", n_trans*n_trans, At, Ar, X, num_threads_At, num_threads_Ar);
+
+ if(thread->work_id < num_threads_At)
+ {
+ dim_t all_start = 0;
+ dim_t all_end = n_trans;
+ dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
+ uplo_t uplo = bli_obj_root_uplo( *c );
+ bool_t handle_edge_low = FALSE;
+ dim_t n_way = num_threads_At;
+ dim_t work_id = thread->work_id;
+ dim_t size = all_end - all_start; // partioning only the triangular part
+ dim_t width;
+ dim_t block_fac_leftover = size % block_factor;
+ dim_t i;
+ double num;
+
+ bli_toggle_uplo(uplo);
+
+ //printf("Triangle: work_id = %d \n", thread->work_id);
+
+ start = 0;
+ end = all_end - all_start;
+ num = size * size / ( double )n_way;
+
+ for ( i = 0; TRUE; ++i )
+ {
+ width = ceil( sqrt( start * start + num ) ) - start;
+
+ if ( i == 0 && handle_edge_low )
+ {
+ if ( width % block_factor != block_fac_leftover )
+ width += block_fac_leftover - ( width % block_factor );
+ }
+ else
+ {
+ if ( width % block_factor != 0 )
+ width += block_factor - ( width % block_factor );
+ }
+
+ if ( work_id == 0 )
+ {
+ start = start + all_start;
+ end = bli_min( start + width, all_end );
+ break;
+ }
+ else
+ {
+ start = start + width;
+ work_id--;
+ }
+ }
+
+ }
+ else
+ {
+ dim_t all_start = n_trans;
+ dim_t all_end = m_trans;
+ dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
+ bool_t handle_edge_low = FALSE;
+
+ dim_t n_way = num_threads_Ar;
+ dim_t work_id = thread->work_id - num_threads_At;
+
+ dim_t size = all_end - all_start;
+
+ dim_t n_bf_whole = size / block_factor;
+ dim_t n_bf_left = size % block_factor;
+
+ dim_t n_bf_lo = n_bf_whole / n_way;
+ dim_t n_bf_hi = n_bf_whole / n_way;
+
+ //printf("Rectangle: work_id = %d \n", thread->work_id);
+
+
+ if ( handle_edge_low == FALSE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "high" and
+ // the "low" thread group is empty.
+ dim_t n_th_lo = n_bf_whole % n_way;
+ //dim_t n_th_hi = n_way - n_th_lo;
+
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to lower index threads.
+ if ( n_th_lo != 0 ) n_bf_lo += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ start = lo_start + (work_id ) * size_lo;
+ end = lo_start + (work_id+1) * size_lo;
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ start = hi_start + (work_id-n_th_lo ) * size_hi;
+ end = hi_start + (work_id-n_th_lo+1) * size_hi;
+
+ // Since the edge case is being allocated to the high
+ // end of the index range, we have to advance the last
+ // thread's end.
+ if ( work_id == n_way - 1 ) end += n_bf_left;
+ }
+ }
+ else // if ( handle_edge_low == TRUE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "low" and
+ // the "high" thread group is empty.
+ dim_t n_th_hi = n_bf_whole % n_way;
+ dim_t n_th_lo = n_way - n_th_hi;
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to higher index threads.
+ if ( n_th_hi != 0 ) n_bf_hi += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo
+ + n_bf_left;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ start = lo_start + (work_id ) * size_lo;
+ end = lo_start + (work_id+1) * size_lo;
+
+ // Since the edge case is being allocated to the low
+ // end of the index range, we have to advance the
+ // starts/ends accordingly.
+ if ( work_id == 0 ) end += n_bf_left;
+ else { start += n_bf_left;
+ end += n_bf_left; }
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ start = hi_start + (work_id-n_th_lo ) * size_hi;
+ end = hi_start + (work_id-n_th_lo+1) * size_hi;
+ }
+ }
+ }
+ }
+ else
+ {
+ bli_get_range_weighted_t2b( thread, 0, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *c ), &start, &end );
+ }
+ }
+ }
+ else
+ bli_get_range_weighted_t2b( thread, 0, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *c ), &start, &end );
+#else
+ bli_get_range_weighted_t2b( thread, 0, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *c ), &start, &end );
+#endif
#ifdef BLIS_ENABLE_C66X_EDMA
if(start < end)
bli_dmam_init( &a1, a1_dma, cntl_sub_dmam_a( cntl ) );
bli_dmam_init( &c1, c1_dma, cntl_sub_dmam_c( cntl ) );
}
- //bli_obj_print("subpart", &a1);
- //bli_obj_print("subpart", a1_dma);
bli_dmam_int( &a1, a1_dma, cntl_sub_dmam_a( cntl ), (dmam_thrinfo_t *) gemm_thread_sub_ipackm( thread ) );
bli_dmam_int( &c1, c1_dma, cntl_sub_dmam_c( cntl ), (dmam_thrinfo_t *) gemm_thread_sub_ipackm( thread ) );
bli_obj_release_pack( ah_pack, cntl_sub_packm_b( cntl ) );
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( ah_dma , cntl_sub_dmam_b( cntl ));
- bli_obj_release_edma_handle( ah_dma );
+ bli_obj_release_emt_handle( ah_dma );
#endif
}
if( thread_am_ichief( thread ) )
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma, cntl_sub_dmam_a( cntl ) );
bli_obj_release_dma( c1_dma, cntl_sub_dmam_c( cntl ) );
- bli_obj_release_edma_handle( a1_dma );
- bli_obj_release_edma_handle( c1_dma );
+ bli_obj_release_emt_handle( a1_dma );
+ bli_obj_release_emt_handle( c1_dma );
#endif
}
}
index caf2d2f7d278da5c61ea50bb95a1ad50e6fb7a3e..35737049cda564aed09ffbc59e2e95e122d95cc5 100644 (file)
dim_t start, end;
// Needs to be replaced with a weighted range because triangle
- bli_get_range_weighted( thread, 0, n_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_is_lower( *c ), &start, &end );
+// bli_get_range_weighted( thread, 0, n_trans,
+// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+// bli_obj_is_lower( *c ), &start, &end );
+
+ bli_get_range_weighted_l2r( thread, 0, n_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *c ), &start, &end );
+
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
bli_acquire_mpart_t2b( stored_part,
i, b_alg, a_pack, &aS_pack );
+ //printf("c1S: blk_var2: m = %d, n = %d, diagoff = %d \nc1: blk_var2: m = %d, n = %d, diagoff = %d\n", bli_obj_length( c1S ), bli_obj_width( c1S ), bli_obj_diag_offset(c1S), bli_obj_length( c1 ), bli_obj_width( c1 ), bli_obj_diag_offset(c1));
+
// Initialize objects for packing A1' and C1.
if( thread_am_ichief( thread ) ) {
bli_packm_init( &ah1, ah1_pack,
index 6156488688ffb71ae3dcf65072141d5fb6d75193..7d28287c82e2f5c5360aabbaef4b994c976c6ea8 100644 (file)
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma, cntl_sub_dmam_a( cntl ) );
bli_obj_release_dma( ah1_dma, cntl_sub_dmam_b( cntl ) );
- bli_obj_release_edma_handle( a1_dma );
- bli_obj_release_edma_handle( ah1_dma );
+ bli_obj_release_emt_handle( a1_dma );
+ bli_obj_release_emt_handle( ah1_dma );
#endif
bli_obj_release_pack( a1_pack, cntl_sub_packm_a( cntl ));
bli_obj_release_pack( ah1_pack, cntl_sub_packm_b( cntl ) );
index c8ce5ed0414942aae1d9e66e2ba99f003586cafd..6cb44ce7247abcc167d677e7b716fd1c75b64c19 100644 (file)
#include "blis.h"
+#ifdef BLIS_ENABLE_PROFILE
+ profile_data_t *bli_herk_profile_data;
+#endif
+
void bli_herk_front( obj_t* alpha,
obj_t* a,
obj_t* beta,
obj_t a_local;
obj_t ah_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, k_profile;
+
+ m_profile = bli_obj_length( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+#endif
// Check parameters.
if ( bli_error_checking_is_enabled() )
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_herk_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
bli_herk_thrinfo_free_paths( infos, n_threads );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_herk_profile_data,
+ m_profile, m_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 1.0, n_threads);
+
+ bli_profile_data_free(bli_herk_profile_data);
+#endif
+
// The Hermitian rank-k product was computed as A*A', even for the
// diagonal elements. Mathematically, the imaginary components of
// diagonal elements of a Hermitian rank-k product should always be
index 01fb8c983dae5dbe5364533ddb4603b60108c786..b548fef4d368bde329d71ce8dea48056ee685797 100644 (file)
#include "blis.h"
-#ifdef BLIS_ENABLE_C66X_BUILD
-//#define BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
-#endif
-
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)( obj_t* a,
impl_t i;
bool_t uplo;
FUNCPTR_T f;
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- volatile int counter_start;
- volatile int counter_end;
+#if defined(BLIS_ENABLE_PROFILE)
+ volatile uint64_t counter_start;
+ volatile uint64_t counter_end;
+ extern profile_data_t *bli_herk_profile_data;
+ dim_t m_var, k_var, n_var;
+ dim_t index;
#endif
// Check parameters.
// Index into the variant array to extract the correct function pointer.
f = vars[uplo][n][i];
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- TSCL = 0;
- counter_start = TSCL;
+#if defined(BLIS_ENABLE_PROFILE)
+ m_var = bli_obj_length( c_local );
+ k_var = bli_obj_width_after_trans( a_local );
+ n_var = bli_obj_width_after_trans( ah_local );
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ TSCL = 0;
+ counter_start = lib_clock64();
+#else
+ counter_start = (uint64_t) (bli_clock()*1.2e9);
+#endif
#endif
// Invoke the variant.
f( &a_local,
&c_local,
cntl,
thread );
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- counter_end = TSCL;
- if(CSL_chipReadDNUM ()==0) printf("xxxxx bli_gemm_int \t %d %d %d \t %d\n", uplo, n, i, counter_end-counter_start);
+#if defined(BLIS_ENABLE_PROFILE)
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ counter_end = lib_clock64();
+#else // if not DSP
+ counter_end = (uint64_t) (bli_clock()*1.2e9);
#endif
+ bli_profile_get_index(n, i, index);
+ bli_profile_data_update(bli_herk_profile_data[index], (counter_end-counter_start), m_var*k_var*n_var);
+
+#endif // if defined profile
}
index fb01f90853af6850416e06c52f9f5ae567d5e0ef..2af25bbcf0c2f36d786b2479703b9fa622c738ac 100644 (file)
#define FUNCPTR_T herk_fp
+#ifdef BLIS_ENABLE_PROFILE
+#define BLIS_ENABLE_PROFILE_KERVAR2 1
+#else
+#define BLIS_ENABLE_PROFILE_KERVAR2 0
+#endif
+
typedef void (*FUNCPTR_T)(
doff_t diagoffc,
pack_t schema_a,
ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
/*EDMA Declarations */ \
\
- EdmaMgr_Handle edma_handle_b = NULL; \
- EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL; \
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_herk_profile_data; \
\
- /*volatile int counter_start; \
- volatile int counter_end;*/ \
/*
Assumptions/assertions:
rs_a == 1
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
} \
- bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
\
{ \
* If it does, update the new m and move the pointers of C and A accordingly
* Logic is not working as of now.*/\
diagoffc_j = diagoffc - (doff_t) jr_thread_id * NR; \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = lib_clock64(); \
+ } \
/*if ( diagoffc_j < 0 ) \
{ \
dim_t ii; \
* values of C written back.*/ \
n_cur = ( bli_is_not_edge_f( jr_thread_id, n_iter, n_left ) ? NR : n_left ); \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1_new, \
+ lib_emt_copy2D2D(emt_handle_c0, c1_new, \
cNew1, mc_new*sizeof(ctype), \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
else \
n_next = ( bli_is_not_edge_f( j+jr_num_threads, n_iter, n_left ) ? NR : n_left ); \
\
/*Start EDMA of KCxNR panel of B*/\
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
/*IDMA the first block of A only if it below the diagonal*/ \
m_cur = ( bli_is_not_edge_f( i, m_iter_new, m_left_new ) ? MR : m_left_new ); \
if (!bli_is_strictly_above_diag_n( diagoffc_j + (doff_t)ir_thread_id*MR, n_cur, m_cur )) \
- idma1_setup(a2_L1, a1_new + ir_thread_id * rstep_a, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1_new + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype)); \
\
/* wait for previous c transfer to complete and initiate next transfer */ \
- EdmaMgr_wait(edma_handle_c0); \
+ lib_emt_wait(emt_handle_c0); \
if(j < (n_iter-jr_num_threads)) /* no transfer for last iteration */ \
{\
/* Compute the diagonal offset for the next MCxNR bloc of C at (MC,j+jr_num_threads). */ \
cs_c11_next = (mc_next%2 == 0) ? mc_next : mc_next+1; /*(m_iter_next-ir_thread_id)*MR;*/ \
} \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1_next, \
+ lib_emt_copy2D2D(emt_handle_c0, c1_next, \
cNew0, mc_next*sizeof(ctype), \
n_next, cs_c*sizeof(ctype), \
cs_c11_next*sizeof(ctype)); \
} \
/*If EDMA-ing the whole MCXNR panel of C*/ \
/*c1_next = c_cast + (j+jr_num_threads) * cstep_c; \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1_next, \
+ lib_emt_copy2D2D(emt_handle_c0, c1_next, \
cNew0, m*sizeof(ctype), \
n_next, cs_c*sizeof(ctype), \
cs_c11*sizeof(ctype));*/ \
}\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter_new; i += ir_num_threads ) \
{ \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
{ \
/*Start next panel*/ \
if (!bli_is_strictly_above_diag_n( diagoffc_ij_next, m_next, n_cur )) \
- idma1_setup(a2_L1, a2, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype)); \
} \
if(i == ir_thread_id) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c11, cs_c11 /* rs_c, cs_c */); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
beta_cast, \
c11, rs_c11, cs_c11 /* rs_c, cs_c */); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} \
- /*counter_end=TSCL; \
- if (CSL_chipReadDNUM () == 0) \
- printf("%d %d %d\t%d\n",n_cur, m_cur, k, counter_end-counter_start); */\
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*mc_new*k*n_cur); \
} \
/* circularly shift buffers */ \
cNewTemp = cNew0; \
cNew1 = cNewTemp; \
if(j != jr_thread_id) /* wait for save c to complete; skip first iteration */ \
{ \
- EdmaMgr_wait(edma_handle_c1); \
+ lib_emt_wait(emt_handle_c1); \
} \
/* save updated c */ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1_new, mc_new*sizeof(ctype), \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1_new, mc_new*sizeof(ctype), \
n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
else \
{ \
diagoffc_j = diagoffc_j_next; \
cs_c11 = cs_c11_next; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*mc_new*k*n); \
+ } \
}\
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
- if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
+ lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
} \
}
INSERT_GENTFUNC_BASIC( herk_l_ker_var2, gemm_ukr_t )
-
-#else
-
#endif
#else
inc_t istep_a; \
inc_t istep_b; \
auxinfo_t aux; \
+\
+/*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_herk_profile_data; \
\
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = (uint64_t) (bli_clock()*1.2e9); \
+ } \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = (uint64_t) (bli_clock()*1.2e9); \
+ } \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = (uint64_t) (bli_clock()*1.2e9); \
+ } \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = (uint64_t) (bli_clock()*1.2e9); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ ( (long int) (counter_end_ker-counter_start_ker)), 2*k*m_cur*n_cur); \
+ } \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = (uint64_t) (bli_clock()*1.2e9); \
+ } \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
beta_cast, \
c11, rs_c, cs_c ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = (uint64_t) (bli_clock()*1.2e9); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ ( (long int) (counter_end_ker-counter_start_ker)), 2*k*m_cur*n_cur); \
+ } \
} \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = (uint64_t) (bli_clock()*1.2e9); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND],\
+ ( (long int) (counter_end_mr-counter_start_mr)), 2*m*k*n_cur); \
+ } \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = (uint64_t) (bli_clock()*1.2e9); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ ( (long int)(counter_end_nr-counter_start_nr)), 2*m*k*n); \
} \
}
index 8bec82440d26958c70dbb79de30e3fa77d7f5ab8..0fdfc2f40c25a6e016816ea4fd61630101db4040 100644 (file)
packm_thrinfo_t* ipackm,
herk_thrinfo_t* sub_herk )
{
- herk_thrinfo_t* thread = ( herk_thrinfo_t* ) bli_malloc( sizeof( herk_thrinfo_t ) );
+ herk_thrinfo_t* thread = ( herk_thrinfo_t* ) bli_malloc_scratch( sizeof( herk_thrinfo_t ) );
bli_setup_herk_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_herk_thrinfo_free( thread->sub_herk );
- bli_free( thread );
+ bli_free_scratch( thread );
return;
}
{
for( int i = 0; i < num; i++)
bli_herk_thrinfo_free( threads[i] );
- bli_free( threads );
+ bli_free_scratch( threads );
}
herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
dim_t ir_nt = 1;
- herk_thrinfo_t** paths = (herk_thrinfo_t**) bli_malloc( global_num_threads * sizeof( herk_thrinfo_t* ) );
+ herk_thrinfo_t** paths = (herk_thrinfo_t**) bli_malloc_scratch( global_num_threads * sizeof( herk_thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
for( int a = 0; a < jc_way; a++ )
index 1edb8183f4df98fe8bc2b5b49f5ebbc0893e1a54..12eafec36d962631133597d5823c890d48cfd13d 100644 (file)
ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
/*EDMA Declarations */ \
\
- EdmaMgr_Handle edma_handle_b = NULL; \
- EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL; \
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_herk_profile_data; \
\
/*
Assumptions/assertions:
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
} \
- bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
\
/* initiate first c transfer */ \
* (m_iter-ir_thread_id)*MR is not equal to m which would lead to incorrect
* values of C written back.*/ \
n_cur = ( bli_is_not_edge_f( jr_thread_id, n_iter, n_left ) ? NR : n_left ); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = lib_clock64(); \
+ } \
+\
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c_cast+jr_thread_id*cstep_c, \
+ lib_emt_copy2D2D(emt_handle_c0, c_cast+jr_thread_id*cstep_c, \
cNew1, m*sizeof(ctype), \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
}\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
- idma1_setup(a2_L1, a_cast + ir_thread_id * rstep_a, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
+ lib_imt_copy(a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype)); \
/* wait for previous c transfer to complete and initiate next transfer */ \
- EdmaMgr_wait(edma_handle_c0); \
+ lib_emt_wait(emt_handle_c0); \
if(j < (n_iter-jr_num_threads)) /* no transfer for last iteration */ \
{ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1+jr_num_threads*cstep_c, \
+ lib_emt_copy2D2D(emt_handle_c0, c1+jr_num_threads*cstep_c, \
cNew0, m*sizeof(ctype), /*(m_iter-ir_thread_id)*sizeof(ctype)*MR,*/ \
n_next, cs_c*sizeof(ctype), \
cs_c11*sizeof(ctype)); \
} \
}\
/* Interior loop over the m dimension (MR rows at a time). */ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
+\
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
/*a1 = a2; Make the next panel the current panel for the next iteration*/ \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
else \
{\
/*Start next panel*/ \
- idma1_setup(a2_L1, a2, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype)); \
}\
if(i == ir_thread_id) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
+\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c11, cs_c11 /*rs_c, cs_c */); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
+\
} \
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
+\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
beta_cast, \
c11, rs_c11, cs_c11 /*rs_c, cs_c */); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*m*k*n_cur); \
+ } \
/* circularly shift buffers */ \
cNewTemp = cNew0; \
cNew0 = cNew2; \
cNew1 = cNewTemp; \
if(j != jr_thread_id) /* wait for save c to complete; skip first iteration */ \
{ \
- EdmaMgr_wait(edma_handle_c1); \
+ lib_emt_wait(emt_handle_c1); \
} \
/* save updated c */ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
} \
else \
} \
} \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND],\
+ (counter_end_nr-counter_start_nr), 2*m*k*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
- if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
+ lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
} \
}
index d01232f65bd0013ff044285822ae44ece6932411..32d4f633304f6388493c1331221adf07bb3ef7c6 100644 (file)
{
num_t dt = bli_obj_datatype( *c );
+#if defined (BLIS_ENABLE_C66X_BUILD)
+ lib_smem_sreset(blasGetMemHandle()); /* reset BLAS scratch heap */
+#endif
+
if ( bli_3mh_is_enabled_dt( dt ) ) bli_symm3mh_entry( side, alpha, a, b, beta, c );
else if ( bli_3m_is_enabled_dt( dt ) ) bli_symm3m_entry( side, alpha, a, b, beta, c );
else if ( bli_4mh_is_enabled_dt( dt ) ) bli_symm4mh_entry( side, alpha, a, b, beta, c );
index 9d2aeaed2c80a7ef4e507139aba6ffd2eb386af7..3bf8f99f34a03892ed8fb9e66b87576d2b4ec0ca 100644 (file)
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ extern profile_data_t *bli_gemm_profile_data;
+ dim_t m_profile, n_profile, k_profile;
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_symm_check( side, alpha, a, b, beta, c );
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ n_profile = bli_obj_width( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+ bli_gemm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS); //One struct for each thread for each variant
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_gemm_int,
(void*) cntl,
(void**) infos );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_gemm_profile_data,
+ m_profile, n_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 2, n_threads);
+
+ bli_profile_data_free(bli_gemm_profile_data);
+
+#endif
+
bli_gemm_thrinfo_free_paths( infos, n_threads );
}
index 9fbc9d7a78a338ed5c84a049abaa972317a96057..8dae961b62b1510efb0e7db433f533390d16bf9b 100644 (file)
{
num_t dt = bli_obj_datatype( *c );
+#if defined (BLIS_ENABLE_C66X_BUILD)
+ lib_smem_sreset(blasGetMemHandle()); /* reset BLAS scratch heap */
+#endif
+
if ( bli_3mh_is_enabled_dt( dt ) ) bli_syr2k3mh_entry( alpha, a, b, beta, c );
else if ( bli_3m_is_enabled_dt( dt ) ) bli_syr2k3m_entry( alpha, a, b, beta, c );
else if ( bli_4mh_is_enabled_dt( dt ) ) bli_syr2k4mh_entry( alpha, a, b, beta, c );
index fdb131c9c56dd808df1544e35cc6fac31719502d..f42e1bb9c3966333c932c49c8b91cdbb5b6ca779 100644 (file)
obj_t b_local;
obj_t at_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, k_profile;
+ extern profile_data_t *bli_herk_profile_data;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syr2k_check( alpha, a, b, beta, c );
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_herk_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
(void*) cntl,
(void**) infos );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_herk_profile_data,
+ m_profile, m_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 1.0, n_threads);
+
+ bli_profile_data_free(bli_herk_profile_data);
+#endif
+
bli_herk_thrinfo_free_paths( infos, n_threads );
#endif
}
index 69dde1fcbba4480f30bf3c5be6b416df08c3a834..f44c08d44261409d8be1ef4d622e53e2b858ce72 100644 (file)
{
num_t dt = bli_obj_datatype( *c );
+#if defined (BLIS_ENABLE_C66X_BUILD)
+ lib_smem_sreset(blasGetMemHandle()); /* reset BLAS scratch heap */
+#endif
+
if ( bli_3mh_is_enabled_dt( dt ) ) bli_syrk3mh_entry( alpha, a, beta, c );
else if ( bli_3m_is_enabled_dt( dt ) ) bli_syrk3m_entry( alpha, a, beta, c );
else if ( bli_4mh_is_enabled_dt( dt ) ) bli_syrk4mh_entry( alpha, a, beta, c );
index ab50f9b2dd3eb3acf7ffb8b715b9cdf60e50b78f..047cf2dbcdd432f0287666a27918a8ac0820fba3 100644 (file)
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ */
#include "blis.h"
+
+#define CLOCK 1.2 // In GHz
+#define OPS_PER_CYCLE_S 16
+#define OPS_PER_CYCLE_D 4
+#define NUM_THREADS 8
+
void bli_syrk_front( obj_t* alpha,
- obj_t* a,
- obj_t* beta,
- obj_t* c,
- gemm_t* cntl )
+ obj_t* a,
+ obj_t* beta,
+ obj_t* c,
+ gemm_t* cntl )
{
obj_t a_local;
obj_t at_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, k_profile;
+ extern profile_data_t *bli_herk_profile_data;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syrk_check( alpha, a, beta, c );
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if (
- ( bli_obj_is_row_stored( c_local ) &&
- bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
- cntl_gemm_ukrs( cntl ) ) ) ||
- ( bli_obj_is_col_stored( c_local ) &&
- bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
- cntl_gemm_ukrs( cntl ) ) )
- )
+ ( bli_obj_is_row_stored( c_local ) &&
+ bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
+ cntl_gemm_ukrs( cntl ) ) ) ||
+ ( bli_obj_is_col_stored( c_local ) &&
+ bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
+ cntl_gemm_ukrs( cntl ) ) )
+ )
{
bli_obj_induce_trans( c_local );
}
-
- herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
- dim_t n_threads = thread_num_threads( infos[0] );
-
- // Invoke the internal back-end.
- bli_level3_thread_decorator( n_threads,
- (level3_int_t) bli_herk_int,
- alpha,
- &a_local,
- &at_local,
- beta,
- &c_local,
- (void*) cntl,
- (void**) infos );
-
- bli_herk_thrinfo_free_paths( infos, n_threads );
+
+ herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
+ dim_t n_threads = thread_num_threads( infos[0] );
+
+#ifdef BLIS_ENABLE_PROFILE
+ bli_herk_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
+ // Invoke the internal back-end.
+ bli_level3_thread_decorator( n_threads,
+ (level3_int_t) bli_herk_int,
+ alpha,
+ &a_local,
+ &at_local,
+ beta,
+ &c_local,
+ (void*) cntl,
+ (void**) infos );
+
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_herk_profile_data,
+ m_profile, m_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 1.0, n_threads);
+
+ bli_profile_data_free(bli_herk_profile_data);
+#endif
+
+ bli_herk_thrinfo_free_paths( infos, n_threads );
}
index 7037876bb62b390818e6167574b8db26b160f5cc..2257fbbc079db3f7681a571282de539df08c7576 100644 (file)
{
num_t dt = bli_obj_datatype( *b );
+#if defined (BLIS_ENABLE_C66X_BUILD)
+ lib_smem_sreset(blasGetMemHandle()); /* reset BLAS scratch heap */
+#endif
+
if ( bli_3m_is_enabled_dt( dt ) ) bli_trmm3m_entry( side, alpha, a, b );
else if ( bli_4m_is_enabled_dt( dt ) ) bli_trmm4m_entry( side, alpha, a, b );
else bli_trmm_entry( side, alpha, a, b );
index 02217c237fe1c73002dad610fb4ada515cf7da8b..32ae4af9930bb8d4e0a8215f9644467f5a9b7429 100644 (file)
bli_obj_width_after_trans( *a );
dim_t start, end;
- bli_get_range_weighted( thread, offA, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_is_upper( *c ), &start, &end );
+// bli_get_range_weighted( thread, offA, m_trans,
+// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+// bli_obj_is_upper( *c ), &start, &end );
+
+#if 0
+ dim_t n_trans;
+ n_trans = bli_obj_width_after_trans(*a);
+
+ {
+ dim_t At, Ar, X;
+ dim_t num_threads_At, num_threads_Ar;
+ At = ceil(n_trans*n_trans/2);
+ Ar = bli_max(0, m_trans - n_trans)*n_trans;
+ X = ceil(Ar/At);
+
+ printf("offA = %d, m_trans = %d, n_trans = %d At = %d, Ar = %d, X = %d\n", offA, m_trans, n_trans, At, Ar, X);
+
+ if (X > 0)
+ {
+ num_threads_At = thread->n_way / (1 + X);
+ num_threads_Ar = thread->n_way - num_threads_At;
+
+ printf("offA = %d, m_trans = %d, n_trans = %d At = %d, Ar = %d, X = %d, num_threads_At = %d, num_threads_Ar = %d\n", offA, m_trans, n_trans, At, Ar, X, num_threads_At, num_threads_Ar);
+
+ if(thread->work_id < num_threads_At)
+ {
+ dim_t all_start = offA;
+ dim_t all_end = offA + n_trans;
+ dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
+ uplo_t uplo = bli_obj_root_uplo( *a );
+ bool_t handle_edge_low = FALSE;
+ dim_t n_way = num_threads_At;
+ dim_t work_id = thread->work_id;
+ dim_t size = all_end - all_start; // partioning only the triangular part
+ dim_t width;
+ dim_t block_fac_leftover = size % block_factor;
+ dim_t i;
+ double num;
+
+ bli_toggle_uplo(uplo);
+
+ //printf("Triangle: work_id = %d \n", thread->work_id);
+
+ start = 0;
+ end = all_end - all_start;
+ num = size * size / ( double )n_way;
+
+ for ( i = 0; TRUE; ++i )
+ {
+ width = ceil( sqrt( start * start + num ) ) - start;
+
+ if ( i == 0 && handle_edge_low )
+ {
+ if ( width % block_factor != block_fac_leftover )
+ width += block_fac_leftover - ( width % block_factor );
+ }
+ else
+ {
+ if ( width % block_factor != 0 )
+ width += block_factor - ( width % block_factor );
+ }
+
+ if ( work_id == 0 )
+ {
+ start = start + all_start;
+ end = bli_min( start + width, all_end );
+ break;
+ }
+ else
+ {
+ start = start + width;
+ work_id--;
+ }
+ }
+
+ }
+ else
+ {
+ dim_t all_start = offA + n_trans;
+ dim_t all_end = m_trans;
+ dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
+ bool_t handle_edge_low = FALSE;
+
+ dim_t n_way = num_threads_Ar;
+ dim_t work_id = thread->work_id - num_threads_At;
+
+ dim_t size = all_end - all_start;
+
+ dim_t n_bf_whole = size / block_factor;
+ dim_t n_bf_left = size % block_factor;
+
+ dim_t n_bf_lo = n_bf_whole / n_way;
+ dim_t n_bf_hi = n_bf_whole / n_way;
+
+ //printf("Rectangle: work_id = %d \n", thread->work_id);
+
+
+ if ( handle_edge_low == FALSE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "high" and
+ // the "low" thread group is empty.
+ dim_t n_th_lo = n_bf_whole % n_way;
+ //dim_t n_th_hi = n_way - n_th_lo;
+
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to lower index threads.
+ if ( n_th_lo != 0 ) n_bf_lo += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ start = lo_start + (work_id ) * size_lo;
+ end = lo_start + (work_id+1) * size_lo;
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ start = hi_start + (work_id-n_th_lo ) * size_hi;
+ end = hi_start + (work_id-n_th_lo+1) * size_hi;
+
+ // Since the edge case is being allocated to the high
+ // end of the index range, we have to advance the last
+ // thread's end.
+ if ( work_id == n_way - 1 ) end += n_bf_left;
+ }
+ }
+ else // if ( handle_edge_low == TRUE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "low" and
+ // the "high" thread group is empty.
+ dim_t n_th_hi = n_bf_whole % n_way;
+ dim_t n_th_lo = n_way - n_th_hi;
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to higher index threads.
+ if ( n_th_hi != 0 ) n_bf_hi += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo
+ + n_bf_left;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ start = lo_start + (work_id ) * size_lo;
+ end = lo_start + (work_id+1) * size_lo;
+
+ // Since the edge case is being allocated to the low
+ // end of the index range, we have to advance the
+ // starts/ends accordingly.
+ if ( work_id == 0 ) end += n_bf_left;
+ else { start += n_bf_left;
+ end += n_bf_left; }
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ start = hi_start + (work_id-n_th_lo ) * size_hi;
+ end = hi_start + (work_id-n_th_lo+1) * size_hi;
+ }
+ }
+ }
+ }
+ else
+ {
+ bli_get_range_weighted_t2b( thread, offA, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *a ), &start, &end );
+ }
+ }
+#else
+ bli_get_range_weighted_t2b( thread, offA, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *a ), &start, &end );
+#endif
+
#ifdef BLIS_ENABLE_C66X_EDMA
if(start<end)
bli_obj_release_pack( b_pack, cntl_sub_packm_b( cntl ));
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( b_dma, cntl_sub_dmam_b( cntl ) );
- bli_obj_release_edma_handle( b_dma );
+ bli_obj_release_emt_handle( b_dma );
#endif
}
if( thread_am_ichief( thread ) )
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma, cntl_sub_dmam_a( cntl ) );
bli_obj_release_dma( c1_dma, cntl_sub_dmam_c( cntl ) );
- bli_obj_release_edma_handle( a1_dma );
- bli_obj_release_edma_handle( c1_dma );
+ bli_obj_release_emt_handle( a1_dma );
+ bli_obj_release_emt_handle( c1_dma );
#endif
}
}
index 3a4716b036d82d0a713b1a76ad86e825f9e24c81..e4e696b93d6ad1edc6751df54e91bad34478bc18 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- bli_get_range_weighted( thread, 0, n_trans,
- bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
- bli_obj_is_upper( *c ), &start, &end );
+// bli_get_range_weighted( thread, 0, n_trans,
+// bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+// bli_obj_is_upper( *c ), &start, &end );
+
+ bli_get_range_weighted_r2l( thread, 0, n_trans,
+ bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *b ), &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
index c341cb449da9c9228f9c77577faf2a42c32c6b2d..6ddd1dc543a181ff5f927bbc688e83e385b9d192 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- bli_get_range_weighted( thread, 0, n_trans,
- bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
- bli_obj_is_lower( *c ), &start, &end );
+// bli_get_range_weighted( thread, 0, n_trans,
+// bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+// bli_obj_is_lower( *c ), &start, &end );
+
+ bli_get_range_weighted_l2r( thread, 0, n_trans,
+ bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *b ), &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
index eee25c115fbeb4169b133459a7bc223cc35a64ed..f50a2b2ebd2d4a10f5a9617ea3b3691a98e823fc 100644 (file)
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma, cntl_sub_dmam_a(cntl) );
bli_obj_release_dma( b1_dma, cntl_sub_dmam_b(cntl) );
- bli_obj_release_edma_handle( a1_dma );
- bli_obj_release_edma_handle( b1_dma );
+ bli_obj_release_emt_handle( a1_dma );
+ bli_obj_release_emt_handle( b1_dma );
#endif
bli_obj_release_pack( a1_pack, cntl_sub_packm_a( cntl ) );
bli_obj_release_pack( b1_pack, cntl_sub_packm_b( cntl ) );
index b14394b0ff821162b499b3640c80554ba693cca7..0d7cee4021a8400a74038e7a749f339f00fe7aed 100644 (file)
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma , cntl_sub_dmam_a(cntl) );
bli_obj_release_dma( b1_dma , cntl_sub_dmam_b(cntl) );
- bli_obj_release_edma_handle( a1_dma );
- bli_obj_release_edma_handle( b1_dma );
+ bli_obj_release_emt_handle( a1_dma );
+ bli_obj_release_emt_handle( b1_dma );
#endif
bli_obj_release_pack( a1_pack, cntl_sub_packm_a(cntl) );
bli_obj_release_pack( b1_pack, cntl_sub_packm_b(cntl) );
index 61bbfa4612dbcdd8e0328ab6d32c1f0be2fe90af..39a9815954edab4255f95464dc76ced443aa55cc 100644 (file)
#include "blis.h"
+
+#ifdef BLIS_ENABLE_PROFILE
+ profile_data_t *bli_trmm_profile_data;
+#endif
+
void bli_trmm_front( side_t side,
obj_t* alpha,
obj_t* a,
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t mn_side, m_profile, n_profile;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *b );
+ n_profile = bli_obj_width( *b );
+ bli_set_dim_with_side( side, m_profile, n_profile, mn_side );
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trmm_check( side, alpha, a, b );
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( bli_is_right( side ) );
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_trmm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*7);
+#endif
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_trmm_int,
(void**) infos );
bli_trmm_thrinfo_free_paths( infos, n_threads );
+
+#ifdef BLIS_ENABLE_PROFILE
+ {
+ bli_profile_data_print (bli_trmm_profile_data,
+ m_profile, n_profile, mn_side,
+ bli_obj_execution_datatype( *b ), 1, n_threads);
+
+
+ bli_profile_data_free(bli_trmm_profile_data);
+ }
+
+#endif
}
index 4878aefea5661b0ddec9377b697b71622d182bef..71f38f9ed094affc089595d45f112145dba737cb 100644 (file)
impl_t i;
FUNCPTR_T f;
+#if defined(BLIS_ENABLE_PROFILE)
+ volatile uint64_t counter_start;
+ volatile uint64_t counter_end;
+ extern profile_data_t *bli_trmm_profile_data;
+ dim_t m_var, k_var, n_var;
+ dim_t index;
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trmm_int_check( alpha, a, b, beta, c, cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[side][uplo][n][i];
+#if defined(BLIS_ENABLE_PROFILE)
+ m_var = bli_obj_length( *b );
+ n_var = bli_obj_width( *b );
+ bli_set_dim_with_side( side, m_var, n_var, k_var );
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ TSCL = 0;
+ counter_start = lib_clock64();
+#else
+ counter_start = (uint64_t) (bli_clock()*1.2e9);
+#endif
+#endif
+
// Invoke the variant.
f( &a_local,
&b_local,
&c_local,
cntl,
thread );
+
+#if defined(BLIS_ENABLE_PROFILE)
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ counter_end = lib_clock64();
+#else
+ counter_end = (uint64_t) (bli_clock()*1.2e9);
+#endif
+ bli_profile_get_index(n, i, index);
+
+ bli_profile_data_update(bli_trmm_profile_data[index], (counter_end-counter_start), m_var*k_var*n_var);
+#endif
}
index 26a0fced5bc241edf403bf94e301abb802619f02..883ae2a71980718fd6a75e4a8fb2bc3f337991e0 100644 (file)
ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
\
/*EDMA Declarations */ \
- EdmaMgr_Handle edma_handle_b = NULL; \
- EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL; \
-\
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trmm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
+ { \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
+ } \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ counter_start_nr = lib_clock64(); \
} \
\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
/* Transfering MC(=m)xNR*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1, \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
cNew1, m*sizeof(ctype), \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
}\
b2 = b1; \
\
/*Does not work when transfer k*n_cur*sizeof(ctype)*/ \
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
\
- EdmaMgr_wait(edma_handle_c0); \
+ lib_emt_wait(emt_handle_c0); \
if(j < n_iter-1) /* no transfer for last iteration*/ \
{ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1+cstep_c, \
+ lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
cNew0, m*sizeof(ctype), \
n_next, cs_c*sizeof(ctype), \
cs_c11*sizeof(ctype)); \
} \
} \
c11 = cNew1;/*c11 = c1;*/ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
+\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
*/ \
if (bli_intersects_diag_n( diagoffa_i, MR, k )) \
{ \
- idma1_setup(a2_L1, a1, bli_min( diagoffa_i + MR, k )*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1, a2_L1, bli_min( diagoffa_i + MR, k )*MR*sizeof(ctype)); \
}\
else \
{ \
- idma1_setup(a2_L1, a1, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1, a2_L1, k*MR*sizeof(ctype)); \
} \
} \
/* If the current panel of A intersects the diagonal, scale C
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
temp = a1_L1; \
a1_L1 = a2_L1; \
b2 = b_cast; \
} \
else \
- idma1_setup(a2_L1, a2, k_next*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k_next*MR*sizeof(ctype)); \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( PACKMR * k_a1011, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11/* rs_c, cs_c */); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k_a1011*m_cur*n_cur); \
+ } \
} /*if ( trmm_l_ir_my_iter( i, ir_thread ) ) */ \
\
a1 += ps_a_cur; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
temp = a1_L1; \
a1_L1 = a2_L1; \
b2 = b_cast; \
} \
else \
- idma1_setup(a2_L1, a2, k_next*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k_next*MR*sizeof(ctype)); \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 /*rs_c, cs_c */); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} /*if ( trmm_l_ir_my_iter( i, ir_thread ) )*/ \
\
a1 += rstep_a; \
/*c11 += rstep_c;*/ \
c11 += rstep_c11; \
} /*for ( i = 0; i < m_iter; ++i )*/\
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND],\
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
+ } \
+\
/* circularly shift buffers */ \
cNewTemp = cNew0; \
cNew0 = cNew2; \
cNew1 = cNewTemp; \
if(j != 0) /* wait for save c to complete; skip first iteration */ \
{ \
- EdmaMgr_wait(edma_handle_c1); \
+ lib_emt_wait(emt_handle_c1); \
} \
/* save updated c*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
else \
{ \
} \
\
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND],\
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
- if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
+ lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
index f0a555b7a4f28758c899b1af33857311c3be1797..e5c36e417d8d6bf4ce641549c3e5d72c9073f76e 100644 (file)
ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
\
/*EDMA Declarations */ \
- EdmaMgr_Handle edma_handle_b = NULL; \
- EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL; \
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trmm_profile_data; \
\
/*
Assumptions/assertions:
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = lib_clock64(); \
+ } \
\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
/* Loop over the n dimension (NR columns at a time). */ \
/* Transfering MC(=m)xNR*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1, \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
cNew1, m*sizeof(ctype), \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
} \
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
\
- EdmaMgr_wait(edma_handle_c0); \
+ lib_emt_wait(emt_handle_c0); \
if(j < n_iter-1) /* no transfer for last iteration */ \
{ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1+cstep_c, \
+ lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
cNew0, m*sizeof(ctype), \
n_next, cs_c*sizeof(ctype), \
cs_c11*sizeof(ctype)); \
} \
} \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
+\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if(i == 0) \
{ \
if(bli_intersects_diag_n( diagoffa_i, MR, k )) \
- idma1_setup(a2_L1, a1, (k - diagoffa_i)*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1, a2_L1, (k - diagoffa_i)*MR*sizeof(ctype)); \
else \
- idma1_setup(a2_L1, a1, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1, a2_L1, k*MR*sizeof(ctype)); \
} \
\
/* If the current panel of A intersects the diagonal, scale C
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
temp = a1_L1; \
a1_L1 = a2_L1; \
} \
else \
{ \
- idma1_setup(a2_L1, a2, k_next*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k_next*MR*sizeof(ctype)); \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
/* Save the 4m/3m imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( PACKMR * k_a1112, aux ); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k_a1112*m_cur*n_cur); \
+ } \
} /*if ( trmm_l_ir_my_iter( i, ir_thread ) )*/ \
\
a1 += ps_a_cur; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
temp = a1_L1; \
a1_L1 = a2_L1; \
b2 = b_cast; \
} \
else \
- idma1_setup(a2_L1, a2, k_next*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k_next*MR*sizeof(ctype)); \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
/* Save the 4m/3m imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, aux ); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
+\
} /*if ( trmm_l_ir_my_iter( i, ir_thread ) )*/\
\
a1 += rstep_a; \
c11 += rstep_c11; \
/*c11 += rstep_c;*/ \
} /*for ( i = 0; i < m_iter; ++i )*/\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
+ } \
/* circularly shift buffers */ \
cNewTemp = cNew0; \
cNew0 = cNew2; \
cNew1 = cNewTemp; \
if(j != 0) /* wait for save c to complete; skip first iteration */ \
{ \
- EdmaMgr_wait(edma_handle_c1); \
+ lib_emt_wait(emt_handle_c1); \
} \
/* save updated c*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
} \
else \
b1 += cstep_b; \
c1 += cstep_c; \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
- if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
+ lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
index b00b37905e649d861ad59263ce7494762d3454f0..1a102e40d6890459898860b55a22ba14ff31a9f0 100644 (file)
#define FUNCPTR_T gemm_fp
+#ifdef BLIS_ENABLE_PROFILE
+#define BLIS_ENABLE_PROFILE_KERVAR2 1
+#else
+#define BLIS_ENABLE_PROFILE_KERVAR2 0
+#endif
+
typedef void (*FUNCPTR_T)(
doff_t diagoffb,
pack_t schema_a,
func_t* gemm_ukrs;
void* gemm_ukr;
-
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
\
/*EDMA Declarations */ \
- EdmaMgr_Handle edma_handle_b = NULL; \
- EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL; \
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trmm_profile_data; \
\
/*
Assumptions/assertions:
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
/* Loop over the n dimension (NR columns at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = lib_clock64(); \
+ } \
/* Transfering MC(=m)xNR*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1, \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
cNew1, m*sizeof(ctype), \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
} \
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k_b1121*NR*sizeof(ctype)); \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k_b1121*NR*sizeof(ctype)); \
\
- EdmaMgr_wait(edma_handle_c0); \
+ lib_emt_wait(emt_handle_c0); \
if(j < n_iter-1) /* no transfer for last iteration */ \
{ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1+cstep_c, \
+ lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
cNew0, m*sizeof(ctype), \
n_next, cs_c*sizeof(ctype), \
cs_c11*sizeof(ctype)); \
bli_auxinfo_set_is_b( PACKNR * k_b1121, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
\
if(i == 0) \
{ \
- idma1_setup(a2_L1, a1 + ( off_b1121 * PACKMR ) / off_scl, k_b1121*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1 + ( off_b1121 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
} \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
/*a1_i = a1_L1 + ( off_b1121 * PACKMR ) / off_scl;*/ \
a1_i = a1_L1; \
else \
{ \
/*Start next panel*/ \
- idma1_setup(a2_L1, a2 + ( off_b1121 * PACKMR ) / off_scl, k_b1121*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2 + ( off_b1121 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ /*TSCL=0;*/ \
+ counter_start_ker = TSCL; \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker),2*k_b1121*m_cur*n_cur); \
+ } \
} \
\
a1 += rstep_a; \
c11 += rstep_c11; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = TSCL; \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k_b1121*m*n_cur); \
+ } \
} \
\
b1 += ps_b_cur; \
bli_auxinfo_set_is_b( istep_b, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
\
if(i == 0) \
{ \
- idma1_setup(a2_L1, a1, k_b1121*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1, a2_L1, k_b1121*MR*sizeof(ctype)); \
} \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
else \
{ \
/*Start next panel*/ \
- idma1_setup(a2_L1, a2, k_b1121*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k_b1121*MR*sizeof(ctype)); \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ /*printf("gemm %d %d %d %ld\n", MR, NR, k, (counter_end_ker-counter_start_ker));*/ \
+ } \
} /*if ( trmm_r_ir_my_iter( i, ir_thread ) )*/\
\
a1 += rstep_a; \
c11 += rstep_c11; \
} /*for i*/\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
+ } \
} /*j thread*/\
\
b1 += cstep_b; \
cNew1 = cNewTemp; \
if(j != 0) /* wait for save c to complete; skip first iteration */ \
{ \
- EdmaMgr_wait(edma_handle_c1); \
+ lib_emt_wait(emt_handle_c1); \
} \
/* save updated c*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
} \
else \
} \
c1 += cstep_c; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
- if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
+ lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
index ee0a25cafd7661de90513e4c0cff2e90c092eb10..8d23b56e30ceaefa99c3b2d58cc11f52322c58aa 100644 (file)
ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
\
/*EDMA Declarations */ \
- EdmaMgr_Handle edma_handle_b = NULL; \
- EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL; \
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trmm_profile_data; \
\
/*
Assumptions/assertions:
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
} \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
/* Loop over the n dimension (NR columns at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = lib_clock64(); \
+ } \
/* Transfering MC(=m)xNR*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1, \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
cNew1, m*sizeof(ctype), \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
} \
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k_b0111*NR*sizeof(ctype)); \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k_b0111*NR*sizeof(ctype)); \
\
- EdmaMgr_wait(edma_handle_c0); \
+ lib_emt_wait(emt_handle_c0); \
if(j < n_iter-1) /* no transfer for last iteration */ \
{ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1+cstep_c, \
+ lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
cNew0, m*sizeof(ctype), \
n_next, cs_c*sizeof(ctype), \
cs_c11*sizeof(ctype)); \
bli_auxinfo_set_is_b( PACKNR * k_b0111, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
\
if(i == 0) \
{ \
- idma1_setup(a2_L1, a1 + ( off_b0111 * PACKMR ) / off_scl, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1 + ( off_b0111 * PACKMR ) / off_scl, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
\
/*a1_i = a1_L1 + ( off_b0111 * PACKMR ) / off_scl;*/ \
else \
{ \
/*Start next panel*/ \
- idma1_setup(a2_L1, a2 + ( off_b0111 * PACKMR ) / off_scl, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2 + ( off_b0111 * PACKMR ) / off_scl, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k_b0111*m_cur*n_cur); \
+ } \
} \
\
a1 += rstep_a; \
c11 += rstep_c11; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k_b0111*m*n_cur); \
+ } \
} \
\
b1 += ps_b_cur; \
bli_auxinfo_set_is_b( istep_b, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
\
if(i == 0) \
{ \
- idma1_setup(a2_L1, a1, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
\
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
else \
{ \
/*Start next panel*/ \
- idma1_setup(a2_L1, a2, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} \
\
a1 += rstep_a; \
c11 += rstep_c11; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
+ } \
} \
\
b1 += cstep_b; \
cNew1 = cNewTemp; \
if(j != 0) /* wait for save c to complete; skip first iteration */ \
{ \
- EdmaMgr_wait(edma_handle_c1); \
+ lib_emt_wait(emt_handle_c1); \
} \
/* save updated c*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
}\
else \
\
c1 += cstep_c; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
- if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
+ lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
index c81b63c308040958c1b692b24d44d9e37da40193..d5b07ad152f378490c5a1862dc5f527677363a5c 100644 (file)
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm )
{
- trmm_thrinfo_t* thread = ( trmm_thrinfo_t* ) bli_malloc( sizeof( trmm_thrinfo_t ) );
+ trmm_thrinfo_t* thread = ( trmm_thrinfo_t* ) bli_malloc_scratch( sizeof( trmm_thrinfo_t ) );
bli_setup_trmm_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_trmm_thrinfo_free( thread->sub_trmm );
- bli_free( thread );
+ bli_free_scratch( thread );
return;
}
{
for( int i = 0; i < num; i++)
bli_trmm_thrinfo_free( threads[i] );
- bli_free( threads );
+ bli_free_scratch( threads );
}
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency )
dim_t ic_nt = jr_way * ir_way;
dim_t jr_nt = ir_way;
dim_t ir_nt = 1;
-
- trmm_thrinfo_t** paths = (trmm_thrinfo_t**) bli_malloc( global_num_threads * sizeof( trmm_thrinfo_t* ) );
+ trmm_thrinfo_t** paths = (trmm_thrinfo_t**) bli_malloc_scratch( global_num_threads * sizeof( trmm_thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
for( int a = 0; a < jc_way; a++ )
index 4b74acf6104b666500d8992a14ccee1281dccea0..de849281f3523011d447085a7adb4af844ee7d0a 100644 (file)
{
num_t dt = bli_obj_datatype( *b );
+#if defined (BLIS_ENABLE_C66X_BUILD)
+ lib_smem_sreset(blasGetMemHandle()); /* reset BLAS scratch heap */
+#endif
+
if ( bli_3m_is_enabled_dt( dt ) ) bli_trsm3m_entry( side, alpha, a, b );
else if ( bli_4m_is_enabled_dt( dt ) ) bli_trsm4m_entry( side, alpha, a, b );
else bli_trsm_entry( side, alpha, a, b );
index b0da52b18f8d1fdb4fd28168259daa851dacddea..9dc5bead6acd9ebaa9a5f334fd8c1d229e3e2352 100644 (file)
#ifdef BLIS_ENABLE_C66X_EDMA
dim_t b_alg_next;
#endif
+ volatile int counter_start;
+ volatile int counter_end;
// printf("blk_var1b\n");
// Initialize object for packing B.
bli_obj_width_after_trans( *a );
dim_t start, end;
- num_t datatype = bli_obj_execution_datatype( *a );
- bli_get_range( thread, offA, m_trans,
- //bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
- bli_info_get_default_mc( datatype ),
- &start, &end );
+ num_t dt = bli_obj_execution_datatype( *a );
+// bli_get_range( thread, offA, m_trans,
+// //bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
+// bli_info_get_default_mc( datatype ),
+// &start, &end );
+
+ bli_get_range_b2t( thread, offA, m_trans,
+ //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+ bli_info_get_default_mc( dt ),
+ &start, &end );
// Partition along the remaining portion of the m dimension.
for ( i = start; i < end; i += b_alg )
if( thread_am_ichief( thread ) )
{
// DMA control leaf unrolling for A
- //printf("Init DMA A2 %x ", bli_mem_buffer(&(a1_dma->dma_mem)));
bli_dmam_init( &a1, a1_dma, cntl_sub_dmam_a( cntl ) );
- //printf("after DMA init %x %x %x\n", bli_mem_buffer(&(a1_dma->dma_mem)), bli_mem_buffer(&(a1_dma->pack_mem)), a1_dma->edma_handle);
}
bli_dmam_int( &a1, a1_dma, cntl_sub_dmam_a( cntl ), (dmam_thrinfo_t *) trsm_thread_sub_ipackm( thread ) );
}
if( thread_am_ichief( thread ) ) {
bli_packm_init( a1_dma, a1_pack,
cntl_sub_packm_a( cntl ) );
- //printf("after Pack init %x %x %x\n", bli_mem_buffer(&(a1_pack->dma_mem)), bli_mem_buffer(&(a1_pack->pack_mem)), a1_pack->edma_handle);
+ //printf("after Pack init %x %x %x\n", bli_mem_buffer(&(a1_pack->dma_mem)), bli_mem_buffer(&(a1_pack->pack_mem)), a1_pack->emt_handle);
}
thread_ibarrier( thread );
// DMA control leaf unrolling for A
//printf("Init DMA A2 %x ", bli_mem_buffer(&(a1_dma->dma_mem)));
bli_dmam_init( &a1, a1_dma, cntl_sub_dmam_a( cntl ) );
- //printf("after DMA init %x %x %x\n", bli_mem_buffer(&(a1_dma->dma_mem)), bli_mem_buffer(&(a1_dma->pack_mem)), a1_dma->edma_handle);
+ //printf("after DMA init %x %x %x\n", bli_mem_buffer(&(a1_dma->dma_mem)), bli_mem_buffer(&(a1_dma->pack_mem)), a1_dma->emt_handle);
}
bli_dmam_int( &a1, a1_dma, cntl_sub_dmam_a( cntl ), (dmam_thrinfo_t *) trsm_thread_sub_ipackm( thread ) );
#endif
// Perform trsm subproblem.
+ //TSCL = 0;
+ //counter_start = TSCL;
bli_trsm_int( &BLIS_ONE,
a1_pack,
b_pack,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
+ //counter_end = TSCL;
+ //if(lib_get_coreID()==0)
+ {
+ //printf("%d\n", (counter_end-counter_start));
+ }
+
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_alias_to(c2, c1);
#endif
bli_obj_release_pack( b_pack , cntl_sub_packm_b(cntl));
if( thread_am_ichief( thread ) )
{
- //printf("before Pack release %x %x %x\n", bli_mem_buffer(&(a1_pack->dma_mem)), bli_mem_buffer(&(a1_pack->pack_mem)), a1_pack->edma_handle);
+ //printf("before Pack release %x %x %x\n", bli_mem_buffer(&(a1_pack->dma_mem)), bli_mem_buffer(&(a1_pack->pack_mem)), a1_pack->emt_handle);
bli_obj_release_pack( a1_pack, cntl_sub_packm_a(cntl) );
- //printf("after Pack release %x %x %x\n", bli_mem_buffer(&(a1_pack->dma_mem)), bli_mem_buffer(&(a1_pack->pack_mem)), a1_pack->edma_handle);
+ //printf("after Pack release %x %x %x\n", bli_mem_buffer(&(a1_pack->dma_mem)), bli_mem_buffer(&(a1_pack->pack_mem)), a1_pack->emt_handle);
#ifdef BLIS_ENABLE_C66X_EDMA
-// printf("before DMA release %x %x %x\n", bli_mem_buffer(&(a1_dma->dma_mem)), bli_mem_buffer(&(a1_dma->pack_mem)), a1_dma->edma_handle);
+// printf("before DMA release %x %x %x\n", bli_mem_buffer(&(a1_dma->dma_mem)), bli_mem_buffer(&(a1_dma->pack_mem)), a1_dma->emt_handle);
bli_obj_release_dma( a1_dma, cntl_sub_dmam_a(cntl) );
- bli_obj_release_edma_handle( a1_dma );
-// printf("after DMA release %x %x %x\n", bli_mem_buffer(&(a1_dma->dma_mem)), bli_mem_buffer(&(a1_dma->pack_mem)), a1_dma->edma_handle);
+ bli_obj_release_emt_handle( a1_dma );
+// printf("after DMA release %x %x %x\n", bli_mem_buffer(&(a1_dma->dma_mem)), bli_mem_buffer(&(a1_dma->pack_mem)), a1_dma->emt_handle);
#endif
}
index e7f46898c80353e08d2da3495bd6f9470d739067..8296455ea8ee682cd642f279391b4a59f586edbf 100644 (file)
offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
dim_t start, end;
- num_t datatype = bli_obj_execution_datatype( *a );
- bli_get_range( thread, offA, m_trans,
- //bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
- bli_info_get_default_mc( datatype ),
- &start, &end );
+ num_t dt = bli_obj_execution_datatype( *a );
+ bli_get_range_t2b( thread, offA, m_trans,
+ //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+ bli_info_get_default_mc( dt ),
+ &start, &end );
// Partition along the remaining portion of the m dimension.
for ( i = start; i < end; i += b_alg )
{
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma , cntl_sub_dmam_a(cntl));
- bli_obj_release_edma_handle( a1_dma );
+ bli_obj_release_emt_handle( a1_dma );
#endif
bli_obj_release_pack( a1_pack , cntl_sub_packm_a(cntl));
}
index 201cf792218e3eab7803dc48a90d1a2f28dc14eb..69550811b9689f2aaf3176f04fb18651b4f9db21 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- num_t datatype = bli_obj_execution_datatype( *a );
- bli_get_range( thread, 0, n_trans,
- bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
+ num_t dt = bli_obj_execution_datatype( *a );
+
+ bli_get_range_r2l( thread, 0, n_trans,
+ bli_lcm( bli_info_get_default_nr( dt ), bli_info_get_default_mr( dt ) ),
&start, &end );
// Partition along the n dimension.
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, end, b,
cntl_blocksize( cntl ) );
- //printf("blk_var2 b_alg %d\n", b_alg);
+
+
// Acquire partitions for B1 and C1.
bli_acquire_mpart_r2l( BLIS_SUBPART1,
i, b_alg, b, &b1 );
index 9e3f82733f63a42c80c1981546666f169159bfc7..78ed4f6f80034f40fa5344e41e04b290f987105f 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- num_t datatype = bli_obj_execution_datatype( *a );
- bli_get_range( thread, 0, n_trans,
- bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
+ num_t dt = bli_obj_execution_datatype( *a );
+// bli_get_range_l2r( thread, 0, n_trans,
+// //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
+// // bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+// bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
+// bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
+// &start, &end );
+
+ bli_get_range_l2r( thread, 0, n_trans,
+ bli_lcm( bli_info_get_default_nr( dt ), bli_info_get_default_mr( dt ) ),
&start, &end );
+ //printf("blk_var2f n = %d end = %d\n", n_trans, end);
+
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
{
b_alg = bli_determine_blocksize_f( i, end, b,
cntl_blocksize( cntl ) );
+ //printf("start %d b_alg %d\n", i, b_alg);
+
// Acquire partitions for B1 and C1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
i, b_alg, b, &b1 );
index f972059a3c48ed46bbca21dc357a0c9572bb1852..7d66387b89acb8dec0369c266a434478ca417ab6 100644 (file)
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma, cntl_sub_dmam_a(cntl) );
bli_obj_release_dma( b1_dma, cntl_sub_dmam_b(cntl) );
- bli_obj_release_edma_handle( a1_dma );
- bli_obj_release_edma_handle( b1_dma );
+ bli_obj_release_emt_handle( a1_dma );
+ bli_obj_release_emt_handle( b1_dma );
#endif
bli_obj_release_pack( a1_pack, cntl_sub_packm_a(cntl) );
bli_obj_release_pack( b1_pack, cntl_sub_packm_b(cntl) );
index 60268aec06afb841e9cc219637d35bbfddd30f3f..8185c46a7903934d36a41a419298692b60f86efe 100644 (file)
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_release_dma( a1_dma, cntl_sub_dmam_a(cntl) );
bli_obj_release_dma( b1_dma, cntl_sub_dmam_b(cntl) );
- bli_obj_release_edma_handle( a1_dma );
- bli_obj_release_edma_handle( b1_dma );
+ bli_obj_release_emt_handle( a1_dma );
+ bli_obj_release_emt_handle( b1_dma );
#endif
bli_obj_release_pack( a1_pack, cntl_sub_packm_a(cntl) );
bli_obj_release_pack( b1_pack, cntl_sub_packm_b(cntl) );
index 811ddeb5260cf532db503718dee6b116f2e83526..f09b16965da240ba937d0a42d1002c6d41f1fcd4 100644 (file)
trsm_r_packa_cntl,
NULL, //trsm_r_packb_cntl, //
NULL,
- gemm_dmaa_cntl, //NULL, //
+ BLIS_GEMM_DMAA_CNTL, //NULL, //
NULL,
NULL,
trsm_cntl_bp_ke,
trsm_r_packb_cntl, //NULL, //
NULL,
NULL,
- gemm_dmab_cntl, //NULL, //
+ BLIS_GEMM_DMAB_CNTL, //NULL, //
NULL,
trsm_r_cntl_op_bp,
NULL,
index c6269218569ab51358bc178782a0cfc481ea6724..d971528a63d18385b2a4d62f66649f9a8df7163f 100644 (file)
#include "blis.h"
+
+
+#ifdef BLIS_ENABLE_PROFILE
+ profile_data_t *bli_trsm_profile_data;
+ profile_data_t *bli_trsm_kervar2_profile_data;
+#endif
+
void bli_trsm_front( side_t side,
obj_t* alpha,
obj_t* a,
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t mn_side, m_profile, n_profile;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *b );
+ n_profile = bli_obj_width( *b );
+ bli_set_dim_with_side( side, m_profile, n_profile, mn_side );
+
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trsm_check( side, alpha, a, b );
trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths( bli_is_right( side ) );
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_trsm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_trsm_int,
(void**) infos );
bli_trsm_thrinfo_free_paths( infos, n_threads );
+
+#ifdef BLIS_ENABLE_PROFILE
+ {
+
+ bli_profile_data_print (bli_trsm_profile_data,
+ m_profile, n_profile, mn_side,
+ bli_obj_execution_datatype( *b ), 1, n_threads);
+
+
+ bli_profile_data_free(bli_trsm_profile_data);
+ }
+
+#endif
}
index 99efa46c27bc9b77bd49dd270be04d76bbde238d..050e962ae286542ae9d2601cb1c43fc5c3795c06 100644 (file)
impl_t i;
FUNCPTR_T f;
+#if defined(BLIS_ENABLE_PROFILE)
+ volatile uint64_t counter_start;
+ volatile uint64_t counter_end;
+ extern profile_data_t *bli_trsm_profile_data;
+ dim_t m_var, k_var, n_var;
+ dim_t index;
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trsm_int_check( alpha, a, b, beta, c, cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[side][uplo][n][i];
+#if defined(BLIS_ENABLE_PROFILE)
+ m_var = bli_obj_length( *b );
+ n_var = bli_obj_width( *b );
+ bli_set_dim_with_side( side, m_var, n_var, k_var );
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ TSCL = 0;
+ counter_start = lib_clock64();
+#else
+ counter_start = (uint64_t) (bli_clock()*1.2e9);
+#endif
+#endif
+
// Invoke the variant.
f( &a_local,
&b_local,
&c_local,
cntl,
thread );
+
+#if defined(BLIS_ENABLE_PROFILE)
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ counter_end = lib_clock64();
+#else
+ counter_end = (uint64_t) (bli_clock()*1.2e9);
+#endif
+ bli_profile_get_index(n, i, index);
+
+ bli_profile_data_update(bli_trsm_profile_data[index], (counter_end-counter_start), m_var*k_var*n_var);
+#endif
}
index ccb153fd74b1f2b3f7dcc20298774cf0655c7025..ac0b20eee6e1ddd354b7a940a0bcda476d2ee7dd 100644 (file)
#define FUNCPTR_T gemm_fp
+#ifdef BLIS_ENABLE_C66X_IDMA
+#define BLIS_ENABLE_C66X_IDMA_KERVAR2 1
+#endif
+
+
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
extern char *pool_mk_mem_L1;
dim_t k_b11; \
dim_t k_b21; \
dim_t off_b11; \
- /*dim_t off_b21;*/ \
+ dim_t off_b21; \
dim_t i, j, jb; \
inc_t rstep_a; \
inc_t cstep_b; \
- inc_t rstep_c, cstep_c; \
+ /*inc_t rstep_c;*/ \
+ inc_t cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
/*EDMA Declarations */ \
\
- EdmaMgr_Handle edma_handle_b = NULL; \
- EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL; \
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trsm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
\
cstep_b = ps_b; \
\
- rstep_c = rs_c * MR; \
+ /*rstep_c = rs_c * MR;*/ \
cstep_c = cs_c * NR; \
\
/* When C (MC*NR) is moved to L2 the stride to get to the next panel of MRxNR*/ \
rs_c11 = NR; /* stride to get to next row in MRxNR panel*/\
cs_c11 = 1; /*stride to get to next column in a panel of MRxNR*/\
\
- rstep_c11 = rstep_c; \
+ /*rstep_c11 = rstep_c; \
rs_c11 = rs_c; \
- cs_c11 = cs_c; \
+ cs_c11 = cs_c;*/ \
} \
\
istep_a = PACKMR * k_full; \
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
} \
- bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
\
- /*for(jb = 0; jb < 16; jb ++) \
- printf("%f \t %f\n", a_cast[jb], b_cast[jb]);*/ \
- /* initiate first c transfer */ \
- /*printf("cstep_c %d rstep_c %d rs_c %d cs_c %d rstep_c11 %d rs_c11 %d cs_c11 %d\n", cstep_c, rstep_c, rs_c, cs_c, rstep_c11, rs_c11, cs_c11);*/\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = lib_clock64(); \
+ } \
+ \
if(rs_c == 1) \
{\
c1 = c1 + (n_iter-1)*cstep_c; \
+ /*printf("rs_c = %d, cs_c = %d, rs_c11 = %d, cs_c11 = %d, cstep_c = %d, rstep_c11 = %d, n_cur = %d, m = %d\n", rs_c, cs_c, rs_c11, cs_c11, cstep_c,rstep_c11, n_cur, m);*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1, \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
cNew1, m*sizeof(ctype), \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
} \
} \
}\
}\
- /*else \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1 = c1 + (n_iter-1)*cstep_c, \
- cNew1, n_cur*sizeof(ctype), \
- m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype));*/ \
+ else \
+ { \
+ /*printf("rs_c = %d, cs_c = %d, rs_c11 = %d, cs_c11 = %d, cstep_c = %d, rstep_c11 = %d, n_cur = %d, m = %d\n", rs_c, cs_c, rs_c11, cs_c11, cstep_c,rstep_c11, n_cur, m);*/ \
+ c1 = c1 + (n_iter-1)*cstep_c; \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
+ cNew1, n_cur*sizeof(ctype), \
+ m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1; \
+ ptr_dest = cNew1; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_cur*sizeof(ctype)); \
+ ptr_source += rs_c; \
+ ptr_dest += rs_c11; \
+ } \
+ }\
+ }\
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( jb = 0; jb < n_iter; ++jb ) \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict b2; \
-\
+ \
j = n_iter - 1 - jb; \
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
- if(rs_c == 1) \
- c11 = cNew1; \
- else \
- c11 = c1 + (n_iter-1)*cstep_c; \
-\
+ /*if(rs_c == 1)*/ \
+ c11 = cNew1; \
+ /*else \
+ c11 = c1 + (n_iter-1)*cstep_c; */\
+ \
n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
n_next = ( bli_is_not_edge_b( jb+1, n_iter, n_left ) ? NR : n_left ); \
-\
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
+ \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
- if(rs_c == 1) \
- EdmaMgr_wait(edma_handle_c0); \
+ /*if(rs_c == 1)*/ \
+ lib_emt_wait(emt_handle_c0); \
if(jb < n_iter-1) /* no transfer for last iteration */ \
{ \
if (rs_c == 1) \
{ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1-cstep_c, \
- cNew0, m*sizeof(ctype), \
- n_next, cs_c*sizeof(ctype), \
- cs_c11*sizeof(ctype)); \
+ lib_emt_copy2D2D(emt_handle_c0, c1-cstep_c, \
+ cNew0, m*sizeof(ctype), \
+ n_next, cs_c*sizeof(ctype), \
+ cs_c11*sizeof(ctype)); \
} \
else \
{ \
} \
}\
}\
- /*else \
- { \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1 - cstep_c, \
- cNew0, n_next*sizeof(ctype), \
- m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
- }*/ \
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1 - cstep_c, \
+ cNew0, n_next*sizeof(ctype), \
+ m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1 - cstep_c; \
+ ptr_dest = cNew0; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_next*sizeof(ctype)); \
+ ptr_source += rs_c; \
+ ptr_dest += rs_c11; \
+ } \
+ }\
+ } \
} \
\
/* If the current panel of B intersects the diagonal, use a
k_b1121 = k - off_b11; \
k_b11 = NR; \
k_b21 = k_b1121 - NR; \
- /*off_b21 = off_b11 + k_b11;*/ \
+ off_b21 = off_b11 + k_b11; \
\
/* Compute the addresses of the triangular block B11 and the
panel B21. */ \
bli_auxinfo_set_is_a( PACKNR * k_b1121, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
- if( trsm_my_iter( i, thread ) ){ \
+ if( trsm_my_iter( i, thread ) ) \
+ { \
\
ctype* restrict a11; \
ctype* restrict a12; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
- /*printf("%d %d %d %d \n", k, MR, off_b11, PACKMR);*/ \
- if (i == 0) \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
{ \
- idma1_setup(a2_L1, a1 + ( off_b11 * PACKMR ) / off_scl, k_b1121*MR*sizeof(ctype), 0, 0, 7); \
+ if (i == 0) \
+ { \
+ lib_imt_copy(a1 + ( off_b11 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
+ } \
} \
\
- /*ORIG TRSM*/ \
/* Compute the addresses of the next panels of A and B. */ \
- /*a2 = a1;*/ \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ a2 = a1 + rstep_a; \
+ lib_imt_wait(); \
+ temp = a1_L1; \
+ a1_L1 = a2_L1; \
+ a2_L1 = temp; \
+ } \
+ else \
+ { \
+ a2 = a1; \
+ } \
\
/* Compute the addresses of the next panels of A and B. */ \
- a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
- temp = a1_L1; \
- a1_L1 = a2_L1; \
- a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
/*if ( i + thread_num_threads(thread) >= m_iter )*/ \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
} \
else \
{ \
- /*Start next panel*/ \
- idma1_setup(a2_L1, a2 + ( off_b11 * PACKMR ) / off_scl , k_b1121*MR*sizeof(ctype), 0, 0, 7); \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ /*Start next panel*/ \
+ lib_imt_copy(a2 + ( off_b11 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
+ } \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( a2, aux ); \
\
/* Compute the addresses of the A11 block and A12 panel. */ \
- /*a11 = a1 + ( off_b11 * PACKMR ) / off_scl; */\
- /*a12 = a1 + ( off_b21 * PACKMR ) / off_scl;*/ \
- a11 = a1_L1;\
- a12 = a1_L1 + ( k_b11 * PACKMR ) / off_scl; \
+ /* If IDMA enabled*/ \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ a11 = a1_L1;\
+ a12 = a1_L1 + ( k_b11 * PACKMR ) / off_scl; \
+ } \
+ else \
+ { \
+ a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
+ a12 = a1 + ( off_b21 * PACKMR ) / off_scl; \
+ } \
/* Handle interior and edge cases separately. */ \
- /*printf("Calling GEMMTRSM ukernel\n");*/\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 /* rs_c, cs_c **** 1, cstep_c11*/); \
} \
- while(!idma1_done()){;} \
- /*printf("%d %d \n", k_b11, PACKMR);*/ \
- idma1_setup(a1 + ( off_b11 * PACKMR ) / off_scl, a11, k_b11*PACKMR*sizeof(ctype), 0,0,7); \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k_b21*m_cur*n_cur); \
+ } \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ /*Storing the value back*/\
+ /*lib_imt_wait(); \
+ lib_imt_copy(a11, a1 + ( off_b11 * PACKMR ) / off_scl, k_b11*PACKMR*sizeof(ctype));*/ \
+ { \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = a11; \
+ ptr_dest = a1 + ( off_b11 * PACKMR ) / off_scl; \
+ memcpy(ptr_dest, ptr_source, k_b11*PACKMR*sizeof(ctype)); \
+ } \
} \
+ } /*trsm_my_iter( i, thread ) */\
\
a1 += rstep_a; \
/*c11 += rstep_c;*/ \
c11 += rstep_c11; \
+ } /*MR loop*/\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
} \
\
b1 += ps_b_cur; \
bli_auxinfo_set_is_a( istep_b, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
+\
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trsm_my_iter( i, thread ) ){ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
- if(i == 0) \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ if(i == 0) \
+ { \
+ lib_imt_copy(a1, a2_L1, k*MR*sizeof(ctype)); \
+ } \
+ /* Compute the addresses of the next panels of A and B. */ \
+ a2 = a1 + rstep_a; \
+ lib_imt_wait(); \
+ temp = a1_L1; \
+ a1_L1 = a2_L1; \
+ a2_L1 = temp; \
+ } \
+ else \
{ \
- idma1_setup(a2_L1, a1, k*MR*sizeof(ctype), 0, 0, 7); \
+ /* Compute the addresses of the next panels of A and B. */ \
+ a2 = a1;\
} \
\
- /*ORIG TRSM*/ \
- /* Compute the addresses of the next panels of A and B. */ \
- /*a2 = a1;*/\
- /* Compute the addresses of the next panels of A and B. */ \
- a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
- temp = a1_L1; \
- a1_L1 = a2_L1; \
- a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
}\
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + thread_num_threads(thread) >= m_iter ) \
} \
else \
{ \
- /*Start next panel*/ \
- idma1_setup(a2_L1, a2 , k*MR*sizeof(ctype), 0, 0, 7); \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ /*Start next panel*/ \
+ lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype)); \
+ } \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( a2, aux ); \
\
/* Handle interior and edge cases separately. */ \
- /*printf("Calling GEMM ukernel\n");*/\
- if ( m_cur == MR && n_cur == NR ) \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
{ \
- /* Invoke the gemm micro-kernel. */ \
- gemm_ukr_cast( k, \
- minus_one, \
- b1_L1, \
- a1_L1, \
- alpha2_cast, \
- c11, cs_c11, rs_c11, /*cs_c, rs_c, * cstep_c11, 1,*/ \
- &aux ); \
+ counter_start_ker = lib_clock64(); \
+ } \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ if ( m_cur == MR && n_cur == NR ) \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ minus_one, \
+ b1_L1, \
+ a1_L1, \
+ alpha2_cast, \
+ c11, cs_c11, rs_c11, /*cs_c, rs_c, * cstep_c11, 1,*/ \
+ &aux ); \
+ } \
+ else \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ minus_one, \
+ b1_L1, \
+ a1_L1, \
+ zero, \
+ ct, cs_ct, rs_ct, \
+ &aux ); \
+\
+ /* Add the result to the edge of C. */ \
+ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+ ct, rs_ct, cs_ct, \
+ alpha2_cast, \
+ c11, rs_c11, cs_c11 /*rs_c, cs_c *1, cstep_c11 */); \
+ } \
} \
else \
{ \
- /* Invoke the gemm micro-kernel. */ \
- gemm_ukr_cast( k, \
- minus_one, \
- b1_L1, \
- a1_L1, \
- zero, \
- ct, cs_ct, rs_ct, \
- &aux ); \
-\
- /* Add the result to the edge of C. */ \
- PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
- ct, rs_ct, cs_ct, \
- alpha2_cast, \
- c11, rs_c11, cs_c11 /*rs_c, cs_c *1, cstep_c11 */); \
+ if ( m_cur == MR && n_cur == NR ) \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ minus_one, \
+ b1_L1, \
+ a1, \
+ alpha2_cast, \
+ c11, cs_c11, rs_c11, /*cs_c, rs_c, * cstep_c11, 1,*/ \
+ &aux ); \
+ } \
+ else \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ minus_one, \
+ b1_L1, \
+ a1, \
+ zero, \
+ ct, cs_ct, rs_ct, \
+ &aux ); \
+\
+ /* Add the result to the edge of C. */ \
+ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+ ct, rs_ct, cs_ct, \
+ alpha2_cast, \
+ c11, rs_c11, cs_c11 /*rs_c, cs_c *1, cstep_c11 */); \
+ } \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
} \
+ } /*myiter*/\
\
a1 += rstep_a; \
/*c11 += rstep_c;*/ \
c11 += rstep_c11; \
+ } /*MR loop if b does not intersect diagonal*/\
+ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
} \
\
b1 += cstep_b; \
} \
\
/* circularly shift buffers */ \
- if(rs_c==1) \
- { \
cNewTemp = cNew0; \
cNew0 = cNew2; \
cNew2 = cNew1; \
cNew1 = cNewTemp; \
if(j != 0) /* wait for save c to complete; skip first iteration */ \
{ \
- EdmaMgr_wait(edma_handle_c1); \
- } \
+ lib_emt_wait(emt_handle_c1); \
} \
/* save updated c*/ \
if(rs_c==1) \
{ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
}\
else \
} \
} \
} \
- /*else \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew1, c1, n_cur*sizeof(ctype), \
- m, rs_c11*sizeof(ctype), rs_c*sizeof(ctype)); */\
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, n_cur*sizeof(ctype), \
+ m, rs_c11*sizeof(ctype), rs_c*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = cNew2; \
+ ptr_dest = c1; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_cur*sizeof(ctype)); \
+ ptr_source += rs_c11; \
+ ptr_dest += rs_c; \
+ } \
+ }\
+ }\
\
c1 -= cstep_c; \
+ } /*NR loop*/\
+ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
} \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
}\
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
- if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
+ lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
} \
}
index 0d8caf5236b73dda5ad2b8fc54db53fdcb307d22..14848dd60d9d1684d95b68de99c9d88a0d895013 100644 (file)
#define FUNCPTR_T gemm_fp
-
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
extern char *pool_mk_mem_L1;
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
- inc_t rstep_c, cstep_c; \
+ /*inc_t rstep_c; */\
+ inc_t cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ps_b_cur; \
auxinfo_t aux; \
\
- /*inc_t rstep_c11, rs_c11, cs_c11;*/ \
+ dim_t n_next; \
+ inc_t rstep_c11, rs_c11, cs_c11; \
\
mem_t b1_L1_mem; \
/*memcpy does not like b1_L1 if it is restrict. The resid of gemm is non zero if this is changed to ctype* restrict*/ \
mem_t a1_L1_mem, a2_L1_mem; \
ctype *a1_L1, *a2_L1, *temp; \
\
-/* mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
- ctype *cNew0, *cNew1, *cNew2, *cNewTemp;*/ \
+ mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
+ ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
/*EDMA Declarations */ \
\
- EdmaMgr_Handle edma_handle_b = NULL; \
-/* EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL;*/ \
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trsm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
last column and the next multiple of NR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
- PASTEMAC(ch,set0s_mxn)( MR, NR, \
- ct, rs_ct, cs_ct ); \
+ /*PASTEMAC(ch,set0s_mxn)( MR, NR, \
+ ct, rs_ct, cs_ct );*/ \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
\
cstep_b = ps_b; \
\
- rstep_c = rs_c * MR; \
+ /*rstep_c = rs_c * MR;*/ \
cstep_c = cs_c * NR; \
\
/* When C (MC*NR) is moved to L2 the stride to get to the next panel of MRxNR*/ \
-/* if(rs_c == 1) \
+ if(rs_c == 1) \
{ \
- rstep_c11 = MR; / *stride to get to next panel of MRxNR in a panel of MCxNR* /\
+ rstep_c11 = MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/ \
rs_c11 = 1; \
- cs_c11 = (m%2 == 0) ? m : m+1; / *stride to get to next column in a panel of MRxNR* /\
+ cs_c11 = (m%2 == 0) ? m : m+1; /*stride to get to next column in a panel of MRxNR*/ \
} \
else\
{ \
- rstep_c11 = NR*MR; / *stride to get to next panel of MRxNR in a panel of MCxNR* /\
- rs_c11 = NR; / * stride to get to next row in MRxNR panel* /\
- cs_c11 = 1; / *stride to get to next column in a panel of MRxNR* /\
- } */\
+ rstep_c11 = NR*MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
+ rs_c11 = NR; /* stride to get to next row in MRxNR panel*/ \
+ cs_c11 = 1; /*stride to get to next column in a panel of MRxNR*/\
+ } \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
a1_L1 = (ctype*) (pool_mk_mem_L1 ); \
a2_L1 = (ctype*) (pool_mk_mem_L1 + k*MR*sizeof(ctype) ) ;\
b1_L1 = (ctype*) (pool_mk_mem_L1 + PASTEMAC(ch,bank) + 2 * k*MR*sizeof(ctype)) ;\
- /*printf("%x %x %x \n", a1_L1, a2_L1, b1_L1);*/\
}\
else { \
- /*Acquiring a buffer for B in L1*/ \
- bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
- b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
- b1_L1 = (ctype *) ((char *) b1_L1_mem.buf + PASTEMAC(ch,bank)); \
+ /*Acquiring a buffer for B in L1*/ \
+ bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
+ b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
+ b1_L1 = (ctype *) ((char *) b1_L1_mem.buf + PASTEMAC(ch,bank)); \
\
- /*Acquiring a buffer for A in L1*/ \
- bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
- a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
+ /*Acquiring a buffer for A in L1*/ \
+ bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
+ a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
\
- bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
- a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
+ bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
+ a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
}\
\
/*Acquiring buffers for C (MC_x_NR) in L2 */\
-/* bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BITVAL_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
+ bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
cNew0 = bli_mem_buffer( &c0_L2_mem ); \
\
- bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BITVAL_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
+ bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
cNew1 = bli_mem_buffer( &c1_L2_mem ); \
\
- bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BITVAL_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
+ bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
-*/ \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
+ { \
+ printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
+ } \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
-/* bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ counter_start_nr = lib_clock64(); \
} \
-*/ \
+ n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
+ if(rs_c == 1) \
+ {\
+ if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
+ cNew1, m*sizeof(ctype), \
+ n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1; \
+ ptr_dest = cNew1; \
+ for(ii = 0; ii < n_cur; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c; \
+ ptr_dest += cs_c11; \
+ } \
+ }\
+ }\
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
+ cNew1, n_cur*sizeof(ctype), \
+ m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1; \
+ ptr_dest = cNew1; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_cur*sizeof(ctype)); \
+ ptr_source += rs_c; \
+ ptr_dest += rs_c11; \
+ } \
+ }\
+ }\
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
- c11 = c1; \
+ /*c11 = c1; */\
+ c11 = cNew1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+ n_next = ( bli_is_not_edge_f( j+1, n_iter, n_left ) ? NR : n_left ); \
\
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
+\
+ lib_emt_wait(emt_handle_c0); \
+ if(j < n_iter-1) /* no transfer for last iteration */ \
+ { \
+ if (rs_c == 1) \
+ { \
+ if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
+ cNew0, m*sizeof(ctype), \
+ n_next, cs_c*sizeof(ctype), \
+ cs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1+cstep_c; \
+ ptr_dest = cNew0; \
+ for(ii = 0; ii < n_next; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c; \
+ ptr_dest += cs_c11; \
+ } \
+ }\
+ }\
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1 + cstep_c, \
+ cNew0, n_next*sizeof(ctype), \
+ m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1 + cstep_c; \
+ ptr_dest = cNew0; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_next*sizeof(ctype)); \
+ ptr_source += rs_c; \
+ ptr_dest += rs_c11; \
+ } \
+ }\
+ } \
+ } \
\
/* If the current panel of B intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
bli_auxinfo_set_is_a( PACKNR * k_b0111, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trsm_my_iter( i, thread ) ){ \
\
if (i == 0) \
{ \
- idma1_setup(a2_L1, a1, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Compute the addresses of the next panels of A and B. */ \
/*a2 = a1;*/ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + thread_num_threads(thread) >= m_iter ) \
else \
{ \
/*Start next panel*/ \
- idma1_setup(a2_L1, a2, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Compute the addresses of the A10 panel and A11 block. */ \
bli_auxinfo_set_next_b( a2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
b11, \
a10, \
a11, \
- c11, cs_c, rs_c, \
+ c11, cs_c11, rs_c11, /*cs_c, rs_c,*/ \
&aux ); \
} \
else \
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
- c11, rs_c, cs_c ); \
+ c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
} \
- while(!idma1_done()){;} \
- idma1_setup(a1 + ( off_b11 * PACKMR ) / off_scl, a11, NR*PACKMR*sizeof(ctype), 0,0,7); \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k_b01*m_cur*n_cur); \
+ } \
+ lib_imt_wait(); \
+ lib_imt_copy(a11, a1 + ( off_b11 * PACKMR ) / off_scl, NR*PACKMR*sizeof(ctype)); \
} \
\
a1 += rstep_a; \
- c11 += rstep_c; \
+ /*c11 += rstep_c;*/ \
+ c11 += rstep_c11; \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
} \
\
b1 += ps_b_cur; \
bli_auxinfo_set_is_a( istep_b, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trsm_my_iter( i, thread ) ){ \
\
if(i == 0) \
{ \
- idma1_setup(a2_L1, a1, k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1, a2_L1, k*MR*sizeof(ctype)); \
} \
/*ORIG TRSM*/ \
/* Compute the addresses of the next panels of A and B. */ \
/*a2 = a1;*/\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + thread_num_threads(thread) >= m_iter ) \
else \
{ \
/*Start next panel*/ \
- idma1_setup(a2_L1, a2 , k*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype)); \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( a2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
b1_L1, \
a1_L1, \
alpha2_cast, \
- c11, cs_c, rs_c, \
+ c11, cs_c11, rs_c11, /*cs_c, rs_c,*/ \
&aux ); \
} \
else \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
- c11, rs_c, cs_c ); \
+ c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
} \
} \
\
a1 += rstep_a; \
- c11 += rstep_c; \
+ /*c11 += rstep_c;*/ \
+ c11 += rstep_c11; \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
} \
\
b1 += cstep_b; \
} \
\
+ /* circularly shift buffers */ \
+ cNewTemp = cNew0; \
+ cNew0 = cNew2; \
+ cNew2 = cNew1; \
+ cNew1 = cNewTemp; \
+ if(j != 0) /* wait for save c to complete; skip first iteration */ \
+ { \
+ lib_emt_wait(emt_handle_c1); \
+ } \
+ /* save updated c*/ \
+ if(rs_c==1) \
+ { \
+ if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = cNew2; \
+ ptr_dest = c1; \
+ for(ii = 0; ii < n_cur; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c11; \
+ ptr_dest += cs_c; \
+ } \
+ } \
+ } \
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, n_cur*sizeof(ctype), \
+ m, rs_c11*sizeof(ctype), rs_c*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = cNew2; \
+ ptr_dest = c1; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_cur*sizeof(ctype)); \
+ ptr_source += rs_c11; \
+ ptr_dest += rs_c; \
+ } \
+ }\
+ }\
c1 += cstep_c; \
} \
-/* bli_mem_release( &c2_L2_mem ); \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
+ bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
bli_mem_release( &c0_L2_mem ); \
- */ \
+\
if((MKSTR(ch)=="c")==0) \
{\
- bli_mem_release( &a2_L1_mem ); \
- bli_mem_release( &a1_L1_mem ); \
- bli_mem_release( &b1_L1_mem ); \
+ bli_mem_release( &a2_L1_mem ); \
+ bli_mem_release( &a1_L1_mem ); \
+ bli_mem_release( &b1_L1_mem ); \
}\
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
-/* if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
- } \
-*/ \
+ lib_emt_wait(emt_handle_c1); \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
+ }\
+ \
}
INSERT_GENTFUNC_BASIC2( trsm_ru_ker_var2, gemmtrsm_ukr_t, gemm_ukr_t )
index dd138b3ff9bf8319058d7b028dbf8d5a566ca202..477d98f8d5285e57fa630d9fe6e5bccca419a033 100644 (file)
packm_thrinfo_t* ipackm,
trsm_thrinfo_t* sub_trsm )
{
- trsm_thrinfo_t* thread = ( trsm_thrinfo_t* ) bli_malloc( sizeof( trsm_thrinfo_t ) );
+ trsm_thrinfo_t* thread = ( trsm_thrinfo_t* ) bli_malloc_scratch( sizeof( trsm_thrinfo_t ) );
bli_setup_trsm_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_trsm_thrinfo_free( thread->sub_trsm );
- bli_free( thread );
+ bli_free_scratch( thread );
return;
}
{
for( int i = 0; i < num; i++)
bli_trsm_thrinfo_free( threads[i] );
- bli_free( threads );
+ bli_free_scratch( threads );
}
trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
dim_t ir_nt = 1;
- trsm_thrinfo_t** paths = (trsm_thrinfo_t**) bli_malloc( global_num_threads * sizeof( trsm_thrinfo_t* ) );
+ trsm_thrinfo_t** paths = (trsm_thrinfo_t**) bli_malloc_scratch( global_num_threads * sizeof( trsm_thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
for( int a = 0; a < jc_way; a++ )
index 80bf770d10d31a04f790af06bd5bff196bbd0797..bf521d0267571b90bc4aef50c00e1023bb30ca27 100755 (executable)
*/
#include "blis.h"
+
+#if defined(BLIS_ENABLE_C66X_OPENCL)
+int ti_printf(FILE* _fp, const char *_format, ...)
+{
+ va_list argptr;
+ va_start(argptr,_format);
+ printf(_format, argptr);
+ va_end(argptr);
+ return 0;
+}
+#endif
+
#if defined(BLIS_ENABLE_C66X_BUILD) && defined(BLIS_ENABLE_C66X_MEM_POOLS) && defined(BLIS_ENABLE_C66X_EDMA)
//#define BLIS_EDMA_DEBUG
//#define BLIS_ENABLE_CYCLE_COUNT
static dma_t dma_pools[BLIS_MAX_NUM_THREADS]; // One pool for each core.
-static EdmaMgr_Handle pool_coreX_edma_handles[BLIS_MAX_NUM_THREADS][BLIS_C66X_EDMA_MAX_NUM_CHANNELS];
+static lib_emt_Handle pool_coreX_emt_handles[BLIS_MAX_NUM_THREADS][BLIS_C66X_EDMA_MAX_NUM_CHANNELS];
//for initilization during encoding the dma control leaf
dmam_t* bli_dmam_cntl_obj_create( impl_t impl_type,
_Pragma( "omp parallel num_threads(BLIS_MAX_NUM_THREADS)" )
{
gint_t status; //int32_t
-#ifdef BLIS_ENABLE_C66X_OPENCL
- status = EdmaMgr_SUCCESS;
-#else
- status = EdmaMgr_init(CSL_chipReadDNUM(), NULL);
-#endif
- if(status != EdmaMgr_SUCCESS)
+ status = lib_emt_init();
+
+ if(status != LIB_EMT_SUCCESS)
{
- printf("Core %d DMA not initialized\n", CSL_chipReadDNUM());
+ printf("Core %d DMA not initialized\n", lib_get_coreID());
exit(1);
}
- bli_dma_init_pool(status, BLIS_C66X_EDMA_MAX_NUM_CHANNELS, pool_coreX_edma_handles[CSL_chipReadDNUM()], &dma_pools[CSL_chipReadDNUM()]);
+ bli_dma_init_pool(status, BLIS_C66X_EDMA_MAX_NUM_CHANNELS, pool_coreX_emt_handles[lib_get_coreID()], &dma_pools[lib_get_coreID()]);
}
}
-void bli_dma_init_pool(gint_t edma_status,
+void bli_dma_init_pool(gint_t emt_status,
gint_t num_channels,
- EdmaMgr_Handle* pool_edma_handles,
+ lib_emt_Handle* pool_emt_handles,
dma_t* dma_pool)
{
dim_t i;
- EdmaMgr_Handle temp_handle;
- if(edma_status != EdmaMgr_SUCCESS)
+ lib_emt_Handle temp_handle;
+ if(emt_status != LIB_EMT_SUCCESS)
{
- dma_pool->edma_status = FALSE;
- printf("Core %d DMA not initialized\n", CSL_chipReadDNUM());
+ dma_pool->emt_status = FALSE;
+ printf("Core %d DMA not initialized\n", lib_get_coreID());
return;
}
- dma_pool->edma_status = TRUE;
+ dma_pool->emt_status = TRUE;
for(i = 0; i < num_channels; i++)
{
- //pool_edma_handles[i] = EdmaMgr_alloc(1);
- temp_handle = EdmaMgr_alloc(1);
+ //pool_emt_handles[i] = lib_emt_alloc(1);
+ temp_handle = lib_emt_alloc(1);
if(temp_handle == NULL)
{
- printf("Failed to alloc edma handle CoreID %d\n", CSL_chipReadDNUM());
+ printf("Failed to alloc edma handle CoreID %d\n", lib_get_coreID());
return;
}
- pool_edma_handles[i] = temp_handle;
+ pool_emt_handles[i] = temp_handle;
}
dma_pool->num_channels = num_channels;
- dma_pool->edma_handle = pool_edma_handles;
+ dma_pool->emt_handle = pool_emt_handles;
dma_pool->top_index = num_channels-1;
#ifdef BLIS_EDMA_DEBUG
- printf("Core ID %d, Dma pool top index %d, num channels = %d\n",CSL_chipReadDNUM(), dma_pool->top_index, dma_pool->num_channels);
+ printf("Core ID %d, Dma pool top index %d, num channels = %d\n",lib_get_coreID(), dma_pool->top_index, dma_pool->num_channels);
#endif
}
-void bli_dma_channel_acquire(EdmaMgr_Handle* edma_handle, dim_t core_id)
+void bli_dma_channel_acquire(lib_emt_Handle* emt_handle, dim_t core_id)
{
dma_t* dma_pool;
- EdmaMgr_Handle* edma_handle_ptrs;
+ lib_emt_Handle* emt_handle_ptrs;
dim_t i;
dma_pool = &dma_pools[core_id];
}
// Get all the handles of DMA pool
- edma_handle_ptrs = dma_pool->edma_handle;
+ emt_handle_ptrs = dma_pool->emt_handle;
//Get index of the top most available handle
i = dma_pool->top_index;
//Get edma handle
- *edma_handle = edma_handle_ptrs[i];
+ *emt_handle = emt_handle_ptrs[i];
#ifdef BLIS_EDMA_DEBUG
- printf("Acquiring DMA handle, top index %d edma handle %x %x\n", i, *edma_handle, edma_handle_ptrs[i]);
+ printf("Acquiring DMA handle, top index %d edma handle %x %x\n", i, *emt_handle, emt_handle_ptrs[i]);
#endif
// Decrement the index so that it now points to the next available handle.
dma_pool->top_index--;
}
-void bli_dma_channel_release(EdmaMgr_Handle edma_handle, dim_t core_id)
+void bli_dma_channel_release(lib_emt_Handle emt_handle, dim_t core_id)
{
dma_t* dma_pool;
- EdmaMgr_Handle* edma_handle_ptrs;
+ lib_emt_Handle* emt_handle_ptrs;
dim_t i;
- if(edma_handle == NULL)
+ if(emt_handle == NULL)
{
printf("nothing to release\n");
return;
dma_pool = &dma_pools[core_id];
// Get all the handles of DMA pool
- edma_handle_ptrs = dma_pool->edma_handle;
+ emt_handle_ptrs = dma_pool->emt_handle;
// Increment the index so that it now points to the next available handle.
dma_pool->top_index++;
//Place the edma handle back onto the top of the dma pool.
// This is done so that if handles were release not in the same order
// that they were acquired, the next time a handle is acquired it gets the latest released one.
- edma_handle_ptrs[i] = edma_handle;
+ emt_handle_ptrs[i] = emt_handle;
#ifdef BLIS_EDMA_DEBUG
- printf("Released DMA handle, top index %d edma handle %x \n", i, edma_handle_ptrs[i]);
+ printf("Released DMA handle, top index %d edma handle %x \n", i, emt_handle_ptrs[i]);
#endif
}
if ( bli_obj_is_zeros( *a ) )
{
- //printf("zeros\n");
- //bli_obj_release_dma( p, cntl );
bli_obj_alias_for_dma( *a, *p );
return;
}
bli_obj_set_buffer( buf, *p );
//If definition does not have an EDMA channel, then acquire a channel from the pool
- if(p->edma_handle == NULL)
+ if(p->emt_handle == NULL)
{
- bli_dma_channel_acquire(&(p->edma_handle), CSL_chipReadDNUM());
- if(p->edma_handle == NULL)
- printf("DMAM_INIT Failed to alloc edma handle CoreID %d %x\n", CSL_chipReadDNUM(), p->edma_handle);
+ bli_dma_channel_acquire(&(p->emt_handle), lib_get_coreID());
+ if(p->emt_handle == NULL)
+ printf("DMAM_INIT Failed to alloc edma handle CoreID %d %x\n", lib_get_coreID(), p->emt_handle);
}
}
if(thread->work_id == 0)
{
// ld_source is already in terms of bytes
- if(ld_source < BLIS_C66X_MAXDMASTRIDE && p->edma_handle != NULL)
+ if(ld_source < BLIS_C66X_MAXDMASTRIDE && p->emt_handle != NULL)
{
int status = -100;
#ifdef BLIS_ENABLE_CYCLE_COUNT
counter_start = TSCL;
#endif
// The destination object contains the EDMA handle
- status = EdmaMgr_copy2D2DSep ( p->edma_handle,
+ status = lib_emt_copy2D2D ( p->emt_handle,
ptr_source,
ptr_dest,
elem_move,
#endif
- if(status != EdmaMgr_SUCCESS)
+ if(status != LIB_EMT_SUCCESS)
printf("DMA Transfer Error %d \n",status);
}
else // cannot use DMA since stride is only 16 bit signed
// Wait only if current thread work ID is zero
if(thread->work_id == 0)
{
- EdmaMgr_wait(p->edma_handle);
+ lib_emt_wait(p->emt_handle);
}
}
-void bli_obj_release_edma_handle( obj_t* p)
+void bli_obj_release_emt_handle( obj_t* p)
{
- if ( p->edma_handle != NULL )
+ if ( p->emt_handle != NULL )
{
- bli_dma_channel_release(p->edma_handle, CSL_chipReadDNUM());
- p->edma_handle = NULL;
+ bli_dma_channel_release(p->emt_handle, lib_get_coreID());
+ p->emt_handle = NULL;
}
}
{
dim_t i;
dma_t* dma_pool;
- EdmaMgr_Handle* edma_handle;
+ lib_emt_Handle* emt_handle;
//Create omp threads
- dma_pool = &dma_pools[CSL_chipReadDNUM()];
- edma_handle = dma_pool->edma_handle;
+ dma_pool = &dma_pools[lib_get_coreID()];
+ emt_handle = dma_pool->emt_handle;
for(i = 0; i < BLIS_C66X_EDMA_MAX_NUM_CHANNELS; i ++)
{
- if( edma_handle[i] != NULL)
+ if( emt_handle[i] != NULL)
{
- if(EdmaMgr_free( edma_handle[i] ) == EdmaMgr_ERROR_FREE)
+ if(lib_emt_free( emt_handle[i] ) == LIB_EMT_ERROR_FREE)
{
- printf("ERROR: edma_free\n");
+ printf("ERROR: emt_free\n");
}
else
- edma_handle[i] = NULL;
+ emt_handle[i] = NULL;
}
}
}
index 425879754d9f3022b3824da5b15907baa38d6088..0e4928ce5a44fd5e93df6f69c32dfe12e0f4d8dd 100644 (file)
#ifndef BLIS_DMA_H
#define BLIS_DMA_H
+
+int ti_printf(FILE *_fp, const char *_format, ...);
+int ti_sprintf(char *str, const char *_format, ...);
+
+
/*
* EDMA Pool
*/
struct dma_s
{
- bool_t edma_status;
- void** edma_handle;
+ bool_t emt_status;
+ void** emt_handle;
dim_t num_channels;
dim_t top_index;
};
\
( &((obj).dma_mem) )
-#define bli_obj_edma_handle( obj ) \
+#define bli_obj_emt_handle( obj ) \
\
- ( &((obj).edma_handle) )
+ ( &((obj).emt_handle) )
-#define bli_edma_handle_set_NULL(obj) \
+#define bli_emt_handle_set_NULL(obj) \
{ \
- (obj).edma_handle = NULL; \
+ (obj).emt_handle = NULL; \
}
#define bli_obj_alias_with_dma(a, b) \
{ \
bli_obj_alias_to( a, b ); \
- &((b).edma_handle) = &((a).edma_handle); \
+ &((b).emt_handle) = &((a).emt_handle); \
}
#define bli_obj_init_dma( obj_p ) \
\
bli_mem_set_buffer( NULL, pack_mem ); \
bli_mem_set_buffer( NULL, dma_mem ); \
- bli_edma_handle_set_NULL( *obj_p ); \
+ bli_emt_handle_set_NULL( *obj_p ); \
}
//#define bli_obj_init_dma( obj_p ) \
//{ \
// Functions to initialize the EDMA and EDMA channels
void bli_dma_init (void);
-void bli_dma_init_pool(gint_t edma_status,
+void bli_dma_init_pool(gint_t emt_status,
gint_t num_channels,
- EdmaMgr_Handle* pool_edma_handles,
+ lib_emt_Handle* pool_emt_handles,
dma_t * dma_pool);
-void bli_dma_channel_acquire(EdmaMgr_Handle* edma_handle, dim_t core_id);
-void bli_dma_channel_release(EdmaMgr_Handle edma_handle, dim_t core_id);
+void bli_dma_channel_acquire(lib_emt_Handle* emt_handle, dim_t core_id);
+void bli_dma_channel_release(lib_emt_Handle emt_handle, dim_t core_id);
void bli_dmam_wait(obj_t* p, dmam_t* cntl, dmam_thrinfo_t* thread);
-void bli_obj_release_edma_handle(obj_t* p);
+void bli_obj_release_emt_handle(obj_t* p);
void bli_dma_finalize(void);
index 8638a23eae933ea7296e199634157c8ec2c072ae..d61f8e8d1e487a358b8629a5c5889a6d6f3da1a7 100644 (file)
{
fprintf( stderr, "libblis: Aborting.\n" );
//raise( SIGABRT );
-#ifdef BLIS_ENABLE_C66X_OPENCL
- exit(1);
-#else
abort();
-#endif
}
void bli_print_msg( char* str, char* file, guint_t line )
{
-#ifdef BLIS_ENABLE_C66X_OPENCL
+ //fprintf( stderr, "\n" );
+ //fprintf( stderr, "libblis: %s (line %lu):\n", file, ( long unsigned int )line );
+ //fprintf( stderr, "libblis: %s\n", str );
printf( "\n" );
printf( "libblis: %s (line %lu):\n", file, ( long unsigned int )line );
printf( "libblis: %s\n", str );
-
-#else
- fprintf( stderr, "\n" );
- fprintf( stderr, "libblis: %s (line %lu):\n", file, ( long unsigned int )line );
- fprintf( stderr, "libblis: %s\n", str );
+#ifndef BLIS_ENABLE_C66X_OPENCL
fflush( stderr );
#endif
}
index 9e68045ea64b4caefbe1ada53a77f2f27938e603..c9cb628cebb6d48bbf56369fda31da09f5f9f41f 100644 (file)
#include <CL/cl_ext.h>
#endif
+/* This function is used to allocate memory during BLIS initialization.
+ Allocated memory will be freed when bli_finalize() is called */
void* bli_malloc( siz_t size )
{
void* p = NULL;
return p;
}
+/* This function is used to allocate memory for kernel computation.
+ The allocated memory will be freed at the end of the computation.
+ For TI DSP implementation, LibArch scratch heap allocator will be
+ used to obtain a memory block from a scratch heap that is
+ initialized during BLIS initialization. */
+void* bli_malloc_scratch( siz_t size )
+{
+ void* p = NULL;
+#if !defined(BLIS_ENABLE_TI_ARM_OPENCL) && !defined(_WIN32) && !defined(BLIS_ENABLE_C66X_BUILD) && (BLIS_HEAP_ADDR_ALIGN_SIZE != 1)
+ int r_val;
+#endif
+
+ if ( size == 0 ) return NULL;
+
+#if defined(BLIS_ENABLE_TI_ARM_OPENCL)
+ _Pragma( "omp critical (bli_malloc_critical)" )
+ {
+ p = __malloc_ddr( ( size_t )size );
+ }
+#elif BLIS_HEAP_ADDR_ALIGN_SIZE == 1
+ p = malloc( ( size_t )size );
+#elif defined(_WIN32)
+ p = _aligned_malloc( ( size_t )size,
+ ( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE );
+#elif defined (BLIS_ENABLE_C66X_BUILD)
+ /* Use LibArch slow scratch memory allocator */
+ p = lib_smem_salloc(blasGetMemHandle(), size, 1);
+#else
+ r_val = posix_memalign( &p,
+ ( size_t )BLIS_HEAP_ADDR_ALIGN_SIZE,
+ ( size_t )size );
+
+ if ( r_val != 0 ) bli_abort();
+#endif
+
+ if ( p == NULL ) bli_abort();
+
+ return p;
+}
+
#ifdef BLIS_ENABLE_C66X_BUILD
+/* This function is used to allocate memory for kernel computation
+ with required alignment. */
void* bli_memalign(siz_t alignment, siz_t size )
{
void* p = NULL;
return p;
}
+
+/* This function is used to allocate memory for kernel computation
+ with required alignment using LibArch scratch heap allocator.
+ A memory block will be obtained from a scratch heap that is
+ initialized during BLIS initialization. */
+void* bli_malloc_scratch_align(siz_t alignment, siz_t size )
+{
+ void* p = NULL;
+
+ p = lib_smem_salloc(blasGetMemHandle(), size, BLIS_CACHE_LINE_SIZE);
+
+ if ( p == NULL ) bli_abort();
+
+ return p;
+}
+
#endif
+/* This function is used to free the memory allocated by bli_malloc. */
void bli_free( void* p )
{
#if defined(BLIS_ENABLE_TI_ARM_OPENCL)
#endif
}
+/* This function is used to free the memory allocated by bli_malloc_scratch. */
+void bli_free_scratch( void* p )
+{
+#if defined(BLIS_ENABLE_TI_ARM_OPENCL)
+ _Pragma( "omp critical (bli_malloc_critical)" )
+ {
+ __free_ddr( p );
+ }
+#elif defined (BLIS_ENABLE_C66X_BUILD)
+ /* for DSP implementation, freeing scratch heap is not needed. */
+
+#elif BLIS_HEAP_ADDR_ALIGN_SIZE == 1 || !defined(_WIN32)
+ free( p );
+#else
+ _aligned_free( p );
+#endif
+}
index afc3e5d0bfe792f4fc8b813c6c14a76cedaedfc8..c9723d7ffbcbbd94080d9a99497c7e4d32c33d2d 100644 (file)
*/
void* bli_malloc( siz_t size );
+void* bli_malloc_scratch( siz_t size );
#ifdef BLIS_ENABLE_C66X_BUILD
void* bli_memalign(siz_t alignment, siz_t size );
+void* bli_malloc_scratch_align(siz_t alignment, siz_t size );
#endif
void bli_free( void* p );
+void bli_free_scratch( void* p );
index 4c3716248964432f606eea848e87ffa16f63df97..430321744d4b6f45fec03dd0f15654c10440298e 100644 (file)
// the memory.
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-#ifdef BLIS_ENABLE_C66X_OPENCL
static pool_t pools[12];
//Main Memory Pools
static void* pool_mn_blk_ptrs_L3[ BLIS_NUM_MC_X_NC_BLOCKS_L3 ];
extern char *pool_mn_mem_L3;
-
-#else //CCS
-static pool_t pools[12];
-
-//Main Memory Pools
-static void* pool_mk_blk_ptrs[ BLIS_NUM_MC_X_KC_BLOCKS ];
-#pragma DATA_SECTION( pool_mk_mem, ".myDDR3");
-static char pool_mk_mem[ BLIS_MK_POOL_SIZE ];
-
-static void* pool_kn_blk_ptrs[ BLIS_NUM_KC_X_NC_BLOCKS ];
-#pragma DATA_SECTION( pool_mk_mem, ".myDDR3");
-static char pool_kn_mem[ BLIS_KN_POOL_SIZE ];
-
-static void* pool_mn_blk_ptrs[ BLIS_NUM_MC_X_NC_BLOCKS ];
-#pragma DATA_SECTION( pool_mk_mem, ".myDDR3");
-static char pool_mn_mem[ BLIS_MN_POOL_SIZE ];
-
-//L1
-static void* pool_mk_blk_ptrs_L1[ BLIS_NUM_MR_X_KC_BLOCKS_L1 ];
-#pragma DATA_SECTION( pool_mk_mem_L1, ".myL1");
-#pragma DATA_ALIGN(pool_mk_mem_L1, BLIS_CACHE_LINE_SIZE);
-static char pool_mk_mem_L1[ BLIS_MK_POOL_SIZE_L1 ];
-
-static void* pool_kn_blk_ptrs_L1[ BLIS_NUM_KC_X_NR_BLOCKS_L1 ];
-#pragma DATA_SECTION( pool_kn_mem_L1, ".myL1");
-#pragma DATA_ALIGN(pool_kn_mem_L1, BLIS_CACHE_LINE_SIZE);
-static char pool_kn_mem_L1[ BLIS_KN_POOL_SIZE_L1 ];
-
-static void* pool_mn_blk_ptrs_L1[ BLIS_NUM_MR_X_NR_BLOCKS_L1 ];
-#pragma DATA_SECTION( pool_mn_mem_L1, ".myL1");
-static char pool_mn_mem_L1[ BLIS_MN_POOL_SIZE_L1 ];
-
-//
-//L2 Pools
-//
-static void* pool_mk_blk_ptrs_L2[ BLIS_NUM_MC_X_KC_BLOCKS_L2 ];
-#pragma DATA_SECTION( pool_mk_mem_L2, ".myL2");
-#pragma DATA_ALIGN(pool_mk_mem_L2, BLIS_CACHE_LINE_SIZE);
-static char pool_mk_mem_L2[ BLIS_MK_POOL_SIZE_L2 ];
-
-static void* pool_kn_blk_ptrs_L2[ BLIS_NUM_KC_X_NC_BLOCKS_L2 ];
-#pragma DATA_SECTION( pool_kn_mem_L2, ".myL2");
-#pragma DATA_ALIGN(pool_kn_mem_L2, BLIS_CACHE_LINE_SIZE);
-static char pool_kn_mem_L2[ BLIS_KN_POOL_SIZE_L2 ];
-
-static void* pool_mn_blk_ptrs_L2[ BLIS_NUM_MC_X_NC_BLOCKS_L2 ];
-#pragma DATA_SECTION( pool_mn_mem_L2, ".myL2");
-#pragma DATA_ALIGN(pool_mn_mem_L2, BLIS_CACHE_LINE_SIZE);
-static char pool_mn_mem_L2[ BLIS_MN_POOL_SIZE_L2 ];
-
-//
-//L3 Pools
-//
-static void* pool_mk_blk_ptrs_L3[ BLIS_NUM_MC_X_KC_BLOCKS_L3 ];
-#pragma DATA_SECTION( pool_mk_mem_L3, ".myL3");
-#pragma DATA_ALIGN(pool_mk_mem_L3, BLIS_CACHE_LINE_SIZE);
-static char pool_mk_mem_L3[ BLIS_MK_POOL_SIZE_L3 ];
-
-static void* pool_kn_blk_ptrs_L3[ BLIS_NUM_KC_X_NC_BLOCKS_L3 ];
-#pragma DATA_SECTION( pool_kn_mem_L3, ".myL3");
-#pragma DATA_ALIGN(pool_kn_mem_L3, BLIS_CACHE_LINE_SIZE);
-static char pool_kn_mem_L3[ BLIS_KN_POOL_SIZE_L3 ];
-
-static void* pool_mn_blk_ptrs_L3[ BLIS_NUM_MC_X_NC_BLOCKS_L3 ];
-#pragma DATA_SECTION( pool_mn_mem_L3, ".myL3");
-#pragma DATA_ALIGN(pool_mn_mem_L3, BLIS_CACHE_LINE_SIZE);
-static char pool_mn_mem_L3[ BLIS_MN_POOL_SIZE_L3 ];
-#endif
#else
static pool_t pools[3];
if (bli_buf_type_is_shared(buf_type))
core_id = 0;
else
- core_id = CSL_chipReadDNUM ();
+ core_id = omp_get_thread_num ();
#endif
bli_mem_set_buffer( block, mem );
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_size( req_size, mem );
+
+ //printf("Acquire: block %x \n", block);
}
else
{
pool = &pools[ pool_index ];
#ifdef BLIS_ENABLE_C66X_MEM_DEBUG
- printf("Acquire: buf_type %x pool size %d req size %d ", buf_type, bli_pool_block_size( pool ), req_size);
+ printf("Acquire: core_id %d buf_type %x block size size %d req size %d ", core_id, buf_type, bli_pool_block_size( pool ), req_size);
#endif
// Unconditionally perform error checking on the memory pool.
if (bli_buf_type_is_shared(buf_type))
core_id = 0;
else
- core_id = CSL_chipReadDNUM ();
+ core_id = omp_get_thread_num ();
#endif
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
#ifdef BLIS_ENABLE_C66X_MEM_DEBUG
printf("L2 Cache\n");
- printf("MK: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MK_POOL_SIZE_L2, BLIS_NUM_MC_X_KC_BLOCKS_L2, BLIS_MK_BLOCK_SIZE);
- printf("KN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_KN_POOL_SIZE_L2, BLIS_NUM_KC_X_NC_BLOCKS_L2, BLIS_KN_BLOCK_SIZE);
- printf("MN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MN_POOL_SIZE_L2, BLIS_NUM_MC_X_NC_BLOCKS_L2, BLIS_MN_BLOCK_SIZE);
-
- printf("BLIS_POOL_MC_S: %d BLIS_POOL_KC_S: %d\n", BLIS_POOL_MC_S, BLIS_POOL_KC_S);
-
- printf("max :%d\n", ( (BLIS_POOL_MC_S + BLIS_POOL_KC_S)^2));
-
- printf("MK: Block Size %d, %d, %d, %d, %d, %d, %d, %d\n",BLIS_MK_BLOCK_SIZE_S ,BLIS_MK_BLOCK_SIZE_D ,BLIS_MK_BLOCK_SIZE_C,
+ printf("MK: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MK_POOL_SIZE_L2, BLIS_NUM_MC_X_KC_BLOCKS_L2, BLIS_MK_BLOCK_SIZE);
+ printf("KN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_KN_POOL_SIZE_L2, BLIS_NUM_KC_X_NC_BLOCKS_L2, BLIS_KN_BLOCK_SIZE);
+ printf("MN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MN_POOL_SIZE_L2, BLIS_NUM_MC_X_NC_BLOCKS_L2, BLIS_MN_BLOCK_SIZE);
+ printf("MK: Block Size %d, %d, %d, %d, %d, %d, %d, %d\n",BLIS_MK_BLOCK_SIZE_S ,BLIS_MK_BLOCK_SIZE_D ,BLIS_MK_BLOCK_SIZE_C,
BLIS_MK_BLOCK_SIZE_Z,BLIS_MK_BLOCK_SIZE_4M_C,BLIS_MK_BLOCK_SIZE_4M_Z,BLIS_MK_BLOCK_SIZE_3M_C,BLIS_MK_BLOCK_SIZE_3M_Z );
#endif
//L2 Cache
printf("MK: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MK_POOL_SIZE_L3, BLIS_NUM_MC_X_KC_BLOCKS_L3, BLIS_MK_BLOCK_SIZE);
printf("KN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_KN_POOL_SIZE_L3, BLIS_NUM_KC_X_NC_BLOCKS_L3, BLIS_KN_BLOCK_SIZE);
printf("MN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MN_POOL_SIZE_L3, BLIS_NUM_MC_X_NC_BLOCKS_L3, BLIS_MN_BLOCK_SIZE);
+ printf("BLIS_POOL_KC_S: %d BLIS_POOL_NC_S: %d\n", BLIS_POOL_KC_S, BLIS_POOL_NC_S);
printf("KN: Block Size %d, %d, %d, %d, %d, %d, %d, %d\n",BLIS_KN_BLOCK_SIZE_S ,BLIS_KN_BLOCK_SIZE_D ,BLIS_KN_BLOCK_SIZE_C,
BLIS_KN_BLOCK_SIZE_Z,BLIS_KN_BLOCK_SIZE_4M_C,BLIS_KN_BLOCK_SIZE_4M_Z,BLIS_KN_BLOCK_SIZE_3M_C,BLIS_KN_BLOCK_SIZE_3M_Z );
index a3359de7d905cb9f6b49ba7aaaf579e8392713ce..13922c8297083c0a70b7abf4033124457ff8535c 100644 (file)
void bli_obj_print( char* label, obj_t* obj )
{
-#ifndef BLIS_ENABLE_C66X_OPENCL
FILE* file = stdout;
-#endif
mem_t* pack_mem = bli_obj_pack_mem( *obj );
//mem_t* cast_mem = bli_obj_cast_mem( *obj );
if ( bli_error_checking_is_enabled() )
bli_obj_print_check( label, obj );
-#ifdef BLIS_ENABLE_C66X_OPENCL
- printf( "\n" );
- printf( "%s\n", label );
- printf( "\n" );
-
- printf( " m x n %lu x %lu\n", ( unsigned long int )bli_obj_length( *obj ),
- ( unsigned long int )bli_obj_width( *obj ) );
- printf( "\n" );
-
- printf( " offm, offn %lu, %lu\n", ( unsigned long int )bli_obj_row_offset( *obj ),
- ( unsigned long int )bli_obj_col_offset( *obj ) );
- printf( " diagoff %ld\n", ( signed long int )bli_obj_diag_offset( *obj ) );
- printf( "\n" );
- printf( " buf %p\n", ( void* )bli_obj_buffer( *obj ) );
- printf( " elem size %lu\n", ( unsigned long int )bli_obj_elem_size( *obj ) );
- printf( " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( *obj ),
- ( signed long int )bli_obj_col_stride( *obj ) );
- printf( " pack_mem \n" );
- printf( " - buf %p\n", ( void* )bli_mem_buffer( pack_mem ) );
- printf( " - buf_type %lu\n", ( unsigned long int )bli_mem_buf_type( pack_mem ) );
- printf( " - size %lu\n", ( unsigned long int )bli_mem_size( pack_mem ) );
- printf( " m_padded %lu\n", ( unsigned long int )bli_obj_padded_length( *obj ) );
- printf( " n_padded %lu\n", ( unsigned long int )bli_obj_padded_width( *obj ) );
- printf( " ps %lu\n", ( unsigned long int )bli_obj_panel_stride( *obj ) );
- printf( " pd %lu\n", ( unsigned long int )bli_obj_panel_dim( *obj ) );
- printf( " m_panel %lu\n", ( unsigned long int )bli_obj_panel_length( *obj ) );
- printf( " n_panel %lu\n", ( unsigned long int )bli_obj_panel_width( *obj ) );
- printf( "\n" );
-
- printf( " info %lX\n", ( unsigned long int )(*obj).info );
- printf( " - is complex %lu\n", ( unsigned long int )bli_obj_is_complex( *obj ) );
- printf( " - is d. prec %lu\n", ( unsigned long int )bli_obj_is_double_precision( *obj ) );
- printf( " - datatype %lu\n", ( unsigned long int )bli_obj_datatype( *obj ) );
- printf( " - target dt %lu\n", ( unsigned long int )bli_obj_target_datatype( *obj ) );
- printf( " - exec dt %lu\n", ( unsigned long int )bli_obj_execution_datatype( *obj ) );
- printf( " - has trans %lu\n", ( unsigned long int )bli_obj_has_trans( *obj ) );
- printf( " - has conj %lu\n", ( unsigned long int )bli_obj_has_conj( *obj ) );
- printf( " - unit diag? %lu\n", ( unsigned long int )bli_obj_has_unit_diag( *obj ) );
- printf( " - struc type %lu\n", ( unsigned long int )bli_obj_struc( *obj ) >> BLIS_STRUC_SHIFT );
- printf( " - uplo type %lu\n", ( unsigned long int )bli_obj_uplo( *obj ) >> BLIS_UPLO_SHIFT );
- printf( " - is upper %lu\n", ( unsigned long int )bli_obj_is_upper( *obj ) );
- printf( " - is lower %lu\n", ( unsigned long int )bli_obj_is_lower( *obj ) );
- printf( " - is dense %lu\n", ( unsigned long int )bli_obj_is_dense( *obj ) );
- printf( " - pack schema %lu\n", ( unsigned long int )bli_obj_pack_schema( *obj ) >> BLIS_PACK_SCHEMA_SHIFT );
- printf( " - packinv diag? %lu\n", ( unsigned long int )bli_obj_has_inverted_diag( *obj ) );
- printf( " - pack ordifup %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_upper( *obj ) );
- printf( " - pack ordiflo %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_lower( *obj ) );
- printf( " - packbuf type %lu\n", ( unsigned long int )bli_obj_pack_buffer_type( *obj ) >> BLIS_PACK_BUFFER_SHIFT );
- printf( "\n" );
-
-#else
fprintf( file, "\n" );
fprintf( file, "%s\n", label );
fprintf( file, "\n" );
fprintf( file, " - pack ordiflo %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_lower( *obj ) );
fprintf( file, " - packbuf type %lu\n", ( unsigned long int )bli_obj_pack_buffer_type( *obj ) >> BLIS_PACK_BUFFER_SHIFT );
fprintf( file, "\n" );
-#endif
+
}
diff --git a/blis/frame/base/bli_profile.c b/blis/frame/base/bli_profile.c
--- /dev/null
@@ -0,0 +1,507 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas at Austin
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas at Austin nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "blis.h"
+#ifdef BLIS_ENABLE_PROFILE
+
+void bli_profile_data_free(profile_data_t *bli_profile_data)
+{
+ if(bli_profile_data == NULL)
+ printf("not allocated\n");
+
+ bli_free(bli_profile_data);
+}
+
+
+profile_data_t * bli_profile_data_init(dim_t num_objects)
+{
+ dim_t i;
+ profile_data_t *bli_profile_data;
+ bli_profile_data = (profile_data_t *)bli_malloc(num_objects*sizeof(profile_data_t));
+ if(bli_profile_data == NULL)
+ printf("not allocated\n");
+ for (i = 0; i < num_objects; i++)
+ {
+ (bli_profile_data[i]).total_cycles = 0;
+ (bli_profile_data[i]).num_iter = 0;
+ (bli_profile_data[i]).num_comp = 0;
+
+ }
+
+ return bli_profile_data;
+}
+
+
+void bli_profile_data_print (profile_data_t *bli_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype, dim_t comp_scale, dim_t num_threads)
+{
+ dim_t n_ind, j;
+
+ dim_t index;
+
+ uint64_t num_iter;
+ //long int total_cycles_kervar2[3];
+
+ double ideal_gflops, gflops, total_cycles, num_ops, time_taken, num_ops_core;
+
+ if(datatype == 0 || datatype == 2)
+ {
+ num_ops = ( (double) comp_scale * m * n * k );
+ }
+ else
+ {
+ // for complex numbers
+ num_ops = ( (double) 4.0*comp_scale * m * n * k );
+ }
+
+ if(datatype == 0 || datatype == 1)
+ {
+ ideal_gflops = CLOCK*OPS_PER_CYCLE_S*num_threads;
+ }
+ else
+ {
+ ideal_gflops = CLOCK*OPS_PER_CYCLE_D*num_threads;
+ }
+ // print intro
+ printf("\n");
+ printf("Clock Frequency %4.1f GHz\n", CLOCK);
+ printf("Number of Threads %d\n", num_threads);
+ printf("Datatype %d Number of Operations: %f\n", datatype, num_ops);
+ printf("Operations per cycle %d\n", (datatype == 0 || datatype == 1) ? OPS_PER_CYCLE_S : OPS_PER_CYCLE_D);
+ printf("Peak GFLOPS %5.1f GFLOPS\n", ideal_gflops);
+ printf("\n");
+
+ // print table header
+ printf("%-10s", "Variant");
+ printf("%2s", "|");
+ printf("%5s", "Cores");
+ printf("%10s", "Num Iter");
+ printf("%15s", "Total Cycles");
+ printf("%10s", "GFLOPS");
+ printf("%12s", "Efficiency");
+
+ // print separator (scales with dimensions of sweep)
+ printf("\n");
+ printf("%-10s","----------");
+ printf("%2s", "--");
+ printf("%10s", "----------");
+ printf("%15s", "---------------");
+ printf("%10s", "----------");
+ printf("%12s", "------------");
+ printf("\n");
+
+
+ for (n_ind = 0; n_ind < BLIS_PROFILE_NUM_REPORTS; n_ind++)
+ {
+ if(n_ind < BLIS_PROFILE_KER_VAR2_IND)
+ {
+ index = 0 + BLIS_MAX_NUM_THREADS*n_ind;
+ if((bli_profile_data[index]).num_iter != 0)
+ {
+ total_cycles = ((double)(bli_profile_data[index]).total_cycles);
+ time_taken = total_cycles / CLOCK ;
+ gflops = num_ops / time_taken; // in 10^9
+ printf("%-10d",n_ind);
+ printf("%2s", "|");
+ printf("%5s", "1");
+ printf("%10d", (bli_profile_data[index]).num_iter);
+ printf("%15llu", (bli_profile_data[index]).total_cycles);
+ printf("%10.4f", gflops );
+ printf("%11.4f%%", gflops/ideal_gflops*100);
+ printf("\n");
+ }
+ }
+ else if(n_ind == BLIS_PROFILE_KER_VAR2_IND)
+ {
+ /*The total performance of the operation depends on the slowest thread in kervar2.
+ * Hence, reporting the max cycles of the thread for ker_var2*/
+
+ num_iter = 0;
+ total_cycles = 0;
+
+ for (j = 0; j<num_threads; j++)
+ {
+ num_iter = bli_max(num_iter, (uint64_t) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_iter) ;
+ total_cycles = bli_max(total_cycles, (double) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].total_cycles);
+ }
+ if(num_iter != 0)
+ {
+ time_taken = total_cycles / CLOCK ;
+ gflops = num_ops / time_taken; // in 10^9
+ printf("%-10d",n_ind);
+ printf("%2s", "|");
+ printf("%5d", 1);
+ printf("%10llu", num_iter);
+ printf("%15llu", (uint64_t) total_cycles);
+ printf("%10.4f", gflops );
+ printf("%11.4f%%", gflops/ideal_gflops*100);
+ printf("\n");
+ }
+ }
+ else
+ {
+ num_iter = 0;
+ total_cycles = 0;
+
+ for (j = 0; j<num_threads; j++)
+ {
+ num_iter += (uint64_t) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_iter ;
+ total_cycles += (double) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].total_cycles;
+ }
+ if(num_iter != 0)
+ {
+ time_taken = total_cycles / CLOCK / num_threads ; // total cycles here are for all 8 cores, that is why we have to divide by num_threads to get the time taken for one core
+ gflops = num_ops / time_taken; // in 10^9
+ printf("%-10d",n_ind);
+ printf("%2s", "|");
+ printf("%5d", num_threads);
+ printf("%10llu", num_iter);
+ printf("%15llu", (uint64_t) total_cycles);
+ printf("%10.4f", gflops );
+ printf("%11.4f%%", gflops/ideal_gflops*100);
+ printf("\n");
+ }
+ }
+ }
+
+ printf("%-10s", "----------");
+ printf("%2s", "--");
+ printf("%10s", "----------");
+ printf("%15s", "---------------");
+ printf("%10s", "----------");
+ printf("%12s", "------------");
+ printf("\n");
+
+ printf("uKernel details for each core\n");
+
+ printf("%-10s", "Core #");
+ printf("%2s", "|");
+ printf("%15s", "Num. Computes");
+ printf("%15s", "Total Cycles");
+ printf("%10s", "GFLOPS");
+ printf("%12s", "Efficiency");
+ printf("\n");
+
+ n_ind = 6;
+ for (j = 0; j<num_threads; j++)
+ {
+ num_iter = (uint64_t) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_iter ;
+ total_cycles = (double) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].total_cycles;
+ if(datatype == 0 || datatype == 2)
+ num_ops_core = (double) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_comp;
+ else
+ num_ops_core = (double) 4.0*bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_comp;
+
+ if(num_iter != 0)
+ {
+ time_taken = total_cycles / CLOCK ;
+ gflops = num_ops_core / time_taken; // in 10^9
+ if(datatype == 0 || datatype == 1)
+ {
+ ideal_gflops = CLOCK*OPS_PER_CYCLE_S;
+ }
+ else
+ {
+ ideal_gflops = CLOCK*OPS_PER_CYCLE_D;
+ }
+ printf("%-10d",j);
+ printf("%2s", "|");
+ printf("%15llu", (uint64_t) num_ops_core);
+ printf("%15llu", (uint64_t) total_cycles);
+ printf("%10.4f", gflops );
+ printf("%11.4f%%", gflops/ideal_gflops*100);
+ printf("\n");
+ }
+ }
+ printf("%-10s", "----------");
+ printf("%2s", "--");
+ printf("%10s", "----------");
+ printf("%15s", "---------------");
+ printf("%10s", "----------");
+ printf("%12s", "------------");
+ printf("\n");
+}
+
+
+void bli_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype, dim_t comp_scale)
+{
+// dim_t i,j;
+//
+// gint_t num_iter[3];
+// long int total_cycles[3];
+// gint_t min_cycles[3];
+// gint_t max_cycles[3];
+//
+//
+// double ideal_gflops, gflops, total_cycles_j, num_ops, time_taken;
+//
+// num_ops = ( (float) comp_scale * (float) m * (float) n * (float) k );
+//
+// if(datatype == 0 || datatype == 1)
+// {
+// ideal_gflops = CLOCK*OPS_PER_CYCLE_S*NUM_THREADS;
+// }
+// else
+// {
+// ideal_gflops = CLOCK*OPS_PER_CYCLE_D*NUM_THREADS;
+// }
+//
+// for(i = 0; i<3; i++)
+// {
+// num_iter[i] = 0;
+// total_cycles[i] = 0;
+// min_cycles[i] = 0;
+// max_cycles[i] = 0;
+// for (j = 0; j<NUM_THREADS; j++)
+// {
+// num_iter[i] += bli_kervar2_profile_data[j*3+i].num_iter ;
+//
+// total_cycles[i] += bli_kervar2_profile_data[j*3+i].total_cycles;
+// if(i == 0 )
+// {
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// }
+// else
+// {
+// if(bli_kervar2_profile_data[j*3+i].max_cycles > max_cycles[i])
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// if(bli_kervar2_profile_data[j*3+i].min_cycles < min_cycles[i])
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// }
+//
+// }
+// }
+//
+// for(i = 0; i<3; i++)
+// {
+// num_iter[i] = 0;
+// total_cycles[i] = 0;
+// min_cycles[i] = 0;
+// max_cycles[i] = 0;
+// for (j = 0; j<8; j++)
+// {
+// num_iter[i] += bli_kervar2_profile_data[j*3+i].num_iter ;
+//
+// total_cycles[i] += bli_kervar2_profile_data[j*3+i].total_cycles;
+// if(i == 0 )
+// {
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// }
+// else
+// {
+// if(bli_kervar2_profile_data[j*3+i].max_cycles > max_cycles[i])
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// if(bli_kervar2_profile_data[j*3+i].min_cycles < min_cycles[i])
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// }
+//
+// }
+// }
+//
+// for(j = 0; j < 3; j++)
+// {
+// if(num_iter[j] != 0)
+// {
+//
+// total_cycles_j = (double) total_cycles[j];
+// time_taken = total_cycles_j / CLOCK / NUM_THREADS;
+// gflops = num_ops / time_taken; // in 10^9
+// if(j == 0)
+// printf("%-10s","kernel");
+// else if(j ==1)
+// printf("%-10s","mr");
+// else
+// printf("%-10s","nr");
+//
+// printf("%2s", "|");
+// printf("%10d", num_iter[j]);
+// printf("%15ld", total_cycles[j]);
+// printf("%10.4f", gflops);
+// printf("%11.4f%%", gflops/ideal_gflops*100);
+// printf("\n");
+// }
+// }
+//#if 1
+// for(j = 0; j < 3; j++)
+// {
+// if(j == 0)
+// printf("%-10s\n","kernel");
+// else if(j ==1)
+// printf("%-10s\n","mr");
+// else
+// printf("%-10s\n","nr");
+//
+// for(i = 0; i < 8; i++)
+// {
+// printf("core %d\t",i);
+// printf("%15ld\t", bli_kervar2_profile_data[i*3+j].total_cycles);
+// printf("%15ld\t", bli_kervar2_profile_data[i*3+j].num_comp);
+// printf("\n");
+// }
+// }
+//#endif
+}
+
+void bli_trsm_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype)
+{
+// dim_t i,j;
+//
+// long int num_iter[3];
+// long int total_cycles[3];
+// gint_t min_cycles[3];
+// gint_t max_cycles[3];
+//
+//
+// double ideal_gflops, gflops, total_cycles_j, num_ops, time_taken;
+//
+// num_ops = ( 1.0 * m * n * k );
+//
+// if(datatype == 0 || datatype == 1)
+// {
+// ideal_gflops = CLOCK*OPS_PER_CYCLE_S*NUM_THREADS;
+// }
+// else
+// {
+// ideal_gflops = CLOCK*OPS_PER_CYCLE_D*NUM_THREADS;
+// }
+//
+// for(i = 0; i<3; i++)
+// {
+// num_iter[i] = 0;
+// total_cycles[i] = 0;
+// min_cycles[i] = 0;
+// max_cycles[i] = 0;
+// for (j = 0; j<8; j++)
+// {
+// num_iter[i] += bli_kervar2_profile_data[j*3+i].num_iter ;
+//
+// total_cycles[i] += bli_kervar2_profile_data[j*3+i].total_cycles;
+// if(i == 0 )
+// {
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// }
+// else
+// {
+// if(bli_kervar2_profile_data[j*3+i].max_cycles > max_cycles[i])
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// if(bli_kervar2_profile_data[j*3+i].min_cycles < min_cycles[i])
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// }
+//
+// }
+// }
+//
+// for(j = 0; j < 3; j++)
+// {
+// if(num_iter[j] != 0)
+// {
+// total_cycles_j = (double) total_cycles[j];
+// time_taken = total_cycles_j / CLOCK / NUM_THREADS;
+// gflops = num_ops / time_taken; // in 10^9
+// if(j == 0)
+// printf("%-10s","gemmtrsm");
+// else if(j ==1)
+// printf("%-10s","MR loop");
+// else
+// printf("%-10s","nr");
+//
+// printf("%2s", "|");
+// printf("%10ld", num_iter[j]);
+// printf("%15ld", total_cycles[j]);
+// printf("%10.4f", gflops);
+// printf("%11.4f%%", gflops/ideal_gflops*100);
+// printf("\n");
+// }
+// }
+// printf("%-10s", "----------");
+// printf("%2s", "--");
+// printf("%10s", "----------");
+// printf("%15s", "---------------");
+// printf("%10s", "----------");
+// printf("%12s", "------------");
+//
+// printf("\nNumber of loop iterations across cores %d\n", NUM_THREADS);
+//
+// // print table header
+// printf("%-10s", " ");
+// printf("%2s", "|");
+// printf("%12s", "Core 0");
+// printf("%12s", "Core 1");
+// printf("%12s", "Core 2");
+// printf("%12s", "Core 3");
+// printf("%12s", "Core 4");
+// printf("%12s", "Core 5");
+// printf("%12s", "Core 6");
+// printf("%12s\n", "Core 7");
+//
+// for(j = 0; j < 3; j++)
+// {
+// if(j == 0)
+// printf("%-10s","kernel");
+// else if(j ==1)
+// printf("%-10s","MR loop");
+// else
+// printf("%-10s","NR loop");
+//
+// printf("%2s", "|");
+//
+// for(i = 0; i < 8; i++)
+// printf("%12ld", bli_kervar2_profile_data[i*3+j].num_comp);
+//
+// printf("\n");
+// }
+//
+// for(j = 0; j < 3; j++)
+// {
+// if(j == 0)
+// printf("%-10s","kernel");
+// else if(j ==1)
+// printf("%-10s","MR loop");
+// else
+// printf("%-10s","NR loop");
+//
+// printf("%2s", "|");
+//
+// for(i = 0; i < 8; i++)
+// printf("%15ld", bli_kervar2_profile_data[i*3+j].total_cycles);
+//
+// printf("\n");
+// }
+}
+
+
+
+
+#endif
diff --git a/blis/frame/base/bli_profile.h b/blis/frame/base/bli_profile.h
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas at Austin
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas at Austin nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_PROFILE_H
+#define BLIS_PROFILE_H
+
+//#ifdef BLIS_ENABLE_C66X_BUILD
+#define BLIS_PROFILE_BLK_VAR2_IND 0
+#define BLIS_PROFILE_BLK_VAR3_IND 1
+#define BLIS_PROFILE_BLK_VAR1_IND 2
+#define BLIS_PROFILE_KER_VAR2_IND 3
+#define BLIS_PROFILE_JR_LOOP_IND 4
+#define BLIS_PROFILE_IR_LOOP_IND 5
+#define BLIS_PROFILE_KER_LOOP_IND 6
+#define BLIS_PROFILE_NUM_REPORTS 7
+
+#define MAX_THREADS 8
+
+#ifdef BLIS_ENABLE_PROFILE
+
+#define CLOCK 1.2 // In GHz
+
+#ifdef BLIS_ENABLE_C66X_BUILD
+#define OPS_PER_CYCLE_S 16
+#define OPS_PER_CYCLE_D 4
+#else
+#define OPS_PER_CYCLE_S 8
+#define OPS_PER_CYCLE_D 2
+#endif
+
+
+#define BLIS_ENABLE_PROFILE_KERVAR2 1
+#else
+#define BLIS_ENABLE_PROFILE_KERVAR2 0
+#endif
+
+
+struct profile_data_s
+{
+ uint64_t total_cycles;
+ gint_t num_iter;
+ uint64_t num_comp;
+};
+typedef struct profile_data_s profile_data_t;
+
+struct profile_details_s
+{
+ dim_t m;
+ dim_t n;
+ dim_t k;
+ long int cycles;
+};
+typedef struct profile_details_s profile_details_t;
+
+
+#define bli_profile_data_update( bli_profile_data, cycles, comps) \
+{ \
+ bli_profile_data.total_cycles += cycles; \
+ bli_profile_data.num_comp += comps; \
+ bli_profile_data.num_iter++; \
+}
+
+#define bli_profile_get_index(n, i, index) \
+{ \
+ if(n == 1 && i == 2) \
+ index = bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_BLK_VAR2_IND; \
+ else if(n == 2 && i == 2) \
+ index = bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_BLK_VAR3_IND; \
+ else if(n == 0 && i == 2) \
+ index = bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_BLK_VAR1_IND; \
+ else if(n == 1 && i == 1) \
+ index = bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_VAR2_IND; \
+}
+
+profile_data_t* bli_profile_data_init (dim_t num_objects);
+
+void bli_profile_data_free (profile_data_t *bli_profile_data);
+
+//void bli_profile_data_update (profile_data_t *bli_profile_data, long int cycles, long int num_comp);
+
+void bli_gemm_profile_data_print (profile_data_t *bli_gemm_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype);
+
+void bli_trsm_profile_data_print (profile_data_t *bli_trsm_profile_data, dim_t m, dim_t n, dim_t mn_side, num_t datatype);
+
+void bli_gemm_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype);
+void bli_trsm_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype);
+
+void bli_profile_data_print (profile_data_t *bli_gemm_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype, dim_t comp_scale, dim_t num_threads);
+
+void bli_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype, dim_t comp_scale);
+
+
+
+profile_details_t* bli_profile_details_init (long int num_objects);
+
+void bli_profile_details_free (profile_details_t *bli_profile_details);
+
+void bli_profile_details_update (profile_details_t *bli_profile_details, dim_t m, dim_t n, dim_t k, long int cycles);
+
+
+
+
+#endif
index 48e8974601b5dc4d50aa21b4c98735cb5f0f7487..bd753f808c3197dfc2e1e1624928023846632d22 100644 (file)
if( barrier->count == 0 )
{
bli_free_barrier_tree( barrier->dad );
- bli_free( barrier );
+ bli_free_scratch( barrier );
}
return;
}
barrier_t* bli_create_tree_barrier(int num_threads, int arity, barrier_t** leaves, int leaf_index)
{
- barrier_t* me = (barrier_t*) bli_malloc(sizeof(barrier_t));
+ barrier_t* me = (barrier_t*) bli_malloc_scratch(sizeof(barrier_t));
me->dad = NULL;
me->signal = 0;
{
bli_free_barrier_tree( communicator->barriers[i] );
}
- bli_free( communicator->barriers );
+ bli_free_scratch( communicator->barriers );
}
void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads)
{
if( communicator == NULL ) return;
communicator->sent_object = NULL;
communicator->n_threads = n_threads;
- communicator->barriers = ( barrier_t** ) bli_malloc( sizeof( barrier_t* ) * n_threads );
+ communicator->barriers = ( barrier_t** ) bli_malloc_scratch( sizeof( barrier_t* ) * n_threads );
bli_create_tree_barrier( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 );
}
{
if( communicator == NULL ) return;
bli_cleanup_communicator( communicator );
- bli_free( communicator );
+ bli_free_scratch( communicator );
}
thread_comm_t* bli_create_communicator( dim_t n_threads )
*/
siz_t communicator_size;
communicator_size = BLIS_CACHE_LINE_SIZE*(sizeof(thread_comm_t)/BLIS_CACHE_LINE_SIZE +1);
- comm = (thread_comm_t*) bli_memalign(BLIS_CACHE_LINE_SIZE, communicator_size );
+ comm = (thread_comm_t*) bli_malloc_scratch_align(BLIS_CACHE_LINE_SIZE, communicator_size );
#else
- thread_comm_t* comm = (thread_comm_t*) bli_malloc( sizeof(thread_comm_t) );
+ thread_comm_t* comm = (thread_comm_t*) bli_malloc_scratch( sizeof(thread_comm_t) );
#endif
bli_setup_communicator( comm, n_threads );
return comm;
@@ -258,7 +258,7 @@ thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_
dim_t n_way, dim_t work_id )
{
- thrinfo_t* thr = (thrinfo_t*) bli_malloc( sizeof(thrinfo_t) );
+ thrinfo_t* thr = (thrinfo_t*) bli_malloc_scratch( sizeof(thrinfo_t) );
bli_setup_thread_info( thr, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id );
return thr;
}
@@ -275,68 +275,386 @@ void bli_setup_thread_info( thrinfo_t* thr, thread_comm_t* ocomm, dim_t ocomm_id
thr->work_id = work_id;
}
-void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+//void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+//{
+// thrinfo_t* thread = (thrinfo_t*) thr;
+// dim_t n_way = thread->n_way;
+// dim_t work_id = thread->work_id;
+//
+// dim_t size = all_end - all_start;
+// dim_t n_pt = size / n_way;
+// n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
+// n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor);
+// *start = work_id * n_pt + all_start;
+// *end = bli_min( *start + n_pt, size + all_start );
+//}
+//
+//void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
+//{
+// thrinfo_t* thread = (thrinfo_t*) thr;
+// dim_t n_way = thread->n_way;
+// dim_t work_id = thread->work_id;
+// dim_t size = all_end - all_start;
+// double num;
+//
+// *start = 0;
+// *end = all_end - all_start;
+// num = size*size / (double) n_way; // 2xArea per thread?
+//
+// //printf("bli_threading %d %d %f %d\n", *start, *end, num, work_id);
+//
+// if( forward ) {
+// dim_t curr_caucus = n_way - 1;
+// dim_t len = 0;
+// while(1){
+// dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus
+// width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
+// if( curr_caucus == work_id ) {
+// *start = bli_max( 0 , *end - width ) + all_start;
+// *end = *end + all_start;
+// return;
+// }
+// else{
+// *end -= width;
+// len += width;
+// curr_caucus--;
+// }
+// }
+// }
+// else{
+// while(1){
+// dim_t width = ceil(sqrt(*start * *start + num)) - *start;
+// width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
+// printf("bli_threading %d %d %d\n", *start, width, work_id);
+//
+// if( work_id == 0 ) {
+// *start = *start + all_start;
+// *end = bli_min( *start + width, all_end );
+// return;
+// }
+// else{
+// *start = *start + width;
+// }
+// work_id--;
+// }
+// }
+//}
+
+
+void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t handle_edge_low, dim_t* start, dim_t* end )
{
- thrinfo_t* thread = (thrinfo_t*) thr;
- dim_t n_way = thread->n_way;
- dim_t work_id = thread->work_id;
-
- dim_t size = all_end - all_start;
- dim_t n_pt = size / n_way;
- n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
- n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor);
- *start = work_id * n_pt + all_start;
- *end = bli_min( *start + n_pt, size + all_start );
+ thrinfo_t* thread = ( thrinfo_t* )thr;
+ dim_t n_way = thread->n_way;
+ dim_t work_id = thread->work_id;
+
+ dim_t size = all_end - all_start;
+
+ dim_t n_bf_whole = size / block_factor;
+ dim_t n_bf_left = size % block_factor;
+
+ dim_t n_bf_lo = n_bf_whole / n_way;
+ dim_t n_bf_hi = n_bf_whole / n_way;
+
+ // In this function, we partition the space between all_start and
+ // all_end into n_way partitions, each a multiple of block_factor
+ // with the exception of the one partition that recieves the
+ // "edge" case (if applicable).
+ //
+ // Here are examples of various thread partitionings, in units of
+ // the block_factor, when n_way = 4. (A '+' indicates the thread
+ // that receives the leftover edge case (ie: n_bf_left extra
+ // rows/columns in its sub-range).
+ // (all_start ... all_end)
+ // n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3
+ // 12 =0 f 0 4 3 3 3 3
+ // 12 >0 f 0 4 3 3 3 3+
+ // 13 >0 f 1 3 4 3 3 3+
+ // 14 >0 f 2 2 4 4 3 3+
+ // 15 >0 f 3 1 4 4 4 3+
+ // 15 =0 f 3 1 4 4 4 3
+ //
+ // 12 =0 t 4 0 3 3 3 3
+ // 12 >0 t 4 0 3+ 3 3 3
+ // 13 >0 t 3 1 3+ 3 3 4
+ // 14 >0 t 2 2 3+ 3 4 4
+ // 15 >0 t 1 3 3+ 4 4 4
+ // 15 =0 t 1 3 3 4 4 4
+
+ // As indicated by the table above, load is balanced as equally
+ // as possible, even in the presence of an edge case.
+
+ // First, we must differentiate between cases where the leftover
+ // "edge" case (n_bf_left) should be allocated to a thread partition
+ // at the low end of the index range or the high end.
+
+ if ( handle_edge_low == FALSE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "high" and
+ // the "low" thread group is empty.
+ dim_t n_th_lo = n_bf_whole % n_way;
+ //dim_t n_th_hi = n_way - n_th_lo;
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to lower index threads.
+ if ( n_th_lo != 0 ) n_bf_lo += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ *start = lo_start + (work_id ) * size_lo;
+ *end = lo_start + (work_id+1) * size_lo;
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ *start = hi_start + (work_id-n_th_lo ) * size_hi;
+ *end = hi_start + (work_id-n_th_lo+1) * size_hi;
+
+ // Since the edge case is being allocated to the high
+ // end of the index range, we have to advance the last
+ // thread's end.
+ if ( work_id == n_way - 1 ) *end += n_bf_left;
+ }
+ }
+ else // if ( handle_edge_low == TRUE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "low" and
+ // the "high" thread group is empty.
+ dim_t n_th_hi = n_bf_whole % n_way;
+ dim_t n_th_lo = n_way - n_th_hi;
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to higher index threads.
+ if ( n_th_hi != 0 ) n_bf_hi += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo
+ + n_bf_left;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ *start = lo_start + (work_id ) * size_lo;
+ *end = lo_start + (work_id+1) * size_lo;
+
+ // Since the edge case is being allocated to the low
+ // end of the index range, we have to advance the
+ // starts/ends accordingly.
+ if ( work_id == 0 ) *end += n_bf_left;
+ else { *start += n_bf_left;
+ *end += n_bf_left; }
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ *start = hi_start + (work_id-n_th_lo ) * size_hi;
+ *end = hi_start + (work_id-n_th_lo+1) * size_hi;
+ }
+ }
}
-void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
+void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
{
- thrinfo_t* thread = (thrinfo_t*) thr;
- dim_t n_way = thread->n_way;
- dim_t work_id = thread->work_id;
- dim_t size = all_end - all_start;
- double num;
-
- *start = 0;
- *end = all_end - all_start;
- num = size*size / (double) n_way; // 2xArea per thread?
-
- if( forward ) {
- dim_t curr_caucus = n_way - 1;
- dim_t len = 0;
- while(1){
- dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus
- width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
- if( curr_caucus == work_id ) {
- *start = bli_max( 0 , *end - width ) + all_start;
- *end = *end + all_start;
- return;
- }
- else{
- *end -= width;
- len += width;
- curr_caucus--;
- }
- }
- }
- else{
- while(1){
- dim_t width = ceil(sqrt(*start * *start + num)) - *start;
- width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
-
- if( work_id == 0 ) {
- *start = *start + all_start;
- *end = bli_min( *start + width, all_end );
- return;
- }
- else{
- *start = *start + width;
- }
- work_id--;
- }
- }
+ bli_get_range( thr, all_start, all_end, block_factor,
+ FALSE, start, end );
+}
+
+void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+{
+ bli_get_range( thr, all_start, all_end, block_factor,
+ TRUE, start, end );
+}
+
+void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+{
+ bli_get_range( thr, all_start, all_end, block_factor,
+ FALSE, start, end );
+}
+
+void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+{
+ bli_get_range( thr, all_start, all_end, block_factor,
+ TRUE, start, end );
}
+void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, bool_t handle_edge_low, dim_t* start, dim_t* end )
+{
+ thrinfo_t* thread = ( thrinfo_t* )thr;
+ dim_t n_way = thread->n_way;
+ dim_t work_id = thread->work_id;
+ dim_t size = all_end - all_start;
+ dim_t width;
+ dim_t block_fac_leftover = size % block_factor;
+ dim_t i;
+ double num;
+
+ *start = 0;
+ *end = all_end - all_start;
+ num = size * size / ( double )n_way;
+
+ if ( bli_is_lower( uplo ) )
+ {
+ dim_t cur_caucus = n_way - 1;
+ dim_t len = 0;
+
+ // This loop computes subpartitions backwards, from the high end
+ // of the index range to the low end. If the low end is assumed
+ // to be on the left and the high end the right, this assignment
+ // of widths is appropriate for n dimension partitioning of a
+ // lower triangular matrix.
+ for ( i = 0; TRUE; ++i )
+ {
+ width = ceil( sqrt( len*len + num ) ) - len;
+
+ // If we need to allocate the edge case (assuming it exists)
+ // to the high thread subpartition, adjust width so that it
+ // contains the exact amount of leftover edge dimension so that
+ // all remaining subpartitions can be multiples of block_factor.
+ // If the edge case is to be allocated to the low subpartition,
+ // or if there is no edge case, it is implicitly allocated to
+ // the low subpartition by virtue of the fact that all other
+ // subpartitions already assigned will be multiples of
+ // block_factor.
+ if ( i == 0 && !handle_edge_low )
+ {
+ if ( width % block_factor != block_fac_leftover )
+ width += block_fac_leftover - ( width % block_factor );
+ }
+ else
+ {
+ if ( width % block_factor != 0 )
+ width += block_factor - ( width % block_factor );
+ }
+
+ if ( cur_caucus == work_id )
+ {
+ *start = bli_max( 0, *end - width ) + all_start;
+ *end = *end + all_start;
+ return;
+ }
+ else
+ {
+ *end -= width;
+ len += width;
+ cur_caucus--;
+ }
+ }
+ }
+ else // if ( bli_is_upper( uplo ) )
+ {
+ // This loop computes subpartitions forwards, from the low end
+ // of the index range to the high end. If the low end is assumed
+ // to be on the left and the high end the right, this assignment
+ // of widths is appropriate for n dimension partitioning of an
+ // upper triangular matrix.
+ for ( i = 0; TRUE; ++i )
+ {
+ width = ceil( sqrt( *start * *start + num ) ) - *start;
+
+ if ( i == 0 && handle_edge_low )
+ {
+ if ( width % block_factor != block_fac_leftover )
+ width += block_fac_leftover - ( width % block_factor );
+ }
+ else
+ {
+ if ( width % block_factor != 0 )
+ width += block_factor - ( width % block_factor );
+ }
+
+ if ( work_id == 0 )
+ {
+ *start = *start + all_start;
+ *end = bli_min( *start + width, all_end );
+ return;
+ }
+ else
+ {
+ *start = *start + width;
+ work_id--;
+ }
+ }
+ }
+}
+
+void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+ if ( bli_is_upper_or_lower( uplo ) )
+ {
+ bli_get_range_weighted( thr, all_start, all_end, block_factor,
+ uplo, FALSE, start, end );
+ }
+ else // if dense or zeros
+ {
+ bli_get_range_l2r( thr, all_start, all_end, block_factor,
+ start, end );
+ }
+}
+
+void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+ if ( bli_is_upper_or_lower( uplo ) )
+ {
+ //printf( "bli_get_range_weighted_r2l: is upper or lower\n" );
+ bli_toggle_uplo( uplo );
+ bli_get_range_weighted( thr, all_start, all_end, block_factor,
+ uplo, TRUE, start, end );
+ }
+ else // if dense or zeros
+ {
+ //printf( "bli_get_range_weighted_r2l: is dense or zeros\n" );
+ bli_get_range_r2l( thr, all_start, all_end, block_factor,
+ start, end );
+ }
+}
+
+void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+ if ( bli_is_upper_or_lower( uplo ) )
+ {
+ bli_toggle_uplo( uplo );
+ bli_get_range_weighted( thr, all_start, all_end, block_factor,
+ uplo, FALSE, start, end );
+ }
+ else // if dense or zeros
+ {
+ bli_get_range_t2b( thr, all_start, all_end, block_factor,
+ start, end );
+ }
+}
+
+void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+ if ( bli_is_upper_or_lower( uplo ) )
+ {
+ bli_get_range_weighted( thr, all_start, all_end, block_factor,
+ uplo, TRUE, start, end );
+ }
+ else // if dense or zeros
+ {
+ bli_get_range_b2t( thr, all_start, all_end, block_factor,
+ start, end );
+ }
+ }
void bli_level3_thread_decorator( dim_t n_threads,
level3_int_t func,
obj_t* alpha,
dim_t number = 1;
#ifdef BLIS_ENABLE_C66X_BUILD
if(strcmp(env,"BLIS_JC_NT")==0)
- number = 1;
+ number = BLIS_C66X_JC_NT;
if(strcmp(env,"BLIS_IC_NT")==0)
- number = 8;
+ number = BLIS_C66X_IC_NT;
if(strcmp(env,"BLIS_JR_NT")==0)
- number = 1;
+ number = BLIS_C66X_JR_NT;
if(strcmp(env,"BLIS_IR_NT")==0)
- number = 1;
+ number = BLIS_C66X_IR_NT;
return number;
#else
char* str = getenv( env );
index 7ca163d8108b3cbe71c0acc705e27c48d250cda2..19c8118fca5bc24fbdb85528caf4962735c9c44f 100644 (file)
};
typedef struct thrinfo_s thrinfo_t;
+#ifdef BLIS_ENABLE_C66X_BUILD
+#define bli_get_thread_num lib_get_coreID
+#else
+#define bli_get_thread_num omp_get_thread_num
+#endif
+
+
// Thread Info Interface Definitions
#define thread_ocomm( thread ) (thread->ocomm)
#define thread_icomm( thread ) (thread->icomm)
#define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id )
#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id )
-void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
-void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end);
+//void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
+//void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end);
+
+void bli_get_range( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ bool_t handle_edge_low,
+ dim_t* start, dim_t* end );
+void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ dim_t* start, dim_t* end );
+void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ dim_t* start, dim_t* end );
+void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ dim_t* start, dim_t* end );
+void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ dim_t* start, dim_t* end );
+
+void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ bool_t handle_edge_low,
+ dim_t* start, dim_t* end );
+void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ dim_t* start, dim_t* end );
+void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ dim_t* start, dim_t* end );
+void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ dim_t* start, dim_t* end );
+void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ dim_t* start, dim_t* end );
+
thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
diff --git a/blis/frame/include/bli_mem_pool_macro_defs.h b/blis/frame/include/bli_mem_pool_macro_defs.h
index d371d63bda455caee126eebe768e31a419ec5cf8..4423b5c26a436c7e6d12da5af31b18632095f2db 100644 (file)
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*#define BLIS_MK_BLOCK_SIZE_S ( BLIS_POOL_MC_L2_S * \
- ( BLIS_POOL_KC_L2_S + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_S / \
- BLIS_SIZEOF_S ) \
- ) * \
- BLIS_SIZEOF_S \
- )
-#define BLIS_KN_BLOCK_SIZE_S ( \
- ( BLIS_POOL_KC_L3_S + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_S / \
- BLIS_SIZEOF_S ) \
- ) * \
- BLIS_POOL_NC_L3_S * \
- BLIS_SIZEOF_S \
- )
-*/
+
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
#define BLIS_MK_BLOCK_SIZE_S ( bli_max( BLIS_POOL_MC_S*(BLIS_POOL_MC_S + BLIS_POOL_KC_S), \
(BLIS_POOL_MC_S + BLIS_POOL_KC_S)*(BLIS_POOL_MC_S + BLIS_POOL_KC_S)/4 \
BLIS_SIZEOF_S \
)
+#elif defined (MEM_MODEL_SMALL)
+
+//DMA is not used, and so we do not need to calculate the extra memory that needs to DMA'ed to rebuild symmetric matrices
+
+#define BLIS_MK_BLOCK_SIZE_S ( BLIS_POOL_MC_S * \
+ ( BLIS_POOL_KC_S \
+ ) * \
+ BLIS_SIZEOF_S \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_S ( \
+ ( BLIS_POOL_KC_S \
+ ) * \
+ BLIS_POOL_NC_S * \
+ BLIS_SIZEOF_S \
+ )
+
+#endif
+
#define BLIS_MN_BLOCK_SIZE_S ( BLIS_POOL_MC_S * \
BLIS_POOL_NC_S * \
BLIS_SIZEOF_S \
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_D ( BLIS_POOL_MC_L2_D * \
- ( BLIS_POOL_KC_L2_D + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_D / \
- BLIS_SIZEOF_D ) \
- ) * \
- BLIS_SIZEOF_D \
- )
-#define BLIS_KN_BLOCK_SIZE_D ( \
- ( BLIS_POOL_KC_L3_D + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_D / \
- BLIS_SIZEOF_D ) \
- ) * \
- BLIS_POOL_NC_L3_D * \
- BLIS_SIZEOF_D \
- )
-*/
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
+
#define BLIS_MK_BLOCK_SIZE_D ( bli_max( BLIS_POOL_MC_D*(BLIS_POOL_MC_D + BLIS_POOL_KC_D), \
(BLIS_POOL_MC_D + BLIS_POOL_KC_D)*(BLIS_POOL_MC_D + BLIS_POOL_KC_D)/4 \
) * \
BLIS_SIZEOF_D \
)
+#elif defined (MEM_MODEL_SMALL)
+#define BLIS_MK_BLOCK_SIZE_D ( BLIS_POOL_MC_D * \
+ ( BLIS_POOL_KC_D \
+ ) * \
+ BLIS_SIZEOF_D \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_D ( \
+ ( BLIS_POOL_KC_D \
+ ) * \
+ BLIS_POOL_NC_D * \
+ BLIS_SIZEOF_D \
+ )
+
+#endif
#define BLIS_MN_BLOCK_SIZE_D ( BLIS_POOL_MC_D * \
BLIS_POOL_NC_D * \
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_C ( BLIS_POOL_MC_L2_C * \
- ( BLIS_POOL_KC_L2_C + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_SIZEOF_C \
- )
-#define BLIS_KN_BLOCK_SIZE_C ( \
- ( BLIS_POOL_KC_L3_C + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_POOL_NC_L3_C * \
- BLIS_SIZEOF_C \
- )
-*/
+
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
+
#define BLIS_MK_BLOCK_SIZE_C ( bli_max( BLIS_POOL_MC_C*(BLIS_POOL_MC_C + BLIS_POOL_KC_C), \
(BLIS_POOL_MC_C + BLIS_POOL_KC_C)*(BLIS_POOL_MC_C + BLIS_POOL_KC_C)/4 \
) * \
BLIS_SIZEOF_D \
)
+#elif defined (MEM_MODEL_SMALL)
+
+#define BLIS_MK_BLOCK_SIZE_C ( BLIS_POOL_MC_C * \
+ ( BLIS_POOL_KC_C \
+ ) * \
+ BLIS_SIZEOF_C \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_C ( \
+ ( BLIS_POOL_KC_C \
+ ) * \
+ BLIS_POOL_NC_C * \
+ BLIS_SIZEOF_C \
+ )
+#endif
+
#define BLIS_MN_BLOCK_SIZE_C ( BLIS_POOL_MC_C * \
BLIS_POOL_NC_C * \
BLIS_SIZEOF_C \
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_Z ( BLIS_POOL_MC_L2_Z * \
- ( BLIS_POOL_KC_L2_Z + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_SIZEOF_Z \
- )
-#define BLIS_KN_BLOCK_SIZE_Z ( \
- ( BLIS_POOL_KC_L3_Z + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_POOL_NC_L3_Z * \
- BLIS_SIZEOF_Z \
- )
-*/
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
#define BLIS_MK_BLOCK_SIZE_Z ( bli_max( BLIS_POOL_MC_Z * (BLIS_POOL_MC_Z + BLIS_POOL_KC_Z), \
(BLIS_POOL_MC_Z + BLIS_POOL_KC_Z)*(BLIS_POOL_MC_Z + BLIS_POOL_KC_Z)/4 \
BLIS_SIZEOF_Z \
)
+#elif defined (MEM_MODEL_SMALL)
+
+#define BLIS_MK_BLOCK_SIZE_Z ( BLIS_POOL_MC_Z * \
+ ( BLIS_POOL_KC_Z \
+ ) * \
+ BLIS_SIZEOF_Z \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_Z ( \
+ ( BLIS_POOL_KC_Z \
+ ) * \
+ BLIS_POOL_NC_Z * \
+ BLIS_SIZEOF_Z \
+ )
+
+#endif
+
#define BLIS_MN_BLOCK_SIZE_Z ( BLIS_POOL_MC_Z * \
BLIS_POOL_NC_Z * \
BLIS_SIZEOF_Z \
// Compute memory pool block sizes for single complex (4m).
//
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_MC_L2_C * \
- ( BLIS_POOL_4M_KC_L2_C + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_SIZEOF_C \
- )
-#define BLIS_KN_BLOCK_SIZE_4M_C ( \
- ( BLIS_POOL_4M_KC_L3_C + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_POOL_4M_NC_L3_C * \
- BLIS_SIZEOF_C \
- )
-*/
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
+
#define BLIS_MK_BLOCK_SIZE_4M_C ( bli_max( BLIS_POOL_4M_MC_C*(BLIS_POOL_4M_MC_C + BLIS_POOL_4M_KC_C), \
(BLIS_POOL_4M_MC_C + BLIS_POOL_4M_KC_C)*(BLIS_POOL_4M_MC_C + BLIS_POOL_4M_KC_C)/4 \
) * \
) * \
BLIS_SIZEOF_C \
)
+#elif defined (MEM_MODEL_SMALL)
+#define BLIS_MK_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_MC_C * \
+ ( BLIS_POOL_4M_KC_C \
+ ) * \
+ BLIS_SIZEOF_C \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_4M_C ( \
+ ( BLIS_POOL_4M_KC_C \
+ ) * \
+ BLIS_POOL_4M_NC_C * \
+ BLIS_SIZEOF_C \
+ )
+#endif
+
#define BLIS_MN_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_MC_C * \
// Compute memory pool block sizes for double complex (4m).
//
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_MC_L2_Z * \
- ( BLIS_POOL_4M_KC_L2_Z + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_SIZEOF_Z \
- )
-#define BLIS_KN_BLOCK_SIZE_4M_Z ( \
- ( BLIS_POOL_4M_KC_L3_Z + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_POOL_4M_NC_L3_Z * \
- BLIS_SIZEOF_Z \
- )
-*/
+
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
+
#define BLIS_MK_BLOCK_SIZE_4M_Z ( bli_max( BLIS_POOL_4M_MC_Z*(BLIS_POOL_4M_MC_Z + BLIS_POOL_4M_KC_Z), \
(BLIS_POOL_4M_MC_Z + BLIS_POOL_4M_KC_Z)*(BLIS_POOL_4M_MC_Z + BLIS_POOL_4M_KC_Z)/4 \
) * \
BLIS_SIZEOF_Z \
)
+#elif defined (MEM_MODEL_SMALL)
+
+#define BLIS_MK_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_MC_Z * \
+ ( BLIS_POOL_4M_KC_Z \
+ ) * \
+ BLIS_SIZEOF_Z \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_4M_Z ( \
+ ( BLIS_POOL_4M_KC_Z \
+ ) * \
+ BLIS_POOL_4M_NC_Z * \
+ BLIS_SIZEOF_Z \
+ )
+#endif
+
#define BLIS_MN_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_MC_Z * \
BLIS_POOL_4M_NC_Z * \
// NOTE: We scale by 3/2 because 3m requires 50% more space than 4m.
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_MC_L2_C * \
- ( BLIS_POOL_3M_KC_L2_C + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- ( BLIS_SIZEOF_C * \
- 3 \
- ) / 2 \
- )
-#define BLIS_KN_BLOCK_SIZE_3M_C ( \
- ( BLIS_POOL_3M_KC_L3_C + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_POOL_3M_NC_L3_C * \
- ( BLIS_SIZEOF_C * \
- 3 \
- ) / 2 \
- )
-*/
+
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
+
#define BLIS_MK_BLOCK_SIZE_3M_C ( bli_max( BLIS_POOL_3M_MC_C*(BLIS_POOL_3M_MC_C + BLIS_POOL_3M_KC_C), \
(BLIS_POOL_3M_MC_C + BLIS_POOL_3M_KC_C)*(BLIS_POOL_3M_MC_C + BLIS_POOL_3M_KC_C)/4 \
) * \
3 / 2 \
)
+#elif defined (MEM_MODEL_SMALL)
+
+#define BLIS_MK_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_MC_C * \
+ ( BLIS_POOL_3M_KC_C \
+ ) * \
+ ( BLIS_SIZEOF_C * \
+ 3 \
+ ) / 2 \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_3M_C ( \
+ ( BLIS_POOL_3M_KC_C \
+ ) * \
+ BLIS_POOL_3M_NC_C * \
+ ( BLIS_SIZEOF_C * \
+ 3 \
+ ) / 2 \
+ )
+#endif
#define BLIS_MN_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_MC_C * \
BLIS_POOL_3M_NC_C * \
// NOTE: We scale by 3/2 because 3m requires 50% more space than 4m.
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_MC_L2_Z * \
- ( BLIS_POOL_3M_KC_L2_Z + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- ( BLIS_SIZEOF_Z * \
- 3 \
- ) / 2 \
- )
-#define BLIS_KN_BLOCK_SIZE_3M_Z ( \
- ( BLIS_POOL_3M_KC_L3_Z + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_POOL_3M_NC_L3_Z * \
- ( BLIS_SIZEOF_Z * \
- 3 \
- ) / 2 \
- )
-*/
+#if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
+
#define BLIS_MK_BLOCK_SIZE_3M_Z ( bli_max( BLIS_POOL_3M_MC_Z*(BLIS_POOL_3M_MC_Z + BLIS_POOL_3M_KC_Z), \
(BLIS_POOL_3M_MC_Z + BLIS_POOL_3M_KC_Z)*(BLIS_POOL_3M_MC_Z + BLIS_POOL_3M_KC_Z) / 4 \
) * \
3 / 2 \
)
+#elif defined (MEM_MODEL_SMALL)
+#define BLIS_MK_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_MC_Z * \
+ ( BLIS_POOL_3M_KC_Z \
+ ) * \
+ ( BLIS_SIZEOF_Z * \
+ 3 \
+ ) / 2 \
+ )
+
+
+#define BLIS_KN_BLOCK_SIZE_3M_Z ( \
+ ( BLIS_POOL_3M_KC_Z \
+ ) * \
+ BLIS_POOL_3M_NC_Z * \
+ ( BLIS_SIZEOF_Z * \
+ 3 \
+ ) / 2 \
+ )
+#endif
+
#define BLIS_MN_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_MC_Z * \
BLIS_POOL_3M_NC_Z * \
)
-/*#define BLIS_MN_POOL_SIZE_L1 ( \
- BLIS_NUM_MC_X_NC_BLOCKS_L1 * \
- ( BLIS_MN_BLOCK_SIZE + \
- BLIS_CONTIG_ADDR_ALIGN_SIZE \
- ) + \
- BLIS_MAX_PRELOAD_BYTE_OFFSET \
+#define BLIS_MNR_POOL_SIZE_L2 ( \
+ BLIS_NUM_MC_X_NR_BLOCKS_L1 * \
+ ( BLIS_MNR_BLOCK_SIZE ) \
)
-*/
+
#define BLIS_MN_POOL_SIZE_L1 ( \
BLIS_NUM_MC_X_NC_BLOCKS_L1 * \
( BLIS_MN_BLOCK_SIZE + \
index b52c8bb975f14834d6f984b685f06247bf536c4a..7f2ec1885c6150af596a033bf2c8be0139aec5e2 100644 (file)
\
bli_obj_is_lower( *bli_obj_root( obj ) ) \
+#define bli_obj_root_uplo( obj ) \
+\
+ bli_obj_uplo( *bli_obj_root( obj ) )
// Root matrix modification
\
bli_mem_set_buffer( NULL, pack_mem ); \
bli_mem_set_buffer( NULL, dma_mem ); \
- bli_edma_handle_set_NULL( *obj_p ); \
+ bli_emt_handle_set_NULL( *obj_p ); \
}
#else
index 6a6f8d4f7fae4fbd590de1acaf1072f4d2d29d77..363e59ec53ed28cd14c5908600892c8ca6ada819 100644 (file)
#ifdef BLIS_ENABLE_C66X_EDMA
mem_t dma_mem;
- EdmaMgr_Handle edma_handle;
+ lib_emt_Handle emt_handle;
#endif
} obj_t;
(b).n_panel = (a).n_panel; \
(b).dma_mem = (a).dma_mem; \
/* Cannot use the macro because the definition of the bli_dma.h comes after bli_type_defs.h in blis.h */ \
- (b).edma_handle = NULL; \
+ (b).emt_handle = NULL; \
}
#else
#define bli_obj_init_full_shallow_copy_of( a, b ) \
index 7c94e2b2223a2fae3503cb6b6bad8be46376fe42..d0926536fa8df0b040b5495a27f7b08ee3a627a9 100644 (file)
//DMA include
#include "bli_dma.h" //Has to be after bli_cntl, because bli_dma.h uses typedefs from bli_cntl.h
#endif
+#include "bli_profile.h"
+
// -- Level-0 operations --
index a4c2779bd8d9e4822991c30451b8451930962f60..dc810df29ff33c2aa62d1d4f75718a3d18aaef4d 100644 (file)
#define BLIS_FPRINTS_H
// prints
-#ifdef BLIS_ENABLE_C66X_OPENCL
-#define bli_sfprints( file, spec, x ) \
-{ \
- printf( spec, (x) ); \
-}
-#define bli_dfprints( file, spec, x ) \
-{ \
- printf(spec, (x) ); \
-}
-#define bli_cfprints( file, spec, x ) \
-{ \
- printf( spec, bli_creal(x) ); \
- printf( " + " ); \
- printf( spec, bli_cimag(x) ); \
- printf( " " ); \
-}
-#define bli_zfprints( file, spec, x ) \
-{ \
- printf( spec, bli_zreal(x) ); \
- printf( " + " ); \
- printf( spec, bli_zimag(x) ); \
- printf( " " ); \
-}
-#define bli_ifprints( file, spec, x ) \
-{ \
- printf( spec, (x) ); \
-}
-#else
+
#define bli_sfprints( file, spec, x ) \
{ \
fprintf( file, spec, (x) ); \
{ \
fprintf( file, spec, (x) ); \
}
-#endif
#endif
index 5097ab0dcfe6418ccfe8edc172b5a6ed84ba0528..d6afa34b575b02725c3a2da4d4b6876364c21f97 100644 (file)
dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, *x );
gint_t* ip = bli_obj_buffer_for_const( BLIS_INT, *x );
-#ifdef BLIS_ENABLE_C66X_OPENCL
- printf( "%s\n", s1 );
- printf( " float: %9.2e\n", bli_sreal( *sp ) );
- printf( " double: %9.2e\n", bli_dreal( *dp ) );
- printf( " scomplex: %9.2e + %9.2e\n", bli_creal( *cp ), bli_cimag( *cp ) );
- printf( " dcomplex: %9.2e + %9.2e\n", bli_zreal( *zp ), bli_zimag( *zp ) );
- printf( " int: %ld\n", (long int)*ip );
- printf( "\n" );
- return;
-#else
fprintf( file, "%s\n", s1 );
fprintf( file, " float: %9.2e\n", bli_sreal( *sp ) );
fprintf( file, " double: %9.2e\n", bli_dreal( *dp ) );
fprintf( file, " int: %ld\n", (long int)*ip );
fprintf( file, "\n" );
return;
-#endif
}
// Index into the type combination array to extract the correct
s2 );
}
-#ifdef BLIS_ENABLE_C66X_OPENCL
-#undef GENTFUNC
-#define GENTFUNC( ctype, ch, opname, varname ) \
-\
-void PASTEMAC(ch,opname)( \
- FILE* file, \
- char* s1, \
- dim_t m, \
- dim_t n, \
- void* x, inc_t rs_x, inc_t cs_x, \
- char* format, \
- char* s2 \
- ) \
-{ \
- dim_t i, j; \
- ctype* chi1; \
- char default_spec[32] = PASTEMAC(ch,formatspec)(); \
-\
- if ( format == NULL ) format = default_spec; \
-\
- printf( "%s\n", s1 ); \
-\
- for ( i = 0; i < m; ++i ) \
- { \
- for ( j = 0; j < n; ++j ) \
- { \
- chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \
-\
- PASTEMAC(ch,fprints)( file, format, *chi1 ); \
- printf( " " ); \
- } \
-\
- printf( ";\n" ); \
- } \
-\
- printf( "%s\n", s2 ); \
-}
-
-INSERT_GENTFUNC_BASIC_I( fprintm, fprintm )
-#else
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
}
INSERT_GENTFUNC_BASIC_I( fprintm, fprintm )
-#endif
+
index e3629615c07d5ea6232ec07e81488a793722de4b..c4387366b7ef4d9bb1231c7dc2730c4785e3c06d 100644 (file)
format,
s2 );
}
-#ifdef BLIS_ENABLE_C66X_OPENCL
-
-#undef GENTFUNC
-#define GENTFUNC( ctype, ch, opname, varname ) \
-\
-void PASTEMAC(ch,opname)( \
- FILE* file, \
- char* s1, \
- dim_t n, \
- void* x, inc_t incx, \
- char* format, \
- char* s2 \
- ) \
-{ \
- dim_t i; \
- ctype* chi1; \
- char default_spec[32] = PASTEMAC(ch,formatspec)(); \
-\
- if ( format == NULL ) format = default_spec; \
-\
- chi1 = x; \
-\
- printf( "%s\n", s1 ); \
-\
- for ( i = 0; i < n; ++i ) \
- { \
- PASTEMAC(ch,fprints)( file, format, *chi1 ); \
- printf( "\n" ); \
-\
- chi1 += incx; \
- } \
-\
- printf( "\n" ); \
- printf( "%s\n", s2 ); \
-}
-
-INSERT_GENTFUNC_BASIC_I( fprintv, fprintv )
-
-#else
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
}
INSERT_GENTFUNC_BASIC_I( fprintv, fprintv )
-#endif
diff --git a/blis/kernels/armv7a/3/bli_cgemm_kernel_2x2.S b/blis/kernels/armv7a/3/bli_cgemm_kernel_2x2.S
--- /dev/null
@@ -0,0 +1,502 @@
+
+#define REALNAME bli_cgemm_kernel_2x2
+
+#define STACKSIZE 256
+
+#define K r0
+#define PTR_ALPHA r1
+#define OLD_A r2
+#define OLD_B r3
+#define PTR_BETA [fp, #0 ]
+#define OLD_C [fp, #4 ]
+#define OLD_RSC [fp, #8 ]
+#define OLD_CSC [fp, #12 ]
+#define AUX [fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L r2
+
+#define AO r5
+#define BO r6
+
+#define CO1 r7
+#define CO2 r8
+
+
+#define A_PRE 96
+#define B_PRE 96
+#define C_PRE 0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#define FMAC_BR fnmacs
+#define FMAC_BI fmacs
+
+#define NN 1
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+ #define FADD_R fsubs
+ #define FADD_I fadds
+
+ #define FMAC_R1 fnmacs
+ #define FMAC_R2 fnmacs
+ #define FMAC_I1 fmacs
+ #define FMAC_I2 fnmacs
+
+#elif defined(CN) || defined(CT)
+
+ #define FADD_R fadds
+ #define FADD_I fsubs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fmacs
+ #define FMAC_I1 fnmacs
+ #define FMAC_I2 fmacs
+
+#elif defined(NC) || defined(TC)
+
+ #define FADD_R fadds
+ #define FADD_I fsubs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fnmacs
+ #define FMAC_I1 fmacs
+ #define FMAC_I2 fmacs
+
+#else
+
+ #define FADD_R fsubs
+ #define FADD_I fadds
+
+ #define FMAC_R1 fnmacs
+ #define FMAC_R2 fmacs
+ #define FMAC_I1 fnmacs
+ #define FMAC_I2 fnmacs
+
+#endif
+
+
+
+.macro INIT2x2
+
+ vsub.f32 s16 , s16 , s16
+ vmov.f32 s17, s16
+ vmov.f32 s18, s16
+ vmov.f32 s19, s16
+ vmov.f32 s20, s16
+ vmov.f32 s21, s16
+ vmov.f32 s22, s16
+ vmov.f32 s23, s16
+ vmov.f32 s24, s16
+ vmov.f32 s25, s16
+ vmov.f32 s26, s16
+ vmov.f32 s27, s16
+ vmov.f32 s28, s16
+ vmov.f32 s29, s16
+ vmov.f32 s30, s16
+ vmov.f32 s31, s16
+
+.endm
+
+.macro KERNEL2x2_I
+ pld [ AO , #A_PRE ]
+ pld [ BO , #B_PRE ]
+ flds s0 , [ AO ]
+ flds s1 , [ AO, #4 ]
+ flds s8 , [ BO ]
+ flds s9 , [ BO, #4 ]
+
+ fmuls s16 , s0, s8
+ flds s2 , [ AO, #8 ]
+ fmuls s24 , s1, s9
+ flds s3 , [ AO, #12 ]
+ fmuls s17 , s0, s9
+ flds s10, [ BO, #8 ]
+ fmuls s25 , s1, s8
+
+ flds s11, [ BO, #12 ]
+ fmuls s18 , s2, s8
+ add BO , BO, #16
+ fmuls s26 , s3, s9
+ add AO , AO, #16
+ fmuls s19 , s2, s9
+ pld [ BO , #B_PRE ]
+ fmuls s27 , s3, s8
+
+ pld [ AO , #A_PRE ]
+ fmuls s20 , s0, s10
+ flds s4 , [ AO, #0 ]
+ fmuls s28 , s1, s11
+ flds s5 , [ AO, #4 ]
+ fmuls s21 , s0, s11
+ flds s12, [ BO ]
+ fmuls s29 , s1, s10
+
+ flds s13, [ BO, #4 ]
+ fmuls s22 , s2, s10
+ flds s6 , [ AO, #8 ]
+ fmuls s30 , s3, s11
+ flds s7 , [ AO, #12 ]
+ fmuls s23 , s2, s11
+ flds s14, [ BO, #8 ]
+ fmuls s31 , s3, s10
+ flds s15, [ BO, #12 ]
+
+ add BO , BO, #16
+ add AO , AO, #16
+.endm
+
+
+
+.macro KERNEL2x2_M1
+ pld [ AO , #A_PRE ]
+
+ fmacs s16 , s0, s8
+ pld [ BO , #B_PRE ]
+ fmacs s24 , s1, s9
+ flds s4 , [ AO, #0 ]
+ fmacs s17 , s0, s9
+ flds s5 , [ AO, #4 ]
+ fmacs s25 , s1, s8
+
+ flds s12, [ BO ]
+ fmacs s18 , s2, s8
+ flds s13, [ BO, #4 ]
+ fmacs s26 , s3, s9
+ flds s6 , [ AO, #8 ]
+ fmacs s19 , s2, s9
+ flds s7 , [ AO, #12 ]
+ fmacs s27 , s3, s8
+
+ fmacs s20 , s0, s10
+ flds s14, [ BO, #8 ]
+ fmacs s28 , s1, s11
+ fmacs s21 , s0, s11
+ flds s15, [ BO, #12 ]
+ fmacs s29 , s1, s10
+
+ fmacs s22 , s2, s10
+ add BO , BO, #16
+ fmacs s30 , s3, s11
+ fmacs s23 , s2, s11
+ add AO , AO, #16
+ fmacs s31 , s3, s10
+
+.endm
+
+.macro KERNEL2x2_M2
+
+ fmacs s16 , s4, s12
+ fmacs s24 , s5, s13
+ flds s0 , [ AO, #0 ]
+ fmacs s17 , s4, s13
+ flds s1 , [ AO, #4 ]
+ fmacs s25 , s5, s12
+
+ fmacs s18 , s6, s12
+ flds s8 , [ BO ]
+ fmacs s26 , s7, s13
+ flds s9 , [ BO, #4 ]
+ fmacs s19 , s6, s13
+ fmacs s27 , s7, s12
+
+ flds s2 , [ AO, #8 ]
+ fmacs s20 , s4, s14
+ flds s3 , [ AO, #12 ]
+ fmacs s28 , s5, s15
+ fmacs s21 , s4, s15
+ flds s10, [ BO, #8 ]
+ fmacs s29 , s5, s14
+
+ flds s11, [ BO, #12 ]
+ fmacs s22 , s6, s14
+ fmacs s30 , s7, s15
+ add BO , BO, #16
+ fmacs s23 , s6, s15
+ add AO , AO, #16
+ fmacs s31 , s7, s14
+
+.endm
+
+
+.macro KERNEL2x2_E
+
+ fmacs s16 , s4, s12
+ fmacs s24 , s5, s13
+ fmacs s17 , s4, s13
+ fmacs s25 , s5, s12
+
+ fmacs s18 , s6, s12
+ fmacs s26 , s7, s13
+ fmacs s19 , s6, s13
+ fmacs s27 , s7, s12
+
+ fmacs s20 , s4, s14
+ fmacs s28 , s5, s15
+ fmacs s21 , s4, s15
+ fmacs s29 , s5, s14
+
+ fmacs s22 , s6, s14
+ fmacs s30 , s7, s15
+ fmacs s23 , s6, s15
+ fmacs s31 , s7, s14
+
+.endm
+
+.macro KERNEL2x2_SUB
+
+ flds s0 , [ AO ]
+ flds s1 , [ AO, #4 ]
+ flds s8 , [ BO ]
+ flds s9 , [ BO, #4 ]
+
+ fmacs s16 , s0, s8
+ flds s2 , [ AO, #8 ]
+ fmacs s24 , s1, s9
+ flds s3 , [ AO, #12 ]
+ fmacs s17 , s0, s9
+ flds s10, [ BO, #8 ]
+ fmacs s25 , s1, s8
+
+ flds s11, [ BO, #12 ]
+ fmacs s18 , s2, s8
+ fmacs s26 , s3, s9
+ fmacs s19 , s2, s9
+ fmacs s27 , s3, s8
+
+ fmacs s20 , s0, s10
+ fmacs s28 , s1, s11
+ fmacs s21 , s0, s11
+ fmacs s29 , s1, s10
+
+ fmacs s22 , s2, s10
+ add BO , BO, #16
+ fmacs s30 , s3, s11
+ fmacs s23 , s2, s11
+ add AO , AO, #16
+ fmacs s31 , s3, s10
+
+.endm
+
+
+
+
+.macro SAVE2x2
+
+ ldr r3, OLD_RSC // Row stride size
+ lsl r3, r3, #3 // multiply with size of complex float
+
+ flds s0, [ PTR_ALPHA ] // load real part of alpha
+ flds s1, [ PTR_ALPHA, #4 ] // load imag part of alpha
+ ldr r4, PTR_BETA
+ flds s2, [ r4 ] // load real part of beta
+ flds s3, [ r4, #4 ] // load imag part of beta
+
+ // Add/Sub the real and the imag parts
+ FADD_R s16, s24 , s16
+ FADD_I s17, s25 , s17
+ FADD_R s18, s26 , s18
+ FADD_I s19, s27 , s19
+ FADD_R s20, s28 , s20
+ FADD_I s21, s29 , s21
+ FADD_R s22, s30 , s22
+ FADD_I s23, s31 , s23
+
+ mov r4, CO1 // save pointer
+ fldmias CO1, { s4 - s5 } // read real and imag part from C
+ add CO1, CO1, r3
+
+ mov r2, CO2 // save pointer
+ fldmias CO2, { s8 - s9 } // read real and imag part from C
+ add CO2, CO2, r3
+
+ fmuls s24, s4, s2 // multiply Beta-real with C-real
+ fmuls s25, s5, s2 // multiply Beta-real with C-imag
+ fmuls s28, s8, s2 // multiply Beta-real with C-real
+ fmuls s29, s9, s2 // multiply Beta-real with C-imag
+
+ FMAC_BR s24, s3, s5 // multiply beta-imag with C-imag and add
+ FMAC_BI s25, s3, s4 // multiply beta-imag with C-real and add
+ FMAC_BR s28, s3, s9 // multiply beta-imag with C-imag and add
+ FMAC_BI s29, s3, s8 // multiply beta-imag with C-real and add
+
+ FMAC_R1 s24 , s0 , s16
+ FMAC_I1 s25 , s0 , s17
+ FMAC_R2 s24 , s1 , s17
+ FMAC_I2 s25 , s1 , s16
+
+ FMAC_R1 s28 , s0 , s20
+ FMAC_I1 s29 , s0 , s21
+ FMAC_R2 s28 , s1 , s21
+ FMAC_I2 s29 , s1 , s20
+
+ fldmias CO1, { s4 - s5 } // read real and imag part from C
+ fldmias CO2, { s8 - s9 } // read real and imag part from C
+
+ fmuls s26, s4, s2 // multiply Beta-real with C-real
+ fmuls s27, s5, s2 // multiply Beta-real with C-imag
+ fmuls s30, s8, s2 // multiply Beta-real with C-real
+ fmuls s31, s9, s2 // multiply Beta-real with C-imag
+
+ FMAC_BR s26, s3, s5 // multiply beta-imag with C-imag and add
+ FMAC_BI s27, s3, s4 // multiply beta-imag with C-real and add
+ FMAC_BR s30, s3, s9 // multiply beta-imag with C-imag and add
+ FMAC_BI s31, s3, s8 // multiply beta-imag with C-real and add
+
+ FMAC_R1 s26 , s0 , s18
+ FMAC_I1 s27 , s0 , s19
+ FMAC_R2 s26 , s1 , s19
+ FMAC_I2 s27 , s1 , s18
+
+ FMAC_R1 s30, s0 , s22
+ FMAC_I1 s31, s0 , s23
+ FMAC_R2 s30, s1 , s23
+ FMAC_I2 s31, s1 , s22
+
+ mov CO1, r4 // restore pointer
+ mov CO2, r2 // restore pointer
+ fstmias CO1, { s24 - s25 }
+ fstmias CO2, { s28 - s29 }
+ add CO1, CO1, r3
+ add CO2, CO2, r3
+ fstmias CO1, { s26 - s27 }
+ fstmias CO2, { s30 - s31 }
+
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ .arm
+ .global REALNAME
+ .func REALNAME
+
+REALNAME:
+
+ push {r4 - r9, fp} // save register
+ add fp, sp, #28 // add number of saved register multiplied by size of int
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ mov AO, OLD_A // pointer matrix A
+ mov BO, OLD_B // pointer matrix B
+
+ sub r3, fp, #128
+ vstm r3, { s8 - s31} // store floating point registers
+
+ ldr r2, OLD_C // pointer matrix C
+ ldr r3, OLD_CSC // Col stride size of C
+ lsl r3, r3, #3 // multiply with size of complex float
+
+ mov CO1, r2 // first line of C
+ add CO2, CO1, r3 // second line of C
+
+ pld [ CO1, #C_PRE ] // prefetch the lines of C
+ pld [ CO2, #C_PRE ] // prefetch the lines of C
+
+cgemm_kernel_L2_M2_20:
+
+ asrs L , K, #3 // L = K / 8
+ cmp L , #2
+ blt cgemm_kernel_L2_M2_32
+
+ KERNEL2x2_I
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ subs L, L, #2
+ ble cgemm_kernel_L2_M2_22a
+ .align 5
+
+cgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ subs L, L, #1
+ bgt cgemm_kernel_L2_M2_22
+
+cgemm_kernel_L2_M2_22a:
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_E
+
+ b cgemm_kernel_L2_M2_44
+
+cgemm_kernel_L2_M2_32:
+
+ tst L, #1
+ ble cgemm_kernel_L2_M2_40
+
+ KERNEL2x2_I
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_E
+
+ b cgemm_kernel_L2_M2_44
+
+cgemm_kernel_L2_M2_40:
+
+ INIT2x2
+
+cgemm_kernel_L2_M2_44:
+
+ ands L , K, #7 // L = K % 8
+ ble cgemm_kernel_L2_M2_100
+
+cgemm_kernel_L2_M2_46:
+
+ KERNEL2x2_SUB
+
+ subs L, L, #1
+ bne cgemm_kernel_L2_M2_46
+
+cgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+cgemm_kernel_L999:
+
+ sub r3, fp, #128
+ vldm r3, { s8 - s31} // restore floating point registers
+
+ sub sp, fp, #28
+ pop {r4 - r9, fp}
+ bx lr
+
diff --git a/blis/kernels/armv7a/3/bli_dgemm_kernel_4x4.S b/blis/kernels/armv7a/3/bli_dgemm_kernel_4x4.S
--- /dev/null
@@ -0,0 +1,503 @@
+
+#define REALNAME bli_dgemm_kernel_4x4
+
+#define STACKSIZE 256
+
+#define K r0
+#define PTR_ALPHA r1
+#define OLD_A r2
+#define OLD_B r3
+#define PTR_BETA [fp, #0 ]
+#define OLD_C [fp, #4 ]
+#define OLD_RSC [fp, #8 ]
+#define OLD_CSC [fp, #12 ]
+#define AUX [fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L r2
+
+#define AO r5
+#define BO r6
+
+#define CO1 r7
+#define CO2 r8
+#define CO3 r9
+#define CO4 r12
+
+
+#define A_PRE 96
+#define B_PRE 96
+#define C_PRE 0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+ vsub.f64 d16 , d16 , d16
+ vmov.f64 d17, d16
+ vmov.f64 d18, d16
+ vmov.f64 d19, d16
+ vmov.f64 d20, d16
+ vmov.f64 d21, d16
+ vmov.f64 d22, d16
+ vmov.f64 d23, d16
+ vmov.f64 d24, d16
+ vmov.f64 d25, d16
+ vmov.f64 d26, d16
+ vmov.f64 d27, d16
+ vmov.f64 d28, d16
+ vmov.f64 d29, d16
+ vmov.f64 d30, d16
+ vmov.f64 d31, d16
+
+.endm
+
+.macro KERNEL4x4_I
+ pld [ BO , #B_PRE ]
+ fldd d8 , [ BO ]
+ fldd d0 , [ AO ]
+ pld [ AO , #A_PRE ]
+
+ fldd d1 , [ AO, #8 ]
+ fmuld d16 , d0, d8
+ fldd d2 , [ AO, #16 ]
+ fmuld d17 , d1, d8
+ fldd d3 , [ AO, #24 ]
+ fmuld d18 , d2, d8
+ fldd d9 , [ BO, #8 ]
+ fmuld d19 , d3, d8
+
+ fldd d10, [ BO, #16 ]
+ fmuld d20 , d0, d9
+ fldd d11, [ BO, #24 ]
+ fmuld d21 , d1, d9
+ add BO , BO, #32
+ add AO , AO, #32
+ fmuld d22 , d2, d9
+
+ pld [ BO , #B_PRE ]
+ fldd d12, [ BO ]
+ fmuld d23 , d3, d9
+
+ pld [ AO , #A_PRE ]
+ fldd d4 , [ AO, #0 ]
+ fmuld d24 , d0, d10
+ fldd d5 , [ AO, #8 ]
+ fmuld d25 , d1, d10
+ fldd d6 , [ AO, #16 ]
+ fmuld d26 , d2, d10
+ fldd d7 , [ AO, #24 ]
+ fmuld d27 , d3, d10
+
+ fldd d13, [ BO, #8 ]
+ fmuld d28 , d0, d11
+ fldd d14, [ BO, #16 ]
+ fmuld d29 , d1, d11
+ fldd d15, [ BO, #24 ]
+ fmuld d30 , d2, d11
+ fmuld d31 , d3, d11
+
+.endm
+
+.macro KERNEL4x4_M2
+
+ fmacd d16 , d4, d12
+ pld [ AO , #A_PRE+32 ]
+ fmacd d17 , d5, d12
+ fldd d0 , [ AO , #32 ]
+ fmacd d18 , d6, d12
+ pld [ BO , #B_PRE+32 ]
+ fmacd d19 , d7, d12
+
+ fldd d8 , [ BO , #32 ]
+ fmacd d20 , d4, d13
+ fldd d1 , [ AO, #40 ]
+ fmacd d21 , d5, d13
+ fldd d2 , [ AO, #48 ]
+ fmacd d22 , d6, d13
+ fldd d3 , [ AO, #56 ]
+ fmacd d23 , d7, d13
+
+ fmacd d24 , d4, d14
+ fmacd d25 , d5, d14
+ fldd d9 , [ BO, #40 ]
+ fmacd d26 , d6, d14
+ fldd d10, [ BO, #48 ]
+ fmacd d27 , d7, d14
+
+ fldd d11, [ BO, #56 ]
+ fmacd d28 , d4, d15
+ fmacd d29 , d5, d15
+ add AO , AO, #64
+ fmacd d30 , d6, d15
+ add BO , BO, #64
+ fmacd d31 , d7, d15
+
+.endm
+
+.macro KERNEL4x4_M1
+
+ fmacd d16 , d0, d8
+ pld [ AO , #A_PRE ]
+ fmacd d17 , d1, d8
+ fldd d4 , [ AO ]
+ fmacd d18 , d2, d8
+ pld [ BO , #B_PRE ]
+ fmacd d19 , d3, d8
+
+ fldd d12, [ BO ]
+ fmacd d20 , d0, d9
+ fldd d5 , [ AO, #8 ]
+ fmacd d21 , d1, d9
+ fldd d6 , [ AO, #16 ]
+ fmacd d22 , d2, d9
+ fldd d7 , [ AO, #24 ]
+ fmacd d23 , d3, d9
+
+ fmacd d24 , d0, d10
+ fmacd d25 , d1, d10
+ fldd d13, [ BO, #8 ]
+ fmacd d26 , d2, d10
+ fldd d14, [ BO, #16 ]
+ fmacd d27 , d3, d10
+
+ fldd d15, [ BO, #24 ]
+ fmacd d28 , d0, d11
+ fmacd d29 , d1, d11
+ fmacd d30 , d2, d11
+ fmacd d31 , d3, d11
+
+.endm
+
+.macro KERNEL4x4_E
+
+ fmacd d16 , d4, d12
+ fmacd d17 , d5, d12
+ add BO , BO, #32
+ fmacd d18 , d6, d12
+ add AO , AO, #32
+ fmacd d19 , d7, d12
+
+ fmacd d20 , d4, d13
+ fmacd d21 , d5, d13
+ fmacd d22 , d6, d13
+ fmacd d23 , d7, d13
+
+ fmacd d24 , d4, d14
+ fmacd d25 , d5, d14
+ fmacd d26 , d6, d14
+ fmacd d27 , d7, d14
+
+ fmacd d28 , d4, d15
+ fmacd d29 , d5, d15
+ fmacd d30 , d6, d15
+ fmacd d31 , d7, d15
+
+.endm
+
+.macro KERNEL4x4_SUB
+
+ fldd d8 , [ BO ]
+ pld [ BO , #B_PRE ]
+
+ fldd d0 , [ AO ]
+ pld [ AO , #A_PRE ]
+ fldd d1 , [ AO, #8 ]
+
+ fmacd d16 , d0, d8
+ fldd d2 , [ AO, #16 ]
+ fmacd d17 , d1, d8
+ fldd d3 , [ AO, #24 ]
+ fmacd d18 , d2, d8
+ fldd d9 , [ BO, #8 ]
+ fmacd d19 , d3, d8
+
+ fldd d10, [ BO, #16 ]
+ fmacd d20 , d0, d9
+ fldd d11, [ BO, #24 ]
+ fmacd d21 , d1, d9
+ fmacd d22 , d2, d9
+ fmacd d23 , d3, d9
+
+ fmacd d24 , d0, d10
+ fmacd d25 , d1, d10
+ fmacd d26 , d2, d10
+ fmacd d27 , d3, d10
+
+ fmacd d28 , d0, d11
+ fmacd d29 , d1, d11
+ add AO , AO, #32
+ fmacd d30 , d2, d11
+ add BO , BO, #32
+ fmacd d31 , d3, d11
+
+.endm
+
+.macro SAVE4x4
+
+ ldr r3, OLD_RSC // Row stride size
+ lsl r3, r3, #3 // multiply with size of double
+
+ fldd d0, [ PTR_ALPHA ] // load alpha
+ ldr r4, PTR_BETA
+ fldd d1, [ r4 ] // load beta
+
+//-----------------------------------------------------------
+ mov r2, CO1 // save pointer
+ mov r4, CO2 // save pointer
+ fldd d8, [ CO1 ] // load value from C
+ fldd d12, [ CO2 ] // load value from C
+ fmuld d8, d8, d1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacd d8, d0, d16 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ fldd d9, [ CO1 ] // load value from C
+ fldd d13, [ CO2 ] // load value from C
+ fmuld d9, d9, d1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacd d9, d0, d17 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ fldd d10, [ CO1 ] // load value from C
+ fldd d14, [ CO2 ] // load value from C
+ fmuld d10, d10, d1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacd d10, d0, d18 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ fldd d11, [ CO1 ] // load value from C
+ fldd d15, [ CO2 ] // load value from C
+ fmuld d11, d11, d1 // multiply with beta
+ mov CO1, r2 // restore pointer
+ fmacd d11, d0, d19 // multiply sum with alpha and add to value of C
+ mov CO2, r4 // restore pointer
+
+ fstd d8, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fstd d9, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fstd d10, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fstd d11, [ CO1 ] // store value in C
+
+//-----------------------------------------------------------
+ mov r2, CO3 // save pointer
+ fldd d8, [ CO3 ] // load value from C
+ fmuld d12, d12, d1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacd d12, d0, d20 // multiply sum with alpha and add to value of C
+
+ fldd d9, [ CO3 ] // load value from C
+ fmuld d13, d13, d1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacd d13, d0, d21 // multiply sum with alpha and add to value of C
+
+ fldd d10, [ CO3 ] // load value from C
+ fmuld d14, d14, d1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacd d14, d0, d22 // multiply sum with alpha and add to value of C
+
+ fldd d11, [ CO3 ] // load value from C
+ fmuld d15, d15, d1 // multiply with beta
+ mov CO3, r2 // restore pointer
+ fmacd d15, d0, d23 // multiply sum with alpha and add to value of C
+
+ fstd d12, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fstd d13, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fstd d14, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fstd d15, [ CO2 ] // store value in C
+
+//-----------------------------------------------------------
+ mov r4, CO4 // save pointer
+ fldd d12, [ CO4 ] // load value from C
+ fmuld d8, d8, d1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacd d8, d0, d24 // multiply sum with alpha and add to value of C
+
+ fldd d13, [ CO4 ] // load value from C
+ fmuld d9, d9, d1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacd d9, d0, d25 // multiply sum with alpha and add to value of C
+
+ fldd d14, [ CO4 ] // load value from C
+ fmuld d10, d10, d1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacd d10, d0, d26 // multiply sum with alpha and add to value of C
+
+ fldd d15, [ CO4 ] // load value from C
+ fmuld d11, d11, d1 // multiply with beta
+ mov CO4, r4 // restore pointer
+ fmacd d11, d0, d27 // multiply sum with alpha and add to value of C
+
+
+//-----------------------------------------------------------
+ fstd d8, [ CO3 ] // store value in C
+ fmuld d12, d12, d1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacd d12, d0, d28 // multiply sum with alpha and add to value of C
+
+ fstd d9, [ CO3 ] // store value in C
+ fmuld d13, d13, d1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacd d13, d0, d29 // multiply sum with alpha and add to value of C
+
+ fstd d10, [ CO3 ] // store value in C
+ fmuld d14, d14, d1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacd d14, d0, d30 // multiply sum with alpha and add to value of C
+
+ fstd d11, [ CO3 ] // store value in C
+ fmuld d15, d15, d1 // multiply with beta
+ fstd d12, [ CO4 ] // store value in C
+ fmacd d15, d0, d31 // multiply sum with alpha and add to value of C
+
+ add CO4 , CO4, r3 // compute next pointer
+ fstd d13, [ CO4 ] // store value in C
+ add CO4 , CO4, r3 // compute next pointer
+ fstd d14, [ CO4 ] // store value in C
+ add CO4 , CO4, r3 // compute next pointer
+ fstd d15, [ CO4 ] // store value in C
+
+.endm
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ .arm
+ .global REALNAME
+ .func REALNAME
+
+REALNAME:
+
+ push {r4 - r9, fp} // save register
+ add fp, sp, #28 // add number of saved register multiplied by size of int
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ mov AO, OLD_A // pointer matrix A
+ mov BO, OLD_B // pointer matrix B
+
+ sub r3, fp, #128
+ vstm r3, { d8 - d15} // store floating point registers
+
+ ldr r2, OLD_C // pointer matrix C
+ ldr r3, OLD_CSC // Col stride size of C
+ lsl r3, r3, #3 // multiply with size of double
+
+ mov CO1, r2 // first line of C
+ add CO2, CO1, r3 // second line of C
+ add CO3, CO2, r3 // third line of C
+ add CO4, CO3, r3 // fourth line of C
+
+ pld [ CO1, #C_PRE ] // prefetch the lines of C
+ pld [ CO2, #C_PRE ] // prefetch the lines of C
+ pld [ CO3, #C_PRE ] // prefetch the lines of C
+ pld [ CO3, #C_PRE ] // prefetch the lines of C
+
+dgemm_kernel_L4_M4_20:
+
+ asrs L , K, #3 // L = K / 8
+ cmp L , #2
+ blt dgemm_kernel_L4_M4_32
+
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs L, L, #2
+ ble dgemm_kernel_L4_M4_22a
+ .align 5
+
+dgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs L, L, #1
+ bgt dgemm_kernel_L4_M4_22
+
+dgemm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b dgemm_kernel_L4_M4_44
+
+dgemm_kernel_L4_M4_32:
+
+ tst L, #1
+ ble dgemm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b dgemm_kernel_L4_M4_44
+
+dgemm_kernel_L4_M4_40:
+
+ INIT4x4
+
+dgemm_kernel_L4_M4_44:
+
+ ands L , K, #7 // L = K % 8
+ ble dgemm_kernel_L4_M4_100
+
+dgemm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+ subs L, L, #1
+ bne dgemm_kernel_L4_M4_46
+
+dgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+dgemm_kernel_L999:
+
+ sub r3, fp, #128
+ vldm r3, { d8 - d15} // restore floating point registers
+
+ sub sp, fp, #28
+ pop {r4 - r9, fp}
+ bx lr
+
diff --git a/blis/kernels/armv7a/3/bli_sgemm_kernel_4x4.S b/blis/kernels/armv7a/3/bli_sgemm_kernel_4x4.S
--- /dev/null
@@ -0,0 +1,483 @@
+
+#define REALNAME bli_sgemm_kernel_4x4
+
+#define STACKSIZE 256
+
+#define K r0
+#define PTR_ALPHA r1
+#define OLD_A r2
+#define OLD_B r3
+#define PTR_BETA [fp, #0 ]
+#define OLD_C [fp, #4 ]
+#define OLD_RSC [fp, #8 ]
+#define OLD_CSC [fp, #12 ]
+#define AUX [fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L r2
+
+#define AO r5
+#define BO r6
+
+#define CO1 r7
+#define CO2 r8
+#define CO3 r9
+#define CO4 r12
+
+
+#define A_PRE 96
+#define B_PRE 96
+#define C_PRE 0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+ vsub.f32 s16 , s16 , s16
+ vmov.f32 s17, s16
+ vmov.f32 s18, s16
+ vmov.f32 s19, s16
+ vmov.f32 s20, s16
+ vmov.f32 s21, s16
+ vmov.f32 s22, s16
+ vmov.f32 s23, s16
+ vmov.f32 s24, s16
+ vmov.f32 s25, s16
+ vmov.f32 s26, s16
+ vmov.f32 s27, s16
+ vmov.f32 s28, s16
+ vmov.f32 s29, s16
+ vmov.f32 s30, s16
+ vmov.f32 s31, s16
+
+.endm
+
+.macro KERNEL4x4_I
+
+ pld [ AO , #A_PRE ]
+ fldmias AO!, { s0 - s1 }
+ pld [ BO , #B_PRE ]
+ fldmias BO!, { s8 - s9 }
+
+ fmuls s16 , s0, s8
+ fldmias AO!, { s2 - s3 }
+ fmuls s17 , s1, s8
+ fmuls s18 , s2, s8
+ fldmias BO!, { s10 - s11 }
+ fmuls s19 , s3, s8
+
+ fmuls s20 , s0, s9
+ fldmias AO!, { s4 - s5 }
+ fmuls s21 , s1, s9
+ fmuls s22 , s2, s9
+ fldmias AO!, { s6 - s7 }
+ fmuls s23 , s3, s9
+
+ fmuls s24 , s0, s10
+ fldmias BO!, { s12 - s13 }
+ fmuls s25 , s1, s10
+ fmuls s26 , s2, s10
+ fldmias BO!, { s14 - s15 }
+ fmuls s27 , s3, s10
+
+ fmuls s28 , s0, s11
+ fmuls s29 , s1, s11
+ fmuls s30 , s2, s11
+ fmuls s31 , s3, s11
+
+.endm
+
+
+.macro KERNEL4x4_M2
+
+ pld [ AO , #A_PRE ]
+ fmacs s16 , s4, s12
+ fmacs s17 , s5, s12
+ fldmias AO!, { s0 - s3 }
+ fmacs s18 , s6, s12
+ pld [ BO , #B_PRE ]
+ fmacs s19 , s7, s12
+
+ fmacs s20 , s4, s13
+ fldmias BO!, { s8 - s11 }
+ fmacs s21 , s5, s13
+ fmacs s22 , s6, s13
+ //fldmias AO!, { s2 - s3 }
+ fmacs s23 , s7, s13
+
+ fmacs s24 , s4, s14
+ //fldmias BO!, { s10 - s11 }
+ fmacs s25 , s5, s14
+ fmacs s26 , s6, s14
+ fmacs s27 , s7, s14
+
+ fmacs s28 , s4, s15
+ fmacs s29 , s5, s15
+ fmacs s30 , s6, s15
+ fmacs s31 , s7, s15
+
+.endm
+
+
+.macro KERNEL4x4_M1
+
+ fmacs s16 , s0, s8
+ fldmias AO!, { s4 - s7 }
+ fmacs s17 , s1, s8
+ fmacs s18 , s2, s8
+ fldmias BO!, { s12 - s15 }
+ //fldmias AO!, { s6 - s7 }
+ fmacs s19 , s3, s8
+
+ fmacs s20 , s0, s9
+ fmacs s21 , s1, s9
+ fmacs s22 , s2, s9
+ //fldmias BO!, { s14 - s15 }
+ fmacs s23 , s3, s9
+
+ fmacs s24 , s0, s10
+ fmacs s25 , s1, s10
+ fmacs s26 , s2, s10
+ fmacs s27 , s3, s10
+
+ fmacs s28 , s0, s11
+ fmacs s29 , s1, s11
+ fmacs s30 , s2, s11
+ fmacs s31 , s3, s11
+
+.endm
+
+
+
+.macro KERNEL4x4_E
+
+ fmacs s16 , s4, s12
+ fmacs s17 , s5, s12
+ fmacs s18 , s6, s12
+ fmacs s19 , s7, s12
+
+ fmacs s20 , s4, s13
+ fmacs s21 , s5, s13
+ fmacs s22 , s6, s13
+ fmacs s23 , s7, s13
+
+ fmacs s24 , s4, s14
+ fmacs s25 , s5, s14
+ fmacs s26 , s6, s14
+ fmacs s27 , s7, s14
+
+ fmacs s28 , s4, s15
+ fmacs s29 , s5, s15
+ fmacs s30 , s6, s15
+ fmacs s31 , s7, s15
+
+.endm
+
+
+
+
+.macro KERNEL4x4_SUB
+
+ flds s8 , [ BO ]
+
+ flds s0 , [ AO ]
+ flds s1 , [ AO, #4 ]
+
+ fmacs s16 , s0, s8
+ flds s2 , [ AO, #8 ]
+ fmacs s17 , s1, s8
+ flds s3 , [ AO, #12 ]
+ fmacs s18 , s2, s8
+ flds s9 , [ BO, #4 ]
+ fmacs s19 , s3, s8
+
+ flds s10, [ BO, #8 ]
+ fmacs s20 , s0, s9
+ flds s11, [ BO, #12 ]
+ fmacs s21 , s1, s9
+ fmacs s22 , s2, s9
+ fmacs s23 , s3, s9
+
+ fmacs s24 , s0, s10
+ fmacs s25 , s1, s10
+ fmacs s26 , s2, s10
+ fmacs s27 , s3, s10
+
+ fmacs s28 , s0, s11
+ fmacs s29 , s1, s11
+ add AO , AO, #16
+ fmacs s30 , s2, s11
+ add BO , BO, #16
+ fmacs s31 , s3, s11
+
+.endm
+
+
+.macro SAVE4x4
+
+ ldr r3, OLD_RSC // Row stride size
+ lsl r3, r3, #2 // multiply with size of float
+
+ flds s0, [ PTR_ALPHA ] // load alpha
+ ldr r4, PTR_BETA
+ flds s1, [ r4 ] // load beta
+
+//-----------------------------------------------------------
+ mov r2, CO1 // save pointer
+ mov r4, CO2 // save pointer
+ flds s8, [ CO1 ] // load value from C
+ flds s12, [ CO2 ] // load value from C
+ fmuls s8, s8, s1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacs s8, s0, s16 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ flds s9, [ CO1 ] // load value from C
+ flds s13, [ CO2 ] // load value from C
+ fmuls s9, s9, s1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacs s9, s0, s17 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ flds s10, [ CO1 ] // load value from C
+ flds s14, [ CO2 ] // load value from C
+ fmuls s10, s10, s1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacs s10, s0, s18 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ flds s11, [ CO1 ] // load value from C
+ flds s15, [ CO2 ] // load value from C
+ fmuls s11, s11, s1 // multiply with beta
+ mov CO1, r2 // restore pointer
+ fmacs s11, s0, s19 // multiply sum with alpha and add to value of C
+ mov CO2, r4 // restore pointer
+
+ fsts s8, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fsts s9, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fsts s10, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fsts s11, [ CO1 ] // store value in C
+
+//-----------------------------------------------------------
+ mov r2, CO3 // save pointer
+ flds s8, [ CO3 ] // load value from C
+ fmuls s12, s12, s1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacs s12, s0, s20 // multiply sum with alpha and add to value of C
+
+ flds s9, [ CO3 ] // load value from C
+ fmuls s13, s13, s1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacs s13, s0, s21 // multiply sum with alpha and add to value of C
+
+ flds s10, [ CO3 ] // load value from C
+ fmuls s14, s14, s1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacs s14, s0, s22 // multiply sum with alpha and add to value of C
+
+ flds s11, [ CO3 ] // load value from C
+ fmuls s15, s15, s1 // multiply with beta
+ mov CO3, r2 // restore pointer
+ fmacs s15, s0, s23 // multiply sum with alpha and add to value of C
+
+ fsts s12, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fsts s13, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fsts s14, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fsts s15, [ CO2 ] // store value in C
+
+//-----------------------------------------------------------
+ mov r4, CO4 // save pointer
+ flds s12, [ CO4 ] // load value from C
+ fmuls s8, s8, s1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacs s8, s0, s24 // multiply sum with alpha and add to value of C
+
+ flds s13, [ CO4 ] // load value from C
+ fmuls s9, s9, s1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacs s9, s0, s25 // multiply sum with alpha and add to value of C
+
+ flds s14, [ CO4 ] // load value from C
+ fmuls s10, s10, s1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacs s10, s0, s26 // multiply sum with alpha and add to value of C
+
+ flds s15, [ CO4 ] // load value from C
+ fmuls s11, s11, s1 // multiply with beta
+ mov CO4, r4 // restore pointer
+ fmacs s11, s0, s27 // multiply sum with alpha and add to value of C
+
+
+//-----------------------------------------------------------
+ fsts s8, [ CO3 ] // store value in C
+ fmuls s12, s12, s1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacs s12, s0, s28 // multiply sum with alpha and add to value of C
+
+ fsts s9, [ CO3 ] // store value in C
+ fmuls s13, s13, s1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacs s13, s0, s29 // multiply sum with alpha and add to value of C
+
+ fsts s10, [ CO3 ] // store value in C
+ fmuls s14, s14, s1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacs s14, s0, s30 // multiply sum with alpha and add to value of C
+
+ fsts s11, [ CO3 ] // store value in C
+ fmuls s15, s15, s1 // multiply with beta
+ fsts s12, [ CO4 ] // store value in C
+ fmacs s15, s0, s31 // multiply sum with alpha and add to value of C
+
+ add CO4 , CO4, r3 // compute next pointer
+ fsts s13, [ CO4 ] // store value in C
+ add CO4 , CO4, r3 // compute next pointer
+ fsts s14, [ CO4 ] // store value in C
+ add CO4 , CO4, r3 // compute next pointer
+ fsts s15, [ CO4 ] // store value in C
+
+.endm
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ .arm
+ .global REALNAME
+ .func REALNAME
+
+REALNAME:
+
+ push {r4 - r9, fp} // save register
+ add fp, sp, #28 // add number of saved register multiplied by size of int
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ mov AO, OLD_A // pointer matrix A
+ mov BO, OLD_B // pointer matrix B
+
+ sub r3, fp, #128
+ vstm r3, { s8 - s31 } // store floating point registers
+
+ ldr r2, OLD_C // pointer matrix C
+ ldr r3, OLD_CSC // Col stride size of C
+ lsl r3, r3, #2 // multiply with size of float
+
+ mov CO1, r2 // first line of C
+ add CO2, CO1, r3 // second line of C
+ add CO3, CO2, r3 // third line of C
+ add CO4, CO3, r3 // fourth line of C
+
+ pld [ CO1, #C_PRE ] // prefetch the lines of C
+ pld [ CO2, #C_PRE ] // prefetch the lines of C
+ pld [ CO3, #C_PRE ] // prefetch the lines of C
+ pld [ CO3, #C_PRE ] // prefetch the lines of C
+
+sgemm_kernel_L4_M4_20:
+
+ asrs L , K, #3 // L = K / 8
+ cmp L , #2
+ blt sgemm_kernel_L4_M4_32
+
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs L, L, #2
+ ble sgemm_kernel_L4_M4_22a
+ .align 5
+
+sgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs L, L, #1
+ bgt sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:
+
+ tst L, #1
+ ble sgemm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_40:
+
+ INIT4x4
+
+sgemm_kernel_L4_M4_44:
+
+ ands L , K, #7 // L = K % 8
+ ble sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+ subs L, L, #1
+ bne sgemm_kernel_L4_M4_46
+
+sgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+sgemm_kernel_L999:
+
+ sub r3, fp, #128
+ vldm r3, { s8 - s31 } // restore floating point registers
+
+ sub sp, fp, #28
+ pop {r4 - r9, fp}
+ bx lr
+
diff --git a/blis/kernels/armv7a/3/bli_zgemm_kernel_2x2.S b/blis/kernels/armv7a/3/bli_zgemm_kernel_2x2.S
--- /dev/null
@@ -0,0 +1,506 @@
+
+#define REALNAME bli_zgemm_kernel_2x2
+
+#define STACKSIZE 256
+
+#define K r0
+#define PTR_ALPHA r1
+#define OLD_A r2
+#define OLD_B r3
+#define PTR_BETA [fp, #0 ]
+#define OLD_C [fp, #4 ]
+#define OLD_RSC [fp, #8 ]
+#define OLD_CSC [fp, #12 ]
+#define AUX [fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L r2
+
+#define AO r5
+#define BO r6
+
+#define CO1 r7
+#define CO2 r8
+
+
+#define A_PRE 96
+#define B_PRE 96
+#define C_PRE 0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#define FMAC_BR fnmacd
+#define FMAC_BI fmacd
+
+#define NN 1
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+ #define FADD_R fsubd
+ #define FADD_I faddd
+
+ #define FMAC_R1 fnmacd
+ #define FMAC_R2 fnmacd
+ #define FMAC_I1 fmacd
+ #define FMAC_I2 fnmacd
+
+#elif defined(CN) || defined(CT)
+
+ #define FADD_R faddd
+ #define FADD_I fsubd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fmacd
+ #define FMAC_I1 fnmacd
+ #define FMAC_I2 fmacd
+
+#elif defined(NC) || defined(TC)
+
+ #define FADD_R faddd
+ #define FADD_I fsubd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fnmacd
+ #define FMAC_I1 fmacd
+ #define FMAC_I2 fmacd
+
+#else
+
+ #define FADD_R fsubd
+ #define FADD_I faddd
+
+ #define FMAC_R1 fnmacd
+ #define FMAC_R2 fmacd
+ #define FMAC_I1 fnmacd
+ #define FMAC_I2 fnmacd
+
+#endif
+
+
+
+.macro INIT2x2
+
+ vsub.f64 d16 , d16 , d16
+ vmov.f64 d17, d16
+ vmov.f64 d18, d16
+ vmov.f64 d19, d16
+ vmov.f64 d20, d16
+ vmov.f64 d21, d16
+ vmov.f64 d22, d16
+ vmov.f64 d23, d16
+ vmov.f64 d24, d16
+ vmov.f64 d25, d16
+ vmov.f64 d26, d16
+ vmov.f64 d27, d16
+ vmov.f64 d28, d16
+ vmov.f64 d29, d16
+ vmov.f64 d30, d16
+ vmov.f64 d31, d16
+
+.endm
+
+.macro KERNEL2x2_I
+ pld [ AO , #A_PRE ]
+ pld [ BO , #B_PRE ]
+ fldd d0 , [ AO ]
+ fldd d1 , [ AO, #8 ]
+ fldd d8 , [ BO ]
+ fldd d9 , [ BO, #8 ]
+
+ fmuld d16 , d0, d8
+ fldd d2 , [ AO, #16 ]
+ fmuld d24 , d1, d9
+ fldd d3 , [ AO, #24 ]
+ fmuld d17 , d0, d9
+ fldd d10, [ BO, #16 ]
+ fmuld d25 , d1, d8
+
+ fldd d11, [ BO, #24 ]
+ fmuld d18 , d2, d8
+ add BO , BO, #32
+ fmuld d26 , d3, d9
+ add AO , AO, #32
+ fmuld d19 , d2, d9
+ pld [ BO , #B_PRE ]
+ fmuld d27 , d3, d8
+
+ pld [ AO , #A_PRE ]
+ fmuld d20 , d0, d10
+ fldd d4 , [ AO, #0 ]
+ fmuld d28 , d1, d11
+ fldd d5 , [ AO, #8 ]
+ fmuld d21 , d0, d11
+ fldd d12, [ BO ]
+ fmuld d29 , d1, d10
+
+ fldd d13, [ BO, #8 ]
+ fmuld d22 , d2, d10
+ fldd d6 , [ AO, #16 ]
+ fmuld d30 , d3, d11
+ fldd d7 , [ AO, #24 ]
+ fmuld d23 , d2, d11
+ fldd d14, [ BO, #16 ]
+ fmuld d31 , d3, d10
+ fldd d15, [ BO, #24 ]
+
+ add BO , BO, #32
+ add AO , AO, #32
+.endm
+
+
+
+.macro KERNEL2x2_M1
+ pld [ AO , #A_PRE ]
+
+ fmacd d16 , d0, d8
+ pld [ BO , #B_PRE ]
+ fmacd d24 , d1, d9
+ fldd d4 , [ AO, #0 ]
+ fmacd d17 , d0, d9
+ fldd d5 , [ AO, #8 ]
+ fmacd d25 , d1, d8
+
+ fldd d12, [ BO ]
+ fmacd d18 , d2, d8
+ fldd d13, [ BO, #8 ]
+ fmacd d26 , d3, d9
+ fldd d6 , [ AO, #16 ]
+ fmacd d19 , d2, d9
+ fldd d7 , [ AO, #24 ]
+ fmacd d27 , d3, d8
+
+ fmacd d20 , d0, d10
+ fldd d14, [ BO, #16 ]
+ fmacd d28 , d1, d11
+ fmacd d21 , d0, d11
+ fldd d15, [ BO, #24 ]
+ fmacd d29 , d1, d10
+
+ fmacd d22 , d2, d10
+ add BO , BO, #32
+ fmacd d30 , d3, d11
+ fmacd d23 , d2, d11
+ add AO , AO, #32
+ fmacd d31 , d3, d10
+
+.endm
+
+.macro KERNEL2x2_M2
+ pld [ AO , #A_PRE ]
+
+ fmacd d16 , d4, d12
+ pld [ BO , #B_PRE ]
+ fmacd d24 , d5, d13
+ fldd d0 , [ AO, #0 ]
+ fmacd d17 , d4, d13
+ fldd d1 , [ AO, #8 ]
+ fmacd d25 , d5, d12
+
+ fmacd d18 , d6, d12
+ fldd d8 , [ BO ]
+ fmacd d26 , d7, d13
+ fldd d9 , [ BO, #8 ]
+ fmacd d19 , d6, d13
+ fmacd d27 , d7, d12
+
+ fldd d2 , [ AO, #16 ]
+ fmacd d20 , d4, d14
+ fldd d3 , [ AO, #24 ]
+ fmacd d28 , d5, d15
+ fmacd d21 , d4, d15
+ fldd d10, [ BO, #16 ]
+ fmacd d29 , d5, d14
+
+ fldd d11, [ BO, #24 ]
+ fmacd d22 , d6, d14
+ fmacd d30 , d7, d15
+ add BO , BO, #32
+ fmacd d23 , d6, d15
+ add AO , AO, #32
+ fmacd d31 , d7, d14
+
+.endm
+
+
+.macro KERNEL2x2_E
+
+ fmacd d16 , d4, d12
+ fmacd d24 , d5, d13
+ fmacd d17 , d4, d13
+ fmacd d25 , d5, d12
+
+ fmacd d18 , d6, d12
+ fmacd d26 , d7, d13
+ fmacd d19 , d6, d13
+ fmacd d27 , d7, d12
+
+ fmacd d20 , d4, d14
+ fmacd d28 , d5, d15
+ fmacd d21 , d4, d15
+ fmacd d29 , d5, d14
+
+ fmacd d22 , d6, d14
+ fmacd d30 , d7, d15
+ fmacd d23 , d6, d15
+ fmacd d31 , d7, d14
+
+.endm
+
+.macro KERNEL2x2_SUB
+
+ pld [ AO , #A_PRE ]
+ pld [ BO , #B_PRE ]
+ fldd d0 , [ AO ]
+ fldd d1 , [ AO, #8 ]
+ fldd d8 , [ BO ]
+ fldd d9 , [ BO, #8 ]
+
+ fmacd d16 , d0, d8
+ fldd d2 , [ AO, #16 ]
+ fmacd d24 , d1, d9
+ fldd d3 , [ AO, #24 ]
+ fmacd d17 , d0, d9
+ fldd d10, [ BO, #16 ]
+ fmacd d25 , d1, d8
+
+ fldd d11, [ BO, #24 ]
+ fmacd d18 , d2, d8
+ fmacd d26 , d3, d9
+ fmacd d19 , d2, d9
+ fmacd d27 , d3, d8
+
+ fmacd d20 , d0, d10
+ fmacd d28 , d1, d11
+ fmacd d21 , d0, d11
+ fmacd d29 , d1, d10
+
+ fmacd d22 , d2, d10
+ add BO , BO, #32
+ fmacd d30 , d3, d11
+ fmacd d23 , d2, d11
+ add AO , AO, #32
+ fmacd d31 , d3, d10
+
+.endm
+
+
+
+
+.macro SAVE2x2
+
+ ldr r3, OLD_RSC // Row stride size
+ lsl r3, r3, #4 // multiply with size of complex double
+
+ fldd d0, [ PTR_ALPHA ] // load real part of alpha
+ fldd d1, [ PTR_ALPHA, #8 ] // load imag part of alpha
+ ldr r4, PTR_BETA
+ fldd d2, [ r4 ] // load real part of beta
+ fldd d3, [ r4, #8 ] // load imag part of beta
+
+ // Add/Sub the real and the imag parts
+ FADD_R d16, d24 , d16
+ FADD_I d17, d25 , d17
+ FADD_R d18, d26 , d18
+ FADD_I d19, d27 , d19
+ FADD_R d20, d28 , d20
+ FADD_I d21, d29 , d21
+ FADD_R d22, d30 , d22
+ FADD_I d23, d31 , d23
+
+ mov r4, CO1 // save pointer
+ fldmiad CO1, { d4 - d5 } // read real and imag part from C
+ add CO1, CO1, r3
+
+ mov r2, CO2 // save pointer
+ fldmiad CO2, { d8 - d9 } // read real and imag part from C
+ add CO2, CO2, r3
+
+ fmuld d24, d4, d2 // multiply Beta-real with C-real
+ fmuld d25, d5, d2 // multiply Beta-real with C-imag
+ fmuld d28, d8, d2 // multiply Beta-real with C-real
+ fmuld d29, d9, d2 // multiply Beta-real with C-imag
+
+ FMAC_BR d24, d3, d5 // multiply beta-imag with C-imag and add
+ FMAC_BI d25, d3, d4 // multiply beta-imag with C-real and add
+ FMAC_BR d28, d3, d9 // multiply beta-imag with C-imag and add
+ FMAC_BI d29, d3, d8 // multiply beta-imag with C-real and add
+
+ FMAC_R1 d24 , d0 , d16
+ FMAC_I1 d25 , d0 , d17
+ FMAC_R2 d24 , d1 , d17
+ FMAC_I2 d25 , d1 , d16
+
+ FMAC_R1 d28 , d0 , d20
+ FMAC_I1 d29 , d0 , d21
+ FMAC_R2 d28 , d1 , d21
+ FMAC_I2 d29 , d1 , d20
+
+ fldmiad CO1, { d4 - d5 } // read real and imag part from C
+ fldmiad CO2, { d8 - d9 } // read real and imag part from C
+
+ fmuld d26, d4, d2 // multiply Beta-real with C-real
+ fmuld d27, d5, d2 // multiply Beta-real with C-imag
+ fmuld d30, d8, d2 // multiply Beta-real with C-real
+ fmuld d31, d9, d2 // multiply Beta-real with C-imag
+
+ FMAC_BR d26, d3, d5 // multiply beta-imag with C-imag and add
+ FMAC_BI d27, d3, d4 // multiply beta-imag with C-real and add
+ FMAC_BR d30, d3, d9 // multiply beta-imag with C-imag and add
+ FMAC_BI d31, d3, d8 // multiply beta-imag with C-real and add
+
+ FMAC_R1 d26 , d0 , d18
+ FMAC_I1 d27 , d0 , d19
+ FMAC_R2 d26 , d1 , d19
+ FMAC_I2 d27 , d1 , d18
+
+ FMAC_R1 d30, d0 , d22
+ FMAC_I1 d31, d0 , d23
+ FMAC_R2 d30, d1 , d23
+ FMAC_I2 d31, d1 , d22
+
+ mov CO1, r4 // restore pointer
+ mov CO2, r2 // restore pointer
+ fstmiad CO1, { d24 - d25 }
+ fstmiad CO2, { d28 - d29 }
+ add CO1, CO1, r3
+ add CO2, CO2, r3
+ fstmiad CO1, { d26 - d27 }
+ fstmiad CO2, { d30 - d31 }
+
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ .arm
+ .global REALNAME
+ .func REALNAME
+
+REALNAME:
+
+ push {r4 - r9, fp} // save register
+ add fp, sp, #28 // add number of saved register multiplied by size of int
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ mov AO, OLD_A // pointer matrix A
+ mov BO, OLD_B // pointer matrix B
+
+ sub r3, fp, #128
+ vstm r3, { d8 - d15} // store floating point registers
+
+ ldr r2, OLD_C // pointer matrix C
+ ldr r3, OLD_CSC // Col stride size of C
+ lsl r3, r3, #4 // multiply with size of complex double
+
+ mov CO1, r2 // first line of C
+ add CO2, CO1, r3 // second line of C
+
+ pld [ CO1, #C_PRE ] // prefetch the lines of C
+ pld [ CO2, #C_PRE ] // prefetch the lines of C
+
+zgemm_kernel_L2_M2_20:
+
+ asrs L , K, #3 // L = K / 8
+ cmp L , #2
+ blt zgemm_kernel_L2_M2_32
+
+ KERNEL2x2_I
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ subs L, L, #2
+ ble zgemm_kernel_L2_M2_22a
+ .align 5
+
+zgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ subs L, L, #1
+ bgt zgemm_kernel_L2_M2_22
+
+zgemm_kernel_L2_M2_22a:
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_E
+
+ b zgemm_kernel_L2_M2_44
+
+zgemm_kernel_L2_M2_32:
+
+ tst L, #1
+ ble zgemm_kernel_L2_M2_40
+
+ KERNEL2x2_I
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_E
+
+ b zgemm_kernel_L2_M2_44
+
+zgemm_kernel_L2_M2_40:
+
+ INIT2x2
+
+zgemm_kernel_L2_M2_44:
+
+ ands L , K, #7 // L = K % 8
+ ble zgemm_kernel_L2_M2_100
+
+zgemm_kernel_L2_M2_46:
+
+ KERNEL2x2_SUB
+
+ subs L, L, #1
+ bne zgemm_kernel_L2_M2_46
+
+zgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+zgemm_kernel_L999:
+
+ sub r3, fp, #128
+ vldm r3, { d8 - d15} // restore floating point registers
+
+ sub sp, fp, #28
+ pop {r4 - r9, fp}
+ bx lr
+
diff --git a/blis/kernels/c66x/1m/bli_packm_cxk_ukernels.c b/blis/kernels/c66x/1m/bli_packm_cxk_ukernels.c
index 3ab63325d4151d1cf95d191d2ae669e01aad283a..b91349092fa2fbdda4616932eba59367b42fc9f0 100755 (executable)
-/*
-
- BLIS
- An object-based framework for developing high-performance BLAS-like
- libraries.
-
- Copyright (C) 2014, The University of Texas
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- - Neither the name of The University of Texas nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-/* Need to implement optimization for various cases */
-
-void bli_spackm_4xk_ukernel(
- conj_t conja,
- dim_t n,
- void* kappa,
- void* a, inc_t inca, inc_t lda,
- void* p, inc_t ldp
- )
-{
-
- float* restrict kappa_cast = kappa;
- dim_t index;
-
- if(*kappa_cast == 1.0f)
- {
- if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))
- {
- __float2_t *restrict ptrA0 = (__float2_t *) a;
- __float2_t *restrict ptrP0 = (__float2_t *) p;
-
- for(index=0;index<n;index++)
- {
- *ptrP0++ = *ptrA0++;
- *ptrP0++ = *ptrA0++;
- ptrA0 += ((lda>>1)-2);
- ptrP0 += ((ldp>>1)-2);
- }
- return;
- }
- else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))
- {
- __float2_t *restrict ptrA0 = (__float2_t *) a;
- __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);
- __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);
- __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);
- __float2_t *restrict ptrP0 = (__float2_t *) p;
- __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);
- __float2_t val0, val1;
- dim_t n_iter = n >> 1;
- dim_t n_left = n & 1;
-
- for(index=0;index<n_iter;index++)
- {
- val0 = *ptrA0++;
- val1 = *ptrA1++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- val0 = *ptrA2++;
- val1 = *ptrA3++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- ptrP0 += ((ldp)-2);
- ptrP1 += ((ldp)-2);
- }
- if(n_left)
- {
- float *restrict ptrA = ((float *) a+2*n_iter);
- float *restrict ptrP = ((float *) p+2*n_iter*ldp);
- ptrP[0] = ptrA[0];
- ptrP[1] = ptrA[inca];
- ptrP[2] = ptrA[2*inca];
- ptrP[3] = ptrA[3*inca];
- }
- return;
- }
- }
-
- /* handle unoptimized case using default packing routine */
- bli_spackm_ref_4xk(conja, n, kappa, a, inca, lda, p, ldp);
-}
-
-void bli_spackm_8xk_ukernel(
- conj_t conja,
- dim_t n,
- void* kappa,
- void* a, inc_t inca, inc_t lda,
- void* p, inc_t ldp
- )
-{
-
- float* restrict kappa_cast = kappa;
- dim_t index;
-
- if(*kappa_cast == 1.0f)
- {
- if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))
- {
- __float2_t *restrict ptrA0 = (__float2_t *) a;
- __float2_t *restrict ptrP0 = (__float2_t *) p;
- for(index=0;index<n;index++)
- {
- *ptrP0++ = *ptrA0++;
- *ptrP0++ = *ptrA0++;
- *ptrP0++ = *ptrA0++;
- *ptrP0++ = *ptrA0++;
- ptrA0 += ((lda>>1)-4);
- ptrP0 += ((ldp>>1)-4);
- }
- return;
- }
- else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))
- {
- __float2_t *restrict ptrA0 = (__float2_t *) a;
- __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);
- __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);
- __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);
- __float2_t *restrict ptrA4 = (__float2_t *) (((float *) a)+4*inca);
- __float2_t *restrict ptrA5 = (__float2_t *) (((float *) a)+5*inca);
- __float2_t *restrict ptrA6 = (__float2_t *) (((float *) a)+6*inca);
- __float2_t *restrict ptrA7 = (__float2_t *) (((float *) a)+7*inca);
- __float2_t *restrict ptrP0 = (__float2_t *) p;
- __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);
- __float2_t val0, val1;
- dim_t n_iter = n >> 1;
- dim_t n_left = n & 1;
- for(index=0;index<n_iter;index++)
- {
- val0 = *ptrA0++;
- val1 = *ptrA1++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- val0 = *ptrA2++;
- val1 = *ptrA3++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- val0 = *ptrA4++;
- val1 = *ptrA5++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- val0 = *ptrA6++;
- val1 = *ptrA7++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- ptrP0 += ((ldp)-4);
- ptrP1 += ((ldp)-4);
- }
- if(n_left)
- {
- float *restrict ptrA = ((float *) a+2*n_iter);
- float *restrict ptrP = ((float *) p+2*n_iter*ldp);
- ptrP[0] = ptrA[0];
- ptrP[1] = ptrA[inca];
- ptrP[2] = ptrA[2*inca];
- ptrP[3] = ptrA[3*inca];
- ptrP[4] = ptrA[4*inca];
- ptrP[5] = ptrA[5*inca];
- ptrP[6] = ptrA[6*inca];
- ptrP[7] = ptrA[7*inca];
- }
- return;
- }
- }
- /* handle unoptimized case using default packing routine */
- bli_spackm_ref_8xk(conja, n, kappa, a, inca, lda, p, ldp);
-}
-
+/*\r
+\r
+ BLIS\r
+ An object-based framework for developing high-performance BLAS-like\r
+ libraries.\r
+\r
+ Copyright (C) 2014, The University of Texas\r
+\r
+ Redistribution and use in source and binary forms, with or without\r
+ modification, are permitted provided that the following conditions are\r
+ met:\r
+ - Redistributions of source code must retain the above copyright\r
+ notice, this list of conditions and the following disclaimer.\r
+ - Redistributions in binary form must reproduce the above copyright\r
+ notice, this list of conditions and the following disclaimer in the\r
+ documentation and/or other materials provided with the distribution.\r
+ - Neither the name of The University of Texas nor the names of its\r
+ contributors may be used to endorse or promote products derived\r
+ from this software without specific prior written permission.\r
+\r
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+\r
+*/\r
+\r
+#include "blis.h"\r
+\r
+/* Need to implement optimization for various cases */\r
+\r
+void bli_spackm_4xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ )\r
+{\r
+\r
+ float* restrict kappa_cast = kappa;\r
+ dim_t index; \r
+\r
+ if(*kappa_cast == 1.0f)\r
+ {\r
+ if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+\r
+ for(index=0;index<n;index++)\r
+ {\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ ptrA0 += ((lda>>1)-2);\r
+ ptrP0 += ((ldp>>1)-2);\r
+ }\r
+ return;\r
+ }\r
+ else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);\r
+ __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);\r
+ __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+ __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);\r
+ __float2_t val0, val1;\r
+ dim_t n_iter = n >> 1;\r
+ dim_t n_left = n & 1;\r
+\r
+ for(index=0;index<n_iter;index++)\r
+ {\r
+ val0 = *ptrA0++;\r
+ val1 = *ptrA1++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA2++;\r
+ val1 = *ptrA3++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ ptrP0 += ((ldp)-2);\r
+ ptrP1 += ((ldp)-2);\r
+ }\r
+ if(n_left)\r
+ {\r
+ float *restrict ptrA = ((float *) a+2*n_iter);\r
+ float *restrict ptrP = ((float *) p+2*n_iter*ldp);\r
+ ptrP[0] = ptrA[0];\r
+ ptrP[1] = ptrA[inca];\r
+ ptrP[2] = ptrA[2*inca];\r
+ ptrP[3] = ptrA[3*inca];\r
+ } \r
+ return; \r
+ }\r
+ }\r
+\r
+ /* handle unoptimized case using default packing routine */\r
+ bli_spackm_ref_4xk(conja, n, kappa, a, inca, lda, p, ldp);\r
+}\r
+\r
+void bli_spackm_8xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ )\r
+{\r
+\r
+ float* restrict kappa_cast = kappa;\r
+ dim_t index;\r
+\r
+ if(*kappa_cast == 1.0f)\r
+ {\r
+ if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+ for(index=0;index<n;index++)\r
+ {\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ ptrA0 += ((lda>>1)-4);\r
+ ptrP0 += ((ldp>>1)-4);\r
+ }\r
+ return;\r
+ }\r
+ else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);\r
+ __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);\r
+ __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);\r
+ __float2_t *restrict ptrA4 = (__float2_t *) (((float *) a)+4*inca);\r
+ __float2_t *restrict ptrA5 = (__float2_t *) (((float *) a)+5*inca);\r
+ __float2_t *restrict ptrA6 = (__float2_t *) (((float *) a)+6*inca);\r
+ __float2_t *restrict ptrA7 = (__float2_t *) (((float *) a)+7*inca);\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+ __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);\r
+ __float2_t val0, val1;\r
+ dim_t n_iter = n >> 1;\r
+ dim_t n_left = n & 1;\r
+ for(index=0;index<n_iter;index++)\r
+ {\r
+ val0 = *ptrA0++;\r
+ val1 = *ptrA1++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA2++;\r
+ val1 = *ptrA3++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA4++;\r
+ val1 = *ptrA5++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA6++;\r
+ val1 = *ptrA7++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ ptrP0 += ((ldp)-4);\r
+ ptrP1 += ((ldp)-4);\r
+ }\r
+ if(n_left)\r
+ {\r
+ float *restrict ptrA = ((float *) a+2*n_iter);\r
+ float *restrict ptrP = ((float *) p+2*n_iter*ldp);\r
+ ptrP[0] = ptrA[0];\r
+ ptrP[1] = ptrA[inca];\r
+ ptrP[2] = ptrA[2*inca];\r
+ ptrP[3] = ptrA[3*inca];\r
+ ptrP[4] = ptrA[4*inca];\r
+ ptrP[5] = ptrA[5*inca];\r
+ ptrP[6] = ptrA[6*inca];\r
+ ptrP[7] = ptrA[7*inca];\r
+ }\r
+ return; \r
+ }\r
+ }\r
+ /* handle unoptimized case using default packing routine */\r
+ bli_spackm_ref_8xk(conja, n, kappa, a, inca, lda, p, ldp);\r
+}\r
+\r
diff --git a/blis/kernels/c66x/1m/bli_packm_cxk_ukernels.h b/blis/kernels/c66x/1m/bli_packm_cxk_ukernels.h
index 6849f5d7214e1f943222f462d49bd49d7b089625..7a01798df897f22df8dadf7a838e6b43b68315bf 100755 (executable)
-/*
-
- BLIS
- An object-based framework for developing high-performance BLAS-like
- libraries.
-
- Copyright (C) 2014, The University of Texas
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- - Neither the name of The University of Texas nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_spackm_4xk_ukernel(
- conj_t conja,
- dim_t n,
- void* kappa,
- void* a, inc_t inca, inc_t lda,
- void* p, inc_t ldp
- );
-
-void bli_spackm_8xk_ukernel(
- conj_t conja,
- dim_t n,
- void* kappa,
- void* a, inc_t inca, inc_t lda,
- void* p, inc_t ldp
- );
-
-
+/*\r
+\r
+ BLIS\r
+ An object-based framework for developing high-performance BLAS-like\r
+ libraries.\r
+\r
+ Copyright (C) 2014, The University of Texas\r
+\r
+ Redistribution and use in source and binary forms, with or without\r
+ modification, are permitted provided that the following conditions are\r
+ met:\r
+ - Redistributions of source code must retain the above copyright\r
+ notice, this list of conditions and the following disclaimer.\r
+ - Redistributions in binary form must reproduce the above copyright\r
+ notice, this list of conditions and the following disclaimer in the\r
+ documentation and/or other materials provided with the distribution.\r
+ - Neither the name of The University of Texas nor the names of its\r
+ contributors may be used to endorse or promote products derived\r
+ from this software without specific prior written permission.\r
+\r
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+\r
+*/\r
+\r
+void bli_spackm_4xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ );\r
+\r
+void bli_spackm_8xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ );\r
+\r
+\r
index 6349347b1c8795a9ac0f69dd3a3e8f31ea8464b8..7361cef7a4245c388bbdf1e1779b211a41e48a5f 100755 (executable)
-/*
-
- BLIS
- An object-based framework for developing high-performance BLAS-like
- libraries.
-
- Copyright (C) 2014, The University of Texas
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- - Neither the name of The University of Texas nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-#include "blis.h"
-
-//#define BLIS_ENABLE_CYCLE_COUNT
-
-void bli_sgemm_ukernel_4x8(
- dim_t k,
- float* restrict alpha,
- float* restrict a,
- float* restrict b,
- float* restrict beta,
- float* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
- __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
- __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;
- __float2_t * restrict ptrB = (__float2_t *) b;
- __float2_t * restrict ptrA = (__float2_t *) a;
- __float2_t * restrict ptrC;
- __float2_t regA2, regC, regS, regR;
- int_least16_t index;
- float* restrict c0, * restrict c1;
- __float2_t regB2;
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
- //touch routine: both a & b
- //Length of b = NR*K*size of float;
-#ifdef BLIS_ENABLE_PREFETCH
- //touch(a, k*BLIS_DEFAULT_MR_S*4);
-#endif
-
- // zero out accumulators
- sum0 = 0.0;
- sum1 = 0.0;
- sum2 = 0.0;
- sum3 = 0.0;
- sum4 = 0.0;
- sum5 = 0.0;
- sum6 = 0.0;
- sum7 = 0.0;
- sum8 = 0.0;
- sum9 = 0.0;
- suma = 0.0;
- sumb = 0.0;
- sumc = 0.0;
- sumd = 0.0;
- sume = 0.0;
- sumf = 0.0;
-
-
- for (index = 0; index < k; index++)
- { // loop over k;
- // each iteration performs rank one update of 4x1 by 1x8
- // matrices of A and B respectively; result is
- // accumulated over 4x8 matrix
- __float2_t b01, b23, b45, b67, a01, a23;
- __x128_t reg128;
-
- a01 = *ptrA++;
- a23 = *ptrA++;
-
- b01 = *ptrB++;
- b23 = *ptrB++;
- b45 = *ptrB++;
- b67 = *ptrB++;
-
- reg128 = _cmpysp(a01, b01);
- // accumulate a[0]*b[1] and -a[0]*b[0]
- sum0 = _daddsp(sum0, _lof2_128(reg128));
- // accumulate a[1]*b[0] and a[1]*b[1]
- sum1 = _daddsp(sum1, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b23);
- // accumulate a[0]*b[3] and -a[0]*b[2]
- sum2 = _daddsp(sum2, _lof2_128(reg128));
- // accumulate a[1]*b[2] and a[1]*b[3]
- sum3 = _daddsp(sum3, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b45);
- // accumulate a[0]*b[5] and -a[0]*b[4]
- sum4 = _daddsp(sum4, _lof2_128(reg128));
- // accumulate a[1]*b[4] and a[1]*b[5]
- sum5 = _daddsp(sum5, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b67);
- // accumulate a[0]*b[7] and -a[0]*b[6]
- sum6 = _daddsp(sum6, _lof2_128(reg128));
- // accumulate a[1]*b[6] and a[1]*b[7]
- sum7 = _daddsp(sum7, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b01);
- // accumulate a[2]*b[1] and -a[2]*b[0]
- sum8 = _daddsp(sum8, _lof2_128(reg128));
- // accumulate a[3]*b[0] and a[3]*b[1]
- sum9 = _daddsp(sum9, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b23);
- // accumulate a[2]*b[3] and -a[2]*b[2]
- suma = _daddsp(suma, _lof2_128(reg128));
- // accumulate a[3]*b[2] and a[3]*b[3]
- sumb = _daddsp(sumb, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b45);
- // accumulate a[2]*b[5] and -a[2]*b[4]
- sumc = _daddsp(sumc, _lof2_128(reg128));
- // accumulate a[3]*b[4] and a[3]*b[5]
- sumd = _daddsp(sumd, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b67);
- // accumulate a[2]*b[7] and -a[2]*b[6]
- sume = _daddsp(sume, _lof2_128(reg128));
- // accumulate a[3]*b[6] and a[3]*b[7]
- sumf = _daddsp(sumf, _hif2_128(reg128));
-
- }
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
- regA2 = _ftof2(*alpha, *alpha);
- regB2 = _ftof2(*beta, *beta);
- if (rs_c != 1)
- {
- // update c[0,0] and c[1,0]
- c0 = (c + 0*rs_c + 0*cs_c);
- c1 = (c + 1*rs_c + 0*cs_c);
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum1),-_hif2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,1] and c[1,1]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum1),_lof2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,2] and c[1,2]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum3),-_hif2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,3] and c[1,3]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum3),_lof2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,4] and c[1,4]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum5),-_hif2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,5] and c[1,5]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum5),_lof2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,6] and c[1,6]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum7),-_hif2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,7] and c[1,7]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum7),_lof2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- // update c[2,0] and c[3,0]
- c0 = (c + 2*rs_c + 0*cs_c);
- c1 = (c + 3*rs_c + 0*cs_c);
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum9),-_hif2(sum8));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- // update c[2,1] and c[3,1]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum9),_lof2(sum8));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,2] and c[3,2]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumb),-_hif2(suma));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,3] and c[3,3]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumb),_lof2(suma));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,4] and c[3,4]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumd),-_hif2(sumc));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,5] and c[3,5]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumd),_lof2(sumc));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,6] and c[2,6]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumf),-_hif2(sume));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,7] and c[2,7]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumf),_lof2(sume));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- }
- else
- {
-#if 0
- // update c[0,0] and c[1,0]
- ptrC = (__float2_t *) c;
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum1),-_hif2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,1] and c[1,1]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum1),_lof2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,2] and c[1,2]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum3),-_hif2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,3] and c[1,3]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum3),_lof2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,4] and c[1,4]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum5),-_hif2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,5] and c[1,5]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum5),_lof2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,6] and c[1,6]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum7),-_hif2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,7] and c[1,7]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum7),_lof2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- // update c[2,0] and c[3,0]
- ptrC = (__float2_t *) (c+2);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum9),-_hif2(sum8));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- // update c[2,1] and c[3,1]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum9),_lof2(sum8));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,2] and c[3,2]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumb),-_hif2(suma));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,3] and c[3,3]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumb),_lof2(suma));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,4] and c[3,4]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumd),-_hif2(sumc));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,5] and c[3,5]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumd),_lof2(sumc));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,6] and c[2,6]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumf),-_hif2(sume));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,7] and c[2,7]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumf),_lof2(sume));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-#else
-/* __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
- __float2_t c8, c9, ca, cb, cc, cd, ce, cf;
-
- ptrC = (__float2_t *) c;
- c0 = *ptrC++;
- sum0 = _dmpysp(regA2, sum0);
- c1 = *ptrC--;
- sum1 = _dmpysp(regA2, sum1);
- ptrC += (cs_c>>1);
- c2 = *ptrC++;
- sum2 = _dmpysp(regA2, sum2);
- c3 = *ptrC--;
- sum3 = _dmpysp(regA2, sum3);
- ptrC += (cs_c>>1);
- c4 = *ptrC++;
- sum8 = _dmpysp(regA2, sum8);
- c5 = *ptrC--;
- sum9 = _dmpysp(regA2, sum9);
- ptrC += (cs_c>>1);
- c6 = *ptrC++;
- suma = _dmpysp(regA2, suma);
- c7 = *ptrC--;
- sumb = _dmpysp(regA2, sumb);
-
- c0 = _dmpysp(c0, regB2);
- c1 = _dmpysp(c1, regB2);
- c2 = _dmpysp(c2, regB2);
- c3 = _dmpysp(c3, regB2);
- c4 = _dmpysp(c4, regB2);
- c5 = _dmpysp(c5, regB2);
- c6 = _dmpysp(c6, regB2);
- c7 = _dmpysp(c7, regB2);
-
- // update c[0,0] and c[1,0]
- c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
- // update c[2,0] and c[3,0]
- c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);
- //update c[0,1] and c[1,1]
- c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
- // update c[2,1] and c[3,1]
- c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);
- //update c[0,2] and c[1,2]
- c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
- //update c[2,2] and c[3,2]
- c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);
- //update c[0,3] and c[1,3]
- c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
- //update c[2,3] and c[3,3]
- c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);
- //update c[0,4] and c[1,4]
-
- ptrC = (__float2_t *) c;
- *ptrC++ = c0;
- *ptrC-- = c1;
- ptrC += (cs_c>>1);
- *ptrC++ = c2;
- *ptrC-- = c3;
- ptrC += (cs_c>>1);
- *ptrC++ = c4;
- *ptrC-- = c5;
- ptrC += (cs_c>>1);
- *ptrC++ = c6;
- *ptrC-- = c7;
-
- ptrC = (__float2_t *) (c+(cs_c<<2));
- c8 = *ptrC++;
- c9 = *ptrC--;
- ptrC += (cs_c>>1);
- ca = *ptrC++;
- cb = *ptrC--;
- ptrC += (cs_c>>1);
- cc = *ptrC++;
- cd = *ptrC--;
- ptrC += (cs_c>>1);
- ce = *ptrC++;
- cf = *ptrC;
-
- sum4 = _dmpysp(regA2, sum4);
- sum5 = _dmpysp(regA2, sum5);
- sum6 = _dmpysp(regA2, sum6);
- sum7 = _dmpysp(regA2, sum7);
- sumc = _dmpysp(regA2, sumc);
- sumd = _dmpysp(regA2, sumd);
- sume = _dmpysp(regA2, sume);
- sumf = _dmpysp(regA2, sumf);
-
- c8 = _dmpysp(c8, regB2);
- c9 = _dmpysp(c9, regB2);
- ca = _dmpysp(ca, regB2);
- cb = _dmpysp(cb, regB2);
- cc = _dmpysp(cc, regB2);
- cd = _dmpysp(cd, regB2);
- ce = _dmpysp(ce, regB2);
- cf = _dmpysp(cf, regB2);
-
- c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);
- //update c[2,4] and c[3,4]
- c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);
- //update c[0,5] and c[1,5]
- ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);
- //update c[2,5] and c[3,5]
- cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);
- //update c[0,6] and c[1,6]
- cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);
- //update c[2,6] and c[3,6]
- cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);
- //update c[0,7] and c[1,7]
- ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);
- //update c[2,7] and c[3,7]
- cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);
-
- ptrC = (__float2_t *) (c+(cs_c<<2));
- *ptrC++ = c8;
- *ptrC-- = c9;
- ptrC += (cs_c>>1);
- *ptrC++ = ca;
- *ptrC-- = cb;
- ptrC += (cs_c>>1);
- *ptrC++ = cc;
- *ptrC-- = cd;
- ptrC += (cs_c>>1);
- *ptrC++ = ce;
- *ptrC = cf;*/
-
- __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
- __float2_t c8, c9, ca, cb, cc, cd, ce, cf;
-
- ptrC = (__float2_t *) c;
- c0 = *ptrC++;
- c1 = *ptrC--;
- ptrC += (cs_c>>1);
- c2 = *ptrC++;
- c3 = *ptrC--;
- ptrC += (cs_c>>1);
- c4 = *ptrC++;
- c5 = *ptrC--;
- ptrC += (cs_c>>1);
- c6 = *ptrC++;
- c7 = *ptrC--;
- ptrC += (cs_c>>1);
- c8 = *ptrC++;
- c9 = *ptrC--;
- ptrC += (cs_c>>1);
- ca = *ptrC++;
- cb = *ptrC--;
- ptrC += (cs_c>>1);
- cc = *ptrC++;
- cd = *ptrC--;
- ptrC += (cs_c>>1);
- ce = *ptrC++;
- cf = *ptrC;
-
- sum0 = _dmpysp(regA2, sum0);
- sum1 = _dmpysp(regA2, sum1);
- sum2 = _dmpysp(regA2, sum2);
- sum3 = _dmpysp(regA2, sum3);
- sum4 = _dmpysp(regA2, sum4);
- sum5 = _dmpysp(regA2, sum5);
- sum6 = _dmpysp(regA2, sum6);
- sum7 = _dmpysp(regA2, sum7);
- sum8 = _dmpysp(regA2, sum8);
- sum9 = _dmpysp(regA2, sum9);
- suma = _dmpysp(regA2, suma);
- sumb = _dmpysp(regA2, sumb);
- sumc = _dmpysp(regA2, sumc);
- sumd = _dmpysp(regA2, sumd);
- sume = _dmpysp(regA2, sume);
- sumf = _dmpysp(regA2, sumf);
-
- c0 = _dmpysp(c0, regB2);
- c1 = _dmpysp(c1, regB2);
- c2 = _dmpysp(c2, regB2);
- c3 = _dmpysp(c3, regB2);
- c4 = _dmpysp(c4, regB2);
- c5 = _dmpysp(c5, regB2);
- c6 = _dmpysp(c6, regB2);
- c7 = _dmpysp(c7, regB2);
- c8 = _dmpysp(c8, regB2);
- c9 = _dmpysp(c9, regB2);
- ca = _dmpysp(ca, regB2);
- cb = _dmpysp(cb, regB2);
- cc = _dmpysp(cc, regB2);
- cd = _dmpysp(cd, regB2);
- ce = _dmpysp(ce, regB2);
- cf = _dmpysp(cf, regB2);
-
- // update c[0,0] and c[1,0]
- c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
- // update c[2,0] and c[3,0]
- c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);
- //update c[0,1] and c[1,1]
- c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
- // update c[2,1] and c[3,1]
- c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);
- //update c[0,2] and c[1,2]
- c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
- //update c[2,2] and c[3,2]
- c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);
- //update c[0,3] and c[1,3]
- c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
- //update c[2,3] and c[3,3]
- c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);
- //update c[0,4] and c[1,4]
- c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);
- //update c[2,4] and c[3,4]
- c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);
- //update c[0,5] and c[1,5]
- ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);
- //update c[2,5] and c[3,5]
- cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);
- //update c[0,6] and c[1,6]
- cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);
- //update c[2,6] and c[3,6]
- cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);
- //update c[0,7] and c[1,7]
- ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);
- //update c[2,7] and c[3,7]
- cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);
-
- ptrC = (__float2_t *) c;
- *ptrC++ = c0;
- *ptrC-- = c1;
- ptrC += (cs_c>>1);
- *ptrC++ = c2;
- *ptrC-- = c3;
- ptrC += (cs_c>>1);
- *ptrC++ = c4;
- *ptrC-- = c5;
- ptrC += (cs_c>>1);
- *ptrC++ = c6;
- *ptrC-- = c7;
- ptrC += (cs_c>>1);
- *ptrC++ = c8;
- *ptrC-- = c9;
- ptrC += (cs_c>>1);
- *ptrC++ = ca;
- *ptrC-- = cb;
- ptrC += (cs_c>>1);
- *ptrC++ = cc;
- *ptrC-- = cd;
- ptrC += (cs_c>>1);
- *ptrC++ = ce;
- *ptrC = cf;
-
-
-#endif
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
-
-}
-
-void bli_sgemm_ukernel_4x4(
- dim_t k,
- float* restrict alpha,
- float* restrict a,
- float* restrict b,
- float* restrict beta,
- float* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
- __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
- __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;
- __float2_t * restrict ptrB = (__float2_t *) b;
- __float2_t * restrict ptrA = (__float2_t *) a;
- __float2_t * restrict ptrC;
- __float2_t regA2, regB2, regC, regS, regR;
- float* restrict c0, * restrict c1;
- int_least16_t index;
- int kEven, kLeft;
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
-
- // zero out accumulators
- sum0 = 0.0;
- sum1 = 0.0;
- sum2 = 0.0;
- sum3 = 0.0;
- sum4 = 0.0;
- sum5 = 0.0;
- sum6 = 0.0;
- sum7 = 0.0;
- sum8 = 0.0;
- sum9 = 0.0;
- suma = 0.0;
- sumb = 0.0;
- sumc = 0.0;
- sumd = 0.0;
- sume = 0.0;
- sumf = 0.0;
-
- kEven=k>>1;
- kLeft=k&1;
- //TSCL = 0;
- //cycles = TSCL;
-
-
- for (index = 0; index < kEven; index++)
- { // loop over k;
- // each iteration performs rank one update of 4x1 by 1x4
- // matrices of A and B respectively; result is
- // accumulated over 4x4 matrix
- __float2_t b01, b23, a01, a23;
- __x128_t reg128;
-
- // for even k
- a01 = *ptrA++;
- a23 = *ptrA++;
-
- b01 = *ptrB++;
- b23 = *ptrB++;
-
- reg128 = _cmpysp(a01, b01);
- // accumulate a[0]*b[1] and -a[0]*b[0]
- sum0 = _daddsp(sum0, _lof2_128(reg128));
- // accumulate a[1]*b[0] and a[1]*b[1]
- sum1 = _daddsp(sum1, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b23);
- // accumulate a[0]*b[3] and -a[0]*b[2]
- sum2 = _daddsp(sum2, _lof2_128(reg128));
- // accumulate a[1]*b[2] and a[1]*b[3]
- sum3 = _daddsp(sum3, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b01);
- // accumulate a[2]*b[1] and -a[2]*b[0]
- sum4 = _daddsp(sum4, _lof2_128(reg128));
- // accumulate a[3]*b[0] and a[3]*b[1]
- sum5 = _daddsp(sum5, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b23);
- // accumulate a[2]*b[3] and -a[2]*b[2]
- sum6 = _daddsp(sum6, _lof2_128(reg128));
- // accumulate a[3]*b[2] and a[3]*b[3]
- sum7 = _daddsp(sum7, _hif2_128(reg128));
-
-
-
- // for odd k
- a01 = *ptrA++;
- a23 = *ptrA++;
-
- b01 = *ptrB++;
- b23 = *ptrB++;
-
- reg128 = _cmpysp(a01, b01);
- // accumulate a[0]*b[1] and -a[0]*b[0]
- sum8 = _daddsp(sum8, _lof2_128(reg128));
- // accumulate a[1]*b[0] and a[1]*b[1]
- sum9 = _daddsp(sum9, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b23);
- // accumulate a[0]*b[3] and -a[0]*b[2]
- suma = _daddsp(suma, _lof2_128(reg128));
- // accumulate a[1]*b[2] and a[1]*b[3]
- sumb = _daddsp(sumb, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b01);
- // accumulate a[2]*b[1] and -a[2]*b[0]
- sumc = _daddsp(sumc, _lof2_128(reg128));
- // accumulate a[3]*b[0] and a[3]*b[1]
- sumd = _daddsp(sumd, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b23);
- // accumulate a[2]*b[3] and -a[2]*b[2]
- sume = _daddsp(sume, _lof2_128(reg128));
- // accumulate a[3]*b[2] and a[3]*b[3]
- sumf = _daddsp(sumf, _hif2_128(reg128));
-
- }
- if(kLeft)
- { // last k if left;
- __float2_t b01, b23, a01, a23;
- __x128_t reg128;
-
- a01 = *ptrA++;
- a23 = *ptrA++;
-
- b01 = *ptrB++;
- b23 = *ptrB++;
-
- reg128 = _cmpysp(a01, b01);
- // accumulate a[0]*b[1] and -a[0]*b[0]
- sum0 = _daddsp(sum0, _lof2_128(reg128));
- // accumulate a[1]*b[0] and a[1]*b[1]
- sum1 = _daddsp(sum1, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b23);
- // accumulate a[0]*b[3] and -a[0]*b[2]
- sum2 = _daddsp(sum2, _lof2_128(reg128));
- // accumulate a[1]*b[2] and a[1]*b[3]
- sum3 = _daddsp(sum3, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b01);
- // accumulate a[2]*b[1] and -a[2]*b[0]
- sum4 = _daddsp(sum4, _lof2_128(reg128));
- // accumulate a[3]*b[0] and a[3]*b[1]
- sum5 = _daddsp(sum5, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b23);
- // accumulate a[2]*b[3] and -a[2]*b[2]
- sum6 = _daddsp(sum6, _lof2_128(reg128));
- // accumulate a[3]*b[2] and a[3]*b[3]
- sum7 = _daddsp(sum7, _hif2_128(reg128));
-
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
- sum0 = _daddsp(sum0, sum8);
- sum1 = _daddsp(sum1, sum9);
- sum2 = _daddsp(sum2, suma);
- sum3 = _daddsp(sum3, sumb);
- sum4 = _daddsp(sum4, sumc);
- sum5 = _daddsp(sum5, sumd);
- sum6 = _daddsp(sum6, sume);
- sum7 = _daddsp(sum7, sumf);
-
-
- regA2 = _ftof2(*alpha, *alpha);
- regB2 = _ftof2(*beta, *beta);
- if (rs_c != 1)
- {
- // update c[0,0] and c[1,0]
- c0 = (c + 0*rs_c + 0*cs_c);
- c1 = (c + 1*rs_c + 0*cs_c);
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum1),-_hif2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,1] and c[1,1]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum1),_lof2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,2] and c[1,2]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum3),-_hif2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,3] and c[1,3]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum3),_lof2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- // update c[2,0] and c[3,0]
- c0 = (c + 2*rs_c + 0*cs_c);
- c1 = (c + 3*rs_c + 0*cs_c);
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum5),-_hif2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- // update c[2,1] and c[3,1]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum5),_lof2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,2] and c[3,2]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum7),-_hif2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,3] and c[3,3]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum7),_lof2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- }
- else
- {
- __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
-
- ptrC = (__float2_t *) c;
- c0 = *ptrC++;
- c1 = *ptrC--;
- ptrC += (cs_c>>1);
- c2 = *ptrC++;
- c3 = *ptrC--;
- ptrC += (cs_c>>1);
- c4 = *ptrC++;
- c5 = *ptrC--;
- ptrC += (cs_c>>1);
- c6 = *ptrC++;
- c7 = *ptrC--;
-
- sum0 = _dmpysp(regA2, sum0);
- sum1 = _dmpysp(regA2, sum1);
- sum2 = _dmpysp(regA2, sum2);
- sum3 = _dmpysp(regA2, sum3);
- sum4 = _dmpysp(regA2, sum4);
- sum5 = _dmpysp(regA2, sum5);
- sum6 = _dmpysp(regA2, sum6);
- sum7 = _dmpysp(regA2, sum7);
-
- c0 = _dmpysp(c0, regB2);
- c1 = _dmpysp(c1, regB2);
- c2 = _dmpysp(c2, regB2);
- c3 = _dmpysp(c3, regB2);
- c4 = _dmpysp(c4, regB2);
- c5 = _dmpysp(c5, regB2);
- c6 = _dmpysp(c6, regB2);
- c7 = _dmpysp(c7, regB2);
-
- // update c[0,0] and c[1,0]
- c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
- // update c[2,0] and c[3,0]
- c1 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c1);
- //update c[0,1] and c[1,1]
- c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
- // update c[2,1] and c[3,1]
- c3 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),c3);
- //update c[0,2] and c[1,2]
- c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
- //update c[2,2] and c[3,2]
- c5 = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),c5);
- //update c[0,3] and c[1,3]
- c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
- //update c[2,3] and c[3,3]
- c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),c7);
-
- ptrC = (__float2_t *) c;
- *ptrC++ = c0;
- *ptrC-- = c1;
- ptrC += (cs_c>>1);
- *ptrC++ = c2;
- *ptrC-- = c3;
- ptrC += (cs_c>>1);
- *ptrC++ = c4;
- *ptrC-- = c5;
- ptrC += (cs_c>>1);
- *ptrC++ = c6;
- *ptrC-- = c7;
-
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
-}
-
-
-//void dgemmKernel(const double *pA, const double *pB, double *pC, const double a, const int k, const int stepC)
-void bli_dgemm_ukernel_4x4(
- dim_t k,
- double* restrict alpha,
- double* restrict a,
- double* restrict b,
- double* restrict beta,
- double* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
- double sum00, sum01, sum02, sum03;
- double sum10, sum11, sum12, sum13;
- double sum20, sum21, sum22, sum23;
- double sum30, sum31, sum32, sum33;
- int index;
- double al = *alpha;
- double be = *beta;
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL = 0;
- counter_start = TSCL;
-#endif
- //touch routine: both a & b
- //Length of b = NR*K*size of double;
- //Length of a = MR*K*size of double;
-#ifdef BLIS_ENABLE_PREFETCH
- //touch(b, k*BLIS_DEFAULT_NR_D*8);
- //touch(a, k*BLIS_DEFAULT_MR_D*8);
-#endif
-
-
-
-
- sum00 = 0.0;
- sum01 = 0.0;
- sum02 = 0.0;
- sum03 = 0.0;
- sum10 = 0.0;
- sum11 = 0.0;
- sum12 = 0.0;
- sum13 = 0.0;
- sum20 = 0.0;
- sum21 = 0.0;
- sum22 = 0.0;
- sum23 = 0.0;
- sum30 = 0.0;
- sum31 = 0.0;
- sum32 = 0.0;
- sum33 = 0.0;
-
-
- for(index = 0; index < k; index++)
- { // loop over k;
- // each iteration performs rank one update of 4x1 by 1x4
- // matrices of A and B respectively; result is
- // accumulated over 4x4 matrix
- register double a0, a1, a2, a3;
- register double b0, b1, b2, b3;
-
- a0 = *a++;
- a1 = *a++;
- a2 = *a++;
- a3 = *a++;
- b0 = *b++;
- b1 = *b++;
- b2 = *b++;
- b3 = *b++;
-
- // a[0]*b[0]
- sum00 += a0*b0;
- // a[0]*b[1]
- sum01 += a0*b1;
- // a[0]*b[2]
- sum02 += a0*b2;
- // a[0]*b[3]
- sum03 += a0*b3;
- // a[1]*b[0]
- sum10 += a1*b0;
- // a[1]*b[1]
- sum11 += a1*b1;
- // a[1]*b[2]
- sum12 += a1*b2;
- // a[1]*b[3]
- sum13 += a1*b3;
- // a[2]*b[0]
- sum20 += a2*b0;
- // a[2]*b[1]
- sum21 += a2*b1;
- // a[2]*b[2]
- sum22 += a2*b2;
- // a[2]*b[3]
- sum23 += a2*b3;
- // a[3]*b[0]
- sum30 += a3*b0;
- // a[3]*b[1]
- sum31 += a3*b1;
- // a[3]*b[2]
- sum32 += a3*b2;
- // a[3]*b[3]
- sum33 += a3*b3;
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL = 0;
- counter_start = TSCL;
-#endif
-
- double* restrict cptr;
- // 0th Column
- // updating C[00]
- cptr = c;
- *cptr = *cptr * be;
- *cptr += sum00 * al;
-
- // updating C[10]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum10 * al;
-
- // updating C[20]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum20 * al;
-
- // updating C[30]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum30 * al;
-
- // 1st column
- // updating C[01]
- cptr = c + cs_c;
- *cptr = *cptr*be;
- *cptr += sum01 * al;
-
- // updating C[11]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum11 * al;
-
- // updating C[21]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum21 * al;
-
- // updating C[31]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum31 * al;
-
- // 2nd Column
- // updating C[02]
- cptr = c + 2*cs_c;
- *cptr = *cptr*be;
- *cptr += sum02 * al;
-
- // updating C[12]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum12 * al;
-
- // updating C[22]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum22 * al;
-
- // updating C[32]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum32 * al;
-
- // 3rd Column
- // updating C[03]
- cptr = c + 3*cs_c;
- *cptr = *cptr*be;
- *cptr += sum03 * al;
-
- // updating C[13]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum13 * al;
-
- // updating C[23]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum23 * al;
-
- // updating C[33]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum33 * al;
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
- return;
-}
-
-void bli_cgemm_ukernel_2x4(
- dim_t k,
- scomplex* restrict alpha,
- scomplex* restrict a,
- scomplex* restrict b,
- scomplex* restrict beta,
- scomplex* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
- __float2_t sum00a, sum10a, sum00b, sum10b;
- __float2_t sum01a, sum11a, sum01b, sum11b;
- __float2_t sum02a, sum12a, sum02b, sum12b;
- __float2_t sum03a, sum13a, sum03b, sum13b;
- __float2_t * restrict ptrB = (__float2_t *) b;
- __float2_t * restrict ptrA = (__float2_t *) a;
- __float2_t * restrict ptrC;
- __float2_t regA, regB, regC;
- int_least16_t index;
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
- // zero out accumulators
- sum00a = 0.0;
- sum10a = 0.0;
- sum01a = 0.0;
- sum11a = 0.0;
- sum02a = 0.0;
- sum12a = 0.0;
- sum03a = 0.0;
- sum13a = 0.0;
- sum00b = 0.0;
- sum10b = 0.0;
- sum01b = 0.0;
- sum11b = 0.0;
- sum02b = 0.0;
- sum12b = 0.0;
- sum03b = 0.0;
- sum13b = 0.0;
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
-
- for (index = 0; index < k; index++)
- { // loop over k;
- // each iteration performs rank one update of 2x1 by 1x4
- // matrices of A and B respectively; result is
- // accumulated over 2x4 matrix
- __float2_t b0, b1, b2, b3, a0, a1;
- __x128_t reg128;
-
- a0 = *ptrA++;
- a1 = *ptrA++;
-
- b0 = *ptrB++;
- b1 = *ptrB++;
- b2 = *ptrB++;
- b3 = *ptrB++;
-
- // the four partial sums are accumulated independently
- // a[0]*b[0]
- reg128 = _cmpysp(a0, b0);
- sum00a = _daddsp(sum00a, _lof2_128(reg128));
- sum00b = _daddsp(sum00b, _hif2_128(reg128));
-
- // a[1]*b[0]
- reg128 = _cmpysp(a1, b0);
- sum10a = _daddsp(sum10a, _lof2_128(reg128));
- sum10b = _daddsp(sum10b, _hif2_128(reg128));
-
- // a[0]*b[1]
- reg128 = _cmpysp(a0, b1);
- sum01a = _daddsp(sum01a, _lof2_128(reg128));
- sum01b = _daddsp(sum01b, _hif2_128(reg128));
-
- // a[1]*b[1]
- reg128 = _cmpysp(a1, b1);
- sum11a = _daddsp(sum11a, _lof2_128(reg128));
- sum11b = _daddsp(sum11b, _hif2_128(reg128));
-
- // a[0]*b[2]
- reg128 = _cmpysp(a0, b2);
- sum02a = _daddsp(sum02a, _lof2_128(reg128));
- sum02b = _daddsp(sum02b, _hif2_128(reg128));
-
- // a[1]*b[2]
- reg128 = _cmpysp(a1, b2);
- sum12a = _daddsp(sum12a, _lof2_128(reg128));
- sum12b = _daddsp(sum12b, _hif2_128(reg128));
-
- // a[0]*b[3]
- reg128 = _cmpysp(a0, b3);
- sum03a = _daddsp(sum03a, _lof2_128(reg128));
- sum03b = _daddsp(sum03b, _hif2_128(reg128));
-
- // a[1]*b[3]
- reg128 = _cmpysp(a1, b3);
- sum13a = _daddsp(sum13a, _lof2_128(reg128));
- sum13b = _daddsp(sum13b, _hif2_128(reg128));
- }
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
- {
- __x128_t reg128;
- ptrA = (__float2_t *) alpha;
- ptrB = (__float2_t *) beta;
- regA = *ptrA;
- regB = *ptrB;
-
- // the value of a and the final values need to be
- // rearranged due to the specific way cmpysp assumes
- // data arrangement
- regA =_ftof2(-_lof(regA), _hif(regA));
- //regB = _ftof2(_lof(regB),_hif(regB));
- ptrC = (__float2_t *) c;
-
- // update and save c[0,0]
- sum00a = _daddsp(sum00a, sum00b);
- reg128 = _cmpysp(regA, sum00a);
- sum00a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum00a),_hif(sum00a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + rs_c;
-
- // update and save c[1,0]
- sum10a = _daddsp(sum10a, sum10b);
- reg128 = _cmpysp(regA, sum10a);
- sum10a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum10a),_hif(sum10a)),_ftof2(_lof(regC),-_hif(regC)));
-
-
- ptrC = (__float2_t *) c + cs_c;
-
- // update and save c[0,1]
- sum01a = _daddsp(sum01a, sum01b);
- reg128 = _cmpysp(regA, sum01a);
- sum01a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum01a),_hif(sum01a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + rs_c + cs_c;
-
- // update and save c[1,1]
- sum11a = _daddsp(sum11a, sum11b);
- reg128 = _cmpysp(regA, sum11a);
- sum11a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum11a),_hif(sum11a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + 2 * cs_c;
-
- // update and save c[0,2]
- sum02a = _daddsp(sum02a, sum02b);
- reg128 = _cmpysp(regA, sum02a);
- sum02a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum02a),_hif(sum02a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + rs_c + 2* cs_c;
-
- // update and save c[1,2]
- sum12a = _daddsp(sum12a, sum12b);
- reg128 = _cmpysp(regA, sum12a);
- sum12a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum12a),_hif(sum12a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + 3 * cs_c;
-
- // update and save c[0,3]
- sum03a = _daddsp(sum03a, sum03b);
- reg128 = _cmpysp(regA, sum03a);
- sum03a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum03a),_hif(sum03a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + rs_c + 3 * cs_c;
-
- // update and save c[1,3]
- sum13a = _daddsp(sum13a, sum13b);
- reg128 = _cmpysp(regA, sum13a);
- sum13a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum13a),_hif(sum13a)),_ftof2(_lof(regC),-_hif(regC)));
-
- }
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
- return;
-}
-
-void bli_zgemm_ukernel_2x2(
- dim_t k,
- dcomplex* restrict alpha,
- dcomplex* restrict a,
- dcomplex* restrict b,
- dcomplex* restrict beta,
- dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
-
- double * restrict ptrA = (double *) a;
- double * restrict ptrB = (double *) b;
- //double * restrict ptrC = (double *) c;
- double sum00r, sum00i;
- int index;
- int kEven = k&0xFFFE;
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
- sum00r = 0.0;
- sum00i = 0.0;
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL = 0;
- counter_start = TSCL;
-#endif
-
- if(k>4) // The loop is safe for k > 4
- {
-#pragma UNROLL(2)
- for(index = 0; index<kEven; index++)
- { // loop over k;
- // each iteration performs rank one update of 1x1 by 1x1
- // matrices of A and B respectively; result is
- // accumulated over 1x1 matrix
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
-
- }
- if(k&1) // odd k; one left to do
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
- }
- else
- {
- if(k>0)
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
- if(k>1)
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
- if(k>2)
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
- if(k>3)
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
-
- }
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
-
- { // final saving
- double alphar, alphai, betar, betai, cr, ci;
- alphar = alpha->real;
- alphai = alpha->imag;
- betar = beta->real;
- betai = beta->imag;
-
- cr = c->real;
- ci = c->imag;
-
- c->imag = (betar * ci + betai * cr);
- c->real = (betar * cr - betai * ci);
- c->real += (alphar * sum00r - alphai * sum00i);
- c->imag += (alphar * sum00i + alphai * sum00r);
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (CSL_chipReadDNUM () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
-
-
- return;
-}
-
-
+/*\r
+\r
+ BLIS\r
+ An object-based framework for developing high-performance BLAS-like\r
+ libraries.\r
+\r
+ Copyright (C) 2014, The University of Texas\r
+\r
+ Redistribution and use in source and binary forms, with or without\r
+ modification, are permitted provided that the following conditions are\r
+ met:\r
+ - Redistributions of source code must retain the above copyright\r
+ notice, this list of conditions and the following disclaimer.\r
+ - Redistributions in binary form must reproduce the above copyright\r
+ notice, this list of conditions and the following disclaimer in the\r
+ documentation and/or other materials provided with the distribution.\r
+ - Neither the name of The University of Texas nor the names of its\r
+ contributors may be used to endorse or promote products derived\r
+ from this software without specific prior written permission.\r
+\r
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+\r
+ */\r
+#include "blis.h"\r
+\r
+void bli_sgemm_ukernel_4x8(\r
+ dim_t k,\r
+ float* restrict alpha,\r
+ float* restrict a,\r
+ float* restrict b,\r
+ float* restrict beta,\r
+ float* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;\r
+ __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;\r
+ __float2_t * restrict ptrB = (__float2_t *) b;\r
+ __float2_t * restrict ptrA = (__float2_t *) a;\r
+ __float2_t * restrict ptrC;\r
+ __float2_t regA2;\r
+ int_least16_t index;\r
+ __float2_t regB2;\r
+\r
+ //touch routine: both a & b\r
+ //Length of b = NR*K*size of float;\r
+#ifdef BLIS_ENABLE_PREFETCH\r
+ //touch(a, k*BLIS_DEFAULT_MR_S*4);\r
+#endif\r
+\r
+ // zero out accumulators\r
+ sum0 = 0.0;\r
+ sum1 = 0.0;\r
+ sum2 = 0.0;\r
+ sum3 = 0.0;\r
+ sum4 = 0.0;\r
+ sum5 = 0.0;\r
+ sum6 = 0.0;\r
+ sum7 = 0.0;\r
+ sum8 = 0.0;\r
+ sum9 = 0.0;\r
+ suma = 0.0;\r
+ sumb = 0.0;\r
+ sumc = 0.0;\r
+ sumd = 0.0;\r
+ sume = 0.0;\r
+ sumf = 0.0;\r
+\r
+\r
+ for (index = 0; index < k; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 4x1 by 1x8\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 4x8 matrix\r
+ __float2_t b01, b23, b45, b67, a01, a23;\r
+ __x128_t reg128;\r
+\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+ b45 = *ptrB++;\r
+ b67 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum0 = _daddsp(sum0, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum1 = _daddsp(sum1, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ sum2 = _daddsp(sum2, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sum3 = _daddsp(sum3, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b45);\r
+ // accumulate a[0]*b[5] and -a[0]*b[4]\r
+ sum4 = _daddsp(sum4, _lof2_128(reg128));\r
+ // accumulate a[1]*b[4] and a[1]*b[5]\r
+ sum5 = _daddsp(sum5, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b67);\r
+ // accumulate a[0]*b[7] and -a[0]*b[6]\r
+ sum6 = _daddsp(sum6, _lof2_128(reg128));\r
+ // accumulate a[1]*b[6] and a[1]*b[7]\r
+ sum7 = _daddsp(sum7, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sum8 = _daddsp(sum8, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sum9 = _daddsp(sum9, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ suma = _daddsp(suma, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sumb = _daddsp(sumb, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b45);\r
+ // accumulate a[2]*b[5] and -a[2]*b[4]\r
+ sumc = _daddsp(sumc, _lof2_128(reg128));\r
+ // accumulate a[3]*b[4] and a[3]*b[5]\r
+ sumd = _daddsp(sumd, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b67);\r
+ // accumulate a[2]*b[7] and -a[2]*b[6]\r
+ sume = _daddsp(sume, _lof2_128(reg128));\r
+ // accumulate a[3]*b[6] and a[3]*b[7]\r
+ sumf = _daddsp(sumf, _hif2_128(reg128));\r
+\r
+ }\r
+\r
+ regA2 = _ftof2(*alpha, *alpha);\r
+ regB2 = _ftof2(*beta, *beta);\r
+ if (rs_c != 1)\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+ __float2_t c8, c9, ca, cb, cc, cd, ce, cf;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++; //c[0,0] and c[0,1]\r
+ c1 = *ptrC--; //c[0,2] and c[0,3]\r
+ ptrC += (rs_c>>1);\r
+ c4 = *ptrC++; //c[1,0] and c[1,1]\r
+ c5 = *ptrC--; //c[1,2] and c[1,3]\r
+ ptrC += (rs_c>>1);\r
+ c8 = *ptrC++; //c[2,0] and c[2,1]\r
+ c9 = *ptrC--; //c[2,2] and c[2,3]\r
+ ptrC += (rs_c>>1);\r
+ cc = *ptrC++; //c[3,0] and c[3,1]\r
+ cd = *ptrC--; //c[3,2] and c[3,3]\r
+\r
+ ptrC = (__float2_t *) c + 2;\r
+ c2 = *ptrC++; //c[0,4] and c[0,5]\r
+ c3 = *ptrC--; //c[0,6] and c[0,7]\r
+ ptrC += (rs_c>>1);\r
+ c6 = *ptrC++; //c[1,4] and c[1,5]\r
+ c7 = *ptrC--; //c[1,6] and c[1,7]\r
+ ptrC += (rs_c>>1);\r
+ ca = *ptrC++; //c[2,4] and c[2,5]\r
+ cb = *ptrC--; //c[2,6] and c[2,7]\r
+ ptrC += (rs_c>>1);\r
+ ce = *ptrC++; //c[3,4] and c[3,5]\r
+ cf = *ptrC; //c[3,6] and c[3,7]\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+ sum8 = _dmpysp(regA2, sum8);\r
+ sum9 = _dmpysp(regA2, sum9);\r
+ suma = _dmpysp(regA2, suma);\r
+ sumb = _dmpysp(regA2, sumb);\r
+ sumc = _dmpysp(regA2, sumc);\r
+ sumd = _dmpysp(regA2, sumd);\r
+ sume = _dmpysp(regA2, sume);\r
+ sumf = _dmpysp(regA2, sumf);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+ c8 = _dmpysp(c8, regB2);\r
+ c9 = _dmpysp(c9, regB2);\r
+ ca = _dmpysp(ca, regB2);\r
+ cb = _dmpysp(cb, regB2);\r
+ cc = _dmpysp(cc, regB2);\r
+ cd = _dmpysp(cd, regB2);\r
+ ce = _dmpysp(ce, regB2);\r
+ cf = _dmpysp(cf, regB2);\r
+\r
+ // update c[0,0] and c[1,0]\r
+ c0 = _daddsp(_ftof2(_lof2(sum0),-_hif2(sum0)),c0);\r
+ // update c[0,2] and c[0,3]\r
+ c1 = _daddsp(_ftof2(_lof2(sum2),-_hif2(sum2)),c1);\r
+ //update c[0,4] and c[0,5]\r
+ c2 = _daddsp(_ftof2(_lof2(sum4),-_hif2(sum4)),c2);\r
+ // update c[0,6] and c[0,7]\r
+ c3 = _daddsp(_ftof2(_lof2(sum6),-_hif2(sum6)),c3);\r
+\r
+ //update c[1,0] and c[1,1]\r
+ c4 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum1)),c4);\r
+ //update c[1,2] and c[1,3]\r
+ c5 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum3)),c5);\r
+ //update c[1,4] and c[1,5]\r
+ c6 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum5)),c6);\r
+ //update c[1,6] and c[1,7]\r
+ c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum7)),c7);\r
+\r
+ // update c[2,0] and c[2,0]\r
+ c8 = _daddsp(_ftof2(_lof2(sum8),-_hif2(sum8)),c8);\r
+ // update c[2,2] and c[2,3]\r
+ c9 = _daddsp(_ftof2(_lof2(suma),-_hif2(suma)),c9);\r
+ //update c[2,4] and c[2,5]\r
+ ca = _daddsp(_ftof2(_lof2(sumc),-_hif2(sumc)),ca);\r
+ // update c[2,6] and c[2,7]\r
+ cb = _daddsp(_ftof2(_lof2(sume),-_hif2(sume)),cb);\r
+\r
+ //update c[3,0] and c[3,1]\r
+ cc = _daddsp(_ftof2(_hif2(sum9),_lof2(sum9)),cc);\r
+ //update c[3,2] and c[3,3]\r
+ cd = _daddsp(_ftof2(_hif2(sumb),_lof2(sumb)),cd);\r
+ //update c[3,4] and c[3,5]\r
+ ce = _daddsp(_ftof2(_hif2(sumd),_lof2(sumd)),ce);\r
+ //update c[3,6] and c[3,7]\r
+ cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sumf)),cf);\r
+\r
+// // update c[0,0] and c[1,0]\r
+// c0 = (c + 0*rs_c + 0*cs_c);\r
+// c1 = (c + 1*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum1),-_hif2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,1] and c[1,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum1),_lof2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,2] and c[1,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum3),-_hif2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,3] and c[1,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum3),_lof2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,4] and c[1,4]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum5),-_hif2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,5] and c[1,5]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum5),_lof2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,6] and c[1,6]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum7),-_hif2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,7] and c[1,7]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum7),_lof2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,0] and c[3,0]\r
+// c0 = (c + 2*rs_c + 0*cs_c);\r
+// c1 = (c + 3*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum9),-_hif2(sum8));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,1] and c[3,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum9),_lof2(sum8));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,2] and c[3,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sumb),-_hif2(suma));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,3] and c[3,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sumb),_lof2(suma));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,4] and c[3,4]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sumd),-_hif2(sumc));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,5] and c[3,5]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sumd),_lof2(sumc));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,6] and c[2,6]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sumf),-_hif2(sume));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,7] and c[2,7]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sumf),_lof2(sume));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+\r
+ }\r
+ else\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+ __float2_t c8, c9, ca, cb, cc, cd, ce, cf;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++; //c[0,0] and c[1,0]\r
+ c1 = *ptrC--; //c[2,0] and c[3,0]\r
+ ptrC += (cs_c>>1); // divide by 2 because ptrC is __float2_t, and cs_c is the stride for floats\r
+ c2 = *ptrC++; //c[0,1] and c[1,1]\r
+ c3 = *ptrC--; //c[2,1] and c[3,1]\r
+ ptrC += (cs_c>>1);\r
+ c4 = *ptrC++; //c[0,2] and c[1,2]\r
+ c5 = *ptrC--; //c[2,2] and c[3,2]\r
+ ptrC += (cs_c>>1);\r
+ c6 = *ptrC++; //c[0,3] and c[1,3]\r
+ c7 = *ptrC--; //c[2,3] and c[3,3]\r
+ ptrC += (cs_c>>1);\r
+ c8 = *ptrC++; //c[0,4] and c[1,0]\r
+ c9 = *ptrC--; //c[2,4] and c[3,0]\r
+ ptrC += (cs_c>>1);\r
+ ca = *ptrC++; //c[0,5] and c[1,0]\r
+ cb = *ptrC--; //c[2,5] and c[3,0]\r
+ ptrC += (cs_c>>1);\r
+ cc = *ptrC++; //c[0,6] and c[1,0]\r
+ cd = *ptrC--; //c[2,6] and c[3,0]\r
+ ptrC += (cs_c>>1);\r
+ ce = *ptrC++; //c[0,7] and c[1,0]\r
+ cf = *ptrC; //c[2,7] and c[3,0]\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+ sum8 = _dmpysp(regA2, sum8);\r
+ sum9 = _dmpysp(regA2, sum9);\r
+ suma = _dmpysp(regA2, suma);\r
+ sumb = _dmpysp(regA2, sumb);\r
+ sumc = _dmpysp(regA2, sumc);\r
+ sumd = _dmpysp(regA2, sumd);\r
+ sume = _dmpysp(regA2, sume);\r
+ sumf = _dmpysp(regA2, sumf);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+ c8 = _dmpysp(c8, regB2);\r
+ c9 = _dmpysp(c9, regB2);\r
+ ca = _dmpysp(ca, regB2);\r
+ cb = _dmpysp(cb, regB2);\r
+ cc = _dmpysp(cc, regB2);\r
+ cd = _dmpysp(cd, regB2);\r
+ ce = _dmpysp(ce, regB2);\r
+ cf = _dmpysp(cf, regB2);\r
+\r
+ // update c[0,0] and c[1,0]\r
+ c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);\r
+ // update c[2,0] and c[3,0]\r
+ c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);\r
+ //update c[0,1] and c[1,1]\r
+ c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);\r
+ // update c[2,1] and c[3,1]\r
+ c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);\r
+ //update c[0,2] and c[1,2]\r
+ c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);\r
+ //update c[2,2] and c[3,2]\r
+ c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);\r
+ //update c[0,3] and c[1,3]\r
+ c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);\r
+ //update c[2,3] and c[3,3]\r
+ c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);\r
+ //update c[0,4] and c[1,4]\r
+ c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);\r
+ //update c[2,4] and c[3,4]\r
+ c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);\r
+ //update c[0,5] and c[1,5]\r
+ ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);\r
+ //update c[2,5] and c[3,5]\r
+ cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);\r
+ //update c[0,6] and c[1,6]\r
+ cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);\r
+ //update c[2,6] and c[3,6]\r
+ cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);\r
+ //update c[0,7] and c[1,7]\r
+ ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);\r
+ //update c[2,7] and c[3,7]\r
+ cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);\r
+\r
+ ptrC = (__float2_t *) c;\r
+ *ptrC++ = c0;\r
+ *ptrC-- = c1;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c2;\r
+ *ptrC-- = c3;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c4;\r
+ *ptrC-- = c5;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c6;\r
+ *ptrC-- = c7;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c8;\r
+ *ptrC-- = c9;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = ca;\r
+ *ptrC-- = cb;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = cc;\r
+ *ptrC-- = cd;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = ce;\r
+ *ptrC = cf;\r
+ }\r
+}\r
+\r
+void bli_sgemm_ukernel_4x4(\r
+ dim_t k,\r
+ float* restrict alpha,\r
+ float* restrict a,\r
+ float* restrict b,\r
+ float* restrict beta,\r
+ float* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;\r
+ __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;\r
+ __float2_t * restrict ptrB = (__float2_t *) b;\r
+ __float2_t * restrict ptrA = (__float2_t *) a;\r
+ __float2_t * restrict ptrC;\r
+ __float2_t regA2, regB2;\r
+ int_least16_t index;\r
+ int kEven, kLeft;\r
+\r
+ // zero out accumulators\r
+ sum0 = 0.0;\r
+ sum1 = 0.0;\r
+ sum2 = 0.0;\r
+ sum3 = 0.0;\r
+ sum4 = 0.0;\r
+ sum5 = 0.0;\r
+ sum6 = 0.0;\r
+ sum7 = 0.0;\r
+ sum8 = 0.0;\r
+ sum9 = 0.0;\r
+ suma = 0.0;\r
+ sumb = 0.0;\r
+ sumc = 0.0;\r
+ sumd = 0.0;\r
+ sume = 0.0;\r
+ sumf = 0.0;\r
+\r
+ kEven=k>>1;\r
+ kLeft=k&1;\r
+\r
+ for (index = 0; index < kEven; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 4x1 by 1x4\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 4x4 matrix\r
+ __float2_t b01, b23, a01, a23;\r
+ __x128_t reg128;\r
+\r
+ // for even k\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum0 = _daddsp(sum0, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum1 = _daddsp(sum1, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ sum2 = _daddsp(sum2, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sum3 = _daddsp(sum3, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sum4 = _daddsp(sum4, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sum5 = _daddsp(sum5, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ sum6 = _daddsp(sum6, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sum7 = _daddsp(sum7, _hif2_128(reg128));\r
+\r
+\r
+\r
+ // for odd k\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum8 = _daddsp(sum8, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum9 = _daddsp(sum9, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ suma = _daddsp(suma, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sumb = _daddsp(sumb, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sumc = _daddsp(sumc, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sumd = _daddsp(sumd, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ sume = _daddsp(sume, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sumf = _daddsp(sumf, _hif2_128(reg128));\r
+\r
+ }\r
+ if(kLeft)\r
+ { // last k if left;\r
+ __float2_t b01, b23, a01, a23;\r
+ __x128_t reg128;\r
+\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum0 = _daddsp(sum0, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum1 = _daddsp(sum1, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ sum2 = _daddsp(sum2, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sum3 = _daddsp(sum3, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sum4 = _daddsp(sum4, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sum5 = _daddsp(sum5, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ sum6 = _daddsp(sum6, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sum7 = _daddsp(sum7, _hif2_128(reg128));\r
+\r
+ }\r
+\r
+ sum0 = _daddsp(sum0, sum8);\r
+ sum1 = _daddsp(sum1, sum9);\r
+ sum2 = _daddsp(sum2, suma);\r
+ sum3 = _daddsp(sum3, sumb);\r
+ sum4 = _daddsp(sum4, sumc);\r
+ sum5 = _daddsp(sum5, sumd);\r
+ sum6 = _daddsp(sum6, sume);\r
+ sum7 = _daddsp(sum7, sumf);\r
+\r
+\r
+ regA2 = _ftof2(*alpha, *alpha);\r
+ regB2 = _ftof2(*beta, *beta);\r
+ if (rs_c != 1)\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++; //c[0,0] and c[0,1]\r
+ c1 = *ptrC--; //c[0,2] and c[0,3]\r
+ ptrC += (rs_c>>1);\r
+ c2 = *ptrC++; //c[1,0] and c[1,1]\r
+ c3 = *ptrC--; //c[1,2] and c[1,3]\r
+ ptrC += (rs_c>>1);\r
+ c4 = *ptrC++; //c[2,0] and c[2,1]\r
+ c5 = *ptrC--; //c[2,2] and c[2,3]\r
+ ptrC += (rs_c>>1);\r
+ c6 = *ptrC++; //c[3,0] and c[3,1]\r
+ c7 = *ptrC--; //c[3,2] and c[3,3]\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+\r
+ // update c[0,0] and c[0,1]\r
+ c0 = _daddsp(_ftof2(_lof2(sum0),-_hif2(sum0)),c0);\r
+ // update c[0,2] and c[0,3]\r
+ c1 = _daddsp(_ftof2(_lof2(sum2),-_hif2(sum2)),c1);\r
+ // update c[1,0] and c[1,1]\r
+ c2 = _daddsp(_ftof2(_hif2(sum1), _lof2(sum1)),c2);\r
+ // update c[1,2] and c[1,2]\r
+ c3 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum3)),c3);\r
+ // update c[2,0] and c[2,1]\r
+ c4 = _daddsp(_ftof2(_lof2(sum4),-_hif2(sum4)),c4);\r
+ // update c[2,2] and c[2,3]\r
+ c5 = _daddsp(_ftof2(_lof2(sum6),-_hif2(sum6)),c5);\r
+ // update c[3,0] and c[3,1]\r
+ c6 = _daddsp(_ftof2(_hif2(sum5), _lof2(sum5)),c6);\r
+ // update c[3,2] and c[3,2]\r
+ c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum7)),c7);\r
+\r
+ ptrC = (__float2_t *) c;\r
+ *ptrC++ = c0; //c[0,0] and c[0,1]\r
+ *ptrC-- = c1; //c[0,2] and c[0,3]\r
+ ptrC += (rs_c>>1);\r
+ *ptrC++ = c2; //c[1,0] and c[1,1]\r
+ *ptrC-- = c3; //c[1,2] and c[1,3]\r
+ ptrC += (rs_c>>1);\r
+ *ptrC++ = c4; //c[2,0] and c[2,1]\r
+ *ptrC-- = c5; //c[2,2] and c[2,3]\r
+ ptrC += (rs_c>>1);\r
+ *ptrC++ = c6; //c[3,0] and c[3,1]\r
+ *ptrC-- = c7; //c[3,2] and c[3,3]\r
+\r
+// // update c[0,0] and c[1,0]\r
+// c0 = (c + 0*rs_c + 0*cs_c);\r
+// c1 = (c + 1*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum1),-_hif2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,1] and c[1,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum1),_lof2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,2] and c[1,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum3),-_hif2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,3] and c[1,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum3),_lof2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,0] and c[3,0]\r
+// c0 = (c + 2*rs_c + 0*cs_c);\r
+// c1 = (c + 3*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum5),-_hif2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,1] and c[3,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum5),_lof2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,2] and c[3,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum7),-_hif2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,3] and c[3,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum7),_lof2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+\r
+ }\r
+ else\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++;\r
+ c1 = *ptrC--;\r
+ ptrC += (cs_c>>1);\r
+ c2 = *ptrC++;\r
+ c3 = *ptrC--;\r
+ ptrC += (cs_c>>1);\r
+ c4 = *ptrC++;\r
+ c5 = *ptrC--;\r
+ ptrC += (cs_c>>1);\r
+ c6 = *ptrC++;\r
+ c7 = *ptrC--;\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+\r
+ // update c[0,0] and c[1,0]\r
+ c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);\r
+ // update c[2,0] and c[3,0]\r
+ c1 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c1);\r
+ //update c[0,1] and c[1,1]\r
+ c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);\r
+ // update c[2,1] and c[3,1]\r
+ c3 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),c3);\r
+ //update c[0,2] and c[1,2]\r
+ c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);\r
+ //update c[2,2] and c[3,2]\r
+ c5 = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),c5);\r
+ //update c[0,3] and c[1,3]\r
+ c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);\r
+ //update c[2,3] and c[3,3]\r
+ c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),c7);\r
+\r
+ ptrC = (__float2_t *) c;\r
+ *ptrC++ = c0;\r
+ *ptrC-- = c1;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c2;\r
+ *ptrC-- = c3;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c4;\r
+ *ptrC-- = c5;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c6;\r
+ *ptrC-- = c7;\r
+\r
+ }\r
+}\r
+\r
+\r
+//void dgemmKernel(const double *pA, const double *pB, double *pC, const double a, const int k, const int stepC)\r
+void bli_dgemm_ukernel_4x4(\r
+ dim_t k,\r
+ double* restrict alpha,\r
+ double* restrict a,\r
+ double* restrict b,\r
+ double* restrict beta,\r
+ double* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ double sum00, sum01, sum02, sum03;\r
+ double sum10, sum11, sum12, sum13;\r
+ double sum20, sum21, sum22, sum23;\r
+ double sum30, sum31, sum32, sum33;\r
+ int index;\r
+ double al = *alpha;\r
+ double be = *beta;\r
+\r
+ //touch routine: both a & b\r
+ //Length of b = NR*K*size of double;\r
+ //Length of a = MR*K*size of double;\r
+#ifdef BLIS_ENABLE_PREFETCH\r
+ //touch(b, k*BLIS_DEFAULT_NR_D*8);\r
+ //touch(a, k*BLIS_DEFAULT_MR_D*8);\r
+#endif\r
+\r
+ sum00 = 0.0;\r
+ sum01 = 0.0;\r
+ sum02 = 0.0;\r
+ sum03 = 0.0;\r
+ sum10 = 0.0;\r
+ sum11 = 0.0;\r
+ sum12 = 0.0;\r
+ sum13 = 0.0;\r
+ sum20 = 0.0;\r
+ sum21 = 0.0;\r
+ sum22 = 0.0;\r
+ sum23 = 0.0;\r
+ sum30 = 0.0;\r
+ sum31 = 0.0;\r
+ sum32 = 0.0;\r
+ sum33 = 0.0;\r
+\r
+ for(index = 0; index < k; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 4x1 by 1x4\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 4x4 matrix\r
+ register double a0, a1, a2, a3;\r
+ register double b0, b1, b2, b3;\r
+\r
+ a0 = *a++;\r
+ a1 = *a++;\r
+ a2 = *a++;\r
+ a3 = *a++;\r
+ b0 = *b++;\r
+ b1 = *b++;\r
+ b2 = *b++;\r
+ b3 = *b++;\r
+\r
+ // a[0]*b[0]\r
+ sum00 += a0*b0;\r
+ // a[0]*b[1]\r
+ sum01 += a0*b1;\r
+ // a[0]*b[2]\r
+ sum02 += a0*b2;\r
+ // a[0]*b[3]\r
+ sum03 += a0*b3;\r
+ // a[1]*b[0]\r
+ sum10 += a1*b0;\r
+ // a[1]*b[1]\r
+ sum11 += a1*b1;\r
+ // a[1]*b[2]\r
+ sum12 += a1*b2;\r
+ // a[1]*b[3]\r
+ sum13 += a1*b3;\r
+ // a[2]*b[0]\r
+ sum20 += a2*b0;\r
+ // a[2]*b[1]\r
+ sum21 += a2*b1;\r
+ // a[2]*b[2]\r
+ sum22 += a2*b2;\r
+ // a[2]*b[3]\r
+ sum23 += a2*b3;\r
+ // a[3]*b[0]\r
+ sum30 += a3*b0;\r
+ // a[3]*b[1]\r
+ sum31 += a3*b1;\r
+ // a[3]*b[2]\r
+ sum32 += a3*b2;\r
+ // a[3]*b[3]\r
+ sum33 += a3*b3;\r
+ }\r
+\r
+ double* restrict cptr;\r
+ // 0th Column\r
+ // updating C[00]\r
+ cptr = c;\r
+ *cptr = *cptr * be;\r
+ *cptr += sum00 * al;\r
+\r
+ // updating C[10]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum10 * al;\r
+\r
+ // updating C[20]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum20 * al;\r
+\r
+ // updating C[30]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum30 * al;\r
+\r
+ // 1st column\r
+ // updating C[01]\r
+ cptr = c + cs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum01 * al;\r
+\r
+ // updating C[11]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum11 * al;\r
+\r
+ // updating C[21]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum21 * al;\r
+\r
+ // updating C[31]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum31 * al;\r
+\r
+ // 2nd Column\r
+ // updating C[02]\r
+ cptr = c + 2*cs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum02 * al;\r
+\r
+ // updating C[12]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum12 * al;\r
+\r
+ // updating C[22]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum22 * al;\r
+\r
+ // updating C[32]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum32 * al;\r
+\r
+ // 3rd Column\r
+ // updating C[03]\r
+ cptr = c + 3*cs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum03 * al;\r
+\r
+ // updating C[13]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum13 * al;\r
+\r
+ // updating C[23]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum23 * al;\r
+\r
+ // updating C[33]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum33 * al;\r
+\r
+ return;\r
+}\r
+\r
+void bli_cgemm_ukernel_2x4(\r
+ dim_t k,\r
+ scomplex* restrict alpha,\r
+ scomplex* restrict a,\r
+ scomplex* restrict b,\r
+ scomplex* restrict beta,\r
+ scomplex* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ __float2_t sum00a, sum10a, sum00b, sum10b;\r
+ __float2_t sum01a, sum11a, sum01b, sum11b;\r
+ __float2_t sum02a, sum12a, sum02b, sum12b;\r
+ __float2_t sum03a, sum13a, sum03b, sum13b;\r
+ __float2_t * restrict ptrB = (__float2_t *) b;\r
+ __float2_t * restrict ptrA = (__float2_t *) a;\r
+ __float2_t * restrict ptrC;\r
+ __float2_t regA, regB, regC;\r
+ int_least16_t index;\r
+\r
+ // zero out accumulators\r
+ sum00a = 0.0;\r
+ sum10a = 0.0;\r
+ sum01a = 0.0;\r
+ sum11a = 0.0;\r
+ sum02a = 0.0;\r
+ sum12a = 0.0;\r
+ sum03a = 0.0;\r
+ sum13a = 0.0;\r
+ sum00b = 0.0;\r
+ sum10b = 0.0;\r
+ sum01b = 0.0;\r
+ sum11b = 0.0;\r
+ sum02b = 0.0;\r
+ sum12b = 0.0;\r
+ sum03b = 0.0;\r
+ sum13b = 0.0;\r
+\r
+ for (index = 0; index < k; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 2x1 by 1x4\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 2x4 matrix\r
+ __float2_t b0, b1, b2, b3, a0, a1;\r
+ __x128_t reg128;\r
+\r
+ a0 = *ptrA++;\r
+ a1 = *ptrA++;\r
+\r
+ b0 = *ptrB++;\r
+ b1 = *ptrB++;\r
+ b2 = *ptrB++;\r
+ b3 = *ptrB++;\r
+\r
+ // the four partial sums are accumulated independently\r
+ // a[0]*b[0]\r
+ reg128 = _cmpysp(a0, b0);\r
+ sum00a = _daddsp(sum00a, _lof2_128(reg128));\r
+ sum00b = _daddsp(sum00b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[0]\r
+ reg128 = _cmpysp(a1, b0);\r
+ sum10a = _daddsp(sum10a, _lof2_128(reg128));\r
+ sum10b = _daddsp(sum10b, _hif2_128(reg128));\r
+\r
+ // a[0]*b[1]\r
+ reg128 = _cmpysp(a0, b1);\r
+ sum01a = _daddsp(sum01a, _lof2_128(reg128));\r
+ sum01b = _daddsp(sum01b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[1]\r
+ reg128 = _cmpysp(a1, b1);\r
+ sum11a = _daddsp(sum11a, _lof2_128(reg128));\r
+ sum11b = _daddsp(sum11b, _hif2_128(reg128));\r
+\r
+ // a[0]*b[2]\r
+ reg128 = _cmpysp(a0, b2);\r
+ sum02a = _daddsp(sum02a, _lof2_128(reg128));\r
+ sum02b = _daddsp(sum02b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[2]\r
+ reg128 = _cmpysp(a1, b2);\r
+ sum12a = _daddsp(sum12a, _lof2_128(reg128));\r
+ sum12b = _daddsp(sum12b, _hif2_128(reg128));\r
+\r
+ // a[0]*b[3]\r
+ reg128 = _cmpysp(a0, b3);\r
+ sum03a = _daddsp(sum03a, _lof2_128(reg128));\r
+ sum03b = _daddsp(sum03b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[3]\r
+ reg128 = _cmpysp(a1, b3);\r
+ sum13a = _daddsp(sum13a, _lof2_128(reg128));\r
+ sum13b = _daddsp(sum13b, _hif2_128(reg128));\r
+ }\r
+\r
+ {\r
+ __x128_t reg128;\r
+ ptrA = (__float2_t *) alpha;\r
+ ptrB = (__float2_t *) beta;\r
+ regA = *ptrA;\r
+ regB = *ptrB;\r
+\r
+ // the value of a and the final values need to be\r
+ // rearranged due to the specific way cmpysp assumes\r
+ // data arrangement\r
+ regA =_ftof2(-_lof(regA), _hif(regA));\r
+ //regB = _ftof2(_lof(regB),_hif(regB));\r
+ ptrC = (__float2_t *) c;\r
+\r
+ // update and save c[0,0]\r
+ sum00a = _daddsp(sum00a, sum00b);\r
+ reg128 = _cmpysp(regA, sum00a);\r
+ sum00a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum00a),_hif(sum00a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c;\r
+\r
+ // update and save c[1,0]\r
+ sum10a = _daddsp(sum10a, sum10b);\r
+ reg128 = _cmpysp(regA, sum10a);\r
+ sum10a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum10a),_hif(sum10a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+\r
+ ptrC = (__float2_t *) c + cs_c;\r
+\r
+ // update and save c[0,1]\r
+ sum01a = _daddsp(sum01a, sum01b);\r
+ reg128 = _cmpysp(regA, sum01a);\r
+ sum01a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum01a),_hif(sum01a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c + cs_c;\r
+\r
+ // update and save c[1,1]\r
+ sum11a = _daddsp(sum11a, sum11b);\r
+ reg128 = _cmpysp(regA, sum11a);\r
+ sum11a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum11a),_hif(sum11a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + 2 * cs_c;\r
+\r
+ // update and save c[0,2]\r
+ sum02a = _daddsp(sum02a, sum02b);\r
+ reg128 = _cmpysp(regA, sum02a);\r
+ sum02a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum02a),_hif(sum02a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c + 2* cs_c;\r
+\r
+ // update and save c[1,2]\r
+ sum12a = _daddsp(sum12a, sum12b);\r
+ reg128 = _cmpysp(regA, sum12a);\r
+ sum12a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum12a),_hif(sum12a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + 3 * cs_c;\r
+\r
+ // update and save c[0,3]\r
+ sum03a = _daddsp(sum03a, sum03b);\r
+ reg128 = _cmpysp(regA, sum03a);\r
+ sum03a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum03a),_hif(sum03a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c + 3 * cs_c;\r
+\r
+ // update and save c[1,3]\r
+ sum13a = _daddsp(sum13a, sum13b);\r
+ reg128 = _cmpysp(regA, sum13a);\r
+ sum13a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum13a),_hif(sum13a)),_ftof2(_lof(regC),-_hif(regC)));\r
+ }\r
+ return;\r
+}\r
+\r
+void bli_zgemm_ukernel_2x2(\r
+ dim_t k,\r
+ dcomplex* restrict alpha,\r
+ dcomplex* restrict a,\r
+ dcomplex* restrict b,\r
+ dcomplex* restrict beta,\r
+ dcomplex* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ double * restrict ptrA = (double *) a;\r
+ double * restrict ptrB = (double *) b;\r
+ //double * restrict ptrC = (double *) c;\r
+ double sum00r, sum00i;\r
+ int index;\r
+ int kEven = k&0xFFFE;\r
+\r
+ sum00r = 0.0;\r
+ sum00i = 0.0;\r
+\r
+ if(k>4) // The loop is safe for k > 4\r
+ {\r
+#pragma UNROLL(2)\r
+ for(index = 0; index<kEven; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 1x1 by 1x1\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 1x1 matrix\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+\r
+ }\r
+ if(k&1) // odd k; one left to do\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ if(k>0)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ if(k>1)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ if(k>2)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ if(k>3)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+\r
+ }\r
+\r
+ { // final saving\r
+ double alphar, alphai, betar, betai, cr, ci;\r
+ alphar = alpha->real;\r
+ alphai = alpha->imag;\r
+ betar = beta->real;\r
+ betai = beta->imag;\r
+\r
+ cr = c->real;\r
+ ci = c->imag;\r
+\r
+ c->imag = (betar * ci + betai * cr);\r
+ c->real = (betar * cr - betai * ci);\r
+ c->real += (alphar * sum00r - alphai * sum00i);\r
+ c->imag += (alphar * sum00i + alphai * sum00r);\r
+ }\r
+\r
+\r
+ return;\r
+}\r
+\r
+\r
index bf427249bf06b30450e279d495a961978e65920e..f80691f67c018a2502db322a8d6334cf4055f10f 100644 (file)
--- a/blis/testsuite/Makefile
+++ b/blis/testsuite/Makefile
#
# BLIS library and header path. This is simply wherever it was installed.
-#BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib
-#BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis
+BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib
+BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis
+BLIS_LIB := ../$(BLIS_LIB_PATH)/libblis.a
# BLIS library.
-BLIS_LIB_PATH := $(DIST_PATH)/$(LIB_DIR)/$(CONFIG_NAME)
-BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a -lOpenCL -locl_util -lstdc++ -lrt
-LDFLAGS += -L$(TARGET_ROOTDIR)/lib -L$(TARGET_ROOTDIR)/usr/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/usr/lib
+#BLIS_LIB_PATH := $(DIST_PATH)/$(LIB_DIR)/$(CONFIG_NAME)
# BLAS library path(s). This is where the BLAS libraries reside.
ifeq ($(lib),CBLAS)
#CBLAS w/o OpenCL wrappers
+LDFLAGS += -L$(TARGET_ROOTDIR)/lib -L$(TARGET_ROOTDIR)/usr/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/usr/lib
+
CFLAGS += -I$(CBLAS_INC_PATH)
CFLAGS += -DCBLAS
-temp := $(CBLAS_LIB) $(BLIS_LIB)
+temp := $(CBLAS_LIB) $(BLIS_LIB) -lOpenCL -locl_util -lstdc++ -lrt
BLIS_LIB := $(temp)
else ifeq ($(lib),OpenCLCBLAS)
CFLAGS += -I$(CBLAS_INC_PATH)
CFLAGS += -DCBLAS
CFLAGS += -I$(TARGET_ROOTDIR)/usr/include -idirafter /usr/include
-BLIS_LIB := $(BLIS_BLAS_ACC_LIB_DIR)/libcblas_armplusdsp.a $(BLIS_ARM_LIB_DIR)/libblis.a -lOpenCL -locl_util -lstdc++ -lrt
+BLIS_LIB := $(BLIS_BLAS_ACC_LIB_DIR)/libcblas_armplusdsp.a $(BLIS_ARM_LIB_DIR)/libblis.a -lOpenCL -locl_util -lstdc++ -lrt -lpthread
LDFLAGS += -L$(TARGET_ROOTDIR)/lib -L$(TARGET_ROOTDIR)/usr/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/usr/lib
endif
diff --git a/blis/testsuite/dsponly/Makefile b/blis/testsuite/dsponly/Makefile
--- /dev/null
@@ -0,0 +1,216 @@
+
+#
+# Check if required environment variables are defined
+#
+ifneq ($(MAKECMDGOALS),clean)
+
+# Path to C6000 compiler tools
+ifeq ($(CGTROOT),)
+$(call error,ERROR - CGTROOT NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(XDC_DIR),)
+$(call error,ERROR - XDC_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(XDAIS_DIR),)
+$(call error,ERROR - XDAIS_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(BIOS_DIR),)
+$(call error,ERROR - BIOS_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(IPC_DIR),)
+$(call error,ERROR - IPC_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(OMP_DIR),)
+$(call error,ERROR - OMP_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(PDK_DIR),)
+$(call error,ERROR - PDK_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(FC_DIR),)
+$(call error,ERROR - FC_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(EDMA3_DIR),)
+$(call error,ERROR - EDMA3_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(LIBARCH_DIR),)
+$(call error,ERROR - LIBARCH_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(LINALG_DIR),)
+$(call error,ERROR - LINALG_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+endif
+
+DESTDIR = ../../
+
+#
+# --- Makefile initialization --------------------------------------------------
+#
+
+# Define the name of the configuration file.
+CONFIG_MK_FILE := config.mk
+
+# Define the name of the file containing build and architecture-specific
+# makefile definitions.
+MAKE_DEFS_FILE := make_defs.mk
+
+# All makefile fragments in the tree will have this name.
+FRAGMENT_MK := .fragment.mk
+
+# Locations of important files.
+CONFIG_DIR := config
+FRAME_DIR := frame
+LIB_DIR := lib
+
+
+
+#
+# --- Include makefile configuration file --------------------------------------
+#
+
+# Construct the path to the makefile configuration file that was generated by
+# the configure script.
+CONFIG_MK_PATH := ../../$(CONFIG_MK_FILE)
+
+# Include the configuration file.
+-include $(CONFIG_MK_PATH)
+
+# Detect whether we actually got the configuration file. If we didn't, then
+# it is likely that the user has not yet generated it (via configure).
+ifeq ($(strip $(CONFIG_MK_INCLUDED)),yes)
+CONFIG_MK_PRESENT := yes
+else
+CONFIG_MK_PRESENT := no
+endif
+
+# Override the DIST_PATH value obtained from config.mk, since it is relative
+# to the build directory.
+DIST_PATH := ..
+
+# Now we have access to CONFIG_NAME, which tells us which sub-directory of the
+# config directory to use as our configuration.
+CONFIG_PATH := $(DIST_PATH)/$(CONFIG_DIR)/$(CONFIG_NAME)
+FRAME_PATH := $(DIST_PATH)/$(FRAME_DIR)
+
+
+
+#
+# --- Include makefile definitions file ----------------------------------------
+#
+
+# Construct the path to the makefile definitions file residing inside of
+# the configuration sub-directory.
+MAKE_DEFS_MK_PATH := $(CONFIG_PATH)/$(MAKE_DEFS_FILE)
+
+# Include the makefile definitions file.
+-include $(MAKE_DEFS_MK_PATH)
+
+# Detect whether we actually got the make definitios file. If we didn't, then
+# it is likely that the configuration is invalid (or incomplete).
+ifeq ($(strip $(MAKE_DEFS_MK_INCLUDED)),yes)
+MAKE_DEFS_MK_PRESENT := yes
+else
+MAKE_DEFS_MK_PRESENT := no
+endif
+
+
+
+#
+# --- Include makefile fragments -----------------------------------------------
+#
+
+# Initialize our list of directory paths to makefile fragments with the empty
+# list. This variable will accumulate all of the directory paths in which
+# makefile fragments reside.
+FRAGMENT_DIR_PATHS :=
+
+# This variable is used by the include statements as they recursively include
+# one another. For the framework source tree ('frame' directory), we initialize
+# it to the top-level directory since that is its parent.
+PARENT_PATH := $(DIST_PATH)
+
+# Recursively include all the makefile fragments in the framework itself.
+-include $(addsuffix /$(FRAGMENT_MK), $(FRAME_PATH))
+
+# Now set PARENT_PATH to $(DIST_PATH)/config in preparation to include the
+# fragments in the configuration sub-directory.
+PARENT_PATH := $(DIST_PATH)/$(CONFIG_DIR)
+
+# Recursively include all the makefile fragments in the configuration
+# sub-directory.
+-include $(addsuffix /$(FRAGMENT_MK), $(CONFIG_PATH))
+
+# Create a list of the makefile fragments.
+MAKEFILE_FRAGMENTS := $(addsuffix /$(FRAGMENT_MK), $(FRAGMENT_DIR_PATHS))
+
+# Detect whether we actually got any makefile fragments. If we didn't, then it
+# is likely that the user has not yet generated them (via configure).
+ifeq ($(strip $(MAKEFILE_FRAGMENTS)),)
+MAKEFILE_FRAGMENTS_PRESENT := no
+else
+MAKEFILE_FRAGMENTS_PRESENT := yes
+endif
+
+
+
+#
+# --- Compiler include path definitions ----------------------------------------
+#
+
+# Expand the fragment paths that contain .h files to attain the set of header
+# files present in all fragment paths.
+MK_HEADER_FILES := $(foreach frag_path, $(FRAGMENT_DIR_PATHS), \
+ $(wildcard $(frag_path)/*.h))
+
+# Strip the leading, internal, and trailing whitespace from our list of header
+# files. This makes the "make install-headers" much more readable.
+MK_HEADER_FILES := $(strip $(MK_HEADER_FILES))
+
+# Expand the fragment paths that contain .h files, and take the first
+# expansion. Then, strip the header filename to leave the path to each header
+# location. Notice this process even weeds out duplicates! Add the config
+# directory manually since it contains FLA_config.h.
+MK_HEADER_DIR_PATHS := $(dir $(foreach frag_path, $(FRAGMENT_DIR_PATHS), \
+ $(firstword $(wildcard $(frag_path)/*.h))))
+
+# Add -I to each header path so we can specify our include search paths to the
+# C compiler.
+INCLUDE_PATHS := $(strip $(patsubst %, -I%, $(MK_HEADER_DIR_PATHS)))
+CFLAGS := $(CFLAGS) $(INCLUDE_PATHS)
+
+# BLIS library and header path. This is simply wherever it was installed.
+BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis
+
+CL_OPTS = -I$(BLIS_INC_PATH) -DCBLAS -DBLIS_TEST_DSP
+
+COMMON_FOLDER = ../../../examples/dsponly/common
+
+testfiles_obj = ticblas_config.obj fc_config_c6678.obj \
+ test_addm.obj test_dotxaxpyf.obj test_her2.obj test_scal2v.obj test_syr2k.obj \
+ test_addv.obj test_dotxf.obj test_her2k.obj test_scalm.obj test_syr.obj \
+ test_axpy2v.obj test_dotxv.obj test_her.obj test_scalv.obj test_syrk.obj \
+ test_axpyf.obj test_gemm.obj test_herk.obj test_setm.obj test_trmm3.obj \
+ test_axpym.obj test_gemmtrsm_ukr.obj test_libblis.obj test_setv.obj test_trmm.obj \
+ test_axpyv.obj test_gemm_ukr.obj test_normfm.obj test_subm.obj test_trmv.obj \
+ test_copym.obj test_gemv.obj test_normfv.obj test_subv.obj test_trsm.obj \
+ test_copyv.obj test_ger.obj test_randm.obj test_symm.obj test_trsm_ukr.obj \
+ test_dotaxpyv.obj test_hemm.obj test_randv.obj test_symv.obj test_trsv.obj \
+ test_dotv.obj test_hemv.obj test_scal2m.obj test_syr2.obj
+
+outfile = ../blistestDSP.out
+
+include $(COMMON_FOLDER)/Makefile.common
+
+%.obj: ../src/%.c
+ $(CL) $(CL_OPTS) $<
+ echo Compiling $<
index fa4d6ab3de6cdbb1a392c7144a7adde8d31cce7e..6a7608261008bd718b687acf364ffa12a720109e 100644 (file)
# 'r' = rowvec / unit stride; 'i' = rowvec / non-unit stride
0 # Test all combinations of storage schemes?
32 # General stride spacing (for cases when testing general stride)
-d # Datatype(s) to test:
+sdcz # Datatype(s) to test:
# 's' = single real; 'c' = single complex;
# 'd' = double real; 'z' = double complex
-1000 # Problem size: first to test
-5000 # Problem size: maximum to test
-500 # Problem size: increment between experiments
+1000 # Problem size: first to test
+4000 # Problem size: maximum to test
+500 # Problem size: increment between experiments
# Complex level-3 implementations
0 # 3mh ('1' = enable; '0' = disable)
0 # 3m ('1' = enable; '0' = disable)
diff --git a/blis/testsuite/parselog.pl b/blis/testsuite/parselog.pl
--- /dev/null
@@ -0,0 +1,15 @@
+#!/usr/bin/perl -sw
+
+my $input_file = $ARGV[0];
+my $output_file = $ARGV[1];
+open( my $fh_in, '<', $input_file ) or die "Can't open $input_file: $!";
+open( my $fh_out, '>', $output_file);
+
+while ( my $line = <$fh_in> ) {
+ if ( $line =~ /blis_/ ) {
+ print $fh_out $line;
+ }
+}
+
+close $fh_in;
+close $fh_out
\ No newline at end of file
index da27a3a61bc52b4e96df980d205e0b6d68f58249..cd6bd167f99e1b7dd91506ba4be6946e47db05f3 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_addm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
double time_min = 1e9;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_addm_check( &alpha, &beta, &x, &y, resid );
index b8909944a34965ae19b1a4eb57b7131fc920becd..afa54dc1960edd0ef77299be2dcd29eb9a41078c 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_addv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
double time_min = 1e9;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
-
+ perf->gflops = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
+
// Perform checks.
libblis_test_addv_check( &alpha, &beta, &x, &y, resid );
index a622a1b6e768ecb82fd58dbf9d4b80deafd14a48..155e2d4f948eaefef36daeacc66d9c9e115439b1 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_axpy2v_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m + 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( z ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m + 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( z ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_axpy2v_check( &alpha1, &alpha2, &x, &y, &z, &z_save, resid );
index e85defc5302f441f7e1952cdc1245d3e2b4a5e16..6968708e32954c838ea4e50bdd1b285e981e1f70 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_axpyf_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * b_n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * b_n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_axpyf_check( &alpha, &a, &x, &y, &y_save, resid );
index da5124bde493ca1f34acbe752c2c45031f8894d1..c1d2c045012e411389a5aef4bed834fbc60a1e83 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_axpym_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_axpym_check( &alpha, &x, &y, &y_save, resid );
index ee237cf5aa447934c05b7227021f0e2d42083daa..76ff5707137582c8ad441e79d751b8f98b6a1af4 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_axpyv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m )*test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m )*test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 66a7bbd9617e06b13d5623183b5791212776966a..3a01242b715f15b1294a7f860365dd616792c24b 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_copym_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
double time_min = 1e9;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_copym_check( &x, &y, resid );
index e854d5da9e57e2367aeebd9c70841936fb64420d..fb6e2af47009b705f16bc1d805fb650fbbb52b2e 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_copyv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
double time_min = 1e9;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 1.0 * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 4fa0fbba91072f2e485cb01e5e8022d027c42ec6..dbce8defc8c53722876bfb98a55c30f89e64c8c9 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_dotaxpyv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m + 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( z ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m + 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( z ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_dotaxpyv_check( &alpha, &xt, &x, &y, &rho, &z, &z_save, resid );
index 0fac9b9f7812cd6b94c1dcf8ada464e13e4d85d3..f6b7ea9b2c29233e940f48a50ee85fa6f4d6c283 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_dotv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) *test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m ) *test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index b4361470ad26ae1f0c62f65b518eac2a4b215cfc..6a7a55e8a3afde048fe4ea8f62dcd8b442c612ac 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_dotxaxpyf_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * b_n + 2.0 * m * b_n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * b_n + 2.0 * m * b_n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_dotxaxpyf_check( &alpha, &at, &a, &w, &x, &beta, &y, &z, &y_save, &z_save, resid );
index d9a21c463c020e4c7104bb11d2dcbf0896a50f81..6167182176d2d409230220cf67bb8a06118cd9cf 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_dotxf_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * b_n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * b_n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_dotxf_check( &alpha, &a, &x, &beta, &y, &y_save, resid );
index fc1aa0aa9797501204039fd66859de63d32494ff..95f6c5e1045072a10b728bef91b4c43d9b08f723 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_dotxv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_dotxv_check( &alpha, &x, &y, &beta, &rho, &rho_save, resid );
index 56399d4514b4da6be5c3f6c687a737ea22e82a23..8748928932171f4b26731d0f56a0d80eb7aadf97 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_gemm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
// Create test operands (vectors and/or matrices).
libblis_test_mobj_create( params, datatype, transa,
sc_str[0], m, k, &a );
+ //printf("Created object a, buffer address is 0x%x.\n", (unsigned int)bli_obj_buffer(a));
+
libblis_test_mobj_create( params, datatype, transb,
sc_str[1], k, n, &b );
+ //printf("Created object b, buffer address is 0x%x.\n", (unsigned int)bli_obj_buffer(b));
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
for(i = 0; i < test_way; i++)
libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
sc_str[2], m, n, &c_save );
#endif
+ //printf("Created object c, buffer address is 0x%x.\n", (unsigned int)bli_obj_buffer(c[0]));
+ //printf("Created object c_save, buffer address is 0x%x.\n", (unsigned int)bli_obj_buffer(c_save[0]));
// Set alpha and beta.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
bli_obj_set_conjtrans( transa, a );
bli_obj_set_conjtrans( transb, b );
+ //bli_printm( "c_save = [", &c_save[0], "%f", "];" );
+
// Repeat the experiment n_repeats times and record results.
for ( i = 0; i < n_repeats; ++i )
{
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Need only one call to initialize the CBLAS OpenCL kernel
bli_copym( &c_save[0], &c[0] );
-
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c[0] );
//but need to re-initialize C for each of iteration of n_repeats
#else
bli_copym( &c_save, &c );
-
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
-
bli_copym( &c_save, &c );
#endif
-
time = bli_clock();
// bli_printm( "a = [", &a, "%f", "];" );
// Estimate the performance of the best experiment repeat.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
- *perf = ( 2.0 * m * n * k ) / time_min * test_way / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n * k ) / time_min * test_way / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c[0] ) ) perf->gflops *= 4.0;
#else
- *perf = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
// Perform checks.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
cblas_b = (float *) bli_obj_buffer( *b );
cblas_c = (float *) bli_obj_buffer( *c );
-// printf("test_gemm %d %d %d %d %d\n", order, transA, transB, lda, ldb);
+ //printf("test_gemm %d %d %d %d %d, 0x%x, 0x%x, 0x%x\n", order, transA, transB, lda, ldb, (unsigned int)cblas_a,(unsigned int)cblas_b,(unsigned int)cblas_c);
+ //printf("Start sgemm for (m,k,n) = (%d,%d,%d) \n", m, k, n);
cblas_sgemm(order, transA, transB, m, n, k, *cblas_alpha, cblas_a, lda, cblas_b, ldb, *cblas_beta, cblas_c, ldc);
+ //printf("sgemm for (m,k,n) = (%d,%d,%d) finished.\n", m, k, n);
}
else if (bli_obj_is_double( *a ))
cblas_b = (double *) bli_obj_buffer( *b );
cblas_c = (double *) bli_obj_buffer( *c );
+ //printf("test_gemm %d %d %d %d %d, 0x%x, 0x%x, 0x%x\n", order, transA, transB, lda, ldb, (unsigned int)cblas_a,(unsigned int)cblas_b,(unsigned int)cblas_c);
cblas_dgemm(order, transA, transB, m, n, k, *cblas_alpha, cblas_a, lda, cblas_b, ldb, *cblas_beta, cblas_c, ldc);
}
cblas_b = bli_obj_buffer( *b );
cblas_c = bli_obj_buffer( *c );
+ //printf("Start zgemm for (m,k,n) = (%d,%d,%d) \n", m, k, n);
cblas_zgemm(order, transA, transB, m, n, k, cblas_alpha, cblas_a, lda, cblas_b, ldb, cblas_beta, cblas_c, ldc);
+ //printf("zgemm for (m,k,n) = (%d,%d,%d) finished.\n", m, k, n);
}
#else
bli_gemm( alpha, a, b, beta, c );
index 5506bedb16f20eb5e8728b5cf256dd5d9a9e7102..9cf862391b5dda7803ba8af5cdd551f1abc0c0b7 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_gemm_ukr_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_gemm_ukr_check( &alpha, &a, &b, &beta, &c, &c_save, resid );
index 87d7f1b942174ede33a55006e0c30ec8c0cfd0d0..eab4d44b69938e1ba1c396fc90a6e6895ff3e5e6 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_gemmtrsm_ukr_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n * k + 1.0 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( b ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n * k + 1.0 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( b ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_gemmtrsm_ukr_check( side, &alpha,
index 7d611488702fdef46a7a27261753275cf64bb822..f59d1d49b661c2284919a67347f69b6c5da358a2 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_gemv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n ) * test_way/ time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n ) * test_way/ time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index d93853331d7162ae40eb35f906171c1aa0cc9fba..2139260927a678a19628f8c5cf1947c4c396e8b4 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_ger_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
// Perform checks.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
index a77cada7b928ed8472a9bbe9c640df0d10cc9f80..1e7a52898e66627a00922193ad6b02e9bf988c36 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_hemm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
// Estimate the performance of the best experiment repeat.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
- *perf = ( 2.0 * mn_side * m * n ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * mn_side * m * n ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c[0] ) ) perf->gflops *= 4.0;
#else
- *perf = ( 2.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
// Perform checks.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
index 682761cbde7d91e629ac7a88ac31c44018116b98..7b1a4477dcaa1e5711f36445aca48ea939ae185c 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_hemv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) *test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) *test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 592860c6039a97ce7dc8dc4fa2b7bd458b2d17ef..660c31fe494e9e1fc97708cb0e7675d7056d6b7a 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_her_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a[0]) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a[0]) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 7bae1e1f5c807a7455cfb25f6a84f8e804825e92..c77f813db07f6c01836dbb295da4a0e1822203c0 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_her2_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index ff60f64bca5cfa392485194580ca84dea91d963b..e921367959de6d789afffcd71a3febb9c047ae7b 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_her2k_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * m * k ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * m * k ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c ) ) perf->gflops *= 4.0;
#endif
-
+ perf->time = time_min;
// Perform checks.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
index a2e4bfc5f01769f4c3e368272daaba751cb9beba..56517fd76ccd9f8233a3d9963ef4c834c38022d9 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_herk_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
// Estimate the performance of the best experiment repeat.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
- *perf = ( 1.0 * m * m * k ) * test_way/ time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m * k ) * test_way/ time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c[0] ) ) perf->gflops *= 4.0;
#else
- *perf = ( 1.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
+
// Perform checks.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 00249a59a44d7accfd7f117aa48a2db862eb84c4..1f7fe1df02e71ef1c5c74574e2c22e59f0dff650 100644 (file)
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
*/
#include "blis.h"
#include "test_libblis.h"
+/* ticblas init and finalize functions,
+ defined in ../../../examples/dsponly/common/ticblas_config.c */
+extern void cleanup_after_ticblas();
+extern void prepare_for_ticblas();
// Global variables.
char libblis_test_binary_name[ MAX_BINARY_NAME_LENGTH + 1 ];
test_ops_t ops;
// Initialize libblis.
- bli_init();
-
+ //bli_init();
+ /* Configure memory and initialize TI CBLAS */
+#ifdef BLIS_TEST_DSP
+ prepare_for_ticblas();
+#endif
+
// Initialize some strings.
libblis_test_init_strings();
libblis_test_level3_ops( ¶ms, &ops );
// Finalize libblis.
- bli_finalize();
-
+ //bli_finalize();
+ /* Finalize TI CBLAS and reconfigure memory */
+#ifdef BLIS_TEST_DSP
+ cleanup_after_ticblas();
+#endif
+
// Return peacefully.
return 0;
}
char*, // pc_str (current param string)
char*, // sc_str (current storage string)
unsigned int, // p_cur (current problem size)
- double*, // perf
+ perf_t*, // perf
double* ) ) // residual
{
unsigned int n_mstorage = params->n_mstorage;
unsigned int p_cur, pi;
unsigned int dt, pci, sci, i, j, o;
- double perf, resid;
+ perf_t perf;
+ double resid;
char* pass_str;
char blank_str[32];
char funcname_str[64];
n_spaces = MAX_FUNC_STRING_LENGTH - strlen( funcname_str );
fill_string_with_n_spaces( blank_str, n_spaces );
+ strcat(funcname_str,blank_str);
+
// Print all dimensions to a single string.
libblis_test_build_dims_string( op, p_cur, dims_str );
if ( params->output_matlab_format )
{
libblis_test_fprintf( stdout,
- "%s%s( %3u, 1:%u ) = [%s %7.3lf %8.2le ]; %c %s\n",
- funcname_str, blank_str, pi, n_dims_print + 2,
- dims_str, perf, resid,
+ "%s( %3u, 1:%u ) = [%s %8.2le %7.3lf %8.2le ]; %c %s\n",
+ funcname_str, pi, n_dims_print + 2,
+ dims_str, perf.time, perf.gflops, resid,
OUTPUT_COMMENT_CHAR,
pass_str );
// Also output to a file if requested (and successfully opened).
if ( output_stream )
libblis_test_fprintf( output_stream,
- "%s%s( %3u, 1:%u ) = [%s %7.3lf %8.2le ]; %c %s\n",
- funcname_str, blank_str, pi, n_dims_print + 2,
- dims_str, perf, resid,
+ "%s( %3u, 1:%u ) = [%s %8.2le %7.3lf %8.2le ]; %c %s\n",
+ funcname_str, pi, n_dims_print + 2,
+ dims_str, perf.time, perf.gflops, resid,
OUTPUT_COMMENT_CHAR,
pass_str );
}
else
{
libblis_test_fprintf( stdout,
- "%s%s %s %7.3lf %8.2le %s\n",
- funcname_str, blank_str,
- dims_str, perf, resid,
+ "%s %s %8.2le %7.3lf %8.2le %s\n",
+ funcname_str,
+ dims_str, perf.time, perf.gflops, resid,
pass_str );
// Also output to a file if requested (and successfully opened).
if ( output_stream )
libblis_test_fprintf( output_stream,
- "%s%s %s %7.3lf %8.2le %s\n",
- funcname_str, blank_str,
- dims_str, perf, resid,
+ "%s %s %8.2le %7.3lf %8.2le %s\n",
+ funcname_str,
+ dims_str, perf.time, perf.gflops, resid,
pass_str );
}
if ( op->dimset == BLIS_TEST_DIMS_MF )
{
//sprintf( &dims_str[strlen(dims_str)], " %5u %5u",
- sprintf( dims_str, " %5u %5u",
+ sprintf( dims_str, " %5u\t %5u\t",
( unsigned int )
libblis_test_get_dim_from_prob_size( op->dim_spec[0],
p_cur ),
else if ( op->dimset == BLIS_TEST_DIMS_K )
{
//sprintf( &dims_str[strlen(dims_str)], " %5u %5u %5u",
- sprintf( dims_str, " %5u %5u %5u",
+ sprintf( dims_str, " %5u\t %5u\t %5u\t",
( unsigned int ) op->dim_aux[0],
( unsigned int ) op->dim_aux[1],
( unsigned int )
else if ( op->dimset == BLIS_TEST_NO_DIMS )
{
//sprintf( &dims_str[strlen(dims_str)], " %5u %5u",
- sprintf( dims_str, " %5u %5u",
+ sprintf( dims_str, " %5u\t %5u\t",
( unsigned int ) op->dim_aux[0],
( unsigned int ) op->dim_aux[1] );
}
sprintf( dims_str, "%s", "" );
// Print all dimensions to a single string.
- for ( i = 0; i < op->n_dims; ++i )
- {
- sprintf( &dims_str[strlen(dims_str)], " %5u",
- ( unsigned int )
- libblis_test_get_dim_from_prob_size( op->dim_spec[i],
+ if(op->dimset == BLIS_TEST_DIMS_MN) {
+ sprintf( &dims_str[strlen(dims_str)], " %5u\t",
+ ( unsigned int )
+ libblis_test_get_dim_from_prob_size( op->dim_spec[0],
+ p_cur ) );
+ sprintf( &dims_str[strlen(dims_str)], " %5u\t",
+ ( unsigned int )
+ libblis_test_get_dim_from_prob_size( op->dim_spec[1],
p_cur ) );
+ sprintf( &dims_str[strlen(dims_str)], " \t");
+
+ }
+ else if(op->dimset == BLIS_TEST_DIMS_MK) {
+ sprintf( &dims_str[strlen(dims_str)], " %5u\t",
+ ( unsigned int )
+ libblis_test_get_dim_from_prob_size( op->dim_spec[0],
+ p_cur ) );
+ sprintf( &dims_str[strlen(dims_str)], " \t");
+ sprintf( &dims_str[strlen(dims_str)], " %5u\t",
+ ( unsigned int )
+ libblis_test_get_dim_from_prob_size( op->dim_spec[1],
+ p_cur ) );
+ }
+ else {
+ for ( i = 0; i < op->n_dims; ++i )
+ {
+ sprintf( &dims_str[strlen(dims_str)], " %5u\t",
+ ( unsigned int )
+ libblis_test_get_dim_from_prob_size( op->dim_spec[i],
+ p_cur ) );
+ }
}
+
+
}
}
n_spaces = 6;
fill_string_with_n_spaces( blank_str, n_spaces );
- sprintf( &l_str[strlen(l_str)], "%s", blank_str );
+ sprintf( &l_str[strlen(l_str)], "%s\t", blank_str );
if ( op->dimset == BLIS_TEST_DIMS_MNK ||
op->dimset == BLIS_TEST_DIMS_MN ||
op->dimset == BLIS_TEST_DIMS_K ||
op->dimset == BLIS_TEST_DIMS_MF ||
op->dimset == BLIS_TEST_NO_DIMS )
- sprintf( &l_str[strlen(l_str)], " %5s", "m" );
+ sprintf( &l_str[strlen(l_str)], " %5s", "m\t" );
+ else
+ sprintf( &l_str[strlen(l_str)], "\t" );
if ( op->dimset == BLIS_TEST_DIMS_MNK ||
op->dimset == BLIS_TEST_DIMS_MN ||
op->dimset == BLIS_TEST_DIMS_K ||
op->dimset == BLIS_TEST_DIMS_MF ||
op->dimset == BLIS_TEST_NO_DIMS )
- sprintf( &l_str[strlen(l_str)], " %5s", "n" );
+ sprintf( &l_str[strlen(l_str)], " %5s", "n\t" );
+ else
+ sprintf( &l_str[strlen(l_str)], "\t" );
if ( op->dimset == BLIS_TEST_DIMS_MNK ||
op->dimset == BLIS_TEST_DIMS_MK ||
op->dimset == BLIS_TEST_DIMS_K )
- sprintf( &l_str[strlen(l_str)], " %5s", "k" );
+ sprintf( &l_str[strlen(l_str)], " %5s", "k\t" );
+ else
+ sprintf( &l_str[strlen(l_str)], "\t" );
- sprintf( &l_str[strlen(l_str)], "%s", " gflops resid result" );
+ sprintf( &l_str[strlen(l_str)], "%s", " \t seconds\t gflops\t resid\t result" );
}
@@ -2037,7 +2084,8 @@ void libblis_test_parse_message( FILE* output_stream, char* message, va_list arg
// Add the final type specifier, and null-terminate the string.
format_spec[cf] = message[c];
- format_spec[cf+1] = '\0';
+ format_spec[cf+1] = '\t';
+ format_spec[cf+2] = '\0';
// Switch based on type, since we can't predict what will
// va_args() will return.
char opt_ch;
// Copy the binary name to a global string so we can use it later.
- strncpy( libblis_test_binary_name, argv[0], MAX_BINARY_NAME_LENGTH );
+ //strncpy( libblis_test_binary_name, argv[0], MAX_BINARY_NAME_LENGTH );
+ strncpy( libblis_test_binary_name, "BlisTestSuite", MAX_BINARY_NAME_LENGTH );
// Process all option arguments until we get a -1, which means we're done.
#ifndef BLIS_ENABLE_C66X_BUILD
-void libblis_test_check_empty_problem( obj_t* c, double* perf, double* resid )
+void libblis_test_check_empty_problem( obj_t* c, perf_t* perf, double* resid )
{
if ( bli_obj_has_zero_dim( *c ) )
{
- *perf = 0.0;
+ perf->gflops = 0.0;
*resid = 0.0;
}
}
index 0afb19cea2f078c90425a10609f9c0055fc68345..4ebf9ce9e8b3383d367a158f021bbf0c20763b17 100644 (file)
#include <unistd.h>
#endif
-// To enable Multiple threads making BLAS calls
+#ifndef BLIS_TEST_DSP
+// To enable Multiple threads making BLAS calls (for ARM only)
#define BLIS_ENABLE_MULTITHREAD_TEST
-
+#endif
//
#define INPUT_BUFFER_SIZE 256
#define MAX_FILENAME_LENGTH 1000
#define MAX_BINARY_NAME_LENGTH 256
-#define MAX_FUNC_STRING_LENGTH 26
+#define MAX_FUNC_STRING_LENGTH 37
#define FLOPS_PER_UNIT_PERF 1e9
#define MAX_NUM_MSTORAGE 4
double warnpass;
} thresh_t;
+typedef struct
+{
+ double time;
+ unsigned long cycles;
+ double gflops;
+} perf_t;
//
// --- Prototypes --------------------------------------------------------------
char*, // pc_str (current param string)
char*, // sc_str (current storage string)
unsigned int, // p_cur (current problem size)
- double*, // perf
+ perf_t*, // perf
double* ) ); // residual
// --- Generate experiment string labels ---
// --- Miscellaneous ---
-void libblis_test_check_empty_problem( obj_t* c, double* perf, double* resid );
+void libblis_test_check_empty_problem( obj_t* c, perf_t* perf, double* resid );
//
index 487735c2c4cd04613c3818fa9581770ebfd76671..1b08a8370c63969b16c605d93a6ac94a7556138a 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_normfm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_normfm_check( &beta, &x, &norm, resid );
index 9b35b9ff48d17e919f1507dec52ec3a1769a7973..49fb8ee12154b61ae22f0929104f743ebe7ce6d9 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_normfv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 2.0 * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index cef126d4c5cafa86715ba58de01d3308c4aff23e..a5962cbef9f63e1f00fa4c60171bf22e279f5190 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_randm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
// Perform checks.
// For randm(), we don't return a meaningful residual/diff, since we can't
index b74100f8f35be52e03eed6219b20cb61cd3b6662..1990ff2da955004c3f551dcf5327ad84513b6509 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_randv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
// Perform checks.
// For randv(), we don't return a meaningful residual/diff, since we can't
index 2816c8a08eafabafbd6a3c62653de9ec48b062d7..63d830bc2b9ab67254e835d41f3eb6703ad8edd4 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_scal2m_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_scal2m_check( &alpha, &x, &y, &y_save, resid );
index 184b50067a38922d79f8e2e94521e942214b844d..06c3a4310ed5da1ba36c4e305c84e3058fb128be 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_scal2v_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_scal2v_check( &alpha, &x, &y, &y_save, resid );
index 1c08b879cd8cc2d04f0637c09c467183b9123732..d0ce2a184c328d3ae002ed05bb18cb5480819500 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_scalm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 6.0;
+ perf->gflops = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 6.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_scalm_check( &beta, &y, &y_save, resid );
index e4559e28072561bd67ac410dfc1e607dc2534390..e99c2b91a7bfe2029611ad26f815905349b97156 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_scalv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
bli_copyv( &y[i], &y_save[i] );
}
#else
- bli_randv( &y[i] );
- bli_copyv( &y[i], &y_save[i] );
+ bli_randv( &y );
+ bli_copyv( &y, &y_save );
#endif
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m ) *test_way/ time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y[0] ) ) *perf *= 6.0;
+ perf->gflops = ( 1.0 * m ) *test_way/ time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y[0] ) ) perf->gflops *= 6.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 6.0;
+ perf->gflops = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 6.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 316e8edadb91d502835400f09bd254cb5f108fb5..6a71f37b840efcd8495e045611be2ef474984e3c 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_setm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_setm_check( &beta, &x, resid );
index dbf7023e0d00e050e67a694214a90d9bb297e4b7..e75d9b2bd2dd0ed585fe0f7f450cfef7b81fe290 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_setv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_setv_check( &beta, &x, resid );
index eca95e0f493265141b9e515c3bb129a4a1fd0a18..b662bbf3ff8ba96d87b2ab6abc09b155cc3a9cab 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_subm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
double time_min = 1e9;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_subm_check( &alpha, &beta, &x, &y, resid );
index 2b7f8c3ceae76985264e5c934d891378c64e1642..633e6e7cde402bd9ee07b6afa96bcd202cea34bd 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_subv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
double time_min = 1e9;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 2.0;
+ perf->gflops = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 2.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_subv_check( &alpha, &beta, &x, &y, resid );
index 165e64e076f1950417cb84cb8c39079155108f09..04e6dc7f31988fdcdce11eac298a3f8f1ac5ecd7 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_symm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
// Estimate the performance of the best experiment repeat.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
- *perf = ( 2.0 * mn_side * m * n ) *test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * mn_side * m * n ) *test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c[0] ) ) perf->gflops *= 4.0;
#else
- *perf = ( 2.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
+
// Perform checks.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 89ec05d156876a959e39229c54152d2e4ace1e13..b2798a75a9f1f32aa57861b20b43cd49edd5115b 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_symv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( y ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( y ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 27b4c09ffe3188dc5d6a7024b3c0ab4de6694e3e..782dada3a3673ae91f8b53a082a059bade3ec355 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_syr_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 5488bb42c4a80725c45760eb93671ff40c565979..45c2c79562922a3f4611e7572e3a701ed0f0980f 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_syr2_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 2.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( a ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( a ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index d590031e940bf4ebd9f0f9d92d74168182530a48..c0a0201e6270fb7c4aef63aaec8a73421e39ef33 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_syr2k_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
// Estimate the performance of the best experiment repeat.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
- *perf = ( 2.0 * m * m * k ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * m * k ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c[0] ) ) perf->gflops *= 4.0;
#else
- *perf = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c ) ) *perf *= 4.0;
+ perf->gflops = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
+
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
for(i = 0; i < test_way; i++)
index 8e072ce95cb597679ddad37f2e0efe60c1af0ba4..64473fd87d028536b75e358dc12b2ce44d3db1c0 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_syrk_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m * k ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m * k ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
// Perform checks.
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
index 8e59ac8ea10de6936f30c6ebf51596d5e29916cb..1c294e1ef8b18a018437f5f6cb57bb7a83ab714b 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_trmm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * mn_side * m * n ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( b[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * mn_side * m * n ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( b[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( b ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( b ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
+
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
for(i = 0; i < test_way; i++)
index a36069b5497c27a0554119feb2e6ff2a1c58d7ff..446ebcf5492ed663c05b1f6fc4d76b8233801fb8 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_trmm3_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( c ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( c ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_trmm3_check( side, &alpha, &a, &b, &beta, &c, &c_save, resid );
index 4147a16814b4de5c4821d923a63cf9fb09fbcaac..b8fb0f07a65806db4faa095384381ceae144222a 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_trmv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
index 75cb9cc8d009aef38bee7be55e1a69893c013838..9658563a32e3863637c22d0dbc6de796f9960664 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_trsm_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * mn_side * m * n ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( b[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * mn_side * m * n ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( b[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( b ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( b ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
+
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
for(i = 0; i < test_way; i++)
index 8d12d4435ff97f10ead01a56f449f17bd9b24dca..0740bfc9e7e066f72e6c17023ceeac1c1921d2d9 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_trsm_ukr_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( b ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( b ) ) perf->gflops *= 4.0;
+ perf->time = time_min;
// Perform checks.
libblis_test_trsm_ukr_check( side, &a, &c, &b, resid );
index ccd1a792e341190a068a60a7460aea220bffbdf2..8f0cb335d3cd798c8f589bdbffed7ea3d23dcf95 100644 (file)
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid );
void libblis_test_trsv_impl( iface_t iface,
char* pc_str,
char* sc_str,
unsigned int p_cur,
- double* perf,
+ perf_t* perf,
double* resid )
{
unsigned int n_repeats = params->n_repeats;
}
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x[0] ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) * test_way / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x[0] ) ) perf->gflops *= 4.0;
#else
// Estimate the performance of the best experiment repeat.
- *perf = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
- if ( bli_obj_is_complex( x ) ) *perf *= 4.0;
+ perf->gflops = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
+ if ( bli_obj_is_complex( x ) ) perf->gflops *= 4.0;
#endif
+ perf->time = time_min;
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Check output of each thread, and send max residue to main
libblis_test_trsv_check( &alpha, &a, &x, &x_save, resid );
// Zero out performance and residual if output vector is empty.
- libblis_test_check_empty_problem( &y, perf, resid );
+ libblis_test_check_empty_problem( &x, perf, resid );
#endif
// Free the test objects.
diff --git a/blis/version b/blis/version
index 4e632fe6172cd89b557687a532f9fde2f8241660..e62876a936f315312b542b0dff7047cc67c7e8e1 100644 (file)
--- a/blis/version
+++ b/blis/version
-DEV.LINALG.01.00.00.01
+DEV.LINALG.01.02.00.00-6
diff --git a/blis/windows/Makefile b/blis/windows/Makefile
index 6e8c1e0f0912dc5988d9952214a211ead8aafb36..b5c211c9938f166efa9df7c03bbf4334319c365c 100644 (file)
--- a/blis/windows/Makefile
+++ b/blis/windows/Makefile
-#
-#
-# BLIS
-# An object-based framework for developing high-performance BLAS-like
-# libraries.
-#
-# Copyright (C) 2014, The University of Texas at Austin
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-# - Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# - Neither the name of The University of Texas at Austin nor the names
-# of its contributors may be used to endorse or promote products
-# derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-
-
-#
-# --- Include variables determined at configure-time --------------------------
-#
-CONFIGURE_DEFS = config\config.mk
-
-!if exist ( $(CONFIGURE_DEFS) )
-!include $(CONFIGURE_DEFS)
-!else
-!error nmake: $(CONFIGURE_DEFS) does not exist! Run configure.cmd first.
-!endif
-
-
-
-#
-# --- Include environment- and build-specific definitions ----------------------
-#
-
-MAKE_DEFS = build\defs.mk
-
-# Include build definitions
-!if exist ( $(MAKE_DEFS) )
-!include $(MAKE_DEFS)
-!else
-!error nmake: $(MAKE_DEFS) does not exist! Your libblis distribution may be incomplete.
-!endif
-
-
-
-#
-# --- Variable modifications ---------------------------------------------------
-#
-
-
-
-#
-# --- High-level rules ---------------------------------------------------------
-#
-
-all: libblis
-
-libblis: libblis-lib
-
-libblis-objs: $(BLIS_OBJS)
-
-libblis-lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB)
-
-libblis-dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL)
-
-lib: libblis-lib
-
-dll: libblis-dll
-
-install: install-lib install-headers
-
-install-lib: $(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib
-
-install-dll: $(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll \
- $(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib \
- $(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp
-
-install-headers: $(INSTALL_PREFIX_INC)\$(BLIS_H)
-
-clean: clean-build clean-log
-
-distclean: clean-config clean-build clean-log
-
-
-
-#
-# --- Source code (inference) rules --------------------------------------------
-#
-
-# --- C source files in flamec directory ---
-{$(SRC_BLI_DIRPATH)}.c{$(OBJ_BLI_DIRPATH)}.obj:
-!ifdef VERBOSE
- if not exist $(OBJ_BLI_DIRPATH) \
- ( $(MKDIR) $(OBJ_BLI_DIRPATH) )
- $(CC) $(CFLAGS) /c $< /Fo$@
-!else
- @if not exist $(OBJ_BLI_DIRPATH) \
- ( ( $(ECHO) nmake: Creating $(OBJ_BLI_DIRPATH) directory ) & \
- ( $(MKDIR) $(OBJ_BLI_DIRPATH) ) )
- @$(ECHO) nmake: Compiling $<
- @$(CC) $(CFLAGS) /c $< /Fo$@ >> $(CC_LOG_FILE)
-!endif
-
-
-
-#
-# --- Library generation rules -------------------------------------------------
-#
-
-# --- Static library ---
-$(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB): libblis-objs
-!ifdef VERBOSE
- if not exist $(LIB_LIBBLIS_DIRPATH) \
- ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) )
- $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH)
- $(CD) $(LIB_LIBBLIS_DIRPATH)
- $(LIB) $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)
- $(DEL) *.obj
- $(CD) $(TOP_BUILD_DIR_ABS)
-!else
- @if not exist $(LIB_LIBBLIS_DIRPATH) \
- ( ( $(ECHO) nmake: Creating $(LIB_LIBBLIS_DIRPATH) directory ) & \
- ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) ) )
- @$(ECHO) nmake: Creating static library $@
- @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)
- @$(CD) $(LIB_LIBBLIS_DIRPATH)
- @$(LIB) /VERBOSE $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)
- @$(DEL) *.obj
- @$(CD) $(TOP_BUILD_DIR_ABS)
-!endif
-
-# --- Dynamic library (object code file, import library, and export file) ---
-$(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL): libblis-objs
-!ifdef VERBOSE
- if not exist $(DLL_LIBBLIS_DIRPATH) \
- ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) )
- $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)
- $(CD) $(DLL_LIBBLIS_DIRPATH)
- $(DIR) /B *.obj > $(OBJ_LIST_FILE)
- $(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)
- $(DEL) $(OBJ_LIST_FILE)
- $(DEL) *.obj
- $(CD) $(TOP_BUILD_DIR_ABS)
-!else
- @if not exist $(DLL_LIBBLIS_DIRPATH) \
- ( ( $(ECHO) nmake: Creating $(DLL_LIBBLIS_DIRPATH) directory ) & \
- ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) ) )
- @$(ECHO) nmake: Creating dynamic library $@
- @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)
- @$(CD) $(DLL_LIBBLIS_DIRPATH)
- @$(DIR) /B *.obj > $(OBJ_LIST_FILE)
- @$(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)
- @$(DEL) $(OBJ_LIST_FILE)
- @$(DEL) *.obj
- @$(CD) $(TOP_BUILD_DIR_ABS)
-!endif
-
-
-
-#
-# --- Install rules ------------------------------------------------------------
-#
-
-# --- Header files ---
-$(INSTALL_PREFIX_INC)\$(BLIS_H): $(INC_BLI_DIRPATH)\$(BLIS_H) \
- $(BUILD_DIRNAME)\$(BLI_CONFIG_H)
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_INC) \
- ( $(MKDIR) $(INSTALL_PREFIX_INC) )
- $(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
- $(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
-!else
- @if not exist $(INSTALL_PREFIX_INC) \
- ( $(MKDIR) $(INSTALL_PREFIX_INC) )
- @$(ECHO) nmake: Installing libblis header files to $(INSTALL_PREFIX_INC)
- @$(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
- @$(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
-!endif
-
-# --- Static library ---
-$(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )
- if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
- ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) )
-!else
- @if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )
- @if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
- ( ( $(ECHO) nmake: Installing $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_LIB) ) & \
- ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) ) )
-!endif
-
-# --- Dynamic library (object code) ---
-$(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )
-!else
- @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \
- ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll to $(INSTALL_PREFIX_DLL) ) & \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )
-!endif
-
-# --- Dynamic library (import library) ---
-$(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )
-!else
- @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
- ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_DLL) ) & \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )
-!endif
-
-# --- Dynamic library (export file) ---
-$(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )
-!else
- @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \
- ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp to $(INSTALL_PREFIX_DLL) ) & \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )
-!endif
-
-
-
-#
-# --- Clean rules --------------------------------------------------------------
-#
-
-clean-log:
-!ifdef VERBOSE
- if exist $(CC_LOG_FILE) \
- ( $(DEL) $(CC_LOG_FILE) )
- if exist $(FC_LOG_FILE) \
- ( $(DEL) $(FC_LOG_FILE) )
- if exist $(COPY_LOG_FILE) \
- ( $(DEL) $(COPY_LOG_FILE) )
-!else
- @if exist $(CC_LOG_FILE) \
- ( ( $(ECHO) nmake: Deleting $(CC_LOG_FILE) ) & \
- ( $(DEL) $(CC_LOG_FILE) ) )
- @if exist $(FC_LOG_FILE) \
- ( ( $(ECHO) nmake: Deleting $(FC_LOG_FILE) ) & \
- ( $(DEL) $(FC_LOG_FILE) ) )
- @if exist $(COPY_LOG_FILE) \
- ( ( $(ECHO) nmake: Deleting $(COPY_LOG_FILE) ) & \
- ( $(DEL) $(COPY_LOG_FILE) ) )
-!endif
-
-clean-config:
-!ifdef VERBOSE
- if exist $(CNF_DIRNAME) \
- ( $(RMDIR) $(CNF_DIRNAME) )
- if exist $(INC_DIRNAME) \
- ( $(RMDIR) $(INC_DIRNAME) )
- if exist $(SRC_DIRNAME) \
- ( $(RMDIR) $(SRC_DIRNAME) )
-!else
- @if exist $(CNF_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(CNF_DIRNAME) directory ) & \
- ( $(RMDIR) $(CNF_DIRNAME) ) )
- @if exist $(INC_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(INC_DIRNAME) directory ) & \
- ( $(RMDIR) $(INC_DIRNAME) ) )
- @if exist $(SRC_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(SRC_DIRNAME) directory ) & \
- ( $(RMDIR) $(SRC_DIRNAME) ) )
-!endif
-
-clean-build:
-!ifdef VERBOSE
- if exist $(OBJ_DIRNAME) \
- ( $(RMDIR) $(OBJ_DIRNAME) )
- if exist $(LIB_DIRNAME) \
- ( $(RMDIR) $(LIB_DIRNAME) )
- if exist $(DLL_DIRNAME) \
- ( $(RMDIR) $(DLL_DIRNAME) )
-!else
- @if exist $(OBJ_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(OBJ_DIRNAME) directory ) & \
- ( $(RMDIR) $(OBJ_DIRNAME) ) )
- @if exist $(LIB_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \
- ( $(RMDIR) $(LIB_DIRNAME) ) )
- @if exist $(DLL_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \
- ( $(RMDIR) $(DLL_DIRNAME) ) )
-!endif
-
-# Useful for developing when all we want to do is remove the library products.
-clean-lib:
-!ifdef VERBOSE
- if exist $(LIB_DIRNAME) \
- ( $(RMDIR) $(LIB_DIRNAME) )
- if exist $(DLL_DIRNAME) \
- ( $(RMDIR) $(DLL_DIRNAME) )
-!else
- @if exist $(LIB_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \
- ( $(RMDIR) $(LIB_DIRNAME) ) )
- @if exist $(DLL_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \
- ( $(RMDIR) $(DLL_DIRNAME) ) )
-!endif
-
-
-
-#
-# --- Help target --------------------------------------------------------------
-#
-
-help:
- @$(NMAKE_HELP)
-
+#\r
+#\r
+# BLIS \r
+# An object-based framework for developing high-performance BLAS-like\r
+# libraries.\r
+#\r
+# Copyright (C) 2014, The University of Texas at Austin\r
+#\r
+# Redistribution and use in source and binary forms, with or without\r
+# modification, are permitted provided that the following conditions are\r
+# met:\r
+# - Redistributions of source code must retain the above copyright\r
+# notice, this list of conditions and the following disclaimer.\r
+# - Redistributions in binary form must reproduce the above copyright\r
+# notice, this list of conditions and the following disclaimer in the\r
+# documentation and/or other materials provided with the distribution.\r
+# - Neither the name of The University of Texas at Austin nor the names\r
+# of its contributors may be used to endorse or promote products\r
+# derived from this software without specific prior written permission.\r
+#\r
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+#\r
+#\r
+\r
+\r
+\r
+#\r
+# --- Include variables determined at configure-time --------------------------\r
+#\r
+CONFIGURE_DEFS = config\config.mk\r
+\r
+!if exist ( $(CONFIGURE_DEFS) )\r
+!include $(CONFIGURE_DEFS)\r
+!else\r
+!error nmake: $(CONFIGURE_DEFS) does not exist! Run configure.cmd first.\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Include environment- and build-specific definitions ----------------------\r
+#\r
+\r
+MAKE_DEFS = build\defs.mk\r
+\r
+# Include build definitions\r
+!if exist ( $(MAKE_DEFS) )\r
+!include $(MAKE_DEFS)\r
+!else\r
+!error nmake: $(MAKE_DEFS) does not exist! Your libblis distribution may be incomplete.\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Variable modifications ---------------------------------------------------\r
+#\r
+\r
+\r
+\r
+#\r
+# --- High-level rules ---------------------------------------------------------\r
+#\r
+\r
+all: libblis\r
+\r
+libblis: libblis-lib\r
+\r
+libblis-objs: $(BLIS_OBJS)\r
+\r
+libblis-lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB)\r
+\r
+libblis-dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL)\r
+\r
+lib: libblis-lib\r
+\r
+dll: libblis-dll\r
+\r
+install: install-lib install-headers\r
+\r
+install-lib: $(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib\r
+\r
+install-dll: $(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll \\r
+ $(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib \\r
+ $(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp\r
+\r
+install-headers: $(INSTALL_PREFIX_INC)\$(BLIS_H)\r
+\r
+clean: clean-build clean-log\r
+\r
+distclean: clean-config clean-build clean-log\r
+\r
+\r
+\r
+#\r
+# --- Source code (inference) rules --------------------------------------------\r
+#\r
+\r
+# --- C source files in flamec directory ---\r
+{$(SRC_BLI_DIRPATH)}.c{$(OBJ_BLI_DIRPATH)}.obj:\r
+!ifdef VERBOSE\r
+ if not exist $(OBJ_BLI_DIRPATH) \\r
+ ( $(MKDIR) $(OBJ_BLI_DIRPATH) )\r
+ $(CC) $(CFLAGS) /c $< /Fo$@\r
+!else\r
+ @if not exist $(OBJ_BLI_DIRPATH) \\r
+ ( ( $(ECHO) nmake: Creating $(OBJ_BLI_DIRPATH) directory ) & \\r
+ ( $(MKDIR) $(OBJ_BLI_DIRPATH) ) )\r
+ @$(ECHO) nmake: Compiling $<\r
+ @$(CC) $(CFLAGS) /c $< /Fo$@ >> $(CC_LOG_FILE)\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Library generation rules -------------------------------------------------\r
+#\r
+\r
+# --- Static library ---\r
+$(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB): libblis-objs\r
+!ifdef VERBOSE\r
+ if not exist $(LIB_LIBBLIS_DIRPATH) \\r
+ ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) )\r
+ $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH)\r
+ $(CD) $(LIB_LIBBLIS_DIRPATH)\r
+ $(LIB) $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)\r
+ $(DEL) *.obj\r
+ $(CD) $(TOP_BUILD_DIR_ABS)\r
+!else\r
+ @if not exist $(LIB_LIBBLIS_DIRPATH) \\r
+ ( ( $(ECHO) nmake: Creating $(LIB_LIBBLIS_DIRPATH) directory ) & \\r
+ ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) ) )\r
+ @$(ECHO) nmake: Creating static library $@\r
+ @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)\r
+ @$(CD) $(LIB_LIBBLIS_DIRPATH)\r
+ @$(LIB) /VERBOSE $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)\r
+ @$(DEL) *.obj\r
+ @$(CD) $(TOP_BUILD_DIR_ABS)\r
+!endif\r
+\r
+# --- Dynamic library (object code file, import library, and export file) ---\r
+$(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL): libblis-objs\r
+!ifdef VERBOSE\r
+ if not exist $(DLL_LIBBLIS_DIRPATH) \\r
+ ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) )\r
+ $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)\r
+ $(CD) $(DLL_LIBBLIS_DIRPATH)\r
+ $(DIR) /B *.obj > $(OBJ_LIST_FILE)\r
+ $(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)\r
+ $(DEL) $(OBJ_LIST_FILE)\r
+ $(DEL) *.obj\r
+ $(CD) $(TOP_BUILD_DIR_ABS)\r
+!else\r
+ @if not exist $(DLL_LIBBLIS_DIRPATH) \\r
+ ( ( $(ECHO) nmake: Creating $(DLL_LIBBLIS_DIRPATH) directory ) & \\r
+ ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) ) )\r
+ @$(ECHO) nmake: Creating dynamic library $@\r
+ @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)\r
+ @$(CD) $(DLL_LIBBLIS_DIRPATH)\r
+ @$(DIR) /B *.obj > $(OBJ_LIST_FILE)\r
+ @$(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)\r
+ @$(DEL) $(OBJ_LIST_FILE)\r
+ @$(DEL) *.obj\r
+ @$(CD) $(TOP_BUILD_DIR_ABS)\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Install rules ------------------------------------------------------------\r
+#\r
+\r
+# --- Header files ---\r
+$(INSTALL_PREFIX_INC)\$(BLIS_H): $(INC_BLI_DIRPATH)\$(BLIS_H) \\r
+ $(BUILD_DIRNAME)\$(BLI_CONFIG_H)\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_INC) \\r
+ ( $(MKDIR) $(INSTALL_PREFIX_INC) )\r
+ $(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)\r
+ $(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_INC) \\r
+ ( $(MKDIR) $(INSTALL_PREFIX_INC) )\r
+ @$(ECHO) nmake: Installing libblis header files to $(INSTALL_PREFIX_INC)\r
+ @$(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)\r
+ @$(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)\r
+!endif\r
+\r
+# --- Static library ---\r
+$(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )\r
+ if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \\r
+ ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) )\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )\r
+ @if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \\r
+ ( ( $(ECHO) nmake: Installing $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_LIB) ) & \\r
+ ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+# --- Dynamic library (object code) ---\r
+$(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \\r
+ ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll to $(INSTALL_PREFIX_DLL) ) & \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+# --- Dynamic library (import library) ---\r
+$(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \\r
+ ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_DLL) ) & \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+# --- Dynamic library (export file) ---\r
+$(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \\r
+ ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp to $(INSTALL_PREFIX_DLL) ) & \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Clean rules --------------------------------------------------------------\r
+#\r
+\r
+clean-log:\r
+!ifdef VERBOSE\r
+ if exist $(CC_LOG_FILE) \\r
+ ( $(DEL) $(CC_LOG_FILE) )\r
+ if exist $(FC_LOG_FILE) \\r
+ ( $(DEL) $(FC_LOG_FILE) )\r
+ if exist $(COPY_LOG_FILE) \\r
+ ( $(DEL) $(COPY_LOG_FILE) )\r
+!else\r
+ @if exist $(CC_LOG_FILE) \\r
+ ( ( $(ECHO) nmake: Deleting $(CC_LOG_FILE) ) & \\r
+ ( $(DEL) $(CC_LOG_FILE) ) )\r
+ @if exist $(FC_LOG_FILE) \\r
+ ( ( $(ECHO) nmake: Deleting $(FC_LOG_FILE) ) & \\r
+ ( $(DEL) $(FC_LOG_FILE) ) )\r
+ @if exist $(COPY_LOG_FILE) \\r
+ ( ( $(ECHO) nmake: Deleting $(COPY_LOG_FILE) ) & \\r
+ ( $(DEL) $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+clean-config:\r
+!ifdef VERBOSE\r
+ if exist $(CNF_DIRNAME) \\r
+ ( $(RMDIR) $(CNF_DIRNAME) )\r
+ if exist $(INC_DIRNAME) \\r
+ ( $(RMDIR) $(INC_DIRNAME) )\r
+ if exist $(SRC_DIRNAME) \\r
+ ( $(RMDIR) $(SRC_DIRNAME) )\r
+!else\r
+ @if exist $(CNF_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(CNF_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(CNF_DIRNAME) ) )\r
+ @if exist $(INC_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(INC_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(INC_DIRNAME) ) )\r
+ @if exist $(SRC_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(SRC_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(SRC_DIRNAME) ) )\r
+!endif\r
+\r
+clean-build:\r
+!ifdef VERBOSE\r
+ if exist $(OBJ_DIRNAME) \\r
+ ( $(RMDIR) $(OBJ_DIRNAME) )\r
+ if exist $(LIB_DIRNAME) \\r
+ ( $(RMDIR) $(LIB_DIRNAME) )\r
+ if exist $(DLL_DIRNAME) \\r
+ ( $(RMDIR) $(DLL_DIRNAME) )\r
+!else\r
+ @if exist $(OBJ_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(OBJ_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(OBJ_DIRNAME) ) )\r
+ @if exist $(LIB_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(LIB_DIRNAME) ) )\r
+ @if exist $(DLL_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(DLL_DIRNAME) ) )\r
+!endif\r
+\r
+# Useful for developing when all we want to do is remove the library products.\r
+clean-lib:\r
+!ifdef VERBOSE\r
+ if exist $(LIB_DIRNAME) \\r
+ ( $(RMDIR) $(LIB_DIRNAME) )\r
+ if exist $(DLL_DIRNAME) \\r
+ ( $(RMDIR) $(DLL_DIRNAME) )\r
+!else\r
+ @if exist $(LIB_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(LIB_DIRNAME) ) )\r
+ @if exist $(DLL_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(DLL_DIRNAME) ) )\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Help target --------------------------------------------------------------\r
+#\r
+\r
+help:\r
+ @$(NMAKE_HELP)\r
+\r
index 4e560a463f7b38dc69ee476a24b93d9888f3d49c..525eee039b03fe13f6e83f80dd2125cb54dcc32a 100644 (file)
-#
-#
-# BLIS
-# An object-based framework for developing high-performance BLAS-like
-# libraries.
-#
-# Copyright (C) 2014, The University of Texas at Austin
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-# - Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# - Neither the name of The University of Texas at Austin nor the names
-# of its contributors may be used to endorse or promote products
-# derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-#
-# --- Configuration variable definitions ---------------------------------------
-#
-# Environment-related variables:
-# REVISION - The code's revision number.
-# PWD - The path to current working directory.
-# ARCH_STR - A string to identify the requested build architecture.
-# BUILD_STR - A string to identify the requested build type.
-# CCOMPILER_STR - A string to identify the requested C compiler.
-#
-# Target-related variables:
-# FLAMEC_OBJS - List of paths to flamec object files.
-# LAPACK2FLAMEC_OBJS - List of paths to lapack2flamec object files.
-#
-# Note: these variables are not present in the .in template file. Instead, they
-# are appended to the contents of the .in file by a build script and output to
-# a separate file (by the same name, without the .in extension).
-#
+#\r
+#\r
+# BLIS \r
+# An object-based framework for developing high-performance BLAS-like\r
+# libraries.\r
+#\r
+# Copyright (C) 2014, The University of Texas at Austin\r
+#\r
+# Redistribution and use in source and binary forms, with or without\r
+# modification, are permitted provided that the following conditions are\r
+# met:\r
+# - Redistributions of source code must retain the above copyright\r
+# notice, this list of conditions and the following disclaimer.\r
+# - Redistributions in binary form must reproduce the above copyright\r
+# notice, this list of conditions and the following disclaimer in the\r
+# documentation and/or other materials provided with the distribution.\r
+# - Neither the name of The University of Texas at Austin nor the names\r
+# of its contributors may be used to endorse or promote products\r
+# derived from this software without specific prior written permission.\r
+#\r
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+#\r
+#\r
+\r
+#\r
+# --- Configuration variable definitions ---------------------------------------\r
+#\r
+# Environment-related variables:\r
+# REVISION - The code's revision number.\r
+# PWD - The path to current working directory.\r
+# ARCH_STR - A string to identify the requested build architecture.\r
+# BUILD_STR - A string to identify the requested build type.\r
+# CCOMPILER_STR - A string to identify the requested C compiler.\r
+#\r
+# Target-related variables:\r
+# FLAMEC_OBJS - List of paths to flamec object files.\r
+# LAPACK2FLAMEC_OBJS - List of paths to lapack2flamec object files.\r
+#\r
+# Note: these variables are not present in the .in template file. Instead, they\r
+# are appended to the contents of the .in file by a build script and output to\r
+# a separate file (by the same name, without the .in extension).\r
+#\r
index af5b69e03cd59a360580f5b6d479dc2ce9a5a68a..2c7775b2114eac6049e0f823bf8e93661f32f5b5 100644 (file)
-#
-#
-# BLIS
-# An object-based framework for developing high-performance BLAS-like
-# libraries.
-#
-# Copyright (C) 2014, The University of Texas at Austin
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-# - Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# - Neither the name of The University of Texas at Austin nor the names
-# of its contributors may be used to endorse or promote products
-# derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-
-#
-# --- General build system options --------------------------------------------
-#
-
-# Uncomment this for verbose output from nmake.
-# VERBOSE = 1
-
-# Assign this varible to be the full path to the directory to which you would
-# like the BLIS build products to be installed upon running "nmake install".
-# The nmake install target will create the install directory and all requisite
-# subdirectories if they do not already exist (in which case the user must have
-# permission to create these directories).
-INSTALL_PREFIX = c:\field\lib
-
-
-#
-# --- Important build system filenames ----------------------------------------
-#
-
-# DLL link arguments. The contents of this file should be customized when
-# building a dynamically-linked library. The lines of the file should contain
-# linker options, library names, and library paths. Note that the library
-# paths must be declared in the following form:
-#
-# /link /LIBPATH:<path1>
-# /link /LIBPATH:<path2>
-# /link /LIBPATH:<path3>
-#
-# where <path1>, <path2>, and <path3> are library paths to add to the list
-# of paths to search when the linker attempts to locate other libraries
-# listed in the file.
-LINKARGS_FILENAME = linkargs.txt
-LINKARGS_FILEPATH = $(PWD)\$(LINKARGS_FILENAME)
-
-# Various log file names that capture standard output when VERBOSE is undefined.
-CC_LOG_FILE = nmake-cc.log
-FC_LOG_FILE = nmake-fc.log
-COPY_LOG_FILE = nmake-copy.log
-
-
-#
-# --- General name and directory definitions -----------------------------------
-#
-
-# The relative and absolute locations of the top-level Windows build directory.
-# This is the directory in which nmake is run (not the directory named "build").
-TOP_BUILD_DIR_REL = .
-TOP_BUILD_DIR_ABS = $(PWD)
-
-# The revision string.
-REV_STR = r$(REVISION)
-
-# The names of the libraries.
-LIBBLIS_NAME_ONLY = libblis
-LIBBLIS = $(LIBBLIS_NAME_ONLY)-$(ARCH_STR)-$(REV_STR)
-
-# Directories that reside within the top-level Windows directory.
-CNF_DIRNAME = config
-INC_DIRNAME = include
-SRC_DIRNAME = frame
-OBJ_DIRNAME = obj
-LIB_DIRNAME = lib
-DLL_DIRNAME = dll
-
-# Leaves of interest for Windows.
-
-# Relative directory paths to each of the above subdirectories.
-INC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(INC_DIRNAME)
-SRC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(SRC_DIRNAME)
-OBJ_DIRPATH = $(TOP_BUILD_DIR_REL)\$(OBJ_DIRNAME)
-LIB_DIRPATH = $(TOP_BUILD_DIR_REL)\$(LIB_DIRNAME)
-DLL_DIRPATH = $(TOP_BUILD_DIR_REL)\$(DLL_DIRNAME)
-
-# We only have header files for flamec leaves.
-INC_BLI_DIRPATH = $(INC_DIRPATH)
-
-# We have source code for flamec and lapack2flamec leaves.
-SRC_BLI_DIRPATH = $(SRC_DIRPATH)
-
-
-# And we have object file paths corresponding to those source leaves defined
-# above.
-OBJ_BLI_DIRPATH = $(OBJ_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)
-
-# Separate directories into which we'll move object files when we create the
-# static libraries.
-LIB_LIBBLIS_DIRPATH = $(LIB_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)
-
-# Separate directories into which we'll move object files when we create the
-# dynamic libraries.
-DLL_LIBBLIS_DIRPATH = $(DLL_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)
-
-# The install subdirectories.
-INSTALL_PREFIX_LIB = $(INSTALL_PREFIX)\libblis\lib
-INSTALL_PREFIX_DLL = $(INSTALL_PREFIX)\libblis\dll
-INSTALL_PREFIX_INC = $(INSTALL_PREFIX)\libblis\include-$(ARCH_STR)-$(REV_STR)
-
-# Definitions for important header files used in the install-headers rule.
-BUILD_DIRNAME = build
-BLIS_H = blis.h
-
-
-#
-# --- General shell definitions ------------------------------------------------
-#
-
-CD = cd
-DIR = dir
-COPY = copy
-DEL = del /F /Q
-MKDIR = mkdir
-RMDIR = rd /S /Q
-ECHO = echo
-
-
-#
-# --- Helper scripts -----------------------------------------------------------
-#
-
-NMAKE_HELP = .\build\nmake-help.cmd
-
-
-
-#
-# --- Compiler-related definitions ---------------------------------------------
-#
-
-#!include $(VERSION_FILE)
-
-# --- C compiler definitions ---
-
-WINDOWS_BUILD = BLIS_ENABLE_WINDOWS_BUILD
-VERS_STR = 0.0.9
-VERSION = BLIS_VERSION_STRING=\"$(VERS_STR)\"
-
-!if "$(CCOMPILER_STR)"=="icl"
-
-!if "$(BUILD_STR)"=="debug"
-CDEBUG = /Zi
-COPTIM = /Od
-!elseif "$(BUILD_STR)"=="release"
-CDEBUG =
-COPTIM = /Ox
-!endif
-
-CC = icl.exe
-CMISCFLAGS = /nologo
-CLANGFLAGS =
-CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)
-CWARNFLAGS = /w
-CDBGFLAGS = $(CDEBUG)
-COPTFLAGS = $(COPTIM)
-CRTIMEFLAGS = /MT
-CMTHREADFLAGS = /Qopenmp
-CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \
- $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)
-
-!elseif "$(CCOMPILER_STR)"=="cl"
-
-!if "$(BUILD_STR)"=="debug"
-CDEBUG = /Zi
-COPTIM = /Od
-!elseif "$(BUILD_STR)"=="release"
-CDEBUG =
-COPTIM = /Ox
-!endif
-
-CC = cl.exe
-CMISCFLAGS = /nologo
-CLANGFLAGS =
-CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)
-CWARNFLAGS = /w
-CDBGFLAGS = $(CDEBUG)
-COPTFLAGS = $(COPTIM)
-CRTIMEFLAGS = /MT
-CMTHREADFLAGS = /openmp
-CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \
- $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)
-
-!endif
-
-
-
-#
-# --- Library-related definitions ----------------------------------------------
-#
-
-# --- Static library definitions ---
-
-LIBBLIS_LIB = $(LIBBLIS).lib
-
-LIB = lib
-LIB_OPTIONS = /nologo
-LIB_BLI_OUTPUT_ARG = /out:$(LIBBLIS_LIB)
-LIB_BLI_INPUT_ARGS = *.obj
-
-# --- Dynamic library definitions ---
-
-LIBBLIS_DLL = $(LIBBLIS).dll
-
-GENDLL = $(TOP_BUILD_DIR_ABS)\gendll.cmd
-OBJ_LIST_FILE = libblis-objects.txt
-
-SYM_DEF_FILEPATH = $(TOP_BUILD_DIR_ABS)\$(BUILD_DIRNAME)\libblis-symbols.def
-
+#\r
+#\r
+# BLIS \r
+# An object-based framework for developing high-performance BLAS-like\r
+# libraries.\r
+#\r
+# Copyright (C) 2014, The University of Texas at Austin\r
+#\r
+# Redistribution and use in source and binary forms, with or without\r
+# modification, are permitted provided that the following conditions are\r
+# met:\r
+# - Redistributions of source code must retain the above copyright\r
+# notice, this list of conditions and the following disclaimer.\r
+# - Redistributions in binary form must reproduce the above copyright\r
+# notice, this list of conditions and the following disclaimer in the\r
+# documentation and/or other materials provided with the distribution.\r
+# - Neither the name of The University of Texas at Austin nor the names\r
+# of its contributors may be used to endorse or promote products\r
+# derived from this software without specific prior written permission.\r
+#\r
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+#\r
+#\r
+\r
+\r
+#\r
+# --- General build system options --------------------------------------------\r
+#\r
+\r
+# Uncomment this for verbose output from nmake.\r
+# VERBOSE = 1\r
+\r
+# Assign this varible to be the full path to the directory to which you would\r
+# like the BLIS build products to be installed upon running "nmake install".\r
+# The nmake install target will create the install directory and all requisite\r
+# subdirectories if they do not already exist (in which case the user must have\r
+# permission to create these directories).\r
+INSTALL_PREFIX = c:\field\lib\r
+\r
+\r
+#\r
+# --- Important build system filenames ----------------------------------------\r
+#\r
+\r
+# DLL link arguments. The contents of this file should be customized when\r
+# building a dynamically-linked library. The lines of the file should contain\r
+# linker options, library names, and library paths. Note that the library\r
+# paths must be declared in the following form:\r
+#\r
+# /link /LIBPATH:<path1>\r
+# /link /LIBPATH:<path2>\r
+# /link /LIBPATH:<path3>\r
+#\r
+# where <path1>, <path2>, and <path3> are library paths to add to the list\r
+# of paths to search when the linker attempts to locate other libraries\r
+# listed in the file.\r
+LINKARGS_FILENAME = linkargs.txt\r
+LINKARGS_FILEPATH = $(PWD)\$(LINKARGS_FILENAME)\r
+\r
+# Various log file names that capture standard output when VERBOSE is undefined.\r
+CC_LOG_FILE = nmake-cc.log\r
+FC_LOG_FILE = nmake-fc.log\r
+COPY_LOG_FILE = nmake-copy.log\r
+\r
+\r
+#\r
+# --- General name and directory definitions -----------------------------------\r
+#\r
+\r
+# The relative and absolute locations of the top-level Windows build directory.\r
+# This is the directory in which nmake is run (not the directory named "build").\r
+TOP_BUILD_DIR_REL = .\r
+TOP_BUILD_DIR_ABS = $(PWD)\r
+\r
+# The revision string.\r
+REV_STR = r$(REVISION)\r
+\r
+# The names of the libraries.\r
+LIBBLIS_NAME_ONLY = libblis\r
+LIBBLIS = $(LIBBLIS_NAME_ONLY)-$(ARCH_STR)-$(REV_STR)\r
+\r
+# Directories that reside within the top-level Windows directory.\r
+CNF_DIRNAME = config\r
+INC_DIRNAME = include\r
+SRC_DIRNAME = frame\r
+OBJ_DIRNAME = obj\r
+LIB_DIRNAME = lib\r
+DLL_DIRNAME = dll\r
+\r
+# Leaves of interest for Windows.\r
+\r
+# Relative directory paths to each of the above subdirectories.\r
+INC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(INC_DIRNAME)\r
+SRC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(SRC_DIRNAME)\r
+OBJ_DIRPATH = $(TOP_BUILD_DIR_REL)\$(OBJ_DIRNAME)\r
+LIB_DIRPATH = $(TOP_BUILD_DIR_REL)\$(LIB_DIRNAME)\r
+DLL_DIRPATH = $(TOP_BUILD_DIR_REL)\$(DLL_DIRNAME)\r
+\r
+# We only have header files for flamec leaves.\r
+INC_BLI_DIRPATH = $(INC_DIRPATH)\r
+\r
+# We have source code for flamec and lapack2flamec leaves.\r
+SRC_BLI_DIRPATH = $(SRC_DIRPATH)\r
+\r
+\r
+# And we have object file paths corresponding to those source leaves defined\r
+# above.\r
+OBJ_BLI_DIRPATH = $(OBJ_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)\r
+\r
+# Separate directories into which we'll move object files when we create the\r
+# static libraries.\r
+LIB_LIBBLIS_DIRPATH = $(LIB_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)\r
+\r
+# Separate directories into which we'll move object files when we create the\r
+# dynamic libraries.\r
+DLL_LIBBLIS_DIRPATH = $(DLL_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)\r
+\r
+# The install subdirectories.\r
+INSTALL_PREFIX_LIB = $(INSTALL_PREFIX)\libblis\lib\r
+INSTALL_PREFIX_DLL = $(INSTALL_PREFIX)\libblis\dll\r
+INSTALL_PREFIX_INC = $(INSTALL_PREFIX)\libblis\include-$(ARCH_STR)-$(REV_STR)\r
+\r
+# Definitions for important header files used in the install-headers rule.\r
+BUILD_DIRNAME = build\r
+BLIS_H = blis.h\r
+\r
+\r
+#\r
+# --- General shell definitions ------------------------------------------------\r
+#\r
+\r
+CD = cd\r
+DIR = dir\r
+COPY = copy\r
+DEL = del /F /Q\r
+MKDIR = mkdir\r
+RMDIR = rd /S /Q\r
+ECHO = echo\r
+\r
+\r
+#\r
+# --- Helper scripts -----------------------------------------------------------\r
+#\r
+\r
+NMAKE_HELP = .\build\nmake-help.cmd\r
+\r
+\r
+\r
+#\r
+# --- Compiler-related definitions ---------------------------------------------\r
+#\r
+\r
+#!include $(VERSION_FILE)\r
+\r
+# --- C compiler definitions ---\r
+\r
+WINDOWS_BUILD = BLIS_ENABLE_WINDOWS_BUILD\r
+VERS_STR = 0.0.9\r
+VERSION = BLIS_VERSION_STRING=\"$(VERS_STR)\"\r
+\r
+!if "$(CCOMPILER_STR)"=="icl"\r
+\r
+!if "$(BUILD_STR)"=="debug"\r
+CDEBUG = /Zi\r
+COPTIM = /Od\r
+!elseif "$(BUILD_STR)"=="release"\r
+CDEBUG =\r
+COPTIM = /Ox\r
+!endif\r
+\r
+CC = icl.exe\r
+CMISCFLAGS = /nologo\r
+CLANGFLAGS =\r
+CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)\r
+CWARNFLAGS = /w\r
+CDBGFLAGS = $(CDEBUG)\r
+COPTFLAGS = $(COPTIM)\r
+CRTIMEFLAGS = /MT\r
+CMTHREADFLAGS = /Qopenmp\r
+CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \\r
+ $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)\r
+\r
+!elseif "$(CCOMPILER_STR)"=="cl"\r
+\r
+!if "$(BUILD_STR)"=="debug"\r
+CDEBUG = /Zi\r
+COPTIM = /Od\r
+!elseif "$(BUILD_STR)"=="release"\r
+CDEBUG =\r
+COPTIM = /Ox\r
+!endif\r
+\r
+CC = cl.exe\r
+CMISCFLAGS = /nologo\r
+CLANGFLAGS =\r
+CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)\r
+CWARNFLAGS = /w\r
+CDBGFLAGS = $(CDEBUG)\r
+COPTFLAGS = $(COPTIM)\r
+CRTIMEFLAGS = /MT\r
+CMTHREADFLAGS = /openmp\r
+CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \\r
+ $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)\r
+\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Library-related definitions ----------------------------------------------\r
+#\r
+\r
+# --- Static library definitions ---\r
+\r
+LIBBLIS_LIB = $(LIBBLIS).lib\r
+\r
+LIB = lib\r
+LIB_OPTIONS = /nologo\r
+LIB_BLI_OUTPUT_ARG = /out:$(LIBBLIS_LIB)\r
+LIB_BLI_INPUT_ARGS = *.obj\r
+\r
+# --- Dynamic library definitions ---\r
+\r
+LIBBLIS_DLL = $(LIBBLIS).dll\r
+\r
+GENDLL = $(TOP_BUILD_DIR_ABS)\gendll.cmd\r
+OBJ_LIST_FILE = libblis-objects.txt\r
+\r
+SYM_DEF_FILEPATH = $(TOP_BUILD_DIR_ABS)\$(BUILD_DIRNAME)\libblis-symbols.def\r
+\r
index ccdd18f6449eca2b9686d1cbc09109dee1610f11..a8230623ed348d0a0edb8e8a3598207f51456469 100644 (file)
-attic
-broken
-old
-other
-temp
-tmp
-test
+attic\r
+broken\r
+old\r
+other\r
+temp\r
+tmp\r
+test\r
index 6b8710a711f3b689885aa5c26c6c06bde348e82b..46f8b9aacc801dea0aa2f79a5e2ecc2219fd5806 100644 (file)
-.git
+.git\r
index 338b3b4b3d4ead60aec077ba31dde59c7bddc8ba..98e115e3f6c8286672a27d7ec86e76b189323235 100644 (file)
-c:h
+c:h\r
index 90e8ddca25820b3d53619c40e0c04bdbf2791fb7..fd953520681022305a39500ab3649522630b68cf 100644 (file)
-EXPORTS
-FLA_TWO
-FLA_ONE
-FLA_ONE_HALF
-FLA_ZERO
-FLA_MINUS_ONE_HALF
-FLA_MINUS_ONE
-FLA_MINUS_TWO
-fla_axpyt_cntl_blas
-fla_copyt_cntl_blas
-fla_gemm_cntl_blas
-fla_hemm_cntl_blas
-fla_herk_cntl_blas
-fla_her2k_cntl_blas
-fla_symm_cntl_blas
-fla_syrk_cntl_blas
-fla_syr2k_cntl_blas
-fla_trmm_cntl_blas
-fla_trsm_cntl_blas
-fla_appiv_cntl_unb
-bli_samax
-bli_damax
-bli_camax
-bli_zamax
-bli_sasum
-bli_dasum
-bli_casum
-bli_zasum
-bli_saxpy
-bli_daxpy
-bli_caxpy
-bli_zaxpy
-bli_saxpymt
-bli_daxpymt
-bli_caxpymt
-bli_zaxpymt
-bli_saxpysmt
-bli_daxpysmt
-bli_caxpysmt
-bli_zaxpysmt
-bli_saxpysv
-bli_daxpysv
-bli_caxpysv
-bli_zaxpysv
-bli_saxpyv
-bli_daxpyv
-bli_caxpyv
-bli_zaxpyv
-bli_cconjm
-bli_zconjm
-bli_cconjmr
-bli_zconjmr
-bli_cconjv
-bli_zconjv
-bli_scopy
-bli_dcopy
-bli_ccopy
-bli_zcopy
-bli_scopymr
-bli_dcopymr
-bli_ccopymr
-bli_zcopymr
-bli_scopymt
-bli_dcopymt
-bli_ccopymt
-bli_zcopymt
-bli_scopyv
-bli_dcopyv
-bli_ccopyv
-bli_zcopyv
-bli_sdot
-bli_ddot
-bli_cdot
-bli_zdot
-bli_sdot2s
-bli_ddot2s
-bli_cdot2s
-bli_zdot2s
-bli_sdots
-bli_ddots
-bli_cdots
-bli_zdots
-bli_sinverts
-bli_dinverts
-bli_cinverts
-bli_zinverts
-bli_sinvscalm
-bli_dinvscalm
-bli_csinvscalm
-bli_cinvscalm
-bli_zdinvscalm
-bli_zinvscalm
-bli_sinvscalv
-bli_dinvscalv
-bli_csinvscalv
-bli_cinvscalv
-bli_zdinvscalv
-bli_zinvscalv
-bli_snrm2
-bli_dnrm2
-bli_cnrm2
-bli_znrm2
-bli_sscal
-bli_dscal
-bli_csscal
-bli_cscal
-bli_zdscal
-bli_zscal
-bli_sscalm
-bli_dscalm
-bli_csscalm
-bli_cscalm
-bli_zdscalm
-bli_zscalm
-bli_sscalmr
-bli_dscalmr
-bli_csscalmr
-bli_cscalmr
-bli_zdscalmr
-bli_zscalmr
-bli_sscalv
-bli_dscalv
-bli_csscalv
-bli_cscalv
-bli_zdscalv
-bli_zscalv
-bli_sswap
-bli_dswap
-bli_cswap
-bli_zswap
-bli_sswapmt
-bli_dswapmt
-bli_cswapmt
-bli_zswapmt
-bli_sgemv
-bli_dgemv
-bli_cgemv
-bli_zgemv
-bli_sger
-bli_dger
-bli_cger
-bli_zger
-bli_chemv
-bli_zhemv
-bli_cher
-bli_zher
-bli_cher2
-bli_zher2
-bli_ssymv
-bli_dsymv
-bli_csymv
-bli_zsymv
-bli_ssyr
-bli_dsyr
-bli_csyr
-bli_zsyr
-bli_ssyr2
-bli_dsyr2
-bli_csyr2
-bli_zsyr2
-bli_strmv
-bli_dtrmv
-bli_ctrmv
-bli_ztrmv
-bli_strsv
-bli_dtrsv
-bli_ctrsv
-bli_ztrsv
-bli_sgemm
-bli_dgemm
-bli_cgemm
-bli_zgemm
-bli_chemm
-bli_zhemm
-bli_cherk
-bli_zherk
-bli_cher2k
-bli_zher2k
-bli_ssymm
-bli_dsymm
-bli_csymm
-bli_zsymm
-bli_ssyrk
-bli_dsyrk
-bli_csyrk
-bli_zsyrk
-bli_ssyr2k
-bli_dsyr2k
-bli_csyr2k
-bli_zsyr2k
-bli_strmm
-bli_dtrmm
-bli_ctrmm
-bli_ztrmm
-bli_strsm
-bli_dtrsm
-bli_ctrsm
-bli_ztrsm
-FLASH_Apply_pivots
-FLASH_Apply_pivots_cntl_init
-FLASH_Apply_pivots_cntl_finalize
-FLASH_Apply_Q_UT
-FLASH_Apply_Q_UT_cntl_init
-FLASH_Apply_Q_UT_cntl_finalize
-FLASH_Apply_Q_UT_inc
-FLASH_Apply_Q_UT_inc_cntl_init
-FLASH_Apply_Q_UT_inc_cntl_finalize
-FLASH_Apply_Q_UT_inc_create_workspace
-FLASH_Apply_Q2_UT
-FLASH_Apply_Q2_UT_cntl_init
-FLASH_Apply_Q2_UT_cntl_finalize
-FLASH_Axpy
-FLASH_Axpyt
-FLASH_Axpyt_cntl_init
-FLASH_Axpyt_cntl_finalize
-FLASH_Axpy_cntl_init
-FLASH_Axpy_cntl_finalize
-FLASH_Axpy_buffer_to_hier
-FLASH_Axpy_hier_to_buffer
-FLASH_Axpy_flat_to_hier
-FLASH_Axpy_hier_to_flat
-FLASH_Axpy_hierarchy
-FLASH_Axpy_hierarchy_r
-FLASH_Chol
-FLASH_Chol_cntl_init
-FLASH_Chol_cntl_finalize
-FLASH_Chol_solve
-FLASH_Copy
-FLASH_Copyt
-FLASH_Copyt_cntl_init
-FLASH_Copyt_cntl_finalize
-FLASH_Copy_cntl_init
-FLASH_Copy_cntl_finalize
-FLASH_Copy_buffer_to_hier
-FLASH_Copy_hier_to_buffer
-FLASH_Copy_flat_to_hier
-FLASH_Copy_hier_to_flat
-FLASH_Copy_hierarchy
-FLASH_Copy_hierarchy_r
-FLASH_FS_incpiv
-FLASH_FS_incpiv_aux1
-FLASH_FS_incpiv_aux2
-FLASH_Gemm
-FLASH_Gemm_cntl_init
-FLASH_Gemm_cntl_finalize
-FLASH_Gemv
-FLASH_Gemv_cntl_init
-FLASH_Gemv_cntl_finalize
-FLASH_Hemm
-FLASH_Hemm_cntl_init
-FLASH_Hemm_cntl_finalize
-FLASH_Her2k
-FLASH_Her2k_cntl_init
-FLASH_Her2k_cntl_finalize
-FLASH_Herk
-FLASH_Herk_cntl_init
-FLASH_Herk_cntl_finalize
-FLASH_LU_find_zero_on_diagonal
-FLASH_LU_incpiv
-FLASH_LU_incpiv_cntl_init
-FLASH_LU_incpiv_cntl_finalize
-FLASH_LU_incpiv_create_hier_matrices
-FLASH_LU_incpiv_determine_alg_blocksize
-FLASH_LU_incpiv_noopt
-FLASH_LU_incpiv_opt1
-FLASH_LU_incpiv_solve
-FLASH_LU_incpiv_var1
-FLASH_LU_incpiv_var2
-FLASH_LU_nopiv
-FLASH_LU_nopiv_cntl_init
-FLASH_LU_nopiv_cntl_finalize
-FLASH_LU_nopiv_solve
-FLASH_LU_piv
-FLASH_LU_piv_cntl_init
-FLASH_LU_piv_cntl_finalize
-FLASH_LU_piv_solve
-FLASH_Max_elemwise_diff
-FLASH_Norm1
-FLASH_Obj_datatype
-FLASH_Obj_depth
-FLASH_Obj_blocksizes
-FLASH_Obj_scalar_length
-FLASH_Obj_scalar_width
-FLASH_Obj_create
-FLASH_Obj_create_ext
-FLASH_Obj_create_without_buffer
-FLASH_Obj_create_without_buffer_ext
-FLASH_Obj_create_helper
-FLASH_Obj_create_hierarchy
-FLASH_Obj_create_conf_to
-FLASH_Obj_create_hier_conf_to_flat
-FLASH_Obj_create_hier_conf_to_flat_ext
-FLASH_Obj_create_flat_conf_to_hier
-FLASH_Obj_create_hier_copy_of_flat
-FLASH_Obj_create_hier_copy_of_flat_ext
-FLASH_Obj_create_flat_copy_of_hier
-FLASH_Obj_free
-FLASH_Obj_free_without_buffer
-FLASH_Obj_free_hierarchy
-FLASH_Obj_extract_buffer
-FLASH_Obj_flatten
-FLASH_Obj_hierarchify
-FLASH_Obj_show
-FLASH_Obj_attach_buffer
-FLASH_Obj_attach_buffer_hierarchy
-FLASH_print_struct
-FLASH_print_struct_helper
-FLASH_Obj_create_diag_panel
-FLASH_Obj_exec
-FLASH_Obj_exec_parallel
-FLASH_Obj_push
-FLASH_Set
-FLASH_Shift_diag
-FLASH_QR_UT_cntl_init
-FLASH_QR_UT_cntl_finalize
-FLASH_QR_UT_inc
-FLASH_QR_UT_inc_cntl_init
-FLASH_QR_UT_inc_cntl_finalize
-FLASH_QR_UT_inc_create_hier_matrices
-FLASH_QR_UT_inc_determine_alg_blocksize
-FLASH_QR_UT_inc_noopt
-FLASH_QR_UT_inc_opt1
-FLASH_QR_UT_inc_solve
-FLASH_QR2_UT
-FLASH_QR2_UT_cntl_init
-FLASH_QR2_UT_cntl_finalize
-FLASH_Queue_begin
-FLASH_Queue_end
-FLASH_Queue_stack_depth
-FLASH_Queue_enable
-FLASH_Queue_disable
-FLASH_Queue_get_enabled
-FLASH_Queue_set_num_threads
-FLASH_Queue_get_num_threads
-FLASH_Queue_init
-FLASH_Queue_finalize
-FLASH_Queue_get_num_tasks
-FLASH_Queue_set_verbose_output
-FLASH_Queue_get_verbose_output
-FLASH_Queue_set_sorting
-FLASH_Queue_get_sorting
-FLASH_Queue_set_caching
-FLASH_Queue_get_caching
-FLASH_Queue_set_work_stealing
-FLASH_Queue_get_work_stealing
-FLASH_Queue_set_data_affinity
-FLASH_Queue_get_data_affinity
-FLASH_Queue_get_total_time
-FLASH_Queue_get_parallel_time
-FLASH_Queue_set_parallel_time
-FLASH_Queue_get_num_blocks
-FLASH_Queue_set_block_size
-FLASH_Queue_get_block_size
-FLASH_Queue_set_cache_size
-FLASH_Queue_get_cache_size
-FLASH_Queue_set_cache_line_size
-FLASH_Queue_get_cache_line_size
-FLASH_Queue_set_cores_per_cache
-FLASH_Queue_get_cores_per_cache
-FLASH_Queue_set_cores_per_queue
-FLASH_Queue_get_cores_per_queue
-FLASH_Queue_reset
-FLASH_Queue_get_head_task
-FLASH_Queue_get_tail_task
-FLASH_Queue_push
-FLASH_Queue_push_input
-FLASH_Queue_push_output
-FLASH_Task_alloc
-FLASH_Task_free
-FLASH_Queue_exec_task
-FLASH_Queue_verbose_output
-FLASH_Queue_exec
-FLASH_Queue_init_tasks
-FLASH_Queue_wait_enqueue
-FLASH_Queue_wait_dequeue
-FLASH_Queue_wait_dequeue_block
-FLASH_Queue_reside_in_cache
-FLASH_Queue_update_cache
-FLASH_Queue_update_cache_block
-FLASH_Queue_prefetch
-FLASH_Queue_prefetch_block
-FLASH_Queue_work_stealing
-FLASH_Queue_exec_parallel
-FLASH_Queue_exec_parallel_function
-FLASH_Task_update_dependencies
-FLASH_Task_update_binding
-FLASH_Task_free_parallel
-FLASH_Random_matrix
-FLASH_Random_spd_matrix
-FLASH_SA_FS
-FLASH_SA_LU
-FLASH_SPDinv
-FLASH_SPDinv_cntl_init
-FLASH_SPDinv_cntl_finalize
-FLASH_Sylv
-FLASH_Sylv_cntl_init
-FLASH_Sylv_cntl_finalize
-FLASH_Symm
-FLASH_Symm_cntl_init
-FLASH_Symm_cntl_finalize
-FLASH_Syr2k
-FLASH_Syr2k_cntl_init
-FLASH_Syr2k_cntl_finalize
-FLASH_Syrk
-FLASH_Syrk_cntl_init
-FLASH_Syrk_cntl_finalize
-FLASH_Triangularize
-FLASH_Trinv
-FLASH_Trinv_cntl_init
-FLASH_Trinv_cntl_finalize
-FLASH_Trmm
-FLASH_Trmm_cntl_init
-FLASH_Trmm_cntl_finalize
-FLASH_Trsm
-FLASH_Trsm_cntl_init
-FLASH_Trsm_cntl_finalize
-FLASH_Trsm_piv
-FLASH_Trsv
-FLASH_Trsv_cntl_init
-FLASH_Trsv_cntl_finalize
-FLASH_Ttmm
-FLASH_Ttmm_cntl_init
-FLASH_Ttmm_cntl_finalize
-FLA_Absolute_square
-FLA_Accum_T_UT
-FLA_Accum_T_UT_fc_blk_var2
-FLA_Accum_T_UT_fc_opt_var1
-FLA_Accum_T_UT_fc_ops_var1
-FLA_Accum_T_UT_fc_opd_var1
-FLA_Accum_T_UT_fc_opc_var1
-FLA_Accum_T_UT_fc_opz_var1
-FLA_Accum_T_UT_fc_unb_var1
-FLA_Accum_T_UT_fr_blk_var2
-FLA_Accum_T_UT_fr_opt_var1
-FLA_Accum_T_UT_fr_ops_var1
-FLA_Accum_T_UT_fr_opd_var1
-FLA_Accum_T_UT_fr_opc_var1
-FLA_Accum_T_UT_fr_opz_var1
-FLA_Accum_T_UT_fr_unb_var1
-FLA_Accum_T_UT_internal
-FLA_Amax
-FLA_Amax_external
-FLA_Apply_H2_UT
-FLA_Apply_H2_UT_internal
-FLA_Apply_H2_UT_lh_opt_var1
-FLA_Apply_H2_UT_lh_ops_var1
-FLA_Apply_H2_UT_lh_opd_var1
-FLA_Apply_H2_UT_lh_opc_var1
-FLA_Apply_H2_UT_lh_opz_var1
-FLA_Apply_H2_UT_lh_unb_var1
-FLA_Apply_H2_UT_rh_opt_var1
-FLA_Apply_H2_UT_rh_ops_var1
-FLA_Apply_H2_UT_rh_opd_var1
-FLA_Apply_H2_UT_rh_opc_var1
-FLA_Apply_H2_UT_rh_opz_var1
-FLA_Apply_H2_UT_rh_unb_var1
-FLA_Apply_H2_UT_rn_opt_var1
-FLA_Apply_H2_UT_rn_ops_var1
-FLA_Apply_H2_UT_rn_opd_var1
-FLA_Apply_H2_UT_rn_opc_var1
-FLA_Apply_H2_UT_rn_opz_var1
-FLA_Apply_H2_UT_rn_unb_var1
-FLA_Apply_pivots
-FLA_Apply_pivots_cntl_init
-FLA_Apply_pivots_cntl_finalize
-FLA_Apply_pivots_internal
-FLA_Apply_pivots_ln
-FLA_Apply_pivots_ln_blk_var1
-FLA_Apply_pivots_ln_blk_var2
-FLA_Apply_pivots_ln_opt_var1
-FLA_Apply_pivots_ln_ops_var1
-FLA_Apply_pivots_ln_opd_var1
-FLA_Apply_pivots_ln_opc_var1
-FLA_Apply_pivots_ln_opz_var1
-FLA_Apply_pivots_macro_external
-FLA_Apply_pivots_macro_task
-FLA_Apply_pivots_task
-FLA_Apply_pivots_ln_task
-FLA_Apply_pivots_unb_external
-FLA_Apply_pivots_ln_unb_ext
-FLA_Apply_Q_blk_external
-FLA_Apply_Q_UT
-FLA_Apply_Q_UT_cntl_init
-FLA_Apply_Q_UT_cntl_finalize
-FLA_Apply_Q_UT_create_workspace
-FLA_Apply_Q_UT_inc_internal
-FLA_Apply_Q_UT_inc_lhfc
-FLA_Apply_Q_UT_inc_lhfc_blk_var1
-FLA_Apply_Q_UT_internal
-FLA_Apply_Q_UT_lhfc
-FLA_Apply_Q_UT_lhfc_blk_var1
-FLA_Apply_Q_UT_lhfc_blk_var2
-FLA_Apply_Q_UT_lnfr
-FLA_Apply_Q_UT_lnfr_blk_var1
-FLA_Apply_Q_UT_lnfr_blk_var2
-FLA_Apply_Q_UT_rnfr
-FLA_Apply_Q_UT_rnfr_blk_var1
-FLA_Apply_Q_UT_rnfr_blk_var2
-FLA_Apply_Q_UT_task
-FLA_Apply_Q_UT_lhfc_task
-FLA_Apply_Q_UT_lnfr_task
-FLA_Apply_Q_UT_rnfr_task
-FLA_Apply_Q2_UT_cntl_init
-FLA_Apply_Q2_UT_cntl_finalize
-FLA_Apply_Q2_UT_internal
-FLA_Apply_Q2_UT_lhfc
-FLA_Apply_Q2_UT_lhfc_blk_var1
-FLA_Apply_Q2_UT_lhfc_blk_var2
-FLA_Apply_Q2_UT_lhfc_blk_var3
-FLA_Apply_Q2_UT_task
-FLA_Apply_Q2_UT_lhfc_task
-FLA_Asum
-FLA_Asum_external
-FLA_Axpy
-FLA_Axpys
-FLA_Axpys_external
-FLA_Axpyt
-FLA_Axpyt_c
-FLA_Axpyt_cntl_init
-FLA_Axpyt_cntl_finalize
-FLA_Axpyt_c_blk_var1
-FLA_Axpyt_c_blk_var2
-FLA_Axpyt_c_blk_var3
-FLA_Axpyt_c_blk_var4
-FLA_Axpyt_external
-FLA_Axpyt_h
-FLA_Axpyt_h_blk_var1
-FLA_Axpyt_h_blk_var2
-FLA_Axpyt_h_blk_var3
-FLA_Axpyt_h_blk_var4
-FLA_Axpyt_internal
-FLA_Axpyt_n
-FLA_Axpyt_n_blk_var1
-FLA_Axpyt_n_blk_var2
-FLA_Axpyt_n_blk_var3
-FLA_Axpyt_n_blk_var4
-FLA_Axpyt_t
-FLA_Axpyt_task
-FLA_Axpyt_n_task
-FLA_Axpyt_t_task
-FLA_Axpyt_c_task
-FLA_Axpyt_h_task
-FLA_Axpyt_t_blk_var1
-FLA_Axpyt_t_blk_var2
-FLA_Axpyt_t_blk_var3
-FLA_Axpyt_t_blk_var4
-FLA_Axpy_blk_var1
-FLA_Axpy_blk_var2
-FLA_Axpy_blk_var3
-FLA_Axpy_blk_var4
-FLA_Axpy_cntl_init
-FLA_Axpy_cntl_finalize
-FLA_Axpy_external
-FLA_Axpy_internal
-FLA_Axpy_task
-FLA_Axpy_buffer_to_object
-FLA_Axpy_object_to_buffer
-FLA_Blocksize_create
-FLA_Blocksize_set
-FLA_Blocksize_scale
-FLA_Blocksize_create_copy
-FLA_Blocksize_free
-FLA_Blocksize_extract
-FLA_Query_blocksizes
-FLA_Query_blocksize
-FLA_Determine_blocksize
-FLA_determine_matrix_size
-FLA_Check_error_level
-FLA_Check_error_level_set
-FLA_Check_error_code_helper
-FLA_Check_valid_side
-FLA_Check_valid_uplo
-FLA_Check_valid_trans
-FLA_Check_valid_diag
-FLA_Check_valid_conj
-FLA_Check_valid_direct
-FLA_Check_valid_storev
-FLA_Check_valid_datatype
-FLA_Check_valid_object_datatype
-FLA_Check_floating_datatype
-FLA_Check_int_datatype
-FLA_Check_real_datatype
-FLA_Check_complex_datatype
-FLA_Check_floating_object
-FLA_Check_int_object
-FLA_Check_real_object
-FLA_Check_complex_object
-FLA_Check_identical_object_precision
-FLA_Check_consistent_object_datatype
-FLA_Check_consistent_datatype
-FLA_Check_square
-FLA_Check_if_scalar
-FLA_Check_if_vector
-FLA_Check_conformal_dims
-FLA_Check_matrix_matrix_dims
-FLA_Check_matrix_vector_dims
-FLA_Check_equal_vector_lengths
-FLA_Check_conj_trans_and_datatype
-FLA_Check_vector_length
-FLA_Check_null_pointer
-FLA_Check_object_dims
-FLA_Check_valid_pivot_type
-FLA_Check_malloc_pointer
-FLA_Check_base_buffer_mismatch
-FLA_Check_adjacent_objects_2x2
-FLA_Check_adjacent_objects_2x1
-FLA_Check_adjacent_objects_1x2
-FLA_Check_blocksize_value
-FLA_Check_blocksize_object
-FLA_Check_file_descriptor
-FLA_Check_lseek_result
-FLA_Check_close_result
-FLA_Check_unlink_result
-FLA_Check_read_result
-FLA_Check_write_result
-FLA_Check_valid_quadrant
-FLA_Check_vector_length_min
-FLA_Check_pthread_create_result
-FLA_Check_pthread_join_result
-FLA_Check_valid_isgn_value
-FLA_Check_sylv_matrix_dims
-FLA_Check_chol_failure
-FLA_Check_valid_elemtype
-FLA_Check_posix_memalign_failure
-FLA_Check_submatrix_dims_and_offset
-FLA_Check_object_scalar_elemtype
-FLA_Check_object_matrix_elemtype
-FLA_Check_num_threads
-FLA_Check_conj_and_datatype
-FLA_Check_valid_complex_trans
-FLA_Check_valid_real_trans
-FLA_Check_valid_blas_trans
-FLA_Check_nonconstant_datatype
-FLA_Check_nonconstant_object
-FLA_Check_identical_object_datatype
-FLA_Check_divide_by_zero
-FLA_Check_identical_object_elemtype
-FLA_Check_pivot_index_range
-FLA_Check_householder_panel_dims
-FLA_Check_object_length_equals
-FLA_Check_object_width_equals
-FLA_Check_object_length_min
-FLA_Check_object_width_min
-FLA_Check_valid_error_level
-FLA_Check_attempted_repart_2x2
-FLA_Check_attempted_repart_2x1
-FLA_Check_attempted_repart_1x2
-FLA_Check_valid_leftright_side
-FLA_Check_valid_topbottom_side
-FLA_Check_matrix_strides
-FLA_Chol
-FLA_Chol_blk_external
-FLA_Chol_cntl_init
-FLA_Chol_cntl_finalize
-FLA_Chol_internal
-FLA_Chol_l
-FLA_Chol_l_blk_var1
-FLA_Chol_l_blk_var2
-FLA_Chol_l_blk_var3
-FLA_Chol_l_opt_var1
-FLA_Chol_l_ops_var1
-FLA_Chol_l_opd_var1
-FLA_Chol_l_opc_var1
-FLA_Chol_l_opz_var1
-FLA_Chol_l_opt_var2
-FLA_Chol_l_ops_var2
-FLA_Chol_l_opd_var2
-FLA_Chol_l_opc_var2
-FLA_Chol_l_opz_var2
-FLA_Chol_l_opt_var3
-FLA_Chol_l_ops_var3
-FLA_Chol_l_opd_var3
-FLA_Chol_l_opc_var3
-FLA_Chol_l_opz_var3
-FLA_Chol_l_unb_var1
-FLA_Chol_l_unb_var2
-FLA_Chol_l_unb_var3
-FLA_Chol_solve
-FLA_Chol_task
-FLA_Chol_l_task
-FLA_Chol_u_task
-FLA_Chol_u
-FLA_Chol_unb_external
-FLA_Chol_l_unb_ext
-FLA_Chol_u_unb_ext
-FLA_Chol_u_blk_var1
-FLA_Chol_u_blk_var2
-FLA_Chol_u_blk_var3
-FLA_Chol_u_opt_var1
-FLA_Chol_u_ops_var1
-FLA_Chol_u_opd_var1
-FLA_Chol_u_opc_var1
-FLA_Chol_u_opz_var1
-FLA_Chol_u_opt_var2
-FLA_Chol_u_ops_var2
-FLA_Chol_u_opd_var2
-FLA_Chol_u_opc_var2
-FLA_Chol_u_opz_var2
-FLA_Chol_u_opt_var3
-FLA_Chol_u_ops_var3
-FLA_Chol_u_opd_var3
-FLA_Chol_u_opc_var3
-FLA_Chol_u_opz_var3
-FLA_Chol_u_unb_var1
-FLA_Chol_u_unb_var2
-FLA_Chol_u_unb_var3
-FLA_Clock
-FLA_Clock_helper
-FLA_Cntl_obj_free
-FLA_Cntl_axpy_obj_create
-FLA_Cntl_axpyt_obj_create
-FLA_Cntl_copy_obj_create
-FLA_Cntl_copyt_obj_create
-FLA_Cntl_swap_obj_create
-FLA_Cntl_tpose_obj_create
-FLA_Cntl_gemv_obj_create
-FLA_Cntl_trsv_obj_create
-FLA_Cntl_gemm_obj_create
-FLA_Cntl_hemm_obj_create
-FLA_Cntl_herk_obj_create
-FLA_Cntl_her2k_obj_create
-FLA_Cntl_symm_obj_create
-FLA_Cntl_syrk_obj_create
-FLA_Cntl_syr2k_obj_create
-FLA_Cntl_trmm_obj_create
-FLA_Cntl_trsm_obj_create
-FLA_Cntl_init
-FLA_Cntl_finalize
-FLA_Cntl_init_flamec
-FLA_Cntl_finalize_flamec
-FLA_Cntl_init_flash
-FLA_Cntl_finalize_flash
-FLA_Cntl_chol_obj_create
-FLA_Cntl_lu_obj_create
-FLA_Cntl_appiv_obj_create
-FLA_Cntl_qrut_obj_create
-FLA_Cntl_qrutud_obj_create
-FLA_Cntl_qrutinc_obj_create
-FLA_Cntl_lqut_obj_create
-FLA_Cntl_trinv_obj_create
-FLA_Cntl_ttmm_obj_create
-FLA_Cntl_sylv_obj_create
-FLA_Cntl_spdinv_obj_create
-FLA_Cntl_apqut_obj_create
-FLA_Cntl_apqutud_obj_create
-FLA_Cntl_apqutinc_obj_create
-FLA_Conjugate
-FLA_Conjugate_r
-FLA_Copy
-FLA_Copyr
-FLA_Copyr_external
-FLA_Copyt
-FLA_Copyt_c
-FLA_Copyt_cntl_init
-FLA_Copyt_cntl_finalize
-FLA_Copyt_c_blk_var1
-FLA_Copyt_c_blk_var2
-FLA_Copyt_c_blk_var3
-FLA_Copyt_c_blk_var4
-FLA_Copyt_external
-FLA_Copyt_h
-FLA_Copyt_h_blk_var1
-FLA_Copyt_h_blk_var2
-FLA_Copyt_h_blk_var3
-FLA_Copyt_h_blk_var4
-FLA_Copyt_internal
-FLA_Copyt_n
-FLA_Copyt_n_blk_var1
-FLA_Copyt_n_blk_var2
-FLA_Copyt_n_blk_var3
-FLA_Copyt_n_blk_var4
-FLA_Copyt_t
-FLA_Copyt_task
-FLA_Copyt_n_task
-FLA_Copyt_t_task
-FLA_Copyt_c_task
-FLA_Copyt_h_task
-FLA_Copyt_t_blk_var1
-FLA_Copyt_t_blk_var2
-FLA_Copyt_t_blk_var3
-FLA_Copyt_t_blk_var4
-FLA_Copy_blk_var1
-FLA_Copy_blk_var2
-FLA_Copy_blk_var3
-FLA_Copy_blk_var4
-FLA_Copy_cntl_init
-FLA_Copy_cntl_finalize
-FLA_Copy_external
-FLA_Copy_internal
-FLA_Copy_task
-FLA_Copy_buffer_to_object
-FLA_Copy_object_to_buffer
-FLA_Dot
-FLA_Dot2cs
-FLA_Dot2cs_external
-FLA_Dot2s
-FLA_Dot2s_external
-FLA_Dotc
-FLA_Dotcs
-FLA_Dotcs_external
-FLA_Dotc_external
-FLA_Dots
-FLA_Dots_external
-FLA_Dot_external
-FLA_Error_string_for_code
-FLA_Error_messages_init
-FLA_Print_message
-FLA_Abort
-FLA_Form_perm_matrix
-FLA_Gemm
-FLA_Gemm_cntl_init
-FLA_Gemm_cntl_finalize
-FLA_Gemm_external
-FLA_Gemm_hh
-FLA_Gemm_hh_blk_var1
-FLA_Gemm_hh_blk_var2
-FLA_Gemm_hh_blk_var3
-FLA_Gemm_hh_blk_var4
-FLA_Gemm_hh_blk_var5
-FLA_Gemm_hh_blk_var6
-FLA_Gemm_hh_unb_var1
-FLA_Gemm_hh_unb_var2
-FLA_Gemm_hh_unb_var3
-FLA_Gemm_hh_unb_var4
-FLA_Gemm_hh_unb_var5
-FLA_Gemm_hh_unb_var6
-FLA_Gemm_hn
-FLA_Gemm_hn_blk_var1
-FLA_Gemm_hn_blk_var2
-FLA_Gemm_hn_blk_var3
-FLA_Gemm_hn_blk_var4
-FLA_Gemm_hn_blk_var5
-FLA_Gemm_hn_blk_var6
-FLA_Gemm_hn_unb_var1
-FLA_Gemm_hn_unb_var2
-FLA_Gemm_hn_unb_var3
-FLA_Gemm_hn_unb_var4
-FLA_Gemm_hn_unb_var5
-FLA_Gemm_hn_unb_var6
-FLA_Gemm_ht
-FLA_Gemm_ht_blk_var1
-FLA_Gemm_ht_blk_var2
-FLA_Gemm_ht_blk_var3
-FLA_Gemm_ht_blk_var4
-FLA_Gemm_ht_blk_var5
-FLA_Gemm_ht_blk_var6
-FLA_Gemm_ht_unb_var1
-FLA_Gemm_ht_unb_var2
-FLA_Gemm_ht_unb_var3
-FLA_Gemm_ht_unb_var4
-FLA_Gemm_ht_unb_var5
-FLA_Gemm_ht_unb_var6
-FLA_Gemm_internal
-FLA_Gemm_nh
-FLA_Gemm_nh_blk_var1
-FLA_Gemm_nh_blk_var2
-FLA_Gemm_nh_blk_var3
-FLA_Gemm_nh_blk_var4
-FLA_Gemm_nh_blk_var5
-FLA_Gemm_nh_blk_var6
-FLA_Gemm_nh_unb_var1
-FLA_Gemm_nh_unb_var2
-FLA_Gemm_nh_unb_var3
-FLA_Gemm_nh_unb_var4
-FLA_Gemm_nh_unb_var5
-FLA_Gemm_nh_unb_var6
-FLA_Gemm_nn
-FLA_Gemm_nn_blk_var1
-FLA_Gemm_nn_blk_var2
-FLA_Gemm_nn_blk_var3
-FLA_Gemm_nn_blk_var4
-FLA_Gemm_nn_blk_var5
-FLA_Gemm_nn_blk_var6
-FLA_Gemm_nn_unb_var1
-FLA_Gemm_nn_unb_var2
-FLA_Gemm_nn_unb_var3
-FLA_Gemm_nn_unb_var4
-FLA_Gemm_nn_unb_var5
-FLA_Gemm_nn_unb_var6
-FLA_Gemm_nt
-FLA_Gemm_nt_blk_var1
-FLA_Gemm_nt_blk_var2
-FLA_Gemm_nt_blk_var3
-FLA_Gemm_nt_blk_var4
-FLA_Gemm_nt_blk_var5
-FLA_Gemm_nt_blk_var6
-FLA_Gemm_nt_unb_var1
-FLA_Gemm_nt_unb_var2
-FLA_Gemm_nt_unb_var3
-FLA_Gemm_nt_unb_var4
-FLA_Gemm_nt_unb_var5
-FLA_Gemm_nt_unb_var6
-FLA_Gemm_task
-FLA_Gemm_hh_task
-FLA_Gemm_hn_task
-FLA_Gemm_ht_task
-FLA_Gemm_nh_task
-FLA_Gemm_nn_task
-FLA_Gemm_nt_task
-FLA_Gemm_th_task
-FLA_Gemm_tn_task
-FLA_Gemm_tt_task
-FLA_Gemm_th
-FLA_Gemm_th_blk_var1
-FLA_Gemm_th_blk_var2
-FLA_Gemm_th_blk_var3
-FLA_Gemm_th_blk_var4
-FLA_Gemm_th_blk_var5
-FLA_Gemm_th_blk_var6
-FLA_Gemm_th_unb_var1
-FLA_Gemm_th_unb_var2
-FLA_Gemm_th_unb_var3
-FLA_Gemm_th_unb_var4
-FLA_Gemm_th_unb_var5
-FLA_Gemm_th_unb_var6
-FLA_Gemm_tn
-FLA_Gemm_tn_blk_var1
-FLA_Gemm_tn_blk_var2
-FLA_Gemm_tn_blk_var3
-FLA_Gemm_tn_blk_var4
-FLA_Gemm_tn_blk_var5
-FLA_Gemm_tn_blk_var6
-FLA_Gemm_tn_unb_var1
-FLA_Gemm_tn_unb_var2
-FLA_Gemm_tn_unb_var3
-FLA_Gemm_tn_unb_var4
-FLA_Gemm_tn_unb_var5
-FLA_Gemm_tn_unb_var6
-FLA_Gemm_tt
-FLA_Gemm_tt_blk_var1
-FLA_Gemm_tt_blk_var2
-FLA_Gemm_tt_blk_var3
-FLA_Gemm_tt_blk_var4
-FLA_Gemm_tt_blk_var5
-FLA_Gemm_tt_blk_var6
-FLA_Gemm_tt_unb_var1
-FLA_Gemm_tt_unb_var2
-FLA_Gemm_tt_unb_var3
-FLA_Gemm_tt_unb_var4
-FLA_Gemm_tt_unb_var5
-FLA_Gemm_tt_unb_var6
-FLA_Gemp
-FLA_Gemv
-FLA_Gemvc
-FLA_Gemvc_external
-FLA_Gemv_c
-FLA_Gemv_cntl_init
-FLA_Gemv_cntl_finalize
-FLA_Gemv_c_blk_var1
-FLA_Gemv_c_blk_var2
-FLA_Gemv_c_blk_var5
-FLA_Gemv_c_blk_var6
-FLA_Gemv_external
-FLA_Gemv_internal
-FLA_Gemv_n
-FLA_Gemv_n_blk_var1
-FLA_Gemv_n_blk_var2
-FLA_Gemv_n_blk_var5
-FLA_Gemv_n_blk_var6
-FLA_Gemv_t
-FLA_Gemv_task
-FLA_Gemv_c_task
-FLA_Gemv_n_task
-FLA_Gemv_t_task
-FLA_Gemv_t_blk_var1
-FLA_Gemv_t_blk_var2
-FLA_Gemv_t_blk_var5
-FLA_Gemv_t_blk_var6
-FLA_Gepm
-FLA_Gepp
-FLA_Ger
-FLA_Gerc
-FLA_Gerc_external
-FLA_Ger_external
-FLA_Hemm
-FLA_Hemm_cntl_init
-FLA_Hemm_cntl_finalize
-FLA_Hemm_external
-FLA_Hemm_internal
-FLA_Hemm_ll
-FLA_Hemm_ll_blk_var1
-FLA_Hemm_ll_blk_var10
-FLA_Hemm_ll_blk_var2
-FLA_Hemm_ll_blk_var3
-FLA_Hemm_ll_blk_var4
-FLA_Hemm_ll_blk_var5
-FLA_Hemm_ll_blk_var6
-FLA_Hemm_ll_blk_var7
-FLA_Hemm_ll_blk_var8
-FLA_Hemm_ll_blk_var9
-FLA_Hemm_ll_unb_var1
-FLA_Hemm_ll_unb_var10
-FLA_Hemm_ll_unb_var2
-FLA_Hemm_ll_unb_var3
-FLA_Hemm_ll_unb_var4
-FLA_Hemm_ll_unb_var5
-FLA_Hemm_ll_unb_var6
-FLA_Hemm_ll_unb_var7
-FLA_Hemm_ll_unb_var8
-FLA_Hemm_ll_unb_var9
-FLA_Hemm_lu
-FLA_Hemm_lu_blk_var1
-FLA_Hemm_lu_blk_var10
-FLA_Hemm_lu_blk_var2
-FLA_Hemm_lu_blk_var3
-FLA_Hemm_lu_blk_var4
-FLA_Hemm_lu_blk_var5
-FLA_Hemm_lu_blk_var6
-FLA_Hemm_lu_blk_var7
-FLA_Hemm_lu_blk_var8
-FLA_Hemm_lu_blk_var9
-FLA_Hemm_lu_unb_var1
-FLA_Hemm_lu_unb_var10
-FLA_Hemm_lu_unb_var2
-FLA_Hemm_lu_unb_var3
-FLA_Hemm_lu_unb_var4
-FLA_Hemm_lu_unb_var5
-FLA_Hemm_lu_unb_var6
-FLA_Hemm_lu_unb_var7
-FLA_Hemm_lu_unb_var8
-FLA_Hemm_lu_unb_var9
-FLA_Hemm_rl
-FLA_Hemm_rl_blk_var1
-FLA_Hemm_rl_blk_var10
-FLA_Hemm_rl_blk_var2
-FLA_Hemm_rl_blk_var3
-FLA_Hemm_rl_blk_var4
-FLA_Hemm_rl_blk_var5
-FLA_Hemm_rl_blk_var6
-FLA_Hemm_rl_blk_var7
-FLA_Hemm_rl_blk_var8
-FLA_Hemm_rl_blk_var9
-FLA_Hemm_rl_unb_var1
-FLA_Hemm_rl_unb_var10
-FLA_Hemm_rl_unb_var2
-FLA_Hemm_rl_unb_var3
-FLA_Hemm_rl_unb_var4
-FLA_Hemm_rl_unb_var5
-FLA_Hemm_rl_unb_var6
-FLA_Hemm_rl_unb_var7
-FLA_Hemm_rl_unb_var8
-FLA_Hemm_rl_unb_var9
-FLA_Hemm_ru
-FLA_Hemm_ru_blk_var1
-FLA_Hemm_ru_blk_var10
-FLA_Hemm_ru_blk_var2
-FLA_Hemm_ru_blk_var3
-FLA_Hemm_ru_blk_var4
-FLA_Hemm_ru_blk_var5
-FLA_Hemm_ru_blk_var6
-FLA_Hemm_ru_blk_var7
-FLA_Hemm_ru_blk_var8
-FLA_Hemm_ru_blk_var9
-FLA_Hemm_ru_unb_var1
-FLA_Hemm_ru_unb_var10
-FLA_Hemm_ru_unb_var2
-FLA_Hemm_ru_unb_var3
-FLA_Hemm_ru_unb_var4
-FLA_Hemm_ru_unb_var5
-FLA_Hemm_ru_unb_var6
-FLA_Hemm_ru_unb_var7
-FLA_Hemm_ru_unb_var8
-FLA_Hemm_ru_unb_var9
-FLA_Hemm_task
-FLA_Hemm_ll_task
-FLA_Hemm_lu_task
-FLA_Hemm_rl_task
-FLA_Hemm_ru_task
-FLA_Hemv
-FLA_Hemvc
-FLA_Hemvc_external
-FLA_Hemv_external
-FLA_Her
-FLA_Her2
-FLA_Her2c
-FLA_Her2c_external
-FLA_Her2k
-FLA_Her2k_cntl_init
-FLA_Her2k_cntl_finalize
-FLA_Her2k_external
-FLA_Her2k_internal
-FLA_Her2k_lh
-FLA_Her2k_lh_blk_var1
-FLA_Her2k_lh_blk_var10
-FLA_Her2k_lh_blk_var2
-FLA_Her2k_lh_blk_var3
-FLA_Her2k_lh_blk_var4
-FLA_Her2k_lh_blk_var5
-FLA_Her2k_lh_blk_var6
-FLA_Her2k_lh_blk_var7
-FLA_Her2k_lh_blk_var8
-FLA_Her2k_lh_blk_var9
-FLA_Her2k_lh_unb_var1
-FLA_Her2k_lh_unb_var10
-FLA_Her2k_lh_unb_var2
-FLA_Her2k_lh_unb_var3
-FLA_Her2k_lh_unb_var4
-FLA_Her2k_lh_unb_var5
-FLA_Her2k_lh_unb_var6
-FLA_Her2k_lh_unb_var7
-FLA_Her2k_lh_unb_var8
-FLA_Her2k_lh_unb_var9
-FLA_Her2k_ln
-FLA_Her2k_ln_blk_var1
-FLA_Her2k_ln_blk_var10
-FLA_Her2k_ln_blk_var2
-FLA_Her2k_ln_blk_var3
-FLA_Her2k_ln_blk_var4
-FLA_Her2k_ln_blk_var5
-FLA_Her2k_ln_blk_var6
-FLA_Her2k_ln_blk_var7
-FLA_Her2k_ln_blk_var8
-FLA_Her2k_ln_blk_var9
-FLA_Her2k_ln_unb_var1
-FLA_Her2k_ln_unb_var10
-FLA_Her2k_ln_unb_var2
-FLA_Her2k_ln_unb_var3
-FLA_Her2k_ln_unb_var4
-FLA_Her2k_ln_unb_var5
-FLA_Her2k_ln_unb_var6
-FLA_Her2k_ln_unb_var7
-FLA_Her2k_ln_unb_var8
-FLA_Her2k_ln_unb_var9
-FLA_Her2k_task
-FLA_Her2k_ln_task
-FLA_Her2k_lh_task
-FLA_Her2k_un_task
-FLA_Her2k_uh_task
-FLA_Her2k_uh
-FLA_Her2k_uh_blk_var1
-FLA_Her2k_uh_blk_var10
-FLA_Her2k_uh_blk_var2
-FLA_Her2k_uh_blk_var3
-FLA_Her2k_uh_blk_var4
-FLA_Her2k_uh_blk_var5
-FLA_Her2k_uh_blk_var6
-FLA_Her2k_uh_blk_var7
-FLA_Her2k_uh_blk_var8
-FLA_Her2k_uh_blk_var9
-FLA_Her2k_uh_unb_var1
-FLA_Her2k_uh_unb_var10
-FLA_Her2k_uh_unb_var2
-FLA_Her2k_uh_unb_var3
-FLA_Her2k_uh_unb_var4
-FLA_Her2k_uh_unb_var5
-FLA_Her2k_uh_unb_var6
-FLA_Her2k_uh_unb_var7
-FLA_Her2k_uh_unb_var8
-FLA_Her2k_uh_unb_var9
-FLA_Her2k_un
-FLA_Her2k_un_blk_var1
-FLA_Her2k_un_blk_var10
-FLA_Her2k_un_blk_var2
-FLA_Her2k_un_blk_var3
-FLA_Her2k_un_blk_var4
-FLA_Her2k_un_blk_var5
-FLA_Her2k_un_blk_var6
-FLA_Her2k_un_blk_var7
-FLA_Her2k_un_blk_var8
-FLA_Her2k_un_blk_var9
-FLA_Her2k_un_unb_var1
-FLA_Her2k_un_unb_var10
-FLA_Her2k_un_unb_var2
-FLA_Her2k_un_unb_var3
-FLA_Her2k_un_unb_var4
-FLA_Her2k_un_unb_var5
-FLA_Her2k_un_unb_var6
-FLA_Her2k_un_unb_var7
-FLA_Her2k_un_unb_var8
-FLA_Her2k_un_unb_var9
-FLA_Her2_external
-FLA_Herc
-FLA_Herc_external
-FLA_Herk
-FLA_Herk_cntl_init
-FLA_Herk_cntl_finalize
-FLA_Herk_external
-FLA_Herk_internal
-FLA_Herk_lh
-FLA_Herk_lh_blk_var1
-FLA_Herk_lh_blk_var2
-FLA_Herk_lh_blk_var3
-FLA_Herk_lh_blk_var4
-FLA_Herk_lh_blk_var5
-FLA_Herk_lh_blk_var6
-FLA_Herk_lh_unb_var1
-FLA_Herk_lh_unb_var2
-FLA_Herk_lh_unb_var3
-FLA_Herk_lh_unb_var4
-FLA_Herk_lh_unb_var5
-FLA_Herk_lh_unb_var6
-FLA_Herk_ln
-FLA_Herk_ln_blk_var1
-FLA_Herk_ln_blk_var2
-FLA_Herk_ln_blk_var3
-FLA_Herk_ln_blk_var4
-FLA_Herk_ln_blk_var5
-FLA_Herk_ln_blk_var6
-FLA_Herk_ln_unb_var1
-FLA_Herk_ln_unb_var2
-FLA_Herk_ln_unb_var3
-FLA_Herk_ln_unb_var4
-FLA_Herk_ln_unb_var5
-FLA_Herk_ln_unb_var6
-FLA_Herk_task
-FLA_Herk_ln_task
-FLA_Herk_lh_task
-FLA_Herk_un_task
-FLA_Herk_uh_task
-FLA_Herk_uh
-FLA_Herk_uh_blk_var1
-FLA_Herk_uh_blk_var2
-FLA_Herk_uh_blk_var3
-FLA_Herk_uh_blk_var4
-FLA_Herk_uh_blk_var5
-FLA_Herk_uh_blk_var6
-FLA_Herk_uh_unb_var1
-FLA_Herk_uh_unb_var2
-FLA_Herk_uh_unb_var3
-FLA_Herk_uh_unb_var4
-FLA_Herk_uh_unb_var5
-FLA_Herk_uh_unb_var6
-FLA_Herk_un
-FLA_Herk_un_blk_var1
-FLA_Herk_un_blk_var2
-FLA_Herk_un_blk_var3
-FLA_Herk_un_blk_var4
-FLA_Herk_un_blk_var5
-FLA_Herk_un_blk_var6
-FLA_Herk_un_unb_var1
-FLA_Herk_un_unb_var2
-FLA_Herk_un_unb_var3
-FLA_Herk_un_unb_var4
-FLA_Herk_un_unb_var5
-FLA_Herk_un_unb_var6
-FLA_Hermitianize
-FLA_Her_external
-FLA_Househ2_UT
-FLA_Househ2_UT_ops
-FLA_Househ2_UT_opd
-FLA_Househ2_UT_opc
-FLA_Househ2_UT_opz
-FLA_Init
-FLA_Finalize
-FLA_Init_safe
-FLA_Finalize_safe
-FLA_Initialized
-FLA_Init_constants
-FLA_Finalize_constants
-FLA_Invert
-FLA_Inv_scal
-FLA_Inv_scalc
-FLA_Inv_scalc_external
-FLA_Inv_scal_external
-FLA_Lock_init
-FLA_Lock_acquire
-FLA_Lock_release
-FLA_Lock_destroy
-FLA_LQ_blk_external
-FLA_LQ_unb_external
-FLA_LQ_UT
-FLA_LQ_UT_Accum_T_blk_var1
-FLA_LQ_UT_Accum_T_opt_var1
-FLA_LQ_UT_Accum_T_ops_var1
-FLA_LQ_UT_Accum_T_opd_var1
-FLA_LQ_UT_Accum_T_opc_var1
-FLA_LQ_UT_Accum_T_opz_var1
-FLA_LQ_UT_Accum_T_unb_var1
-FLA_LQ_UT_blk_var2
-FLA_LQ_UT_cntl_init
-FLA_LQ_UT_cntl_finalize
-FLA_LQ_UT_create_T
-FLA_LQ_UT_internal
-FLA_LQ_UT_opt_var2
-FLA_LQ_UT_ops_var2
-FLA_LQ_UT_opd_var2
-FLA_LQ_UT_opc_var2
-FLA_LQ_UT_opz_var2
-FLA_LQ_UT_recover_tau
-FLA_LQ_UT_recover_tau_submatrix
-FLA_LQ_UT_solve
-FLA_LQ_UT_task
-FLA_LQ_UT_unb_var2
-FLA_LU_find_zero_on_diagonal
-FLA_LU_nopiv
-FLA_LU_nopiv_blk_var1
-FLA_LU_nopiv_blk_var2
-FLA_LU_nopiv_blk_var3
-FLA_LU_nopiv_blk_var4
-FLA_LU_nopiv_blk_var5
-FLA_LU_nopiv_cntl_init
-FLA_LU_nopiv_cntl_finalize
-FLA_LU_nopiv_internal
-FLA_LU_nopiv_opt_var1
-FLA_LU_nopiv_ops_var1
-FLA_LU_nopiv_opd_var1
-FLA_LU_nopiv_opc_var1
-FLA_LU_nopiv_opz_var1
-FLA_LU_nopiv_opt_var2
-FLA_LU_nopiv_ops_var2
-FLA_LU_nopiv_opd_var2
-FLA_LU_nopiv_opc_var2
-FLA_LU_nopiv_opz_var2
-FLA_LU_nopiv_opt_var3
-FLA_LU_nopiv_ops_var3
-FLA_LU_nopiv_opd_var3
-FLA_LU_nopiv_opc_var3
-FLA_LU_nopiv_opz_var3
-FLA_LU_nopiv_opt_var4
-FLA_LU_nopiv_ops_var4
-FLA_LU_nopiv_opd_var4
-FLA_LU_nopiv_opc_var4
-FLA_LU_nopiv_opz_var4
-FLA_LU_nopiv_opt_var5
-FLA_LU_nopiv_ops_var5
-FLA_LU_nopiv_opd_var5
-FLA_LU_nopiv_opc_var5
-FLA_LU_nopiv_opz_var5
-FLA_LU_nopiv_solve
-FLA_LU_nopiv_task
-FLA_LU_nopiv_unb_var1
-FLA_LU_nopiv_unb_var2
-FLA_LU_nopiv_unb_var3
-FLA_LU_nopiv_unb_var4
-FLA_LU_nopiv_unb_var5
-FLA_LU_piv
-FLA_LU_piv_blk_external
-FLA_LU_piv_blk_var3
-FLA_LU_piv_blk_var4
-FLA_LU_piv_blk_var5
-FLA_LU_piv_cntl_init
-FLA_LU_piv_cntl_finalize
-FLA_LU_piv_copy_task
-FLA_LU_piv_internal
-FLA_LU_piv_macro_task
-FLA_LU_piv_opt_var3
-FLA_LU_piv_ops_var3
-FLA_LU_piv_opd_var3
-FLA_LU_piv_opc_var3
-FLA_LU_piv_opz_var3
-FLA_LU_piv_opt_var4
-FLA_LU_piv_ops_var4
-FLA_LU_piv_opd_var4
-FLA_LU_piv_opc_var4
-FLA_LU_piv_opz_var4
-FLA_LU_piv_opt_var5
-FLA_LU_piv_ops_var5
-FLA_LU_piv_opd_var5
-FLA_LU_piv_opc_var5
-FLA_LU_piv_opz_var5
-FLA_LU_piv_solve
-FLA_LU_piv_task
-FLA_LU_piv_unb_external
-FLA_LU_piv_unb_ext
-FLA_LU_piv_unb_var3
-FLA_LU_piv_unb_var3b
-FLA_LU_piv_unb_var4
-FLA_LU_piv_unb_var5
-FLA_Max_abs_value
-FLA_Max_elemwise_diff
-FLA_Memory_leak_counter_init
-FLA_Memory_leak_counter_finalize
-FLA_Memory_leak_counter_status
-FLA_Memory_leak_counter_set
-FLA_malloc
-FLA_realloc
-FLA_free
-FLA_Set
-FLA_Obj_extract_real_scalar
-FLA_Set_diag
-FLA_Set_to_identity
-FLA_Add_to_diag
-FLA_Shift_diag
-FLA_Scale_diag
-FLA_Obj_fshow
-FLA_Obj_show
-FLA_Mult_add
-FLA_Negate
-FLA_Norm1
-FLA_Norm_inf
-FLA_Nrm2
-FLA_Nrm2_external
-FLA_Obj_create
-FLA_Obj_create_ext
-FLA_align_ldim
-FLA_Obj_create_conf_to
-FLA_Obj_create_copy_of
-FLA_Obj_create_without_buffer
-FLA_Obj_create_constant
-FLA_Obj_create_complex_constant
-FLA_Obj_attach_buffer
-FLA_Obj_free
-FLA_Obj_free_without_buffer
-FLA_Param_map_flame_to_netlib_trans
-FLA_Param_map_flame_to_netlib_uplo
-FLA_Param_map_flame_to_netlib_side
-FLA_Param_map_flame_to_netlib_diag
-FLA_Param_map_flame_to_netlib_direct
-FLA_Param_map_flame_to_netlib_storev
-FLA_Param_map_flame_to_blis_trans
-FLA_Param_map_flame_to_blis_conj
-FLA_Param_map_flame_to_blis_uplo
-FLA_Param_map_flame_to_blis_side
-FLA_Param_map_flame_to_blis_diag
-FLA_Param_map_blis_to_netlib_trans
-FLA_Param_map_blis_to_netlib_uplo
-FLA_Param_map_blis_to_netlib_side
-FLA_Param_map_blis_to_netlib_diag
-FLA_Param_map_netlib_to_flame_trans
-FLA_Param_map_netlib_to_flame_uplo
-FLA_Param_map_netlib_to_flame_side
-FLA_Param_map_netlib_to_flame_diag
-FLA_Param_map_blislapack_to_flame_trans
-FLA_Param_map_blislapack_to_flame_uplo
-FLA_Param_map_blislapack_to_flame_side
-FLA_Param_map_blislapack_to_flame_diag
-FLA_QR_blk_external
-FLA_QR_unb_external
-FLA_QR_UT
-FLA_QR_UT_Accum_T_blk_var1
-FLA_QR_UT_Accum_T_opt_var1
-FLA_QR_UT_Accum_T_ops_var1
-FLA_QR_UT_Accum_T_opd_var1
-FLA_QR_UT_Accum_T_opc_var1
-FLA_QR_UT_Accum_T_opz_var1
-FLA_QR_UT_Accum_T_unb_var1
-FLA_QR_UT_blk_var2
-FLA_QR_UT_cntl_init
-FLA_QR_UT_cntl_finalize
-FLA_QR_UT_copy_internal
-FLA_QR_UT_copy_task
-FLA_QR_UT_create_T
-FLA_QR_UT_inc_blk_var1
-FLA_QR_UT_inc_blk_var2
-FLA_QR_UT_internal
-FLA_QR_UT_opt_var2
-FLA_QR_UT_ops_var2
-FLA_QR_UT_opd_var2
-FLA_QR_UT_opc_var2
-FLA_QR_UT_opz_var2
-FLA_QR_UT_recover_tau
-FLA_QR_UT_recover_tau_submatrix
-FLA_QR_UT_solve
-FLA_QR_UT_task
-FLA_QR2_UT_Accum_T_opt_var1
-FLA_QR2_UT_Accum_T_ops_var1
-FLA_QR2_UT_Accum_T_opd_var1
-FLA_QR2_UT_Accum_T_opc_var1
-FLA_QR2_UT_Accum_T_opz_var1
-FLA_QR2_UT_Accum_T_unb_var1
-FLA_QR2_UT_blk_var1
-FLA_QR2_UT_blk_var2
-FLA_QR2_UT_cntl_init
-FLA_QR2_UT_cntl_finalize
-FLA_QR2_UT_internal
-FLA_QR2_UT_task
-FLA_QR2_UT_unb_var2
-FLA_Obj_datatype
-FLA_Obj_datatype_proj_to_real
-FLA_Obj_elemtype
-FLA_Obj_datatype_size
-FLA_Obj_elem_size
-FLA_Obj_length
-FLA_Obj_width
-FLA_Obj_vector_dim
-FLA_Obj_vector_inc
-FLA_Obj_min_dim
-FLA_Obj_max_dim
-FLA_Obj_row_stride
-FLA_Obj_col_stride
-FLA_Obj_buffer
-FLA_Obj_is_int
-FLA_Obj_is_floating_point
-FLA_Obj_is_constant
-FLA_Obj_is_real
-FLA_Obj_is_complex
-FLA_Obj_is_single_precision
-FLA_Obj_is_double_precision
-FLA_Obj_is_scalar
-FLA_Obj_is_vector
-FLA_Obj_has_zero_dim
-FLA_Obj_is_col_major
-FLA_Obj_is_row_major
-FLA_Obj_is_conformal_to
-FLA_Obj_is
-FLA_Obj_equals
-FLA_Random_herm_matrix
-FLA_Random_matrix
-FLA_random_float
-FLA_random_double
-FLA_random_scomplex
-FLA_random_dcomplex
-FLA_Random_spd_matrix
-FLA_Random_tri_matrix
-FLA_SA_Apply_pivots
-FLA_SA_FS_blk
-FLA_SA_FS_task
-FLA_SA_LU_blk
-FLA_SA_LU_task
-FLA_SA_LU_unb
-FLA_Scal
-FLA_Scalc
-FLA_Scalc_external
-FLA_Scalr
-FLA_Scalr_external
-FLA_Scal_external
-FLA_Shift_pivots_to
-FLA_SPDinv
-FLA_SPDinv_blk_external
-FLA_SPDinv_cntl_init
-FLA_SPDinv_cntl_finalize
-FLA_SPDinv_internal
-FLA_Sqrt
-FLA_Swap
-FLA_Swapt
-FLA_Swapt_external
-FLA_Swap_external
-FLA_Swap_t_blk_var1
-FLA_Swap_t_blk_var2
-FLA_Sylv
-FLA_Sylv_blk_external
-FLA_Sylv_cntl_init
-FLA_Sylv_cntl_finalize
-FLA_Sylv_hh
-FLA_Sylv_hh_blk_var1
-FLA_Sylv_hh_blk_var10
-FLA_Sylv_hh_blk_var11
-FLA_Sylv_hh_blk_var12
-FLA_Sylv_hh_blk_var13
-FLA_Sylv_hh_blk_var14
-FLA_Sylv_hh_blk_var15
-FLA_Sylv_hh_blk_var16
-FLA_Sylv_hh_blk_var17
-FLA_Sylv_hh_blk_var18
-FLA_Sylv_hh_blk_var2
-FLA_Sylv_hh_blk_var3
-FLA_Sylv_hh_blk_var4
-FLA_Sylv_hh_blk_var5
-FLA_Sylv_hh_blk_var6
-FLA_Sylv_hh_blk_var7
-FLA_Sylv_hh_blk_var8
-FLA_Sylv_hh_blk_var9
-FLA_Sylv_hh_opt_var1
-FLA_Sylv_hh_ops_var1
-FLA_Sylv_hh_opd_var1
-FLA_Sylv_hh_opc_var1
-FLA_Sylv_hh_opz_var1
-FLA_Sylv_hh_opt_var10
-FLA_Sylv_hh_opt_var11
-FLA_Sylv_hh_opt_var12
-FLA_Sylv_hh_opt_var13
-FLA_Sylv_hh_opt_var14
-FLA_Sylv_hh_opt_var15
-FLA_Sylv_hh_opt_var16
-FLA_Sylv_hh_opt_var17
-FLA_Sylv_hh_opt_var18
-FLA_Sylv_hh_opt_var2
-FLA_Sylv_hh_opt_var3
-FLA_Sylv_hh_opt_var4
-FLA_Sylv_hh_opt_var5
-FLA_Sylv_hh_opt_var6
-FLA_Sylv_hh_opt_var7
-FLA_Sylv_hh_opt_var8
-FLA_Sylv_hh_opt_var9
-FLA_Sylv_hn
-FLA_Sylv_hn_blk_var1
-FLA_Sylv_hn_blk_var10
-FLA_Sylv_hn_blk_var11
-FLA_Sylv_hn_blk_var12
-FLA_Sylv_hn_blk_var13
-FLA_Sylv_hn_blk_var14
-FLA_Sylv_hn_blk_var15
-FLA_Sylv_hn_blk_var16
-FLA_Sylv_hn_blk_var17
-FLA_Sylv_hn_blk_var18
-FLA_Sylv_hn_blk_var2
-FLA_Sylv_hn_blk_var3
-FLA_Sylv_hn_blk_var4
-FLA_Sylv_hn_blk_var5
-FLA_Sylv_hn_blk_var6
-FLA_Sylv_hn_blk_var7
-FLA_Sylv_hn_blk_var8
-FLA_Sylv_hn_blk_var9
-FLA_Sylv_hn_opt_var1
-FLA_Sylv_hn_ops_var1
-FLA_Sylv_hn_opd_var1
-FLA_Sylv_hn_opc_var1
-FLA_Sylv_hn_opz_var1
-FLA_Sylv_hn_opt_var10
-FLA_Sylv_hn_opt_var11
-FLA_Sylv_hn_opt_var12
-FLA_Sylv_hn_opt_var13
-FLA_Sylv_hn_opt_var14
-FLA_Sylv_hn_opt_var15
-FLA_Sylv_hn_opt_var16
-FLA_Sylv_hn_opt_var17
-FLA_Sylv_hn_opt_var18
-FLA_Sylv_hn_opt_var2
-FLA_Sylv_hn_opt_var3
-FLA_Sylv_hn_opt_var4
-FLA_Sylv_hn_opt_var5
-FLA_Sylv_hn_opt_var6
-FLA_Sylv_hn_opt_var7
-FLA_Sylv_hn_opt_var8
-FLA_Sylv_hn_opt_var9
-FLA_Sylv_internal
-FLA_Sylv_nh
-FLA_Sylv_nh_blk_var1
-FLA_Sylv_nh_blk_var10
-FLA_Sylv_nh_blk_var11
-FLA_Sylv_nh_blk_var12
-FLA_Sylv_nh_blk_var13
-FLA_Sylv_nh_blk_var14
-FLA_Sylv_nh_blk_var15
-FLA_Sylv_nh_blk_var16
-FLA_Sylv_nh_blk_var17
-FLA_Sylv_nh_blk_var18
-FLA_Sylv_nh_blk_var2
-FLA_Sylv_nh_blk_var3
-FLA_Sylv_nh_blk_var4
-FLA_Sylv_nh_blk_var5
-FLA_Sylv_nh_blk_var6
-FLA_Sylv_nh_blk_var7
-FLA_Sylv_nh_blk_var8
-FLA_Sylv_nh_blk_var9
-FLA_Sylv_nh_opt_var1
-FLA_Sylv_nh_ops_var1
-FLA_Sylv_nh_opd_var1
-FLA_Sylv_nh_opc_var1
-FLA_Sylv_nh_opz_var1
-FLA_Sylv_nh_opt_var10
-FLA_Sylv_nh_opt_var11
-FLA_Sylv_nh_opt_var12
-FLA_Sylv_nh_opt_var13
-FLA_Sylv_nh_opt_var14
-FLA_Sylv_nh_opt_var15
-FLA_Sylv_nh_opt_var16
-FLA_Sylv_nh_opt_var17
-FLA_Sylv_nh_opt_var18
-FLA_Sylv_nh_opt_var2
-FLA_Sylv_nh_opt_var3
-FLA_Sylv_nh_opt_var4
-FLA_Sylv_nh_opt_var5
-FLA_Sylv_nh_opt_var6
-FLA_Sylv_nh_opt_var7
-FLA_Sylv_nh_opt_var8
-FLA_Sylv_nh_opt_var9
-FLA_Sylv_nn
-FLA_Sylv_nn_blk_var1
-FLA_Sylv_nn_blk_var10
-FLA_Sylv_nn_blk_var11
-FLA_Sylv_nn_blk_var12
-FLA_Sylv_nn_blk_var13
-FLA_Sylv_nn_blk_var14
-FLA_Sylv_nn_blk_var15
-FLA_Sylv_nn_blk_var16
-FLA_Sylv_nn_blk_var17
-FLA_Sylv_nn_blk_var18
-FLA_Sylv_nn_blk_var2
-FLA_Sylv_nn_blk_var3
-FLA_Sylv_nn_blk_var4
-FLA_Sylv_nn_blk_var5
-FLA_Sylv_nn_blk_var6
-FLA_Sylv_nn_blk_var7
-FLA_Sylv_nn_blk_var8
-FLA_Sylv_nn_blk_var9
-FLA_Sylv_nn_opt_var1
-FLA_Sylv_nn_ops_var1
-FLA_Sylv_nn_opd_var1
-FLA_Sylv_nn_opc_var1
-FLA_Sylv_nn_opz_var1
-FLA_Sylv_nn_opt_var10
-FLA_Sylv_nn_opt_var11
-FLA_Sylv_nn_opt_var12
-FLA_Sylv_nn_opt_var13
-FLA_Sylv_nn_opt_var14
-FLA_Sylv_nn_opt_var15
-FLA_Sylv_nn_opt_var16
-FLA_Sylv_nn_opt_var17
-FLA_Sylv_nn_opt_var18
-FLA_Sylv_nn_opt_var2
-FLA_Sylv_nn_opt_var3
-FLA_Sylv_nn_opt_var4
-FLA_Sylv_nn_opt_var5
-FLA_Sylv_nn_opt_var6
-FLA_Sylv_nn_opt_var7
-FLA_Sylv_nn_opt_var8
-FLA_Sylv_nn_opt_var9
-FLA_Sylv_task
-FLA_Sylv_nn_task
-FLA_Sylv_nh_task
-FLA_Sylv_hn_task
-FLA_Sylv_hh_task
-FLA_Sylv_unb_external
-FLA_Sylv_nn_unb_ext
-FLA_Sylv_nh_unb_ext
-FLA_Sylv_hn_unb_ext
-FLA_Sylv_hh_unb_ext
-FLA_Symm
-FLA_Symmetrize
-FLA_Symm_cntl_init
-FLA_Symm_cntl_finalize
-FLA_Symm_external
-FLA_Symm_internal
-FLA_Symm_ll
-FLA_Symm_ll_blk_var1
-FLA_Symm_ll_blk_var10
-FLA_Symm_ll_blk_var2
-FLA_Symm_ll_blk_var3
-FLA_Symm_ll_blk_var4
-FLA_Symm_ll_blk_var5
-FLA_Symm_ll_blk_var6
-FLA_Symm_ll_blk_var7
-FLA_Symm_ll_blk_var8
-FLA_Symm_ll_blk_var9
-FLA_Symm_ll_unb_var1
-FLA_Symm_ll_unb_var10
-FLA_Symm_ll_unb_var2
-FLA_Symm_ll_unb_var3
-FLA_Symm_ll_unb_var4
-FLA_Symm_ll_unb_var5
-FLA_Symm_ll_unb_var6
-FLA_Symm_ll_unb_var7
-FLA_Symm_ll_unb_var8
-FLA_Symm_ll_unb_var9
-FLA_Symm_lu
-FLA_Symm_lu_blk_var1
-FLA_Symm_lu_blk_var10
-FLA_Symm_lu_blk_var2
-FLA_Symm_lu_blk_var3
-FLA_Symm_lu_blk_var4
-FLA_Symm_lu_blk_var5
-FLA_Symm_lu_blk_var6
-FLA_Symm_lu_blk_var7
-FLA_Symm_lu_blk_var8
-FLA_Symm_lu_blk_var9
-FLA_Symm_lu_unb_var1
-FLA_Symm_lu_unb_var10
-FLA_Symm_lu_unb_var2
-FLA_Symm_lu_unb_var3
-FLA_Symm_lu_unb_var4
-FLA_Symm_lu_unb_var5
-FLA_Symm_lu_unb_var6
-FLA_Symm_lu_unb_var7
-FLA_Symm_lu_unb_var8
-FLA_Symm_lu_unb_var9
-FLA_Symm_rl
-FLA_Symm_rl_blk_var1
-FLA_Symm_rl_blk_var10
-FLA_Symm_rl_blk_var2
-FLA_Symm_rl_blk_var3
-FLA_Symm_rl_blk_var4
-FLA_Symm_rl_blk_var5
-FLA_Symm_rl_blk_var6
-FLA_Symm_rl_blk_var7
-FLA_Symm_rl_blk_var8
-FLA_Symm_rl_blk_var9
-FLA_Symm_rl_unb_var1
-FLA_Symm_rl_unb_var10
-FLA_Symm_rl_unb_var2
-FLA_Symm_rl_unb_var3
-FLA_Symm_rl_unb_var4
-FLA_Symm_rl_unb_var5
-FLA_Symm_rl_unb_var6
-FLA_Symm_rl_unb_var7
-FLA_Symm_rl_unb_var8
-FLA_Symm_rl_unb_var9
-FLA_Symm_ru
-FLA_Symm_ru_blk_var1
-FLA_Symm_ru_blk_var10
-FLA_Symm_ru_blk_var2
-FLA_Symm_ru_blk_var3
-FLA_Symm_ru_blk_var4
-FLA_Symm_ru_blk_var5
-FLA_Symm_ru_blk_var6
-FLA_Symm_ru_blk_var7
-FLA_Symm_ru_blk_var8
-FLA_Symm_ru_blk_var9
-FLA_Symm_ru_unb_var1
-FLA_Symm_ru_unb_var10
-FLA_Symm_ru_unb_var2
-FLA_Symm_ru_unb_var3
-FLA_Symm_ru_unb_var4
-FLA_Symm_ru_unb_var5
-FLA_Symm_ru_unb_var6
-FLA_Symm_ru_unb_var7
-FLA_Symm_ru_unb_var8
-FLA_Symm_ru_unb_var9
-FLA_Symm_task
-FLA_Symm_ll_task
-FLA_Symm_lu_task
-FLA_Symm_rl_task
-FLA_Symm_ru_task
-FLA_Symv
-FLA_Symv_external
-FLA_Syr
-FLA_Syr2
-FLA_Syr2k
-FLA_Syr2k_cntl_init
-FLA_Syr2k_cntl_finalize
-FLA_Syr2k_external
-FLA_Syr2k_internal
-FLA_Syr2k_ln
-FLA_Syr2k_ln_blk_var1
-FLA_Syr2k_ln_blk_var10
-FLA_Syr2k_ln_blk_var2
-FLA_Syr2k_ln_blk_var3
-FLA_Syr2k_ln_blk_var4
-FLA_Syr2k_ln_blk_var5
-FLA_Syr2k_ln_blk_var6
-FLA_Syr2k_ln_blk_var7
-FLA_Syr2k_ln_blk_var8
-FLA_Syr2k_ln_blk_var9
-FLA_Syr2k_ln_unb_var1
-FLA_Syr2k_ln_unb_var10
-FLA_Syr2k_ln_unb_var2
-FLA_Syr2k_ln_unb_var3
-FLA_Syr2k_ln_unb_var4
-FLA_Syr2k_ln_unb_var5
-FLA_Syr2k_ln_unb_var6
-FLA_Syr2k_ln_unb_var7
-FLA_Syr2k_ln_unb_var8
-FLA_Syr2k_ln_unb_var9
-FLA_Syr2k_lt
-FLA_Syr2k_lt_blk_var1
-FLA_Syr2k_lt_blk_var10
-FLA_Syr2k_lt_blk_var2
-FLA_Syr2k_lt_blk_var3
-FLA_Syr2k_lt_blk_var4
-FLA_Syr2k_lt_blk_var5
-FLA_Syr2k_lt_blk_var6
-FLA_Syr2k_lt_blk_var7
-FLA_Syr2k_lt_blk_var8
-FLA_Syr2k_lt_blk_var9
-FLA_Syr2k_lt_unb_var1
-FLA_Syr2k_lt_unb_var10
-FLA_Syr2k_lt_unb_var2
-FLA_Syr2k_lt_unb_var3
-FLA_Syr2k_lt_unb_var4
-FLA_Syr2k_lt_unb_var5
-FLA_Syr2k_lt_unb_var6
-FLA_Syr2k_lt_unb_var7
-FLA_Syr2k_lt_unb_var8
-FLA_Syr2k_lt_unb_var9
-FLA_Syr2k_task
-FLA_Syr2k_ln_task
-FLA_Syr2k_lt_task
-FLA_Syr2k_un_task
-FLA_Syr2k_ut_task
-FLA_Syr2k_un
-FLA_Syr2k_un_blk_var1
-FLA_Syr2k_un_blk_var10
-FLA_Syr2k_un_blk_var2
-FLA_Syr2k_un_blk_var3
-FLA_Syr2k_un_blk_var4
-FLA_Syr2k_un_blk_var5
-FLA_Syr2k_un_blk_var6
-FLA_Syr2k_un_blk_var7
-FLA_Syr2k_un_blk_var8
-FLA_Syr2k_un_blk_var9
-FLA_Syr2k_un_unb_var1
-FLA_Syr2k_un_unb_var10
-FLA_Syr2k_un_unb_var2
-FLA_Syr2k_un_unb_var3
-FLA_Syr2k_un_unb_var4
-FLA_Syr2k_un_unb_var5
-FLA_Syr2k_un_unb_var6
-FLA_Syr2k_un_unb_var7
-FLA_Syr2k_un_unb_var8
-FLA_Syr2k_un_unb_var9
-FLA_Syr2k_ut
-FLA_Syr2k_ut_blk_var1
-FLA_Syr2k_ut_blk_var10
-FLA_Syr2k_ut_blk_var2
-FLA_Syr2k_ut_blk_var3
-FLA_Syr2k_ut_blk_var4
-FLA_Syr2k_ut_blk_var5
-FLA_Syr2k_ut_blk_var6
-FLA_Syr2k_ut_blk_var7
-FLA_Syr2k_ut_blk_var8
-FLA_Syr2k_ut_blk_var9
-FLA_Syr2k_ut_unb_var1
-FLA_Syr2k_ut_unb_var10
-FLA_Syr2k_ut_unb_var2
-FLA_Syr2k_ut_unb_var3
-FLA_Syr2k_ut_unb_var4
-FLA_Syr2k_ut_unb_var5
-FLA_Syr2k_ut_unb_var6
-FLA_Syr2k_ut_unb_var7
-FLA_Syr2k_ut_unb_var8
-FLA_Syr2k_ut_unb_var9
-FLA_Syr2_external
-FLA_Syrk
-FLA_Syrk_cntl_init
-FLA_Syrk_cntl_finalize
-FLA_Syrk_external
-FLA_Syrk_internal
-FLA_Syrk_ln
-FLA_Syrk_ln_blk_var1
-FLA_Syrk_ln_blk_var2
-FLA_Syrk_ln_blk_var3
-FLA_Syrk_ln_blk_var4
-FLA_Syrk_ln_blk_var5
-FLA_Syrk_ln_blk_var6
-FLA_Syrk_ln_unb_var1
-FLA_Syrk_ln_unb_var2
-FLA_Syrk_ln_unb_var3
-FLA_Syrk_ln_unb_var4
-FLA_Syrk_ln_unb_var5
-FLA_Syrk_ln_unb_var6
-FLA_Syrk_lt
-FLA_Syrk_lt_blk_var1
-FLA_Syrk_lt_blk_var2
-FLA_Syrk_lt_blk_var3
-FLA_Syrk_lt_blk_var4
-FLA_Syrk_lt_blk_var5
-FLA_Syrk_lt_blk_var6
-FLA_Syrk_lt_unb_var1
-FLA_Syrk_lt_unb_var2
-FLA_Syrk_lt_unb_var3
-FLA_Syrk_lt_unb_var4
-FLA_Syrk_lt_unb_var5
-FLA_Syrk_lt_unb_var6
-FLA_Syrk_task
-FLA_Syrk_ln_task
-FLA_Syrk_lt_task
-FLA_Syrk_un_task
-FLA_Syrk_ut_task
-FLA_Syrk_un
-FLA_Syrk_un_blk_var1
-FLA_Syrk_un_blk_var2
-FLA_Syrk_un_blk_var3
-FLA_Syrk_un_blk_var4
-FLA_Syrk_un_blk_var5
-FLA_Syrk_un_blk_var6
-FLA_Syrk_un_unb_var1
-FLA_Syrk_un_unb_var2
-FLA_Syrk_un_unb_var3
-FLA_Syrk_un_unb_var4
-FLA_Syrk_un_unb_var5
-FLA_Syrk_un_unb_var6
-FLA_Syrk_ut
-FLA_Syrk_ut_blk_var1
-FLA_Syrk_ut_blk_var2
-FLA_Syrk_ut_blk_var3
-FLA_Syrk_ut_blk_var4
-FLA_Syrk_ut_blk_var5
-FLA_Syrk_ut_blk_var6
-FLA_Syrk_ut_unb_var1
-FLA_Syrk_ut_unb_var2
-FLA_Syrk_ut_unb_var3
-FLA_Syrk_ut_unb_var4
-FLA_Syrk_ut_unb_var5
-FLA_Syrk_ut_unb_var6
-FLA_Syr_external
-FLA_Transpose
-FLA_Transpose_blk_var1
-FLA_Transpose_blk_var2
-FLA_Transpose_cntl_init
-FLA_Transpose_cntl_finalize
-FLA_Transpose_unb_var1
-FLA_Transpose_unb_var2
-FLA_Triangularize
-FLA_Trinv
-FLA_Trinv_blk_external
-FLA_Trinv_cntl_init
-FLA_Trinv_cntl_finalize
-FLA_Trinv_internal
-FLA_Trinv_ln
-FLA_Trinv_ln_blk_var1
-FLA_Trinv_ln_blk_var2
-FLA_Trinv_ln_blk_var3
-FLA_Trinv_ln_blk_var4
-FLA_Trinv_ln_opt_var1
-FLA_Trinv_ln_ops_var1
-FLA_Trinv_ln_opd_var1
-FLA_Trinv_ln_opc_var1
-FLA_Trinv_ln_opz_var1
-FLA_Trinv_ln_opt_var2
-FLA_Trinv_ln_ops_var2
-FLA_Trinv_ln_opd_var2
-FLA_Trinv_ln_opc_var2
-FLA_Trinv_ln_opz_var2
-FLA_Trinv_ln_opt_var3
-FLA_Trinv_ln_ops_var3
-FLA_Trinv_ln_opd_var3
-FLA_Trinv_ln_opc_var3
-FLA_Trinv_ln_opz_var3
-FLA_Trinv_ln_opt_var4
-FLA_Trinv_ln_ops_var4
-FLA_Trinv_ln_opd_var4
-FLA_Trinv_ln_opc_var4
-FLA_Trinv_ln_opz_var4
-FLA_Trinv_ln_unb_var1
-FLA_Trinv_ln_unb_var2
-FLA_Trinv_ln_unb_var3
-FLA_Trinv_ln_unb_var4
-FLA_Trinv_lu
-FLA_Trinv_lu_blk_var1
-FLA_Trinv_lu_blk_var2
-FLA_Trinv_lu_blk_var3
-FLA_Trinv_lu_blk_var4
-FLA_Trinv_lu_opt_var1
-FLA_Trinv_lu_ops_var1
-FLA_Trinv_lu_opd_var1
-FLA_Trinv_lu_opc_var1
-FLA_Trinv_lu_opz_var1
-FLA_Trinv_lu_opt_var2
-FLA_Trinv_lu_ops_var2
-FLA_Trinv_lu_opd_var2
-FLA_Trinv_lu_opc_var2
-FLA_Trinv_lu_opz_var2
-FLA_Trinv_lu_opt_var3
-FLA_Trinv_lu_ops_var3
-FLA_Trinv_lu_opd_var3
-FLA_Trinv_lu_opc_var3
-FLA_Trinv_lu_opz_var3
-FLA_Trinv_lu_opt_var4
-FLA_Trinv_lu_ops_var4
-FLA_Trinv_lu_opd_var4
-FLA_Trinv_lu_opc_var4
-FLA_Trinv_lu_opz_var4
-FLA_Trinv_lu_unb_var1
-FLA_Trinv_lu_unb_var2
-FLA_Trinv_lu_unb_var3
-FLA_Trinv_lu_unb_var4
-FLA_Trinv_task
-FLA_Trinv_ln_task
-FLA_Trinv_lu_task
-FLA_Trinv_un_task
-FLA_Trinv_uu_task
-FLA_Trinv_un
-FLA_Trinv_unb_external
-FLA_Trinv_ln_unb_ext
-FLA_Trinv_lu_unb_ext
-FLA_Trinv_un_unb_ext
-FLA_Trinv_uu_unb_ext
-FLA_Trinv_un_blk_var1
-FLA_Trinv_un_blk_var2
-FLA_Trinv_un_blk_var3
-FLA_Trinv_un_blk_var4
-FLA_Trinv_un_opt_var1
-FLA_Trinv_un_ops_var1
-FLA_Trinv_un_opd_var1
-FLA_Trinv_un_opc_var1
-FLA_Trinv_un_opz_var1
-FLA_Trinv_un_opt_var2
-FLA_Trinv_un_ops_var2
-FLA_Trinv_un_opd_var2
-FLA_Trinv_un_opc_var2
-FLA_Trinv_un_opz_var2
-FLA_Trinv_un_opt_var3
-FLA_Trinv_un_ops_var3
-FLA_Trinv_un_opd_var3
-FLA_Trinv_un_opc_var3
-FLA_Trinv_un_opz_var3
-FLA_Trinv_un_opt_var4
-FLA_Trinv_un_ops_var4
-FLA_Trinv_un_opd_var4
-FLA_Trinv_un_opc_var4
-FLA_Trinv_un_opz_var4
-FLA_Trinv_un_unb_var1
-FLA_Trinv_un_unb_var2
-FLA_Trinv_un_unb_var3
-FLA_Trinv_un_unb_var4
-FLA_Trinv_uu
-FLA_Trinv_uu_blk_var1
-FLA_Trinv_uu_blk_var2
-FLA_Trinv_uu_blk_var3
-FLA_Trinv_uu_blk_var4
-FLA_Trinv_uu_opt_var1
-FLA_Trinv_uu_ops_var1
-FLA_Trinv_uu_opd_var1
-FLA_Trinv_uu_opc_var1
-FLA_Trinv_uu_opz_var1
-FLA_Trinv_uu_opt_var2
-FLA_Trinv_uu_ops_var2
-FLA_Trinv_uu_opd_var2
-FLA_Trinv_uu_opc_var2
-FLA_Trinv_uu_opz_var2
-FLA_Trinv_uu_opt_var3
-FLA_Trinv_uu_ops_var3
-FLA_Trinv_uu_opd_var3
-FLA_Trinv_uu_opc_var3
-FLA_Trinv_uu_opz_var3
-FLA_Trinv_uu_opt_var4
-FLA_Trinv_uu_ops_var4
-FLA_Trinv_uu_opd_var4
-FLA_Trinv_uu_opc_var4
-FLA_Trinv_uu_opz_var4
-FLA_Trinv_uu_unb_var1
-FLA_Trinv_uu_unb_var2
-FLA_Trinv_uu_unb_var3
-FLA_Trinv_uu_unb_var4
-FLA_Trmm
-FLA_Trmmsx_external
-FLA_Trmm_cntl_init
-FLA_Trmm_cntl_finalize
-FLA_Trmm_external
-FLA_Trmm_internal
-FLA_Trmm_llh
-FLA_Trmm_llh_blk_var1
-FLA_Trmm_llh_blk_var2
-FLA_Trmm_llh_blk_var3
-FLA_Trmm_llh_blk_var4
-FLA_Trmm_llh_unb_var1
-FLA_Trmm_llh_unb_var2
-FLA_Trmm_llh_unb_var3
-FLA_Trmm_llh_unb_var4
-FLA_Trmm_lln
-FLA_Trmm_lln_blk_var1
-FLA_Trmm_lln_blk_var2
-FLA_Trmm_lln_blk_var3
-FLA_Trmm_lln_blk_var4
-FLA_Trmm_lln_unb_var1
-FLA_Trmm_lln_unb_var2
-FLA_Trmm_lln_unb_var3
-FLA_Trmm_lln_unb_var4
-FLA_Trmm_llt
-FLA_Trmm_llt_blk_var1
-FLA_Trmm_llt_blk_var2
-FLA_Trmm_llt_blk_var3
-FLA_Trmm_llt_blk_var4
-FLA_Trmm_llt_unb_var1
-FLA_Trmm_llt_unb_var2
-FLA_Trmm_llt_unb_var3
-FLA_Trmm_llt_unb_var4
-FLA_Trmm_luh
-FLA_Trmm_luh_blk_var1
-FLA_Trmm_luh_blk_var2
-FLA_Trmm_luh_blk_var3
-FLA_Trmm_luh_blk_var4
-FLA_Trmm_luh_unb_var1
-FLA_Trmm_luh_unb_var2
-FLA_Trmm_luh_unb_var3
-FLA_Trmm_luh_unb_var4
-FLA_Trmm_lun
-FLA_Trmm_lun_blk_var1
-FLA_Trmm_lun_blk_var2
-FLA_Trmm_lun_blk_var3
-FLA_Trmm_lun_blk_var4
-FLA_Trmm_lun_unb_var1
-FLA_Trmm_lun_unb_var2
-FLA_Trmm_lun_unb_var3
-FLA_Trmm_lun_unb_var4
-FLA_Trmm_lut
-FLA_Trmm_lut_blk_var1
-FLA_Trmm_lut_blk_var2
-FLA_Trmm_lut_blk_var3
-FLA_Trmm_lut_blk_var4
-FLA_Trmm_lut_unb_var1
-FLA_Trmm_lut_unb_var2
-FLA_Trmm_lut_unb_var3
-FLA_Trmm_lut_unb_var4
-FLA_Trmm_rlh
-FLA_Trmm_rlh_blk_var1
-FLA_Trmm_rlh_blk_var2
-FLA_Trmm_rlh_blk_var3
-FLA_Trmm_rlh_blk_var4
-FLA_Trmm_rlh_unb_var1
-FLA_Trmm_rlh_unb_var2
-FLA_Trmm_rlh_unb_var3
-FLA_Trmm_rlh_unb_var4
-FLA_Trmm_rln
-FLA_Trmm_rln_blk_var1
-FLA_Trmm_rln_blk_var2
-FLA_Trmm_rln_blk_var3
-FLA_Trmm_rln_blk_var4
-FLA_Trmm_rln_unb_var1
-FLA_Trmm_rln_unb_var2
-FLA_Trmm_rln_unb_var3
-FLA_Trmm_rln_unb_var4
-FLA_Trmm_rlt
-FLA_Trmm_rlt_blk_var1
-FLA_Trmm_rlt_blk_var2
-FLA_Trmm_rlt_blk_var3
-FLA_Trmm_rlt_blk_var4
-FLA_Trmm_rlt_unb_var1
-FLA_Trmm_rlt_unb_var2
-FLA_Trmm_rlt_unb_var3
-FLA_Trmm_rlt_unb_var4
-FLA_Trmm_ruh
-FLA_Trmm_ruh_blk_var1
-FLA_Trmm_ruh_blk_var2
-FLA_Trmm_ruh_blk_var3
-FLA_Trmm_ruh_blk_var4
-FLA_Trmm_ruh_unb_var1
-FLA_Trmm_ruh_unb_var2
-FLA_Trmm_ruh_unb_var3
-FLA_Trmm_ruh_unb_var4
-FLA_Trmm_run
-FLA_Trmm_run_blk_var1
-FLA_Trmm_run_blk_var2
-FLA_Trmm_run_blk_var3
-FLA_Trmm_run_blk_var4
-FLA_Trmm_run_unb_var1
-FLA_Trmm_run_unb_var2
-FLA_Trmm_run_unb_var3
-FLA_Trmm_run_unb_var4
-FLA_Trmm_rut
-FLA_Trmm_rut_blk_var1
-FLA_Trmm_rut_blk_var2
-FLA_Trmm_rut_blk_var3
-FLA_Trmm_rut_blk_var4
-FLA_Trmm_rut_unb_var1
-FLA_Trmm_rut_unb_var2
-FLA_Trmm_rut_unb_var3
-FLA_Trmm_rut_unb_var4
-FLA_Trmm_task
-FLA_Trmm_llh_task
-FLA_Trmm_lln_task
-FLA_Trmm_llt_task
-FLA_Trmm_luh_task
-FLA_Trmm_lun_task
-FLA_Trmm_lut_task
-FLA_Trmm_rlh_task
-FLA_Trmm_rln_task
-FLA_Trmm_rlt_task
-FLA_Trmm_ruh_task
-FLA_Trmm_run_task
-FLA_Trmm_rut_task
-FLA_Trmv
-FLA_Trmvsx
-FLA_Trmvsx_external
-FLA_Trmv_external
-FLA_Trsm
-FLA_Trsmsx_external
-FLA_Trsm_cntl_init
-FLA_Trsm_cntl_finalize
-FLA_Trsm_external
-FLA_Trsm_internal
-FLA_Trsm_llh
-FLA_Trsm_llh_blk_var1
-FLA_Trsm_llh_blk_var2
-FLA_Trsm_llh_blk_var3
-FLA_Trsm_llh_blk_var4
-FLA_Trsm_llh_unb_var1
-FLA_Trsm_llh_unb_var2
-FLA_Trsm_llh_unb_var3
-FLA_Trsm_llh_unb_var4
-FLA_Trsm_lln
-FLA_Trsm_lln_blk_var1
-FLA_Trsm_lln_blk_var2
-FLA_Trsm_lln_blk_var3
-FLA_Trsm_lln_blk_var4
-FLA_Trsm_lln_unb_var1
-FLA_Trsm_lln_unb_var2
-FLA_Trsm_lln_unb_var3
-FLA_Trsm_lln_unb_var4
-FLA_Trsm_llt
-FLA_Trsm_llt_blk_var1
-FLA_Trsm_llt_blk_var2
-FLA_Trsm_llt_blk_var3
-FLA_Trsm_llt_blk_var4
-FLA_Trsm_llt_unb_var1
-FLA_Trsm_llt_unb_var2
-FLA_Trsm_llt_unb_var3
-FLA_Trsm_llt_unb_var4
-FLA_Trsm_luh
-FLA_Trsm_luh_blk_var1
-FLA_Trsm_luh_blk_var2
-FLA_Trsm_luh_blk_var3
-FLA_Trsm_luh_blk_var4
-FLA_Trsm_luh_unb_var1
-FLA_Trsm_luh_unb_var2
-FLA_Trsm_luh_unb_var3
-FLA_Trsm_luh_unb_var4
-FLA_Trsm_lun
-FLA_Trsm_lun_blk_var1
-FLA_Trsm_lun_blk_var2
-FLA_Trsm_lun_blk_var3
-FLA_Trsm_lun_blk_var4
-FLA_Trsm_lun_unb_var1
-FLA_Trsm_lun_unb_var2
-FLA_Trsm_lun_unb_var3
-FLA_Trsm_lun_unb_var4
-FLA_Trsm_lut
-FLA_Trsm_lut_blk_var1
-FLA_Trsm_lut_blk_var2
-FLA_Trsm_lut_blk_var3
-FLA_Trsm_lut_blk_var4
-FLA_Trsm_lut_unb_var1
-FLA_Trsm_lut_unb_var2
-FLA_Trsm_lut_unb_var3
-FLA_Trsm_lut_unb_var4
-FLA_Trsm_piv_task
-FLA_Trsm_rlh
-FLA_Trsm_rlh_blk_var1
-FLA_Trsm_rlh_blk_var2
-FLA_Trsm_rlh_blk_var3
-FLA_Trsm_rlh_blk_var4
-FLA_Trsm_rlh_unb_var1
-FLA_Trsm_rlh_unb_var2
-FLA_Trsm_rlh_unb_var3
-FLA_Trsm_rlh_unb_var4
-FLA_Trsm_rln
-FLA_Trsm_rln_blk_var1
-FLA_Trsm_rln_blk_var2
-FLA_Trsm_rln_blk_var3
-FLA_Trsm_rln_blk_var4
-FLA_Trsm_rln_unb_var1
-FLA_Trsm_rln_unb_var2
-FLA_Trsm_rln_unb_var3
-FLA_Trsm_rln_unb_var4
-FLA_Trsm_rlt
-FLA_Trsm_rlt_blk_var1
-FLA_Trsm_rlt_blk_var2
-FLA_Trsm_rlt_blk_var3
-FLA_Trsm_rlt_blk_var4
-FLA_Trsm_rlt_unb_var1
-FLA_Trsm_rlt_unb_var2
-FLA_Trsm_rlt_unb_var3
-FLA_Trsm_rlt_unb_var4
-FLA_Trsm_ruh
-FLA_Trsm_ruh_blk_var1
-FLA_Trsm_ruh_blk_var2
-FLA_Trsm_ruh_blk_var3
-FLA_Trsm_ruh_blk_var4
-FLA_Trsm_ruh_unb_var1
-FLA_Trsm_ruh_unb_var2
-FLA_Trsm_ruh_unb_var3
-FLA_Trsm_ruh_unb_var4
-FLA_Trsm_run
-FLA_Trsm_run_blk_var1
-FLA_Trsm_run_blk_var2
-FLA_Trsm_run_blk_var3
-FLA_Trsm_run_blk_var4
-FLA_Trsm_run_unb_var1
-FLA_Trsm_run_unb_var2
-FLA_Trsm_run_unb_var3
-FLA_Trsm_run_unb_var4
-FLA_Trsm_rut
-FLA_Trsm_rut_blk_var1
-FLA_Trsm_rut_blk_var2
-FLA_Trsm_rut_blk_var3
-FLA_Trsm_rut_blk_var4
-FLA_Trsm_rut_unb_var1
-FLA_Trsm_rut_unb_var2
-FLA_Trsm_rut_unb_var3
-FLA_Trsm_rut_unb_var4
-FLA_Trsm_task
-FLA_Trsm_llh_task
-FLA_Trsm_lln_task
-FLA_Trsm_llt_task
-FLA_Trsm_luh_task
-FLA_Trsm_lun_task
-FLA_Trsm_lut_task
-FLA_Trsm_rlh_task
-FLA_Trsm_rln_task
-FLA_Trsm_rlt_task
-FLA_Trsm_ruh_task
-FLA_Trsm_run_task
-FLA_Trsm_rut_task
-FLA_Trsv
-FLA_Trsvsx
-FLA_Trsvsx_external
-FLA_Trsv_cntl_init
-FLA_Trsv_cntl_finalize
-FLA_Trsv_external
-FLA_Trsv_internal
-FLA_Trsv_lc
-FLA_Trsv_lc_blk_var1
-FLA_Trsv_lc_blk_var2
-FLA_Trsv_ln
-FLA_Trsv_ln_blk_var1
-FLA_Trsv_ln_blk_var2
-FLA_Trsv_lt
-FLA_Trsv_lt_blk_var1
-FLA_Trsv_lt_blk_var2
-FLA_Trsv_task
-FLA_Trsv_lc_task
-FLA_Trsv_ln_task
-FLA_Trsv_lt_task
-FLA_Trsv_uc_task
-FLA_Trsv_un_task
-FLA_Trsv_ut_task
-FLA_Trsv_uc
-FLA_Trsv_uc_blk_var1
-FLA_Trsv_uc_blk_var2
-FLA_Trsv_un
-FLA_Trsv_un_blk_var1
-FLA_Trsv_un_blk_var2
-FLA_Trsv_ut
-FLA_Trsv_ut_blk_var1
-FLA_Trsv_ut_blk_var2
-FLA_Ttmm
-FLA_Ttmm_blk_external
-FLA_Ttmm_cntl_init
-FLA_Ttmm_cntl_finalize
-FLA_Ttmm_internal
-FLA_Ttmm_l
-FLA_Ttmm_l_blk_var1
-FLA_Ttmm_l_blk_var2
-FLA_Ttmm_l_blk_var3
-FLA_Ttmm_l_opt_var1
-FLA_Ttmm_l_ops_var1
-FLA_Ttmm_l_opd_var1
-FLA_Ttmm_l_opc_var1
-FLA_Ttmm_l_opz_var1
-FLA_Ttmm_l_opt_var2
-FLA_Ttmm_l_ops_var2
-FLA_Ttmm_l_opd_var2
-FLA_Ttmm_l_opc_var2
-FLA_Ttmm_l_opz_var2
-FLA_Ttmm_l_opt_var3
-FLA_Ttmm_l_ops_var3
-FLA_Ttmm_l_opd_var3
-FLA_Ttmm_l_opc_var3
-FLA_Ttmm_l_opz_var3
-FLA_Ttmm_l_unb_var1
-FLA_Ttmm_l_unb_var2
-FLA_Ttmm_l_unb_var3
-FLA_Ttmm_task
-FLA_Ttmm_l_task
-FLA_Ttmm_u_task
-FLA_Ttmm_u
-FLA_Ttmm_unb_external
-FLA_Ttmm_l_unb_ext
-FLA_Ttmm_u_unb_ext
-FLA_Ttmm_u_blk_var1
-FLA_Ttmm_u_blk_var2
-FLA_Ttmm_u_blk_var3
-FLA_Ttmm_u_opt_var1
-FLA_Ttmm_u_ops_var1
-FLA_Ttmm_u_opd_var1
-FLA_Ttmm_u_opc_var1
-FLA_Ttmm_u_opz_var1
-FLA_Ttmm_u_opt_var2
-FLA_Ttmm_u_ops_var2
-FLA_Ttmm_u_opd_var2
-FLA_Ttmm_u_opc_var2
-FLA_Ttmm_u_opz_var2
-FLA_Ttmm_u_opt_var3
-FLA_Ttmm_u_ops_var3
-FLA_Ttmm_u_opd_var3
-FLA_Ttmm_u_opc_var3
-FLA_Ttmm_u_opz_var3
-FLA_Ttmm_u_unb_var1
-FLA_Ttmm_u_unb_var2
-FLA_Ttmm_u_unb_var3
-FLA_Part_2x2
-FLA_Part_2x1
-FLA_Part_1x2
-FLA_Repart_2x2_to_3x3
-FLA_Repart_2x1_to_3x1
-FLA_Repart_1x2_to_1x3
-FLA_Cont_with_3x3_to_2x2
-FLA_Cont_with_3x1_to_2x1
-FLA_Cont_with_1x3_to_1x2
-FLA_Merge_2x2
-FLA_Merge_2x1
-FLA_Merge_1x2
+EXPORTS \r
+FLA_TWO\r
+FLA_ONE\r
+FLA_ONE_HALF\r
+FLA_ZERO\r
+FLA_MINUS_ONE_HALF\r
+FLA_MINUS_ONE\r
+FLA_MINUS_TWO\r
+fla_axpyt_cntl_blas\r
+fla_copyt_cntl_blas\r
+fla_gemm_cntl_blas\r
+fla_hemm_cntl_blas\r
+fla_herk_cntl_blas\r
+fla_her2k_cntl_blas\r
+fla_symm_cntl_blas\r
+fla_syrk_cntl_blas\r
+fla_syr2k_cntl_blas\r
+fla_trmm_cntl_blas\r
+fla_trsm_cntl_blas\r
+fla_appiv_cntl_unb\r
+bli_samax \r
+bli_damax \r
+bli_camax \r
+bli_zamax \r
+bli_sasum \r
+bli_dasum \r
+bli_casum \r
+bli_zasum \r
+bli_saxpy \r
+bli_daxpy \r
+bli_caxpy \r
+bli_zaxpy \r
+bli_saxpymt \r
+bli_daxpymt \r
+bli_caxpymt \r
+bli_zaxpymt \r
+bli_saxpysmt \r
+bli_daxpysmt \r
+bli_caxpysmt \r
+bli_zaxpysmt \r
+bli_saxpysv \r
+bli_daxpysv \r
+bli_caxpysv \r
+bli_zaxpysv \r
+bli_saxpyv \r
+bli_daxpyv \r
+bli_caxpyv \r
+bli_zaxpyv \r
+bli_cconjm \r
+bli_zconjm \r
+bli_cconjmr \r
+bli_zconjmr \r
+bli_cconjv \r
+bli_zconjv \r
+bli_scopy \r
+bli_dcopy \r
+bli_ccopy \r
+bli_zcopy \r
+bli_scopymr \r
+bli_dcopymr \r
+bli_ccopymr \r
+bli_zcopymr \r
+bli_scopymt \r
+bli_dcopymt \r
+bli_ccopymt \r
+bli_zcopymt \r
+bli_scopyv \r
+bli_dcopyv \r
+bli_ccopyv \r
+bli_zcopyv \r
+bli_sdot \r
+bli_ddot \r
+bli_cdot \r
+bli_zdot \r
+bli_sdot2s \r
+bli_ddot2s \r
+bli_cdot2s \r
+bli_zdot2s \r
+bli_sdots \r
+bli_ddots \r
+bli_cdots \r
+bli_zdots \r
+bli_sinverts \r
+bli_dinverts \r
+bli_cinverts \r
+bli_zinverts \r
+bli_sinvscalm \r
+bli_dinvscalm \r
+bli_csinvscalm \r
+bli_cinvscalm \r
+bli_zdinvscalm \r
+bli_zinvscalm \r
+bli_sinvscalv \r
+bli_dinvscalv \r
+bli_csinvscalv \r
+bli_cinvscalv \r
+bli_zdinvscalv \r
+bli_zinvscalv \r
+bli_snrm2 \r
+bli_dnrm2 \r
+bli_cnrm2 \r
+bli_znrm2 \r
+bli_sscal \r
+bli_dscal \r
+bli_csscal \r
+bli_cscal \r
+bli_zdscal \r
+bli_zscal \r
+bli_sscalm \r
+bli_dscalm \r
+bli_csscalm \r
+bli_cscalm \r
+bli_zdscalm \r
+bli_zscalm \r
+bli_sscalmr \r
+bli_dscalmr \r
+bli_csscalmr \r
+bli_cscalmr \r
+bli_zdscalmr \r
+bli_zscalmr \r
+bli_sscalv \r
+bli_dscalv \r
+bli_csscalv \r
+bli_cscalv \r
+bli_zdscalv \r
+bli_zscalv \r
+bli_sswap \r
+bli_dswap \r
+bli_cswap \r
+bli_zswap \r
+bli_sswapmt \r
+bli_dswapmt \r
+bli_cswapmt \r
+bli_zswapmt \r
+bli_sgemv \r
+bli_dgemv \r
+bli_cgemv \r
+bli_zgemv \r
+bli_sger \r
+bli_dger \r
+bli_cger \r
+bli_zger \r
+bli_chemv \r
+bli_zhemv \r
+bli_cher \r
+bli_zher \r
+bli_cher2 \r
+bli_zher2 \r
+bli_ssymv \r
+bli_dsymv \r
+bli_csymv \r
+bli_zsymv \r
+bli_ssyr \r
+bli_dsyr \r
+bli_csyr \r
+bli_zsyr \r
+bli_ssyr2 \r
+bli_dsyr2 \r
+bli_csyr2 \r
+bli_zsyr2 \r
+bli_strmv \r
+bli_dtrmv \r
+bli_ctrmv \r
+bli_ztrmv \r
+bli_strsv \r
+bli_dtrsv \r
+bli_ctrsv \r
+bli_ztrsv \r
+bli_sgemm \r
+bli_dgemm \r
+bli_cgemm \r
+bli_zgemm \r
+bli_chemm \r
+bli_zhemm \r
+bli_cherk \r
+bli_zherk \r
+bli_cher2k \r
+bli_zher2k \r
+bli_ssymm \r
+bli_dsymm \r
+bli_csymm \r
+bli_zsymm \r
+bli_ssyrk \r
+bli_dsyrk \r
+bli_csyrk \r
+bli_zsyrk \r
+bli_ssyr2k \r
+bli_dsyr2k \r
+bli_csyr2k \r
+bli_zsyr2k \r
+bli_strmm \r
+bli_dtrmm \r
+bli_ctrmm \r
+bli_ztrmm \r
+bli_strsm \r
+bli_dtrsm \r
+bli_ctrsm \r
+bli_ztrsm \r
+FLASH_Apply_pivots \r
+FLASH_Apply_pivots_cntl_init \r
+FLASH_Apply_pivots_cntl_finalize \r
+FLASH_Apply_Q_UT \r
+FLASH_Apply_Q_UT_cntl_init \r
+FLASH_Apply_Q_UT_cntl_finalize \r
+FLASH_Apply_Q_UT_inc \r
+FLASH_Apply_Q_UT_inc_cntl_init \r
+FLASH_Apply_Q_UT_inc_cntl_finalize \r
+FLASH_Apply_Q_UT_inc_create_workspace \r
+FLASH_Apply_Q2_UT \r
+FLASH_Apply_Q2_UT_cntl_init \r
+FLASH_Apply_Q2_UT_cntl_finalize \r
+FLASH_Axpy \r
+FLASH_Axpyt \r
+FLASH_Axpyt_cntl_init \r
+FLASH_Axpyt_cntl_finalize \r
+FLASH_Axpy_cntl_init \r
+FLASH_Axpy_cntl_finalize \r
+FLASH_Axpy_buffer_to_hier \r
+FLASH_Axpy_hier_to_buffer \r
+FLASH_Axpy_flat_to_hier \r
+FLASH_Axpy_hier_to_flat \r
+FLASH_Axpy_hierarchy \r
+FLASH_Axpy_hierarchy_r \r
+FLASH_Chol \r
+FLASH_Chol_cntl_init \r
+FLASH_Chol_cntl_finalize \r
+FLASH_Chol_solve \r
+FLASH_Copy \r
+FLASH_Copyt \r
+FLASH_Copyt_cntl_init \r
+FLASH_Copyt_cntl_finalize \r
+FLASH_Copy_cntl_init \r
+FLASH_Copy_cntl_finalize \r
+FLASH_Copy_buffer_to_hier \r
+FLASH_Copy_hier_to_buffer \r
+FLASH_Copy_flat_to_hier \r
+FLASH_Copy_hier_to_flat \r
+FLASH_Copy_hierarchy \r
+FLASH_Copy_hierarchy_r \r
+FLASH_FS_incpiv \r
+FLASH_FS_incpiv_aux1 \r
+FLASH_FS_incpiv_aux2 \r
+FLASH_Gemm \r
+FLASH_Gemm_cntl_init \r
+FLASH_Gemm_cntl_finalize \r
+FLASH_Gemv \r
+FLASH_Gemv_cntl_init \r
+FLASH_Gemv_cntl_finalize \r
+FLASH_Hemm \r
+FLASH_Hemm_cntl_init \r
+FLASH_Hemm_cntl_finalize \r
+FLASH_Her2k \r
+FLASH_Her2k_cntl_init \r
+FLASH_Her2k_cntl_finalize \r
+FLASH_Herk \r
+FLASH_Herk_cntl_init \r
+FLASH_Herk_cntl_finalize \r
+FLASH_LU_find_zero_on_diagonal \r
+FLASH_LU_incpiv \r
+FLASH_LU_incpiv_cntl_init \r
+FLASH_LU_incpiv_cntl_finalize \r
+FLASH_LU_incpiv_create_hier_matrices \r
+FLASH_LU_incpiv_determine_alg_blocksize \r
+FLASH_LU_incpiv_noopt \r
+FLASH_LU_incpiv_opt1 \r
+FLASH_LU_incpiv_solve \r
+FLASH_LU_incpiv_var1 \r
+FLASH_LU_incpiv_var2 \r
+FLASH_LU_nopiv \r
+FLASH_LU_nopiv_cntl_init \r
+FLASH_LU_nopiv_cntl_finalize \r
+FLASH_LU_nopiv_solve \r
+FLASH_LU_piv \r
+FLASH_LU_piv_cntl_init \r
+FLASH_LU_piv_cntl_finalize \r
+FLASH_LU_piv_solve \r
+FLASH_Max_elemwise_diff \r
+FLASH_Norm1 \r
+FLASH_Obj_datatype \r
+FLASH_Obj_depth \r
+FLASH_Obj_blocksizes \r
+FLASH_Obj_scalar_length \r
+FLASH_Obj_scalar_width \r
+FLASH_Obj_create \r
+FLASH_Obj_create_ext \r
+FLASH_Obj_create_without_buffer \r
+FLASH_Obj_create_without_buffer_ext \r
+FLASH_Obj_create_helper \r
+FLASH_Obj_create_hierarchy \r
+FLASH_Obj_create_conf_to \r
+FLASH_Obj_create_hier_conf_to_flat \r
+FLASH_Obj_create_hier_conf_to_flat_ext \r
+FLASH_Obj_create_flat_conf_to_hier \r
+FLASH_Obj_create_hier_copy_of_flat \r
+FLASH_Obj_create_hier_copy_of_flat_ext \r
+FLASH_Obj_create_flat_copy_of_hier \r
+FLASH_Obj_free \r
+FLASH_Obj_free_without_buffer \r
+FLASH_Obj_free_hierarchy \r
+FLASH_Obj_extract_buffer \r
+FLASH_Obj_flatten \r
+FLASH_Obj_hierarchify \r
+FLASH_Obj_show \r
+FLASH_Obj_attach_buffer \r
+FLASH_Obj_attach_buffer_hierarchy \r
+FLASH_print_struct \r
+FLASH_print_struct_helper \r
+FLASH_Obj_create_diag_panel \r
+FLASH_Obj_exec \r
+FLASH_Obj_exec_parallel \r
+FLASH_Obj_push \r
+FLASH_Set \r
+FLASH_Shift_diag \r
+FLASH_QR_UT_cntl_init \r
+FLASH_QR_UT_cntl_finalize \r
+FLASH_QR_UT_inc \r
+FLASH_QR_UT_inc_cntl_init \r
+FLASH_QR_UT_inc_cntl_finalize \r
+FLASH_QR_UT_inc_create_hier_matrices \r
+FLASH_QR_UT_inc_determine_alg_blocksize \r
+FLASH_QR_UT_inc_noopt \r
+FLASH_QR_UT_inc_opt1 \r
+FLASH_QR_UT_inc_solve \r
+FLASH_QR2_UT \r
+FLASH_QR2_UT_cntl_init \r
+FLASH_QR2_UT_cntl_finalize \r
+FLASH_Queue_begin \r
+FLASH_Queue_end \r
+FLASH_Queue_stack_depth \r
+FLASH_Queue_enable \r
+FLASH_Queue_disable \r
+FLASH_Queue_get_enabled \r
+FLASH_Queue_set_num_threads \r
+FLASH_Queue_get_num_threads \r
+FLASH_Queue_init \r
+FLASH_Queue_finalize \r
+FLASH_Queue_get_num_tasks \r
+FLASH_Queue_set_verbose_output \r
+FLASH_Queue_get_verbose_output \r
+FLASH_Queue_set_sorting \r
+FLASH_Queue_get_sorting \r
+FLASH_Queue_set_caching \r
+FLASH_Queue_get_caching \r
+FLASH_Queue_set_work_stealing \r
+FLASH_Queue_get_work_stealing \r
+FLASH_Queue_set_data_affinity \r
+FLASH_Queue_get_data_affinity \r
+FLASH_Queue_get_total_time \r
+FLASH_Queue_get_parallel_time \r
+FLASH_Queue_set_parallel_time \r
+FLASH_Queue_get_num_blocks \r
+FLASH_Queue_set_block_size \r
+FLASH_Queue_get_block_size \r
+FLASH_Queue_set_cache_size \r
+FLASH_Queue_get_cache_size \r
+FLASH_Queue_set_cache_line_size \r
+FLASH_Queue_get_cache_line_size \r
+FLASH_Queue_set_cores_per_cache \r
+FLASH_Queue_get_cores_per_cache \r
+FLASH_Queue_set_cores_per_queue \r
+FLASH_Queue_get_cores_per_queue \r
+FLASH_Queue_reset \r
+FLASH_Queue_get_head_task \r
+FLASH_Queue_get_tail_task \r
+FLASH_Queue_push \r
+FLASH_Queue_push_input \r
+FLASH_Queue_push_output \r
+FLASH_Task_alloc \r
+FLASH_Task_free \r
+FLASH_Queue_exec_task \r
+FLASH_Queue_verbose_output \r
+FLASH_Queue_exec \r
+FLASH_Queue_init_tasks \r
+FLASH_Queue_wait_enqueue \r
+FLASH_Queue_wait_dequeue \r
+FLASH_Queue_wait_dequeue_block \r
+FLASH_Queue_reside_in_cache \r
+FLASH_Queue_update_cache \r
+FLASH_Queue_update_cache_block \r
+FLASH_Queue_prefetch \r
+FLASH_Queue_prefetch_block \r
+FLASH_Queue_work_stealing \r
+FLASH_Queue_exec_parallel \r
+FLASH_Queue_exec_parallel_function \r
+FLASH_Task_update_dependencies \r
+FLASH_Task_update_binding \r
+FLASH_Task_free_parallel \r
+FLASH_Random_matrix \r
+FLASH_Random_spd_matrix \r
+FLASH_SA_FS \r
+FLASH_SA_LU \r
+FLASH_SPDinv \r
+FLASH_SPDinv_cntl_init \r
+FLASH_SPDinv_cntl_finalize \r
+FLASH_Sylv \r
+FLASH_Sylv_cntl_init \r
+FLASH_Sylv_cntl_finalize \r
+FLASH_Symm \r
+FLASH_Symm_cntl_init \r
+FLASH_Symm_cntl_finalize \r
+FLASH_Syr2k \r
+FLASH_Syr2k_cntl_init \r
+FLASH_Syr2k_cntl_finalize \r
+FLASH_Syrk \r
+FLASH_Syrk_cntl_init \r
+FLASH_Syrk_cntl_finalize \r
+FLASH_Triangularize \r
+FLASH_Trinv \r
+FLASH_Trinv_cntl_init \r
+FLASH_Trinv_cntl_finalize \r
+FLASH_Trmm \r
+FLASH_Trmm_cntl_init \r
+FLASH_Trmm_cntl_finalize \r
+FLASH_Trsm \r
+FLASH_Trsm_cntl_init \r
+FLASH_Trsm_cntl_finalize \r
+FLASH_Trsm_piv \r
+FLASH_Trsv \r
+FLASH_Trsv_cntl_init \r
+FLASH_Trsv_cntl_finalize \r
+FLASH_Ttmm \r
+FLASH_Ttmm_cntl_init \r
+FLASH_Ttmm_cntl_finalize \r
+FLA_Absolute_square \r
+FLA_Accum_T_UT \r
+FLA_Accum_T_UT_fc_blk_var2 \r
+FLA_Accum_T_UT_fc_opt_var1 \r
+FLA_Accum_T_UT_fc_ops_var1 \r
+FLA_Accum_T_UT_fc_opd_var1 \r
+FLA_Accum_T_UT_fc_opc_var1 \r
+FLA_Accum_T_UT_fc_opz_var1 \r
+FLA_Accum_T_UT_fc_unb_var1 \r
+FLA_Accum_T_UT_fr_blk_var2 \r
+FLA_Accum_T_UT_fr_opt_var1 \r
+FLA_Accum_T_UT_fr_ops_var1 \r
+FLA_Accum_T_UT_fr_opd_var1 \r
+FLA_Accum_T_UT_fr_opc_var1 \r
+FLA_Accum_T_UT_fr_opz_var1 \r
+FLA_Accum_T_UT_fr_unb_var1 \r
+FLA_Accum_T_UT_internal \r
+FLA_Amax \r
+FLA_Amax_external \r
+FLA_Apply_H2_UT \r
+FLA_Apply_H2_UT_internal \r
+FLA_Apply_H2_UT_lh_opt_var1 \r
+FLA_Apply_H2_UT_lh_ops_var1 \r
+FLA_Apply_H2_UT_lh_opd_var1 \r
+FLA_Apply_H2_UT_lh_opc_var1 \r
+FLA_Apply_H2_UT_lh_opz_var1 \r
+FLA_Apply_H2_UT_lh_unb_var1 \r
+FLA_Apply_H2_UT_rh_opt_var1 \r
+FLA_Apply_H2_UT_rh_ops_var1 \r
+FLA_Apply_H2_UT_rh_opd_var1 \r
+FLA_Apply_H2_UT_rh_opc_var1 \r
+FLA_Apply_H2_UT_rh_opz_var1 \r
+FLA_Apply_H2_UT_rh_unb_var1 \r
+FLA_Apply_H2_UT_rn_opt_var1 \r
+FLA_Apply_H2_UT_rn_ops_var1 \r
+FLA_Apply_H2_UT_rn_opd_var1 \r
+FLA_Apply_H2_UT_rn_opc_var1 \r
+FLA_Apply_H2_UT_rn_opz_var1 \r
+FLA_Apply_H2_UT_rn_unb_var1 \r
+FLA_Apply_pivots \r
+FLA_Apply_pivots_cntl_init \r
+FLA_Apply_pivots_cntl_finalize \r
+FLA_Apply_pivots_internal \r
+FLA_Apply_pivots_ln \r
+FLA_Apply_pivots_ln_blk_var1 \r
+FLA_Apply_pivots_ln_blk_var2 \r
+FLA_Apply_pivots_ln_opt_var1 \r
+FLA_Apply_pivots_ln_ops_var1 \r
+FLA_Apply_pivots_ln_opd_var1 \r
+FLA_Apply_pivots_ln_opc_var1 \r
+FLA_Apply_pivots_ln_opz_var1 \r
+FLA_Apply_pivots_macro_external \r
+FLA_Apply_pivots_macro_task \r
+FLA_Apply_pivots_task \r
+FLA_Apply_pivots_ln_task \r
+FLA_Apply_pivots_unb_external \r
+FLA_Apply_pivots_ln_unb_ext \r
+FLA_Apply_Q_blk_external \r
+FLA_Apply_Q_UT \r
+FLA_Apply_Q_UT_cntl_init \r
+FLA_Apply_Q_UT_cntl_finalize \r
+FLA_Apply_Q_UT_create_workspace \r
+FLA_Apply_Q_UT_inc_internal \r
+FLA_Apply_Q_UT_inc_lhfc \r
+FLA_Apply_Q_UT_inc_lhfc_blk_var1 \r
+FLA_Apply_Q_UT_internal \r
+FLA_Apply_Q_UT_lhfc \r
+FLA_Apply_Q_UT_lhfc_blk_var1 \r
+FLA_Apply_Q_UT_lhfc_blk_var2 \r
+FLA_Apply_Q_UT_lnfr \r
+FLA_Apply_Q_UT_lnfr_blk_var1 \r
+FLA_Apply_Q_UT_lnfr_blk_var2 \r
+FLA_Apply_Q_UT_rnfr \r
+FLA_Apply_Q_UT_rnfr_blk_var1 \r
+FLA_Apply_Q_UT_rnfr_blk_var2 \r
+FLA_Apply_Q_UT_task \r
+FLA_Apply_Q_UT_lhfc_task \r
+FLA_Apply_Q_UT_lnfr_task \r
+FLA_Apply_Q_UT_rnfr_task \r
+FLA_Apply_Q2_UT_cntl_init \r
+FLA_Apply_Q2_UT_cntl_finalize \r
+FLA_Apply_Q2_UT_internal \r
+FLA_Apply_Q2_UT_lhfc \r
+FLA_Apply_Q2_UT_lhfc_blk_var1 \r
+FLA_Apply_Q2_UT_lhfc_blk_var2 \r
+FLA_Apply_Q2_UT_lhfc_blk_var3 \r
+FLA_Apply_Q2_UT_task \r
+FLA_Apply_Q2_UT_lhfc_task \r
+FLA_Asum \r
+FLA_Asum_external \r
+FLA_Axpy \r
+FLA_Axpys \r
+FLA_Axpys_external \r
+FLA_Axpyt \r
+FLA_Axpyt_c \r
+FLA_Axpyt_cntl_init \r
+FLA_Axpyt_cntl_finalize \r
+FLA_Axpyt_c_blk_var1 \r
+FLA_Axpyt_c_blk_var2 \r
+FLA_Axpyt_c_blk_var3 \r
+FLA_Axpyt_c_blk_var4 \r
+FLA_Axpyt_external \r
+FLA_Axpyt_h \r
+FLA_Axpyt_h_blk_var1 \r
+FLA_Axpyt_h_blk_var2 \r
+FLA_Axpyt_h_blk_var3 \r
+FLA_Axpyt_h_blk_var4 \r
+FLA_Axpyt_internal \r
+FLA_Axpyt_n \r
+FLA_Axpyt_n_blk_var1 \r
+FLA_Axpyt_n_blk_var2 \r
+FLA_Axpyt_n_blk_var3 \r
+FLA_Axpyt_n_blk_var4 \r
+FLA_Axpyt_t \r
+FLA_Axpyt_task \r
+FLA_Axpyt_n_task \r
+FLA_Axpyt_t_task \r
+FLA_Axpyt_c_task \r
+FLA_Axpyt_h_task \r
+FLA_Axpyt_t_blk_var1 \r
+FLA_Axpyt_t_blk_var2 \r
+FLA_Axpyt_t_blk_var3 \r
+FLA_Axpyt_t_blk_var4 \r
+FLA_Axpy_blk_var1 \r
+FLA_Axpy_blk_var2 \r
+FLA_Axpy_blk_var3 \r
+FLA_Axpy_blk_var4 \r
+FLA_Axpy_cntl_init \r
+FLA_Axpy_cntl_finalize \r
+FLA_Axpy_external \r
+FLA_Axpy_internal \r
+FLA_Axpy_task \r
+FLA_Axpy_buffer_to_object \r
+FLA_Axpy_object_to_buffer \r
+FLA_Blocksize_create \r
+FLA_Blocksize_set \r
+FLA_Blocksize_scale \r
+FLA_Blocksize_create_copy \r
+FLA_Blocksize_free \r
+FLA_Blocksize_extract \r
+FLA_Query_blocksizes \r
+FLA_Query_blocksize \r
+FLA_Determine_blocksize \r
+FLA_determine_matrix_size \r
+FLA_Check_error_level \r
+FLA_Check_error_level_set \r
+FLA_Check_error_code_helper \r
+FLA_Check_valid_side \r
+FLA_Check_valid_uplo \r
+FLA_Check_valid_trans \r
+FLA_Check_valid_diag \r
+FLA_Check_valid_conj \r
+FLA_Check_valid_direct \r
+FLA_Check_valid_storev \r
+FLA_Check_valid_datatype \r
+FLA_Check_valid_object_datatype \r
+FLA_Check_floating_datatype \r
+FLA_Check_int_datatype \r
+FLA_Check_real_datatype \r
+FLA_Check_complex_datatype \r
+FLA_Check_floating_object \r
+FLA_Check_int_object \r
+FLA_Check_real_object \r
+FLA_Check_complex_object \r
+FLA_Check_identical_object_precision \r
+FLA_Check_consistent_object_datatype \r
+FLA_Check_consistent_datatype \r
+FLA_Check_square \r
+FLA_Check_if_scalar \r
+FLA_Check_if_vector \r
+FLA_Check_conformal_dims \r
+FLA_Check_matrix_matrix_dims \r
+FLA_Check_matrix_vector_dims \r
+FLA_Check_equal_vector_lengths \r
+FLA_Check_conj_trans_and_datatype \r
+FLA_Check_vector_length \r
+FLA_Check_null_pointer \r
+FLA_Check_object_dims \r
+FLA_Check_valid_pivot_type \r
+FLA_Check_malloc_pointer \r
+FLA_Check_base_buffer_mismatch \r
+FLA_Check_adjacent_objects_2x2 \r
+FLA_Check_adjacent_objects_2x1 \r
+FLA_Check_adjacent_objects_1x2 \r
+FLA_Check_blocksize_value \r
+FLA_Check_blocksize_object \r
+FLA_Check_file_descriptor \r
+FLA_Check_lseek_result \r
+FLA_Check_close_result \r
+FLA_Check_unlink_result \r
+FLA_Check_read_result \r
+FLA_Check_write_result \r
+FLA_Check_valid_quadrant \r
+FLA_Check_vector_length_min \r
+FLA_Check_pthread_create_result \r
+FLA_Check_pthread_join_result \r
+FLA_Check_valid_isgn_value \r
+FLA_Check_sylv_matrix_dims \r
+FLA_Check_chol_failure \r
+FLA_Check_valid_elemtype \r
+FLA_Check_posix_memalign_failure \r
+FLA_Check_submatrix_dims_and_offset \r
+FLA_Check_object_scalar_elemtype \r
+FLA_Check_object_matrix_elemtype \r
+FLA_Check_num_threads \r
+FLA_Check_conj_and_datatype \r
+FLA_Check_valid_complex_trans \r
+FLA_Check_valid_real_trans \r
+FLA_Check_valid_blas_trans \r
+FLA_Check_nonconstant_datatype \r
+FLA_Check_nonconstant_object \r
+FLA_Check_identical_object_datatype \r
+FLA_Check_divide_by_zero \r
+FLA_Check_identical_object_elemtype \r
+FLA_Check_pivot_index_range \r
+FLA_Check_householder_panel_dims \r
+FLA_Check_object_length_equals \r
+FLA_Check_object_width_equals \r
+FLA_Check_object_length_min \r
+FLA_Check_object_width_min \r
+FLA_Check_valid_error_level \r
+FLA_Check_attempted_repart_2x2 \r
+FLA_Check_attempted_repart_2x1 \r
+FLA_Check_attempted_repart_1x2 \r
+FLA_Check_valid_leftright_side \r
+FLA_Check_valid_topbottom_side \r
+FLA_Check_matrix_strides \r
+FLA_Chol \r
+FLA_Chol_blk_external \r
+FLA_Chol_cntl_init \r
+FLA_Chol_cntl_finalize \r
+FLA_Chol_internal \r
+FLA_Chol_l \r
+FLA_Chol_l_blk_var1 \r
+FLA_Chol_l_blk_var2 \r
+FLA_Chol_l_blk_var3 \r
+FLA_Chol_l_opt_var1 \r
+FLA_Chol_l_ops_var1 \r
+FLA_Chol_l_opd_var1 \r
+FLA_Chol_l_opc_var1 \r
+FLA_Chol_l_opz_var1 \r
+FLA_Chol_l_opt_var2 \r
+FLA_Chol_l_ops_var2 \r
+FLA_Chol_l_opd_var2 \r
+FLA_Chol_l_opc_var2 \r
+FLA_Chol_l_opz_var2 \r
+FLA_Chol_l_opt_var3 \r
+FLA_Chol_l_ops_var3 \r
+FLA_Chol_l_opd_var3 \r
+FLA_Chol_l_opc_var3 \r
+FLA_Chol_l_opz_var3 \r
+FLA_Chol_l_unb_var1 \r
+FLA_Chol_l_unb_var2 \r
+FLA_Chol_l_unb_var3 \r
+FLA_Chol_solve \r
+FLA_Chol_task \r
+FLA_Chol_l_task \r
+FLA_Chol_u_task \r
+FLA_Chol_u \r
+FLA_Chol_unb_external \r
+FLA_Chol_l_unb_ext \r
+FLA_Chol_u_unb_ext \r
+FLA_Chol_u_blk_var1 \r
+FLA_Chol_u_blk_var2 \r
+FLA_Chol_u_blk_var3 \r
+FLA_Chol_u_opt_var1 \r
+FLA_Chol_u_ops_var1 \r
+FLA_Chol_u_opd_var1 \r
+FLA_Chol_u_opc_var1 \r
+FLA_Chol_u_opz_var1 \r
+FLA_Chol_u_opt_var2 \r
+FLA_Chol_u_ops_var2 \r
+FLA_Chol_u_opd_var2 \r
+FLA_Chol_u_opc_var2 \r
+FLA_Chol_u_opz_var2 \r
+FLA_Chol_u_opt_var3 \r
+FLA_Chol_u_ops_var3 \r
+FLA_Chol_u_opd_var3 \r
+FLA_Chol_u_opc_var3 \r
+FLA_Chol_u_opz_var3 \r
+FLA_Chol_u_unb_var1 \r
+FLA_Chol_u_unb_var2 \r
+FLA_Chol_u_unb_var3 \r
+FLA_Clock \r
+FLA_Clock_helper \r
+FLA_Cntl_obj_free \r
+FLA_Cntl_axpy_obj_create \r
+FLA_Cntl_axpyt_obj_create \r
+FLA_Cntl_copy_obj_create \r
+FLA_Cntl_copyt_obj_create \r
+FLA_Cntl_swap_obj_create \r
+FLA_Cntl_tpose_obj_create \r
+FLA_Cntl_gemv_obj_create \r
+FLA_Cntl_trsv_obj_create \r
+FLA_Cntl_gemm_obj_create \r
+FLA_Cntl_hemm_obj_create \r
+FLA_Cntl_herk_obj_create \r
+FLA_Cntl_her2k_obj_create \r
+FLA_Cntl_symm_obj_create \r
+FLA_Cntl_syrk_obj_create \r
+FLA_Cntl_syr2k_obj_create \r
+FLA_Cntl_trmm_obj_create \r
+FLA_Cntl_trsm_obj_create \r
+FLA_Cntl_init \r
+FLA_Cntl_finalize \r
+FLA_Cntl_init_flamec \r
+FLA_Cntl_finalize_flamec \r
+FLA_Cntl_init_flash \r
+FLA_Cntl_finalize_flash \r
+FLA_Cntl_chol_obj_create \r
+FLA_Cntl_lu_obj_create \r
+FLA_Cntl_appiv_obj_create \r
+FLA_Cntl_qrut_obj_create \r
+FLA_Cntl_qrutud_obj_create \r
+FLA_Cntl_qrutinc_obj_create \r
+FLA_Cntl_lqut_obj_create \r
+FLA_Cntl_trinv_obj_create \r
+FLA_Cntl_ttmm_obj_create \r
+FLA_Cntl_sylv_obj_create \r
+FLA_Cntl_spdinv_obj_create \r
+FLA_Cntl_apqut_obj_create \r
+FLA_Cntl_apqutud_obj_create \r
+FLA_Cntl_apqutinc_obj_create \r
+FLA_Conjugate \r
+FLA_Conjugate_r \r
+FLA_Copy \r
+FLA_Copyr \r
+FLA_Copyr_external \r
+FLA_Copyt \r
+FLA_Copyt_c \r
+FLA_Copyt_cntl_init \r
+FLA_Copyt_cntl_finalize \r
+FLA_Copyt_c_blk_var1 \r
+FLA_Copyt_c_blk_var2 \r
+FLA_Copyt_c_blk_var3 \r
+FLA_Copyt_c_blk_var4 \r
+FLA_Copyt_external \r
+FLA_Copyt_h \r
+FLA_Copyt_h_blk_var1 \r
+FLA_Copyt_h_blk_var2 \r
+FLA_Copyt_h_blk_var3 \r
+FLA_Copyt_h_blk_var4 \r
+FLA_Copyt_internal \r
+FLA_Copyt_n \r
+FLA_Copyt_n_blk_var1 \r
+FLA_Copyt_n_blk_var2 \r
+FLA_Copyt_n_blk_var3 \r
+FLA_Copyt_n_blk_var4 \r
+FLA_Copyt_t \r
+FLA_Copyt_task \r
+FLA_Copyt_n_task \r
+FLA_Copyt_t_task \r
+FLA_Copyt_c_task \r
+FLA_Copyt_h_task \r
+FLA_Copyt_t_blk_var1 \r
+FLA_Copyt_t_blk_var2 \r
+FLA_Copyt_t_blk_var3 \r
+FLA_Copyt_t_blk_var4 \r
+FLA_Copy_blk_var1 \r
+FLA_Copy_blk_var2 \r
+FLA_Copy_blk_var3 \r
+FLA_Copy_blk_var4 \r
+FLA_Copy_cntl_init \r
+FLA_Copy_cntl_finalize \r
+FLA_Copy_external \r
+FLA_Copy_internal \r
+FLA_Copy_task \r
+FLA_Copy_buffer_to_object \r
+FLA_Copy_object_to_buffer \r
+FLA_Dot \r
+FLA_Dot2cs \r
+FLA_Dot2cs_external \r
+FLA_Dot2s \r
+FLA_Dot2s_external \r
+FLA_Dotc \r
+FLA_Dotcs \r
+FLA_Dotcs_external \r
+FLA_Dotc_external \r
+FLA_Dots \r
+FLA_Dots_external \r
+FLA_Dot_external \r
+FLA_Error_string_for_code \r
+FLA_Error_messages_init \r
+FLA_Print_message \r
+FLA_Abort \r
+FLA_Form_perm_matrix \r
+FLA_Gemm \r
+FLA_Gemm_cntl_init \r
+FLA_Gemm_cntl_finalize \r
+FLA_Gemm_external \r
+FLA_Gemm_hh \r
+FLA_Gemm_hh_blk_var1 \r
+FLA_Gemm_hh_blk_var2 \r
+FLA_Gemm_hh_blk_var3 \r
+FLA_Gemm_hh_blk_var4 \r
+FLA_Gemm_hh_blk_var5 \r
+FLA_Gemm_hh_blk_var6 \r
+FLA_Gemm_hh_unb_var1 \r
+FLA_Gemm_hh_unb_var2 \r
+FLA_Gemm_hh_unb_var3 \r
+FLA_Gemm_hh_unb_var4 \r
+FLA_Gemm_hh_unb_var5 \r
+FLA_Gemm_hh_unb_var6 \r
+FLA_Gemm_hn \r
+FLA_Gemm_hn_blk_var1 \r
+FLA_Gemm_hn_blk_var2 \r
+FLA_Gemm_hn_blk_var3 \r
+FLA_Gemm_hn_blk_var4 \r
+FLA_Gemm_hn_blk_var5 \r
+FLA_Gemm_hn_blk_var6 \r
+FLA_Gemm_hn_unb_var1 \r
+FLA_Gemm_hn_unb_var2 \r
+FLA_Gemm_hn_unb_var3 \r
+FLA_Gemm_hn_unb_var4 \r
+FLA_Gemm_hn_unb_var5 \r
+FLA_Gemm_hn_unb_var6 \r
+FLA_Gemm_ht \r
+FLA_Gemm_ht_blk_var1 \r
+FLA_Gemm_ht_blk_var2 \r
+FLA_Gemm_ht_blk_var3 \r
+FLA_Gemm_ht_blk_var4 \r
+FLA_Gemm_ht_blk_var5 \r
+FLA_Gemm_ht_blk_var6 \r
+FLA_Gemm_ht_unb_var1 \r
+FLA_Gemm_ht_unb_var2 \r
+FLA_Gemm_ht_unb_var3 \r
+FLA_Gemm_ht_unb_var4 \r
+FLA_Gemm_ht_unb_var5 \r
+FLA_Gemm_ht_unb_var6 \r
+FLA_Gemm_internal \r
+FLA_Gemm_nh \r
+FLA_Gemm_nh_blk_var1 \r
+FLA_Gemm_nh_blk_var2 \r
+FLA_Gemm_nh_blk_var3 \r
+FLA_Gemm_nh_blk_var4 \r
+FLA_Gemm_nh_blk_var5 \r
+FLA_Gemm_nh_blk_var6 \r
+FLA_Gemm_nh_unb_var1 \r
+FLA_Gemm_nh_unb_var2 \r
+FLA_Gemm_nh_unb_var3 \r
+FLA_Gemm_nh_unb_var4 \r
+FLA_Gemm_nh_unb_var5 \r
+FLA_Gemm_nh_unb_var6 \r
+FLA_Gemm_nn \r
+FLA_Gemm_nn_blk_var1 \r
+FLA_Gemm_nn_blk_var2 \r
+FLA_Gemm_nn_blk_var3 \r
+FLA_Gemm_nn_blk_var4 \r
+FLA_Gemm_nn_blk_var5 \r
+FLA_Gemm_nn_blk_var6 \r
+FLA_Gemm_nn_unb_var1 \r
+FLA_Gemm_nn_unb_var2 \r
+FLA_Gemm_nn_unb_var3 \r
+FLA_Gemm_nn_unb_var4 \r
+FLA_Gemm_nn_unb_var5 \r
+FLA_Gemm_nn_unb_var6 \r
+FLA_Gemm_nt \r
+FLA_Gemm_nt_blk_var1 \r
+FLA_Gemm_nt_blk_var2 \r
+FLA_Gemm_nt_blk_var3 \r
+FLA_Gemm_nt_blk_var4 \r
+FLA_Gemm_nt_blk_var5 \r
+FLA_Gemm_nt_blk_var6 \r
+FLA_Gemm_nt_unb_var1 \r
+FLA_Gemm_nt_unb_var2 \r
+FLA_Gemm_nt_unb_var3 \r
+FLA_Gemm_nt_unb_var4 \r
+FLA_Gemm_nt_unb_var5 \r
+FLA_Gemm_nt_unb_var6 \r
+FLA_Gemm_task \r
+FLA_Gemm_hh_task \r
+FLA_Gemm_hn_task \r
+FLA_Gemm_ht_task \r
+FLA_Gemm_nh_task \r
+FLA_Gemm_nn_task \r
+FLA_Gemm_nt_task \r
+FLA_Gemm_th_task \r
+FLA_Gemm_tn_task \r
+FLA_Gemm_tt_task \r
+FLA_Gemm_th \r
+FLA_Gemm_th_blk_var1 \r
+FLA_Gemm_th_blk_var2 \r
+FLA_Gemm_th_blk_var3 \r
+FLA_Gemm_th_blk_var4 \r
+FLA_Gemm_th_blk_var5 \r
+FLA_Gemm_th_blk_var6 \r
+FLA_Gemm_th_unb_var1 \r
+FLA_Gemm_th_unb_var2 \r
+FLA_Gemm_th_unb_var3 \r
+FLA_Gemm_th_unb_var4 \r
+FLA_Gemm_th_unb_var5 \r
+FLA_Gemm_th_unb_var6 \r
+FLA_Gemm_tn \r
+FLA_Gemm_tn_blk_var1 \r
+FLA_Gemm_tn_blk_var2 \r
+FLA_Gemm_tn_blk_var3 \r
+FLA_Gemm_tn_blk_var4 \r
+FLA_Gemm_tn_blk_var5 \r
+FLA_Gemm_tn_blk_var6 \r
+FLA_Gemm_tn_unb_var1 \r
+FLA_Gemm_tn_unb_var2 \r
+FLA_Gemm_tn_unb_var3 \r
+FLA_Gemm_tn_unb_var4 \r
+FLA_Gemm_tn_unb_var5 \r
+FLA_Gemm_tn_unb_var6 \r
+FLA_Gemm_tt \r
+FLA_Gemm_tt_blk_var1 \r
+FLA_Gemm_tt_blk_var2 \r
+FLA_Gemm_tt_blk_var3 \r
+FLA_Gemm_tt_blk_var4 \r
+FLA_Gemm_tt_blk_var5 \r
+FLA_Gemm_tt_blk_var6 \r
+FLA_Gemm_tt_unb_var1 \r
+FLA_Gemm_tt_unb_var2 \r
+FLA_Gemm_tt_unb_var3 \r
+FLA_Gemm_tt_unb_var4 \r
+FLA_Gemm_tt_unb_var5 \r
+FLA_Gemm_tt_unb_var6 \r
+FLA_Gemp \r
+FLA_Gemv \r
+FLA_Gemvc \r
+FLA_Gemvc_external \r
+FLA_Gemv_c \r
+FLA_Gemv_cntl_init \r
+FLA_Gemv_cntl_finalize \r
+FLA_Gemv_c_blk_var1 \r
+FLA_Gemv_c_blk_var2 \r
+FLA_Gemv_c_blk_var5 \r
+FLA_Gemv_c_blk_var6 \r
+FLA_Gemv_external \r
+FLA_Gemv_internal \r
+FLA_Gemv_n \r
+FLA_Gemv_n_blk_var1 \r
+FLA_Gemv_n_blk_var2 \r
+FLA_Gemv_n_blk_var5 \r
+FLA_Gemv_n_blk_var6 \r
+FLA_Gemv_t \r
+FLA_Gemv_task \r
+FLA_Gemv_c_task \r
+FLA_Gemv_n_task \r
+FLA_Gemv_t_task \r
+FLA_Gemv_t_blk_var1 \r
+FLA_Gemv_t_blk_var2 \r
+FLA_Gemv_t_blk_var5 \r
+FLA_Gemv_t_blk_var6 \r
+FLA_Gepm \r
+FLA_Gepp \r
+FLA_Ger \r
+FLA_Gerc \r
+FLA_Gerc_external \r
+FLA_Ger_external \r
+FLA_Hemm \r
+FLA_Hemm_cntl_init \r
+FLA_Hemm_cntl_finalize \r
+FLA_Hemm_external \r
+FLA_Hemm_internal \r
+FLA_Hemm_ll \r
+FLA_Hemm_ll_blk_var1 \r
+FLA_Hemm_ll_blk_var10 \r
+FLA_Hemm_ll_blk_var2 \r
+FLA_Hemm_ll_blk_var3 \r
+FLA_Hemm_ll_blk_var4 \r
+FLA_Hemm_ll_blk_var5 \r
+FLA_Hemm_ll_blk_var6 \r
+FLA_Hemm_ll_blk_var7 \r
+FLA_Hemm_ll_blk_var8 \r
+FLA_Hemm_ll_blk_var9 \r
+FLA_Hemm_ll_unb_var1 \r
+FLA_Hemm_ll_unb_var10 \r
+FLA_Hemm_ll_unb_var2 \r
+FLA_Hemm_ll_unb_var3 \r
+FLA_Hemm_ll_unb_var4 \r
+FLA_Hemm_ll_unb_var5 \r
+FLA_Hemm_ll_unb_var6 \r
+FLA_Hemm_ll_unb_var7 \r
+FLA_Hemm_ll_unb_var8 \r
+FLA_Hemm_ll_unb_var9 \r
+FLA_Hemm_lu \r
+FLA_Hemm_lu_blk_var1 \r
+FLA_Hemm_lu_blk_var10 \r
+FLA_Hemm_lu_blk_var2 \r
+FLA_Hemm_lu_blk_var3 \r
+FLA_Hemm_lu_blk_var4 \r
+FLA_Hemm_lu_blk_var5 \r
+FLA_Hemm_lu_blk_var6 \r
+FLA_Hemm_lu_blk_var7 \r
+FLA_Hemm_lu_blk_var8 \r
+FLA_Hemm_lu_blk_var9 \r
+FLA_Hemm_lu_unb_var1 \r
+FLA_Hemm_lu_unb_var10 \r
+FLA_Hemm_lu_unb_var2 \r
+FLA_Hemm_lu_unb_var3 \r
+FLA_Hemm_lu_unb_var4 \r
+FLA_Hemm_lu_unb_var5 \r
+FLA_Hemm_lu_unb_var6 \r
+FLA_Hemm_lu_unb_var7 \r
+FLA_Hemm_lu_unb_var8 \r
+FLA_Hemm_lu_unb_var9 \r
+FLA_Hemm_rl \r
+FLA_Hemm_rl_blk_var1 \r
+FLA_Hemm_rl_blk_var10 \r
+FLA_Hemm_rl_blk_var2 \r
+FLA_Hemm_rl_blk_var3 \r
+FLA_Hemm_rl_blk_var4 \r
+FLA_Hemm_rl_blk_var5 \r
+FLA_Hemm_rl_blk_var6 \r
+FLA_Hemm_rl_blk_var7 \r
+FLA_Hemm_rl_blk_var8 \r
+FLA_Hemm_rl_blk_var9 \r
+FLA_Hemm_rl_unb_var1 \r
+FLA_Hemm_rl_unb_var10 \r
+FLA_Hemm_rl_unb_var2 \r
+FLA_Hemm_rl_unb_var3 \r
+FLA_Hemm_rl_unb_var4 \r
+FLA_Hemm_rl_unb_var5 \r
+FLA_Hemm_rl_unb_var6 \r
+FLA_Hemm_rl_unb_var7 \r
+FLA_Hemm_rl_unb_var8 \r
+FLA_Hemm_rl_unb_var9 \r
+FLA_Hemm_ru \r
+FLA_Hemm_ru_blk_var1 \r
+FLA_Hemm_ru_blk_var10 \r
+FLA_Hemm_ru_blk_var2 \r
+FLA_Hemm_ru_blk_var3 \r
+FLA_Hemm_ru_blk_var4 \r
+FLA_Hemm_ru_blk_var5 \r
+FLA_Hemm_ru_blk_var6 \r
+FLA_Hemm_ru_blk_var7 \r
+FLA_Hemm_ru_blk_var8 \r
+FLA_Hemm_ru_blk_var9 \r
+FLA_Hemm_ru_unb_var1 \r
+FLA_Hemm_ru_unb_var10 \r
+FLA_Hemm_ru_unb_var2 \r
+FLA_Hemm_ru_unb_var3 \r
+FLA_Hemm_ru_unb_var4 \r
+FLA_Hemm_ru_unb_var5 \r
+FLA_Hemm_ru_unb_var6 \r
+FLA_Hemm_ru_unb_var7 \r
+FLA_Hemm_ru_unb_var8 \r
+FLA_Hemm_ru_unb_var9 \r
+FLA_Hemm_task \r
+FLA_Hemm_ll_task \r
+FLA_Hemm_lu_task \r
+FLA_Hemm_rl_task \r
+FLA_Hemm_ru_task \r
+FLA_Hemv \r
+FLA_Hemvc \r
+FLA_Hemvc_external \r
+FLA_Hemv_external \r
+FLA_Her \r
+FLA_Her2 \r
+FLA_Her2c \r
+FLA_Her2c_external \r
+FLA_Her2k \r
+FLA_Her2k_cntl_init \r
+FLA_Her2k_cntl_finalize \r
+FLA_Her2k_external \r
+FLA_Her2k_internal \r
+FLA_Her2k_lh \r
+FLA_Her2k_lh_blk_var1 \r
+FLA_Her2k_lh_blk_var10 \r
+FLA_Her2k_lh_blk_var2 \r
+FLA_Her2k_lh_blk_var3 \r
+FLA_Her2k_lh_blk_var4 \r
+FLA_Her2k_lh_blk_var5 \r
+FLA_Her2k_lh_blk_var6 \r
+FLA_Her2k_lh_blk_var7 \r
+FLA_Her2k_lh_blk_var8 \r
+FLA_Her2k_lh_blk_var9 \r
+FLA_Her2k_lh_unb_var1 \r
+FLA_Her2k_lh_unb_var10 \r
+FLA_Her2k_lh_unb_var2 \r
+FLA_Her2k_lh_unb_var3 \r
+FLA_Her2k_lh_unb_var4 \r
+FLA_Her2k_lh_unb_var5 \r
+FLA_Her2k_lh_unb_var6 \r
+FLA_Her2k_lh_unb_var7 \r
+FLA_Her2k_lh_unb_var8 \r
+FLA_Her2k_lh_unb_var9 \r
+FLA_Her2k_ln \r
+FLA_Her2k_ln_blk_var1 \r
+FLA_Her2k_ln_blk_var10 \r
+FLA_Her2k_ln_blk_var2 \r
+FLA_Her2k_ln_blk_var3 \r
+FLA_Her2k_ln_blk_var4 \r
+FLA_Her2k_ln_blk_var5 \r
+FLA_Her2k_ln_blk_var6 \r
+FLA_Her2k_ln_blk_var7 \r
+FLA_Her2k_ln_blk_var8 \r
+FLA_Her2k_ln_blk_var9 \r
+FLA_Her2k_ln_unb_var1 \r
+FLA_Her2k_ln_unb_var10 \r
+FLA_Her2k_ln_unb_var2 \r
+FLA_Her2k_ln_unb_var3 \r
+FLA_Her2k_ln_unb_var4 \r
+FLA_Her2k_ln_unb_var5 \r
+FLA_Her2k_ln_unb_var6 \r
+FLA_Her2k_ln_unb_var7 \r
+FLA_Her2k_ln_unb_var8 \r
+FLA_Her2k_ln_unb_var9 \r
+FLA_Her2k_task \r
+FLA_Her2k_ln_task \r
+FLA_Her2k_lh_task \r
+FLA_Her2k_un_task \r
+FLA_Her2k_uh_task \r
+FLA_Her2k_uh \r
+FLA_Her2k_uh_blk_var1 \r
+FLA_Her2k_uh_blk_var10 \r
+FLA_Her2k_uh_blk_var2 \r
+FLA_Her2k_uh_blk_var3 \r
+FLA_Her2k_uh_blk_var4 \r
+FLA_Her2k_uh_blk_var5 \r
+FLA_Her2k_uh_blk_var6 \r
+FLA_Her2k_uh_blk_var7 \r
+FLA_Her2k_uh_blk_var8 \r
+FLA_Her2k_uh_blk_var9 \r
+FLA_Her2k_uh_unb_var1 \r
+FLA_Her2k_uh_unb_var10 \r
+FLA_Her2k_uh_unb_var2 \r
+FLA_Her2k_uh_unb_var3 \r
+FLA_Her2k_uh_unb_var4 \r
+FLA_Her2k_uh_unb_var5 \r
+FLA_Her2k_uh_unb_var6 \r
+FLA_Her2k_uh_unb_var7 \r
+FLA_Her2k_uh_unb_var8 \r
+FLA_Her2k_uh_unb_var9 \r
+FLA_Her2k_un \r
+FLA_Her2k_un_blk_var1 \r
+FLA_Her2k_un_blk_var10 \r
+FLA_Her2k_un_blk_var2 \r
+FLA_Her2k_un_blk_var3 \r
+FLA_Her2k_un_blk_var4 \r
+FLA_Her2k_un_blk_var5 \r
+FLA_Her2k_un_blk_var6 \r
+FLA_Her2k_un_blk_var7 \r
+FLA_Her2k_un_blk_var8 \r
+FLA_Her2k_un_blk_var9 \r
+FLA_Her2k_un_unb_var1 \r
+FLA_Her2k_un_unb_var10 \r
+FLA_Her2k_un_unb_var2 \r
+FLA_Her2k_un_unb_var3 \r
+FLA_Her2k_un_unb_var4 \r
+FLA_Her2k_un_unb_var5 \r
+FLA_Her2k_un_unb_var6 \r
+FLA_Her2k_un_unb_var7 \r
+FLA_Her2k_un_unb_var8 \r
+FLA_Her2k_un_unb_var9 \r
+FLA_Her2_external \r
+FLA_Herc \r
+FLA_Herc_external \r
+FLA_Herk \r
+FLA_Herk_cntl_init \r
+FLA_Herk_cntl_finalize \r
+FLA_Herk_external \r
+FLA_Herk_internal \r
+FLA_Herk_lh \r
+FLA_Herk_lh_blk_var1 \r
+FLA_Herk_lh_blk_var2 \r
+FLA_Herk_lh_blk_var3 \r
+FLA_Herk_lh_blk_var4 \r
+FLA_Herk_lh_blk_var5 \r
+FLA_Herk_lh_blk_var6 \r
+FLA_Herk_lh_unb_var1 \r
+FLA_Herk_lh_unb_var2 \r
+FLA_Herk_lh_unb_var3 \r
+FLA_Herk_lh_unb_var4 \r
+FLA_Herk_lh_unb_var5 \r
+FLA_Herk_lh_unb_var6 \r
+FLA_Herk_ln \r
+FLA_Herk_ln_blk_var1 \r
+FLA_Herk_ln_blk_var2 \r
+FLA_Herk_ln_blk_var3 \r
+FLA_Herk_ln_blk_var4 \r
+FLA_Herk_ln_blk_var5 \r
+FLA_Herk_ln_blk_var6 \r
+FLA_Herk_ln_unb_var1 \r
+FLA_Herk_ln_unb_var2 \r
+FLA_Herk_ln_unb_var3 \r
+FLA_Herk_ln_unb_var4 \r
+FLA_Herk_ln_unb_var5 \r
+FLA_Herk_ln_unb_var6 \r
+FLA_Herk_task \r
+FLA_Herk_ln_task \r
+FLA_Herk_lh_task \r
+FLA_Herk_un_task \r
+FLA_Herk_uh_task \r
+FLA_Herk_uh \r
+FLA_Herk_uh_blk_var1 \r
+FLA_Herk_uh_blk_var2 \r
+FLA_Herk_uh_blk_var3 \r
+FLA_Herk_uh_blk_var4 \r
+FLA_Herk_uh_blk_var5 \r
+FLA_Herk_uh_blk_var6 \r
+FLA_Herk_uh_unb_var1 \r
+FLA_Herk_uh_unb_var2 \r
+FLA_Herk_uh_unb_var3 \r
+FLA_Herk_uh_unb_var4 \r
+FLA_Herk_uh_unb_var5 \r
+FLA_Herk_uh_unb_var6 \r
+FLA_Herk_un \r
+FLA_Herk_un_blk_var1 \r
+FLA_Herk_un_blk_var2 \r
+FLA_Herk_un_blk_var3 \r
+FLA_Herk_un_blk_var4 \r
+FLA_Herk_un_blk_var5 \r
+FLA_Herk_un_blk_var6 \r
+FLA_Herk_un_unb_var1 \r
+FLA_Herk_un_unb_var2 \r
+FLA_Herk_un_unb_var3 \r
+FLA_Herk_un_unb_var4 \r
+FLA_Herk_un_unb_var5 \r
+FLA_Herk_un_unb_var6 \r
+FLA_Hermitianize \r
+FLA_Her_external \r
+FLA_Househ2_UT \r
+FLA_Househ2_UT_ops \r
+FLA_Househ2_UT_opd \r
+FLA_Househ2_UT_opc \r
+FLA_Househ2_UT_opz \r
+FLA_Init \r
+FLA_Finalize \r
+FLA_Init_safe \r
+FLA_Finalize_safe \r
+FLA_Initialized \r
+FLA_Init_constants \r
+FLA_Finalize_constants \r
+FLA_Invert \r
+FLA_Inv_scal \r
+FLA_Inv_scalc \r
+FLA_Inv_scalc_external \r
+FLA_Inv_scal_external \r
+FLA_Lock_init \r
+FLA_Lock_acquire \r
+FLA_Lock_release \r
+FLA_Lock_destroy \r
+FLA_LQ_blk_external \r
+FLA_LQ_unb_external \r
+FLA_LQ_UT \r
+FLA_LQ_UT_Accum_T_blk_var1 \r
+FLA_LQ_UT_Accum_T_opt_var1 \r
+FLA_LQ_UT_Accum_T_ops_var1 \r
+FLA_LQ_UT_Accum_T_opd_var1 \r
+FLA_LQ_UT_Accum_T_opc_var1 \r
+FLA_LQ_UT_Accum_T_opz_var1 \r
+FLA_LQ_UT_Accum_T_unb_var1 \r
+FLA_LQ_UT_blk_var2 \r
+FLA_LQ_UT_cntl_init \r
+FLA_LQ_UT_cntl_finalize \r
+FLA_LQ_UT_create_T \r
+FLA_LQ_UT_internal \r
+FLA_LQ_UT_opt_var2 \r
+FLA_LQ_UT_ops_var2 \r
+FLA_LQ_UT_opd_var2 \r
+FLA_LQ_UT_opc_var2 \r
+FLA_LQ_UT_opz_var2 \r
+FLA_LQ_UT_recover_tau \r
+FLA_LQ_UT_recover_tau_submatrix \r
+FLA_LQ_UT_solve \r
+FLA_LQ_UT_task \r
+FLA_LQ_UT_unb_var2 \r
+FLA_LU_find_zero_on_diagonal \r
+FLA_LU_nopiv \r
+FLA_LU_nopiv_blk_var1 \r
+FLA_LU_nopiv_blk_var2 \r
+FLA_LU_nopiv_blk_var3 \r
+FLA_LU_nopiv_blk_var4 \r
+FLA_LU_nopiv_blk_var5 \r
+FLA_LU_nopiv_cntl_init \r
+FLA_LU_nopiv_cntl_finalize \r
+FLA_LU_nopiv_internal \r
+FLA_LU_nopiv_opt_var1 \r
+FLA_LU_nopiv_ops_var1 \r
+FLA_LU_nopiv_opd_var1 \r
+FLA_LU_nopiv_opc_var1 \r
+FLA_LU_nopiv_opz_var1 \r
+FLA_LU_nopiv_opt_var2 \r
+FLA_LU_nopiv_ops_var2 \r
+FLA_LU_nopiv_opd_var2 \r
+FLA_LU_nopiv_opc_var2 \r
+FLA_LU_nopiv_opz_var2 \r
+FLA_LU_nopiv_opt_var3 \r
+FLA_LU_nopiv_ops_var3 \r
+FLA_LU_nopiv_opd_var3 \r
+FLA_LU_nopiv_opc_var3 \r
+FLA_LU_nopiv_opz_var3 \r
+FLA_LU_nopiv_opt_var4 \r
+FLA_LU_nopiv_ops_var4 \r
+FLA_LU_nopiv_opd_var4 \r
+FLA_LU_nopiv_opc_var4 \r
+FLA_LU_nopiv_opz_var4 \r
+FLA_LU_nopiv_opt_var5 \r
+FLA_LU_nopiv_ops_var5 \r
+FLA_LU_nopiv_opd_var5 \r
+FLA_LU_nopiv_opc_var5 \r
+FLA_LU_nopiv_opz_var5 \r
+FLA_LU_nopiv_solve \r
+FLA_LU_nopiv_task \r
+FLA_LU_nopiv_unb_var1 \r
+FLA_LU_nopiv_unb_var2 \r
+FLA_LU_nopiv_unb_var3 \r
+FLA_LU_nopiv_unb_var4 \r
+FLA_LU_nopiv_unb_var5 \r
+FLA_LU_piv \r
+FLA_LU_piv_blk_external \r
+FLA_LU_piv_blk_var3 \r
+FLA_LU_piv_blk_var4 \r
+FLA_LU_piv_blk_var5 \r
+FLA_LU_piv_cntl_init \r
+FLA_LU_piv_cntl_finalize \r
+FLA_LU_piv_copy_task \r
+FLA_LU_piv_internal \r
+FLA_LU_piv_macro_task \r
+FLA_LU_piv_opt_var3 \r
+FLA_LU_piv_ops_var3 \r
+FLA_LU_piv_opd_var3 \r
+FLA_LU_piv_opc_var3 \r
+FLA_LU_piv_opz_var3 \r
+FLA_LU_piv_opt_var4 \r
+FLA_LU_piv_ops_var4 \r
+FLA_LU_piv_opd_var4 \r
+FLA_LU_piv_opc_var4 \r
+FLA_LU_piv_opz_var4 \r
+FLA_LU_piv_opt_var5 \r
+FLA_LU_piv_ops_var5 \r
+FLA_LU_piv_opd_var5 \r
+FLA_LU_piv_opc_var5 \r
+FLA_LU_piv_opz_var5 \r
+FLA_LU_piv_solve \r
+FLA_LU_piv_task \r
+FLA_LU_piv_unb_external \r
+FLA_LU_piv_unb_ext \r
+FLA_LU_piv_unb_var3 \r
+FLA_LU_piv_unb_var3b \r
+FLA_LU_piv_unb_var4 \r
+FLA_LU_piv_unb_var5 \r
+FLA_Max_abs_value \r
+FLA_Max_elemwise_diff \r
+FLA_Memory_leak_counter_init \r
+FLA_Memory_leak_counter_finalize \r
+FLA_Memory_leak_counter_status \r
+FLA_Memory_leak_counter_set \r
+FLA_malloc \r
+FLA_realloc \r
+FLA_free \r
+FLA_Set \r
+FLA_Obj_extract_real_scalar \r
+FLA_Set_diag \r
+FLA_Set_to_identity \r
+FLA_Add_to_diag \r
+FLA_Shift_diag \r
+FLA_Scale_diag\r
+FLA_Obj_fshow \r
+FLA_Obj_show \r
+FLA_Mult_add \r
+FLA_Negate \r
+FLA_Norm1 \r
+FLA_Norm_inf \r
+FLA_Nrm2 \r
+FLA_Nrm2_external \r
+FLA_Obj_create \r
+FLA_Obj_create_ext \r
+FLA_align_ldim \r
+FLA_Obj_create_conf_to \r
+FLA_Obj_create_copy_of \r
+FLA_Obj_create_without_buffer \r
+FLA_Obj_create_constant \r
+FLA_Obj_create_complex_constant\r
+FLA_Obj_attach_buffer \r
+FLA_Obj_free \r
+FLA_Obj_free_without_buffer \r
+FLA_Param_map_flame_to_netlib_trans \r
+FLA_Param_map_flame_to_netlib_uplo \r
+FLA_Param_map_flame_to_netlib_side \r
+FLA_Param_map_flame_to_netlib_diag \r
+FLA_Param_map_flame_to_netlib_direct \r
+FLA_Param_map_flame_to_netlib_storev \r
+FLA_Param_map_flame_to_blis_trans \r
+FLA_Param_map_flame_to_blis_conj \r
+FLA_Param_map_flame_to_blis_uplo \r
+FLA_Param_map_flame_to_blis_side \r
+FLA_Param_map_flame_to_blis_diag \r
+FLA_Param_map_blis_to_netlib_trans \r
+FLA_Param_map_blis_to_netlib_uplo \r
+FLA_Param_map_blis_to_netlib_side \r
+FLA_Param_map_blis_to_netlib_diag \r
+FLA_Param_map_netlib_to_flame_trans \r
+FLA_Param_map_netlib_to_flame_uplo \r
+FLA_Param_map_netlib_to_flame_side \r
+FLA_Param_map_netlib_to_flame_diag \r
+FLA_Param_map_blislapack_to_flame_trans \r
+FLA_Param_map_blislapack_to_flame_uplo \r
+FLA_Param_map_blislapack_to_flame_side \r
+FLA_Param_map_blislapack_to_flame_diag \r
+FLA_QR_blk_external \r
+FLA_QR_unb_external \r
+FLA_QR_UT \r
+FLA_QR_UT_Accum_T_blk_var1 \r
+FLA_QR_UT_Accum_T_opt_var1 \r
+FLA_QR_UT_Accum_T_ops_var1 \r
+FLA_QR_UT_Accum_T_opd_var1 \r
+FLA_QR_UT_Accum_T_opc_var1 \r
+FLA_QR_UT_Accum_T_opz_var1 \r
+FLA_QR_UT_Accum_T_unb_var1 \r
+FLA_QR_UT_blk_var2 \r
+FLA_QR_UT_cntl_init \r
+FLA_QR_UT_cntl_finalize \r
+FLA_QR_UT_copy_internal \r
+FLA_QR_UT_copy_task \r
+FLA_QR_UT_create_T \r
+FLA_QR_UT_inc_blk_var1 \r
+FLA_QR_UT_inc_blk_var2 \r
+FLA_QR_UT_internal \r
+FLA_QR_UT_opt_var2 \r
+FLA_QR_UT_ops_var2 \r
+FLA_QR_UT_opd_var2 \r
+FLA_QR_UT_opc_var2 \r
+FLA_QR_UT_opz_var2 \r
+FLA_QR_UT_recover_tau \r
+FLA_QR_UT_recover_tau_submatrix \r
+FLA_QR_UT_solve \r
+FLA_QR_UT_task \r
+FLA_QR2_UT_Accum_T_opt_var1 \r
+FLA_QR2_UT_Accum_T_ops_var1 \r
+FLA_QR2_UT_Accum_T_opd_var1 \r
+FLA_QR2_UT_Accum_T_opc_var1 \r
+FLA_QR2_UT_Accum_T_opz_var1 \r
+FLA_QR2_UT_Accum_T_unb_var1 \r
+FLA_QR2_UT_blk_var1 \r
+FLA_QR2_UT_blk_var2 \r
+FLA_QR2_UT_cntl_init \r
+FLA_QR2_UT_cntl_finalize \r
+FLA_QR2_UT_internal \r
+FLA_QR2_UT_task \r
+FLA_QR2_UT_unb_var2 \r
+FLA_Obj_datatype \r
+FLA_Obj_datatype_proj_to_real \r
+FLA_Obj_elemtype \r
+FLA_Obj_datatype_size \r
+FLA_Obj_elem_size \r
+FLA_Obj_length \r
+FLA_Obj_width \r
+FLA_Obj_vector_dim \r
+FLA_Obj_vector_inc \r
+FLA_Obj_min_dim \r
+FLA_Obj_max_dim \r
+FLA_Obj_row_stride \r
+FLA_Obj_col_stride \r
+FLA_Obj_buffer \r
+FLA_Obj_is_int \r
+FLA_Obj_is_floating_point \r
+FLA_Obj_is_constant \r
+FLA_Obj_is_real \r
+FLA_Obj_is_complex \r
+FLA_Obj_is_single_precision \r
+FLA_Obj_is_double_precision \r
+FLA_Obj_is_scalar \r
+FLA_Obj_is_vector \r
+FLA_Obj_has_zero_dim \r
+FLA_Obj_is_col_major \r
+FLA_Obj_is_row_major \r
+FLA_Obj_is_conformal_to \r
+FLA_Obj_is \r
+FLA_Obj_equals \r
+FLA_Random_herm_matrix \r
+FLA_Random_matrix \r
+FLA_random_float \r
+FLA_random_double \r
+FLA_random_scomplex \r
+FLA_random_dcomplex \r
+FLA_Random_spd_matrix \r
+FLA_Random_tri_matrix \r
+FLA_SA_Apply_pivots \r
+FLA_SA_FS_blk \r
+FLA_SA_FS_task \r
+FLA_SA_LU_blk \r
+FLA_SA_LU_task \r
+FLA_SA_LU_unb \r
+FLA_Scal \r
+FLA_Scalc \r
+FLA_Scalc_external \r
+FLA_Scalr \r
+FLA_Scalr_external \r
+FLA_Scal_external \r
+FLA_Shift_pivots_to \r
+FLA_SPDinv \r
+FLA_SPDinv_blk_external \r
+FLA_SPDinv_cntl_init \r
+FLA_SPDinv_cntl_finalize \r
+FLA_SPDinv_internal \r
+FLA_Sqrt \r
+FLA_Swap \r
+FLA_Swapt \r
+FLA_Swapt_external \r
+FLA_Swap_external \r
+FLA_Swap_t_blk_var1 \r
+FLA_Swap_t_blk_var2 \r
+FLA_Sylv \r
+FLA_Sylv_blk_external \r
+FLA_Sylv_cntl_init \r
+FLA_Sylv_cntl_finalize \r
+FLA_Sylv_hh \r
+FLA_Sylv_hh_blk_var1 \r
+FLA_Sylv_hh_blk_var10 \r
+FLA_Sylv_hh_blk_var11 \r
+FLA_Sylv_hh_blk_var12 \r
+FLA_Sylv_hh_blk_var13 \r
+FLA_Sylv_hh_blk_var14 \r
+FLA_Sylv_hh_blk_var15 \r
+FLA_Sylv_hh_blk_var16 \r
+FLA_Sylv_hh_blk_var17 \r
+FLA_Sylv_hh_blk_var18 \r
+FLA_Sylv_hh_blk_var2 \r
+FLA_Sylv_hh_blk_var3 \r
+FLA_Sylv_hh_blk_var4 \r
+FLA_Sylv_hh_blk_var5 \r
+FLA_Sylv_hh_blk_var6 \r
+FLA_Sylv_hh_blk_var7 \r
+FLA_Sylv_hh_blk_var8 \r
+FLA_Sylv_hh_blk_var9 \r
+FLA_Sylv_hh_opt_var1 \r
+FLA_Sylv_hh_ops_var1 \r
+FLA_Sylv_hh_opd_var1 \r
+FLA_Sylv_hh_opc_var1 \r
+FLA_Sylv_hh_opz_var1 \r
+FLA_Sylv_hh_opt_var10 \r
+FLA_Sylv_hh_opt_var11 \r
+FLA_Sylv_hh_opt_var12 \r
+FLA_Sylv_hh_opt_var13 \r
+FLA_Sylv_hh_opt_var14 \r
+FLA_Sylv_hh_opt_var15 \r
+FLA_Sylv_hh_opt_var16 \r
+FLA_Sylv_hh_opt_var17 \r
+FLA_Sylv_hh_opt_var18 \r
+FLA_Sylv_hh_opt_var2 \r
+FLA_Sylv_hh_opt_var3 \r
+FLA_Sylv_hh_opt_var4 \r
+FLA_Sylv_hh_opt_var5 \r
+FLA_Sylv_hh_opt_var6 \r
+FLA_Sylv_hh_opt_var7 \r
+FLA_Sylv_hh_opt_var8 \r
+FLA_Sylv_hh_opt_var9 \r
+FLA_Sylv_hn \r
+FLA_Sylv_hn_blk_var1 \r
+FLA_Sylv_hn_blk_var10 \r
+FLA_Sylv_hn_blk_var11 \r
+FLA_Sylv_hn_blk_var12 \r
+FLA_Sylv_hn_blk_var13 \r
+FLA_Sylv_hn_blk_var14 \r
+FLA_Sylv_hn_blk_var15 \r
+FLA_Sylv_hn_blk_var16 \r
+FLA_Sylv_hn_blk_var17 \r
+FLA_Sylv_hn_blk_var18 \r
+FLA_Sylv_hn_blk_var2 \r
+FLA_Sylv_hn_blk_var3 \r
+FLA_Sylv_hn_blk_var4 \r
+FLA_Sylv_hn_blk_var5 \r
+FLA_Sylv_hn_blk_var6 \r
+FLA_Sylv_hn_blk_var7 \r
+FLA_Sylv_hn_blk_var8 \r
+FLA_Sylv_hn_blk_var9 \r
+FLA_Sylv_hn_opt_var1 \r
+FLA_Sylv_hn_ops_var1 \r
+FLA_Sylv_hn_opd_var1 \r
+FLA_Sylv_hn_opc_var1 \r
+FLA_Sylv_hn_opz_var1 \r
+FLA_Sylv_hn_opt_var10 \r
+FLA_Sylv_hn_opt_var11 \r
+FLA_Sylv_hn_opt_var12 \r
+FLA_Sylv_hn_opt_var13 \r
+FLA_Sylv_hn_opt_var14 \r
+FLA_Sylv_hn_opt_var15 \r
+FLA_Sylv_hn_opt_var16 \r
+FLA_Sylv_hn_opt_var17 \r
+FLA_Sylv_hn_opt_var18 \r
+FLA_Sylv_hn_opt_var2 \r
+FLA_Sylv_hn_opt_var3 \r
+FLA_Sylv_hn_opt_var4 \r
+FLA_Sylv_hn_opt_var5 \r
+FLA_Sylv_hn_opt_var6 \r
+FLA_Sylv_hn_opt_var7 \r
+FLA_Sylv_hn_opt_var8 \r
+FLA_Sylv_hn_opt_var9 \r
+FLA_Sylv_internal \r
+FLA_Sylv_nh \r
+FLA_Sylv_nh_blk_var1 \r
+FLA_Sylv_nh_blk_var10 \r
+FLA_Sylv_nh_blk_var11 \r
+FLA_Sylv_nh_blk_var12 \r
+FLA_Sylv_nh_blk_var13 \r
+FLA_Sylv_nh_blk_var14 \r
+FLA_Sylv_nh_blk_var15 \r
+FLA_Sylv_nh_blk_var16 \r
+FLA_Sylv_nh_blk_var17 \r
+FLA_Sylv_nh_blk_var18 \r
+FLA_Sylv_nh_blk_var2 \r
+FLA_Sylv_nh_blk_var3 \r
+FLA_Sylv_nh_blk_var4 \r
+FLA_Sylv_nh_blk_var5 \r
+FLA_Sylv_nh_blk_var6 \r
+FLA_Sylv_nh_blk_var7 \r
+FLA_Sylv_nh_blk_var8 \r
+FLA_Sylv_nh_blk_var9 \r
+FLA_Sylv_nh_opt_var1 \r
+FLA_Sylv_nh_ops_var1 \r
+FLA_Sylv_nh_opd_var1 \r
+FLA_Sylv_nh_opc_var1 \r
+FLA_Sylv_nh_opz_var1 \r
+FLA_Sylv_nh_opt_var10 \r
+FLA_Sylv_nh_opt_var11 \r
+FLA_Sylv_nh_opt_var12 \r
+FLA_Sylv_nh_opt_var13 \r
+FLA_Sylv_nh_opt_var14 \r
+FLA_Sylv_nh_opt_var15 \r
+FLA_Sylv_nh_opt_var16 \r
+FLA_Sylv_nh_opt_var17 \r
+FLA_Sylv_nh_opt_var18 \r
+FLA_Sylv_nh_opt_var2 \r
+FLA_Sylv_nh_opt_var3 \r
+FLA_Sylv_nh_opt_var4 \r
+FLA_Sylv_nh_opt_var5 \r
+FLA_Sylv_nh_opt_var6 \r
+FLA_Sylv_nh_opt_var7 \r
+FLA_Sylv_nh_opt_var8 \r
+FLA_Sylv_nh_opt_var9 \r
+FLA_Sylv_nn \r
+FLA_Sylv_nn_blk_var1 \r
+FLA_Sylv_nn_blk_var10 \r
+FLA_Sylv_nn_blk_var11 \r
+FLA_Sylv_nn_blk_var12 \r
+FLA_Sylv_nn_blk_var13 \r
+FLA_Sylv_nn_blk_var14 \r
+FLA_Sylv_nn_blk_var15 \r
+FLA_Sylv_nn_blk_var16 \r
+FLA_Sylv_nn_blk_var17 \r
+FLA_Sylv_nn_blk_var18 \r
+FLA_Sylv_nn_blk_var2 \r
+FLA_Sylv_nn_blk_var3 \r
+FLA_Sylv_nn_blk_var4 \r
+FLA_Sylv_nn_blk_var5 \r
+FLA_Sylv_nn_blk_var6 \r
+FLA_Sylv_nn_blk_var7 \r
+FLA_Sylv_nn_blk_var8 \r
+FLA_Sylv_nn_blk_var9 \r
+FLA_Sylv_nn_opt_var1 \r
+FLA_Sylv_nn_ops_var1 \r
+FLA_Sylv_nn_opd_var1 \r
+FLA_Sylv_nn_opc_var1 \r
+FLA_Sylv_nn_opz_var1 \r
+FLA_Sylv_nn_opt_var10 \r
+FLA_Sylv_nn_opt_var11 \r
+FLA_Sylv_nn_opt_var12 \r
+FLA_Sylv_nn_opt_var13 \r
+FLA_Sylv_nn_opt_var14 \r
+FLA_Sylv_nn_opt_var15 \r
+FLA_Sylv_nn_opt_var16 \r
+FLA_Sylv_nn_opt_var17 \r
+FLA_Sylv_nn_opt_var18 \r
+FLA_Sylv_nn_opt_var2 \r
+FLA_Sylv_nn_opt_var3 \r
+FLA_Sylv_nn_opt_var4 \r
+FLA_Sylv_nn_opt_var5 \r
+FLA_Sylv_nn_opt_var6 \r
+FLA_Sylv_nn_opt_var7 \r
+FLA_Sylv_nn_opt_var8 \r
+FLA_Sylv_nn_opt_var9 \r
+FLA_Sylv_task \r
+FLA_Sylv_nn_task \r
+FLA_Sylv_nh_task \r
+FLA_Sylv_hn_task \r
+FLA_Sylv_hh_task \r
+FLA_Sylv_unb_external \r
+FLA_Sylv_nn_unb_ext \r
+FLA_Sylv_nh_unb_ext \r
+FLA_Sylv_hn_unb_ext \r
+FLA_Sylv_hh_unb_ext \r
+FLA_Symm \r
+FLA_Symmetrize \r
+FLA_Symm_cntl_init \r
+FLA_Symm_cntl_finalize \r
+FLA_Symm_external \r
+FLA_Symm_internal \r
+FLA_Symm_ll \r
+FLA_Symm_ll_blk_var1 \r
+FLA_Symm_ll_blk_var10 \r
+FLA_Symm_ll_blk_var2 \r
+FLA_Symm_ll_blk_var3 \r
+FLA_Symm_ll_blk_var4 \r
+FLA_Symm_ll_blk_var5 \r
+FLA_Symm_ll_blk_var6 \r
+FLA_Symm_ll_blk_var7 \r
+FLA_Symm_ll_blk_var8 \r
+FLA_Symm_ll_blk_var9 \r
+FLA_Symm_ll_unb_var1 \r
+FLA_Symm_ll_unb_var10 \r
+FLA_Symm_ll_unb_var2 \r
+FLA_Symm_ll_unb_var3 \r
+FLA_Symm_ll_unb_var4 \r
+FLA_Symm_ll_unb_var5 \r
+FLA_Symm_ll_unb_var6 \r
+FLA_Symm_ll_unb_var7 \r
+FLA_Symm_ll_unb_var8 \r
+FLA_Symm_ll_unb_var9 \r
+FLA_Symm_lu \r
+FLA_Symm_lu_blk_var1 \r
+FLA_Symm_lu_blk_var10 \r
+FLA_Symm_lu_blk_var2 \r
+FLA_Symm_lu_blk_var3 \r
+FLA_Symm_lu_blk_var4 \r
+FLA_Symm_lu_blk_var5 \r
+FLA_Symm_lu_blk_var6 \r
+FLA_Symm_lu_blk_var7 \r
+FLA_Symm_lu_blk_var8 \r
+FLA_Symm_lu_blk_var9 \r
+FLA_Symm_lu_unb_var1 \r
+FLA_Symm_lu_unb_var10 \r
+FLA_Symm_lu_unb_var2 \r
+FLA_Symm_lu_unb_var3 \r
+FLA_Symm_lu_unb_var4 \r
+FLA_Symm_lu_unb_var5 \r
+FLA_Symm_lu_unb_var6 \r
+FLA_Symm_lu_unb_var7 \r
+FLA_Symm_lu_unb_var8 \r
+FLA_Symm_lu_unb_var9 \r
+FLA_Symm_rl \r
+FLA_Symm_rl_blk_var1 \r
+FLA_Symm_rl_blk_var10 \r
+FLA_Symm_rl_blk_var2 \r
+FLA_Symm_rl_blk_var3 \r
+FLA_Symm_rl_blk_var4 \r
+FLA_Symm_rl_blk_var5 \r
+FLA_Symm_rl_blk_var6 \r
+FLA_Symm_rl_blk_var7 \r
+FLA_Symm_rl_blk_var8 \r
+FLA_Symm_rl_blk_var9 \r
+FLA_Symm_rl_unb_var1 \r
+FLA_Symm_rl_unb_var10 \r
+FLA_Symm_rl_unb_var2 \r
+FLA_Symm_rl_unb_var3 \r
+FLA_Symm_rl_unb_var4 \r
+FLA_Symm_rl_unb_var5 \r
+FLA_Symm_rl_unb_var6 \r
+FLA_Symm_rl_unb_var7 \r
+FLA_Symm_rl_unb_var8 \r
+FLA_Symm_rl_unb_var9 \r
+FLA_Symm_ru \r
+FLA_Symm_ru_blk_var1 \r
+FLA_Symm_ru_blk_var10 \r
+FLA_Symm_ru_blk_var2 \r
+FLA_Symm_ru_blk_var3 \r
+FLA_Symm_ru_blk_var4 \r
+FLA_Symm_ru_blk_var5 \r
+FLA_Symm_ru_blk_var6 \r
+FLA_Symm_ru_blk_var7 \r
+FLA_Symm_ru_blk_var8 \r
+FLA_Symm_ru_blk_var9 \r
+FLA_Symm_ru_unb_var1 \r
+FLA_Symm_ru_unb_var10 \r
+FLA_Symm_ru_unb_var2 \r
+FLA_Symm_ru_unb_var3 \r
+FLA_Symm_ru_unb_var4 \r
+FLA_Symm_ru_unb_var5 \r
+FLA_Symm_ru_unb_var6 \r
+FLA_Symm_ru_unb_var7 \r
+FLA_Symm_ru_unb_var8 \r
+FLA_Symm_ru_unb_var9 \r
+FLA_Symm_task \r
+FLA_Symm_ll_task \r
+FLA_Symm_lu_task \r
+FLA_Symm_rl_task \r
+FLA_Symm_ru_task \r
+FLA_Symv \r
+FLA_Symv_external \r
+FLA_Syr \r
+FLA_Syr2 \r
+FLA_Syr2k \r
+FLA_Syr2k_cntl_init \r
+FLA_Syr2k_cntl_finalize \r
+FLA_Syr2k_external \r
+FLA_Syr2k_internal \r
+FLA_Syr2k_ln \r
+FLA_Syr2k_ln_blk_var1 \r
+FLA_Syr2k_ln_blk_var10 \r
+FLA_Syr2k_ln_blk_var2 \r
+FLA_Syr2k_ln_blk_var3 \r
+FLA_Syr2k_ln_blk_var4 \r
+FLA_Syr2k_ln_blk_var5 \r
+FLA_Syr2k_ln_blk_var6 \r
+FLA_Syr2k_ln_blk_var7 \r
+FLA_Syr2k_ln_blk_var8 \r
+FLA_Syr2k_ln_blk_var9 \r
+FLA_Syr2k_ln_unb_var1 \r
+FLA_Syr2k_ln_unb_var10 \r
+FLA_Syr2k_ln_unb_var2 \r
+FLA_Syr2k_ln_unb_var3 \r
+FLA_Syr2k_ln_unb_var4 \r
+FLA_Syr2k_ln_unb_var5 \r
+FLA_Syr2k_ln_unb_var6 \r
+FLA_Syr2k_ln_unb_var7 \r
+FLA_Syr2k_ln_unb_var8 \r
+FLA_Syr2k_ln_unb_var9 \r
+FLA_Syr2k_lt \r
+FLA_Syr2k_lt_blk_var1 \r
+FLA_Syr2k_lt_blk_var10 \r
+FLA_Syr2k_lt_blk_var2 \r
+FLA_Syr2k_lt_blk_var3 \r
+FLA_Syr2k_lt_blk_var4 \r
+FLA_Syr2k_lt_blk_var5 \r
+FLA_Syr2k_lt_blk_var6 \r
+FLA_Syr2k_lt_blk_var7 \r
+FLA_Syr2k_lt_blk_var8 \r
+FLA_Syr2k_lt_blk_var9 \r
+FLA_Syr2k_lt_unb_var1 \r
+FLA_Syr2k_lt_unb_var10 \r
+FLA_Syr2k_lt_unb_var2 \r
+FLA_Syr2k_lt_unb_var3 \r
+FLA_Syr2k_lt_unb_var4 \r
+FLA_Syr2k_lt_unb_var5 \r
+FLA_Syr2k_lt_unb_var6 \r
+FLA_Syr2k_lt_unb_var7 \r
+FLA_Syr2k_lt_unb_var8 \r
+FLA_Syr2k_lt_unb_var9 \r
+FLA_Syr2k_task \r
+FLA_Syr2k_ln_task \r
+FLA_Syr2k_lt_task \r
+FLA_Syr2k_un_task \r
+FLA_Syr2k_ut_task \r
+FLA_Syr2k_un \r
+FLA_Syr2k_un_blk_var1 \r
+FLA_Syr2k_un_blk_var10 \r
+FLA_Syr2k_un_blk_var2 \r
+FLA_Syr2k_un_blk_var3 \r
+FLA_Syr2k_un_blk_var4 \r
+FLA_Syr2k_un_blk_var5 \r
+FLA_Syr2k_un_blk_var6 \r
+FLA_Syr2k_un_blk_var7 \r
+FLA_Syr2k_un_blk_var8 \r
+FLA_Syr2k_un_blk_var9 \r
+FLA_Syr2k_un_unb_var1 \r
+FLA_Syr2k_un_unb_var10 \r
+FLA_Syr2k_un_unb_var2 \r
+FLA_Syr2k_un_unb_var3 \r
+FLA_Syr2k_un_unb_var4 \r
+FLA_Syr2k_un_unb_var5 \r
+FLA_Syr2k_un_unb_var6 \r
+FLA_Syr2k_un_unb_var7 \r
+FLA_Syr2k_un_unb_var8 \r
+FLA_Syr2k_un_unb_var9 \r
+FLA_Syr2k_ut \r
+FLA_Syr2k_ut_blk_var1 \r
+FLA_Syr2k_ut_blk_var10 \r
+FLA_Syr2k_ut_blk_var2 \r
+FLA_Syr2k_ut_blk_var3 \r
+FLA_Syr2k_ut_blk_var4 \r
+FLA_Syr2k_ut_blk_var5 \r
+FLA_Syr2k_ut_blk_var6 \r
+FLA_Syr2k_ut_blk_var7 \r
+FLA_Syr2k_ut_blk_var8 \r
+FLA_Syr2k_ut_blk_var9 \r
+FLA_Syr2k_ut_unb_var1 \r
+FLA_Syr2k_ut_unb_var10 \r
+FLA_Syr2k_ut_unb_var2 \r
+FLA_Syr2k_ut_unb_var3 \r
+FLA_Syr2k_ut_unb_var4 \r
+FLA_Syr2k_ut_unb_var5 \r
+FLA_Syr2k_ut_unb_var6 \r
+FLA_Syr2k_ut_unb_var7 \r
+FLA_Syr2k_ut_unb_var8 \r
+FLA_Syr2k_ut_unb_var9 \r
+FLA_Syr2_external \r
+FLA_Syrk \r
+FLA_Syrk_cntl_init \r
+FLA_Syrk_cntl_finalize \r
+FLA_Syrk_external \r
+FLA_Syrk_internal \r
+FLA_Syrk_ln \r
+FLA_Syrk_ln_blk_var1 \r
+FLA_Syrk_ln_blk_var2 \r
+FLA_Syrk_ln_blk_var3 \r
+FLA_Syrk_ln_blk_var4 \r
+FLA_Syrk_ln_blk_var5 \r
+FLA_Syrk_ln_blk_var6 \r
+FLA_Syrk_ln_unb_var1 \r
+FLA_Syrk_ln_unb_var2 \r
+FLA_Syrk_ln_unb_var3 \r
+FLA_Syrk_ln_unb_var4 \r
+FLA_Syrk_ln_unb_var5 \r
+FLA_Syrk_ln_unb_var6 \r
+FLA_Syrk_lt \r
+FLA_Syrk_lt_blk_var1 \r
+FLA_Syrk_lt_blk_var2 \r
+FLA_Syrk_lt_blk_var3 \r
+FLA_Syrk_lt_blk_var4 \r
+FLA_Syrk_lt_blk_var5 \r
+FLA_Syrk_lt_blk_var6 \r
+FLA_Syrk_lt_unb_var1 \r
+FLA_Syrk_lt_unb_var2 \r
+FLA_Syrk_lt_unb_var3 \r
+FLA_Syrk_lt_unb_var4 \r
+FLA_Syrk_lt_unb_var5 \r
+FLA_Syrk_lt_unb_var6 \r
+FLA_Syrk_task \r
+FLA_Syrk_ln_task \r
+FLA_Syrk_lt_task \r
+FLA_Syrk_un_task \r
+FLA_Syrk_ut_task \r
+FLA_Syrk_un \r
+FLA_Syrk_un_blk_var1 \r
+FLA_Syrk_un_blk_var2 \r
+FLA_Syrk_un_blk_var3 \r
+FLA_Syrk_un_blk_var4 \r
+FLA_Syrk_un_blk_var5 \r
+FLA_Syrk_un_blk_var6 \r
+FLA_Syrk_un_unb_var1 \r
+FLA_Syrk_un_unb_var2 \r
+FLA_Syrk_un_unb_var3 \r
+FLA_Syrk_un_unb_var4 \r
+FLA_Syrk_un_unb_var5 \r
+FLA_Syrk_un_unb_var6 \r
+FLA_Syrk_ut \r
+FLA_Syrk_ut_blk_var1 \r
+FLA_Syrk_ut_blk_var2 \r
+FLA_Syrk_ut_blk_var3 \r
+FLA_Syrk_ut_blk_var4 \r
+FLA_Syrk_ut_blk_var5 \r
+FLA_Syrk_ut_blk_var6 \r
+FLA_Syrk_ut_unb_var1 \r
+FLA_Syrk_ut_unb_var2 \r
+FLA_Syrk_ut_unb_var3 \r
+FLA_Syrk_ut_unb_var4 \r
+FLA_Syrk_ut_unb_var5 \r
+FLA_Syrk_ut_unb_var6 \r
+FLA_Syr_external \r
+FLA_Transpose \r
+FLA_Transpose_blk_var1 \r
+FLA_Transpose_blk_var2 \r
+FLA_Transpose_cntl_init \r
+FLA_Transpose_cntl_finalize \r
+FLA_Transpose_unb_var1 \r
+FLA_Transpose_unb_var2 \r
+FLA_Triangularize \r
+FLA_Trinv \r
+FLA_Trinv_blk_external \r
+FLA_Trinv_cntl_init \r
+FLA_Trinv_cntl_finalize \r
+FLA_Trinv_internal \r
+FLA_Trinv_ln \r
+FLA_Trinv_ln_blk_var1 \r
+FLA_Trinv_ln_blk_var2 \r
+FLA_Trinv_ln_blk_var3 \r
+FLA_Trinv_ln_blk_var4 \r
+FLA_Trinv_ln_opt_var1 \r
+FLA_Trinv_ln_ops_var1 \r
+FLA_Trinv_ln_opd_var1 \r
+FLA_Trinv_ln_opc_var1 \r
+FLA_Trinv_ln_opz_var1 \r
+FLA_Trinv_ln_opt_var2 \r
+FLA_Trinv_ln_ops_var2 \r
+FLA_Trinv_ln_opd_var2 \r
+FLA_Trinv_ln_opc_var2 \r
+FLA_Trinv_ln_opz_var2 \r
+FLA_Trinv_ln_opt_var3 \r
+FLA_Trinv_ln_ops_var3 \r
+FLA_Trinv_ln_opd_var3 \r
+FLA_Trinv_ln_opc_var3 \r
+FLA_Trinv_ln_opz_var3 \r
+FLA_Trinv_ln_opt_var4 \r
+FLA_Trinv_ln_ops_var4 \r
+FLA_Trinv_ln_opd_var4 \r
+FLA_Trinv_ln_opc_var4 \r
+FLA_Trinv_ln_opz_var4 \r
+FLA_Trinv_ln_unb_var1 \r
+FLA_Trinv_ln_unb_var2 \r
+FLA_Trinv_ln_unb_var3 \r
+FLA_Trinv_ln_unb_var4 \r
+FLA_Trinv_lu \r
+FLA_Trinv_lu_blk_var1 \r
+FLA_Trinv_lu_blk_var2 \r
+FLA_Trinv_lu_blk_var3 \r
+FLA_Trinv_lu_blk_var4 \r
+FLA_Trinv_lu_opt_var1 \r
+FLA_Trinv_lu_ops_var1 \r
+FLA_Trinv_lu_opd_var1 \r
+FLA_Trinv_lu_opc_var1 \r
+FLA_Trinv_lu_opz_var1 \r
+FLA_Trinv_lu_opt_var2 \r
+FLA_Trinv_lu_ops_var2 \r
+FLA_Trinv_lu_opd_var2 \r
+FLA_Trinv_lu_opc_var2 \r
+FLA_Trinv_lu_opz_var2 \r
+FLA_Trinv_lu_opt_var3 \r
+FLA_Trinv_lu_ops_var3 \r
+FLA_Trinv_lu_opd_var3 \r
+FLA_Trinv_lu_opc_var3 \r
+FLA_Trinv_lu_opz_var3 \r
+FLA_Trinv_lu_opt_var4 \r
+FLA_Trinv_lu_ops_var4 \r
+FLA_Trinv_lu_opd_var4 \r
+FLA_Trinv_lu_opc_var4 \r
+FLA_Trinv_lu_opz_var4 \r
+FLA_Trinv_lu_unb_var1 \r
+FLA_Trinv_lu_unb_var2 \r
+FLA_Trinv_lu_unb_var3 \r
+FLA_Trinv_lu_unb_var4 \r
+FLA_Trinv_task \r
+FLA_Trinv_ln_task \r
+FLA_Trinv_lu_task \r
+FLA_Trinv_un_task \r
+FLA_Trinv_uu_task \r
+FLA_Trinv_un \r
+FLA_Trinv_unb_external \r
+FLA_Trinv_ln_unb_ext \r
+FLA_Trinv_lu_unb_ext \r
+FLA_Trinv_un_unb_ext \r
+FLA_Trinv_uu_unb_ext \r
+FLA_Trinv_un_blk_var1 \r
+FLA_Trinv_un_blk_var2 \r
+FLA_Trinv_un_blk_var3 \r
+FLA_Trinv_un_blk_var4 \r
+FLA_Trinv_un_opt_var1 \r
+FLA_Trinv_un_ops_var1 \r
+FLA_Trinv_un_opd_var1 \r
+FLA_Trinv_un_opc_var1 \r
+FLA_Trinv_un_opz_var1 \r
+FLA_Trinv_un_opt_var2 \r
+FLA_Trinv_un_ops_var2 \r
+FLA_Trinv_un_opd_var2 \r
+FLA_Trinv_un_opc_var2 \r
+FLA_Trinv_un_opz_var2 \r
+FLA_Trinv_un_opt_var3 \r
+FLA_Trinv_un_ops_var3 \r
+FLA_Trinv_un_opd_var3 \r
+FLA_Trinv_un_opc_var3 \r
+FLA_Trinv_un_opz_var3 \r
+FLA_Trinv_un_opt_var4 \r
+FLA_Trinv_un_ops_var4 \r
+FLA_Trinv_un_opd_var4 \r
+FLA_Trinv_un_opc_var4 \r
+FLA_Trinv_un_opz_var4 \r
+FLA_Trinv_un_unb_var1 \r
+FLA_Trinv_un_unb_var2 \r
+FLA_Trinv_un_unb_var3 \r
+FLA_Trinv_un_unb_var4 \r
+FLA_Trinv_uu \r
+FLA_Trinv_uu_blk_var1 \r
+FLA_Trinv_uu_blk_var2 \r
+FLA_Trinv_uu_blk_var3 \r
+FLA_Trinv_uu_blk_var4 \r
+FLA_Trinv_uu_opt_var1 \r
+FLA_Trinv_uu_ops_var1 \r
+FLA_Trinv_uu_opd_var1 \r
+FLA_Trinv_uu_opc_var1 \r
+FLA_Trinv_uu_opz_var1 \r
+FLA_Trinv_uu_opt_var2 \r
+FLA_Trinv_uu_ops_var2 \r
+FLA_Trinv_uu_opd_var2 \r
+FLA_Trinv_uu_opc_var2 \r
+FLA_Trinv_uu_opz_var2 \r
+FLA_Trinv_uu_opt_var3 \r
+FLA_Trinv_uu_ops_var3 \r
+FLA_Trinv_uu_opd_var3 \r
+FLA_Trinv_uu_opc_var3 \r
+FLA_Trinv_uu_opz_var3 \r
+FLA_Trinv_uu_opt_var4 \r
+FLA_Trinv_uu_ops_var4 \r
+FLA_Trinv_uu_opd_var4 \r
+FLA_Trinv_uu_opc_var4 \r
+FLA_Trinv_uu_opz_var4 \r
+FLA_Trinv_uu_unb_var1 \r
+FLA_Trinv_uu_unb_var2 \r
+FLA_Trinv_uu_unb_var3 \r
+FLA_Trinv_uu_unb_var4 \r
+FLA_Trmm \r
+FLA_Trmmsx_external \r
+FLA_Trmm_cntl_init \r
+FLA_Trmm_cntl_finalize \r
+FLA_Trmm_external \r
+FLA_Trmm_internal \r
+FLA_Trmm_llh \r
+FLA_Trmm_llh_blk_var1 \r
+FLA_Trmm_llh_blk_var2 \r
+FLA_Trmm_llh_blk_var3 \r
+FLA_Trmm_llh_blk_var4 \r
+FLA_Trmm_llh_unb_var1 \r
+FLA_Trmm_llh_unb_var2 \r
+FLA_Trmm_llh_unb_var3 \r
+FLA_Trmm_llh_unb_var4 \r
+FLA_Trmm_lln \r
+FLA_Trmm_lln_blk_var1 \r
+FLA_Trmm_lln_blk_var2 \r
+FLA_Trmm_lln_blk_var3 \r
+FLA_Trmm_lln_blk_var4 \r
+FLA_Trmm_lln_unb_var1 \r
+FLA_Trmm_lln_unb_var2 \r
+FLA_Trmm_lln_unb_var3 \r
+FLA_Trmm_lln_unb_var4 \r
+FLA_Trmm_llt \r
+FLA_Trmm_llt_blk_var1 \r
+FLA_Trmm_llt_blk_var2 \r
+FLA_Trmm_llt_blk_var3 \r
+FLA_Trmm_llt_blk_var4 \r
+FLA_Trmm_llt_unb_var1 \r
+FLA_Trmm_llt_unb_var2 \r
+FLA_Trmm_llt_unb_var3 \r
+FLA_Trmm_llt_unb_var4 \r
+FLA_Trmm_luh \r
+FLA_Trmm_luh_blk_var1 \r
+FLA_Trmm_luh_blk_var2 \r
+FLA_Trmm_luh_blk_var3 \r
+FLA_Trmm_luh_blk_var4 \r
+FLA_Trmm_luh_unb_var1 \r
+FLA_Trmm_luh_unb_var2 \r
+FLA_Trmm_luh_unb_var3 \r
+FLA_Trmm_luh_unb_var4 \r
+FLA_Trmm_lun \r
+FLA_Trmm_lun_blk_var1 \r
+FLA_Trmm_lun_blk_var2 \r
+FLA_Trmm_lun_blk_var3 \r
+FLA_Trmm_lun_blk_var4 \r
+FLA_Trmm_lun_unb_var1 \r
+FLA_Trmm_lun_unb_var2 \r
+FLA_Trmm_lun_unb_var3 \r
+FLA_Trmm_lun_unb_var4 \r
+FLA_Trmm_lut \r
+FLA_Trmm_lut_blk_var1 \r
+FLA_Trmm_lut_blk_var2 \r
+FLA_Trmm_lut_blk_var3 \r
+FLA_Trmm_lut_blk_var4 \r
+FLA_Trmm_lut_unb_var1 \r
+FLA_Trmm_lut_unb_var2 \r
+FLA_Trmm_lut_unb_var3 \r
+FLA_Trmm_lut_unb_var4 \r
+FLA_Trmm_rlh \r
+FLA_Trmm_rlh_blk_var1 \r
+FLA_Trmm_rlh_blk_var2 \r
+FLA_Trmm_rlh_blk_var3 \r
+FLA_Trmm_rlh_blk_var4 \r
+FLA_Trmm_rlh_unb_var1 \r
+FLA_Trmm_rlh_unb_var2 \r
+FLA_Trmm_rlh_unb_var3 \r
+FLA_Trmm_rlh_unb_var4 \r
+FLA_Trmm_rln \r
+FLA_Trmm_rln_blk_var1 \r
+FLA_Trmm_rln_blk_var2 \r
+FLA_Trmm_rln_blk_var3 \r
+FLA_Trmm_rln_blk_var4 \r
+FLA_Trmm_rln_unb_var1 \r
+FLA_Trmm_rln_unb_var2 \r
+FLA_Trmm_rln_unb_var3 \r
+FLA_Trmm_rln_unb_var4 \r
+FLA_Trmm_rlt \r
+FLA_Trmm_rlt_blk_var1 \r
+FLA_Trmm_rlt_blk_var2 \r
+FLA_Trmm_rlt_blk_var3 \r
+FLA_Trmm_rlt_blk_var4 \r
+FLA_Trmm_rlt_unb_var1 \r
+FLA_Trmm_rlt_unb_var2 \r
+FLA_Trmm_rlt_unb_var3 \r
+FLA_Trmm_rlt_unb_var4 \r
+FLA_Trmm_ruh \r
+FLA_Trmm_ruh_blk_var1 \r
+FLA_Trmm_ruh_blk_var2 \r
+FLA_Trmm_ruh_blk_var3 \r
+FLA_Trmm_ruh_blk_var4 \r
+FLA_Trmm_ruh_unb_var1 \r
+FLA_Trmm_ruh_unb_var2 \r
+FLA_Trmm_ruh_unb_var3 \r
+FLA_Trmm_ruh_unb_var4 \r
+FLA_Trmm_run \r
+FLA_Trmm_run_blk_var1 \r
+FLA_Trmm_run_blk_var2 \r
+FLA_Trmm_run_blk_var3 \r
+FLA_Trmm_run_blk_var4 \r
+FLA_Trmm_run_unb_var1 \r
+FLA_Trmm_run_unb_var2 \r
+FLA_Trmm_run_unb_var3 \r
+FLA_Trmm_run_unb_var4 \r
+FLA_Trmm_rut \r
+FLA_Trmm_rut_blk_var1 \r
+FLA_Trmm_rut_blk_var2 \r
+FLA_Trmm_rut_blk_var3 \r
+FLA_Trmm_rut_blk_var4 \r
+FLA_Trmm_rut_unb_var1 \r
+FLA_Trmm_rut_unb_var2 \r
+FLA_Trmm_rut_unb_var3 \r
+FLA_Trmm_rut_unb_var4 \r
+FLA_Trmm_task \r
+FLA_Trmm_llh_task \r
+FLA_Trmm_lln_task \r
+FLA_Trmm_llt_task \r
+FLA_Trmm_luh_task \r
+FLA_Trmm_lun_task \r
+FLA_Trmm_lut_task \r
+FLA_Trmm_rlh_task \r
+FLA_Trmm_rln_task \r
+FLA_Trmm_rlt_task \r
+FLA_Trmm_ruh_task \r
+FLA_Trmm_run_task \r
+FLA_Trmm_rut_task \r
+FLA_Trmv \r
+FLA_Trmvsx \r
+FLA_Trmvsx_external \r
+FLA_Trmv_external \r
+FLA_Trsm \r
+FLA_Trsmsx_external \r
+FLA_Trsm_cntl_init \r
+FLA_Trsm_cntl_finalize \r
+FLA_Trsm_external \r
+FLA_Trsm_internal \r
+FLA_Trsm_llh \r
+FLA_Trsm_llh_blk_var1 \r
+FLA_Trsm_llh_blk_var2 \r
+FLA_Trsm_llh_blk_var3 \r
+FLA_Trsm_llh_blk_var4 \r
+FLA_Trsm_llh_unb_var1 \r
+FLA_Trsm_llh_unb_var2 \r
+FLA_Trsm_llh_unb_var3 \r
+FLA_Trsm_llh_unb_var4 \r
+FLA_Trsm_lln \r
+FLA_Trsm_lln_blk_var1 \r
+FLA_Trsm_lln_blk_var2 \r
+FLA_Trsm_lln_blk_var3 \r
+FLA_Trsm_lln_blk_var4 \r
+FLA_Trsm_lln_unb_var1 \r
+FLA_Trsm_lln_unb_var2 \r
+FLA_Trsm_lln_unb_var3 \r
+FLA_Trsm_lln_unb_var4 \r
+FLA_Trsm_llt \r
+FLA_Trsm_llt_blk_var1 \r
+FLA_Trsm_llt_blk_var2 \r
+FLA_Trsm_llt_blk_var3 \r
+FLA_Trsm_llt_blk_var4 \r
+FLA_Trsm_llt_unb_var1 \r
+FLA_Trsm_llt_unb_var2 \r
+FLA_Trsm_llt_unb_var3 \r
+FLA_Trsm_llt_unb_var4 \r
+FLA_Trsm_luh \r
+FLA_Trsm_luh_blk_var1 \r
+FLA_Trsm_luh_blk_var2 \r
+FLA_Trsm_luh_blk_var3 \r
+FLA_Trsm_luh_blk_var4 \r
+FLA_Trsm_luh_unb_var1 \r
+FLA_Trsm_luh_unb_var2 \r
+FLA_Trsm_luh_unb_var3 \r
+FLA_Trsm_luh_unb_var4 \r
+FLA_Trsm_lun \r
+FLA_Trsm_lun_blk_var1 \r
+FLA_Trsm_lun_blk_var2 \r
+FLA_Trsm_lun_blk_var3 \r
+FLA_Trsm_lun_blk_var4 \r
+FLA_Trsm_lun_unb_var1 \r
+FLA_Trsm_lun_unb_var2 \r
+FLA_Trsm_lun_unb_var3 \r
+FLA_Trsm_lun_unb_var4 \r
+FLA_Trsm_lut \r
+FLA_Trsm_lut_blk_var1 \r
+FLA_Trsm_lut_blk_var2 \r
+FLA_Trsm_lut_blk_var3 \r
+FLA_Trsm_lut_blk_var4 \r
+FLA_Trsm_lut_unb_var1 \r
+FLA_Trsm_lut_unb_var2 \r
+FLA_Trsm_lut_unb_var3 \r
+FLA_Trsm_lut_unb_var4 \r
+FLA_Trsm_piv_task \r
+FLA_Trsm_rlh \r
+FLA_Trsm_rlh_blk_var1 \r
+FLA_Trsm_rlh_blk_var2 \r
+FLA_Trsm_rlh_blk_var3 \r
+FLA_Trsm_rlh_blk_var4 \r
+FLA_Trsm_rlh_unb_var1 \r
+FLA_Trsm_rlh_unb_var2 \r
+FLA_Trsm_rlh_unb_var3 \r
+FLA_Trsm_rlh_unb_var4 \r
+FLA_Trsm_rln \r
+FLA_Trsm_rln_blk_var1 \r
+FLA_Trsm_rln_blk_var2 \r
+FLA_Trsm_rln_blk_var3 \r
+FLA_Trsm_rln_blk_var4 \r
+FLA_Trsm_rln_unb_var1 \r
+FLA_Trsm_rln_unb_var2 \r
+FLA_Trsm_rln_unb_var3 \r
+FLA_Trsm_rln_unb_var4 \r
+FLA_Trsm_rlt \r
+FLA_Trsm_rlt_blk_var1 \r
+FLA_Trsm_rlt_blk_var2 \r
+FLA_Trsm_rlt_blk_var3 \r
+FLA_Trsm_rlt_blk_var4 \r
+FLA_Trsm_rlt_unb_var1 \r
+FLA_Trsm_rlt_unb_var2 \r
+FLA_Trsm_rlt_unb_var3 \r
+FLA_Trsm_rlt_unb_var4 \r
+FLA_Trsm_ruh \r
+FLA_Trsm_ruh_blk_var1 \r
+FLA_Trsm_ruh_blk_var2 \r
+FLA_Trsm_ruh_blk_var3 \r
+FLA_Trsm_ruh_blk_var4 \r
+FLA_Trsm_ruh_unb_var1 \r
+FLA_Trsm_ruh_unb_var2 \r
+FLA_Trsm_ruh_unb_var3 \r
+FLA_Trsm_ruh_unb_var4 \r
+FLA_Trsm_run \r
+FLA_Trsm_run_blk_var1 \r
+FLA_Trsm_run_blk_var2 \r
+FLA_Trsm_run_blk_var3 \r
+FLA_Trsm_run_blk_var4 \r
+FLA_Trsm_run_unb_var1 \r
+FLA_Trsm_run_unb_var2 \r
+FLA_Trsm_run_unb_var3 \r
+FLA_Trsm_run_unb_var4 \r
+FLA_Trsm_rut \r
+FLA_Trsm_rut_blk_var1 \r
+FLA_Trsm_rut_blk_var2 \r
+FLA_Trsm_rut_blk_var3 \r
+FLA_Trsm_rut_blk_var4 \r
+FLA_Trsm_rut_unb_var1 \r
+FLA_Trsm_rut_unb_var2 \r
+FLA_Trsm_rut_unb_var3 \r
+FLA_Trsm_rut_unb_var4 \r
+FLA_Trsm_task \r
+FLA_Trsm_llh_task \r
+FLA_Trsm_lln_task \r
+FLA_Trsm_llt_task \r
+FLA_Trsm_luh_task \r
+FLA_Trsm_lun_task \r
+FLA_Trsm_lut_task \r
+FLA_Trsm_rlh_task \r
+FLA_Trsm_rln_task \r
+FLA_Trsm_rlt_task \r
+FLA_Trsm_ruh_task \r
+FLA_Trsm_run_task \r
+FLA_Trsm_rut_task \r
+FLA_Trsv \r
+FLA_Trsvsx \r
+FLA_Trsvsx_external \r
+FLA_Trsv_cntl_init \r
+FLA_Trsv_cntl_finalize \r
+FLA_Trsv_external \r
+FLA_Trsv_internal \r
+FLA_Trsv_lc \r
+FLA_Trsv_lc_blk_var1 \r
+FLA_Trsv_lc_blk_var2 \r
+FLA_Trsv_ln \r
+FLA_Trsv_ln_blk_var1 \r
+FLA_Trsv_ln_blk_var2 \r
+FLA_Trsv_lt \r
+FLA_Trsv_lt_blk_var1 \r
+FLA_Trsv_lt_blk_var2 \r
+FLA_Trsv_task \r
+FLA_Trsv_lc_task \r
+FLA_Trsv_ln_task \r
+FLA_Trsv_lt_task \r
+FLA_Trsv_uc_task \r
+FLA_Trsv_un_task \r
+FLA_Trsv_ut_task \r
+FLA_Trsv_uc \r
+FLA_Trsv_uc_blk_var1 \r
+FLA_Trsv_uc_blk_var2 \r
+FLA_Trsv_un \r
+FLA_Trsv_un_blk_var1 \r
+FLA_Trsv_un_blk_var2 \r
+FLA_Trsv_ut \r
+FLA_Trsv_ut_blk_var1 \r
+FLA_Trsv_ut_blk_var2 \r
+FLA_Ttmm \r
+FLA_Ttmm_blk_external \r
+FLA_Ttmm_cntl_init \r
+FLA_Ttmm_cntl_finalize \r
+FLA_Ttmm_internal \r
+FLA_Ttmm_l \r
+FLA_Ttmm_l_blk_var1 \r
+FLA_Ttmm_l_blk_var2 \r
+FLA_Ttmm_l_blk_var3 \r
+FLA_Ttmm_l_opt_var1 \r
+FLA_Ttmm_l_ops_var1 \r
+FLA_Ttmm_l_opd_var1 \r
+FLA_Ttmm_l_opc_var1 \r
+FLA_Ttmm_l_opz_var1 \r
+FLA_Ttmm_l_opt_var2 \r
+FLA_Ttmm_l_ops_var2 \r
+FLA_Ttmm_l_opd_var2 \r
+FLA_Ttmm_l_opc_var2 \r
+FLA_Ttmm_l_opz_var2 \r
+FLA_Ttmm_l_opt_var3 \r
+FLA_Ttmm_l_ops_var3 \r
+FLA_Ttmm_l_opd_var3 \r
+FLA_Ttmm_l_opc_var3 \r
+FLA_Ttmm_l_opz_var3 \r
+FLA_Ttmm_l_unb_var1 \r
+FLA_Ttmm_l_unb_var2 \r
+FLA_Ttmm_l_unb_var3 \r
+FLA_Ttmm_task \r
+FLA_Ttmm_l_task \r
+FLA_Ttmm_u_task \r
+FLA_Ttmm_u \r
+FLA_Ttmm_unb_external \r
+FLA_Ttmm_l_unb_ext \r
+FLA_Ttmm_u_unb_ext \r
+FLA_Ttmm_u_blk_var1 \r
+FLA_Ttmm_u_blk_var2 \r
+FLA_Ttmm_u_blk_var3 \r
+FLA_Ttmm_u_opt_var1 \r
+FLA_Ttmm_u_ops_var1 \r
+FLA_Ttmm_u_opd_var1 \r
+FLA_Ttmm_u_opc_var1 \r
+FLA_Ttmm_u_opz_var1 \r
+FLA_Ttmm_u_opt_var2 \r
+FLA_Ttmm_u_ops_var2 \r
+FLA_Ttmm_u_opd_var2 \r
+FLA_Ttmm_u_opc_var2 \r
+FLA_Ttmm_u_opz_var2 \r
+FLA_Ttmm_u_opt_var3 \r
+FLA_Ttmm_u_ops_var3 \r
+FLA_Ttmm_u_opd_var3 \r
+FLA_Ttmm_u_opc_var3 \r
+FLA_Ttmm_u_opz_var3 \r
+FLA_Ttmm_u_unb_var1 \r
+FLA_Ttmm_u_unb_var2 \r
+FLA_Ttmm_u_unb_var3 \r
+FLA_Part_2x2 \r
+FLA_Part_2x1 \r
+FLA_Part_1x2 \r
+FLA_Repart_2x2_to_3x3 \r
+FLA_Repart_2x1_to_3x1 \r
+FLA_Repart_1x2_to_1x3 \r
+FLA_Cont_with_3x3_to_2x2 \r
+FLA_Cont_with_3x1_to_2x1 \r
+FLA_Cont_with_1x3_to_1x2 \r
+FLA_Merge_2x2 \r
+FLA_Merge_2x1 \r
+FLA_Merge_1x2 \r
index 2fa601b01f4723030d2940a8c06ff0222821b4d5..7ea6f83ab8cb1eff07f18e4259c07883f678a1a4 100644 (file)
-::
-::
-:: BLIS
-:: An object-based framework for developing high-performance BLAS-like
-:: libraries.
-::
-:: Copyright (C) 2014, The University of Texas at Austin
-::
-:: Redistribution and use in source and binary forms, with or without
-:: modification, are permitted provided that the following conditions are
-:: met:
-:: - Redistributions of source code must retain the above copyright
-:: notice, this list of conditions and the following disclaimer.
-:: - Redistributions in binary form must reproduce the above copyright
-:: notice, this list of conditions and the following disclaimer in the
-:: documentation and/or other materials provided with the distribution.
-:: - Neither the name of The University of Texas at Austin nor the names
-:: of its contributors may be used to endorse or promote products
-:: derived from this software without specific prior written permission.
-::
-:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-::
-::
-
-@echo off
-
-echo.
-echo Makefile
-echo.
-echo Field G. Van Zee
-echo.
-echo nmake Makefile for building BLIS for Microsoft Windows. nmake targets
-echo may be invoked after running the configure.cmd script. Valid targets are:
-echo.
-echo all - Invoke the lib and dll targets.
-echo lib - Build BLIS as a static library.
-echo dll - Build BLIS as a dynamically-linked library.
-echo help - Output help and usage information.
-echo clean - Invoke clean-log and clean-build targets.
-echo clean-log - Remove any log files present.
-echo clean-config - Remove all products of configure.cmd. Namely, remove the
-echo config, include, and src directories.
-echo clean-build - Remove all products of the compilation portion of the build
-echo process. Namely, remove the obj, lib, and dll directories.
-echo distclean - Invoke clean-log, clean-config, and clean-build targets.
-echo.
-echo The Makefile also recognizes configuration options corresponding to the
-echo following Makefile variables:
-echo.
-echo VERBOSE - When defined, nmake outputs the actual commands
-echo executed instead of more concise one-line progress
-echo indicators. (Undefined by default.)
-echo.
-echo Typically, these options are specified by commenting or uncommenting the
-echo corresponding lines in the Makefile. However, if the Makefile currently does
-echo not define one of the options, and you wish to enable the corresponding
-echo feature without editing the Makefile, you may define the variable at the
-echo command line when nmake is invoked. For example, you may enable verboseness
-echo while invoking the lib target as follows:
-echo.
-echo nmake lib VERBOSE=1
-echo.
+::\r
+::\r
+:: BLIS \r
+:: An object-based framework for developing high-performance BLAS-like\r
+:: libraries.\r
+::\r
+:: Copyright (C) 2014, The University of Texas at Austin\r
+::\r
+:: Redistribution and use in source and binary forms, with or without\r
+:: modification, are permitted provided that the following conditions are\r
+:: met:\r
+:: - Redistributions of source code must retain the above copyright\r
+:: notice, this list of conditions and the following disclaimer.\r
+:: - Redistributions in binary form must reproduce the above copyright\r
+:: notice, this list of conditions and the following disclaimer in the\r
+:: documentation and/or other materials provided with the distribution.\r
+:: - Neither the name of The University of Texas at Austin nor the names\r
+:: of its contributors may be used to endorse or promote products\r
+:: derived from this software without specific prior written permission.\r
+::\r
+:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+::\r
+::\r
+\r
+@echo off\r
+\r
+echo. \r
+echo Makefile\r
+echo. \r
+echo Field G. Van Zee\r
+echo. \r
+echo nmake Makefile for building BLIS for Microsoft Windows. nmake targets\r
+echo may be invoked after running the configure.cmd script. Valid targets are:\r
+echo. \r
+echo all - Invoke the lib and dll targets.\r
+echo lib - Build BLIS as a static library.\r
+echo dll - Build BLIS as a dynamically-linked library.\r
+echo help - Output help and usage information.\r
+echo clean - Invoke clean-log and clean-build targets.\r
+echo clean-log - Remove any log files present.\r
+echo clean-config - Remove all products of configure.cmd. Namely, remove the\r
+echo config, include, and src directories.\r
+echo clean-build - Remove all products of the compilation portion of the build\r
+echo process. Namely, remove the obj, lib, and dll directories.\r
+echo distclean - Invoke clean-log, clean-config, and clean-build targets.\r
+echo.\r
+echo The Makefile also recognizes configuration options corresponding to the\r
+echo following Makefile variables:\r
+echo.\r
+echo VERBOSE - When defined, nmake outputs the actual commands\r
+echo executed instead of more concise one-line progress\r
+echo indicators. (Undefined by default.)\r
+echo.\r
+echo Typically, these options are specified by commenting or uncommenting the\r
+echo corresponding lines in the Makefile. However, if the Makefile currently does\r
+echo not define one of the options, and you wish to enable the corresponding\r
+echo feature without editing the Makefile, you may define the variable at the\r
+echo command line when nmake is invoked. For example, you may enable verboseness\r
+echo while invoking the lib target as follows:\r
+echo.\r
+echo nmake lib VERBOSE=1\r
+echo.\r
index 37b965cb7bddb330c86eb25bf3d95693d0423f09..98115790e3fc839b38697a3a9c886b6ac75403be 100644 (file)
-::
-::
-:: BLIS
-:: An object-based framework for developing high-performance BLAS-like
-:: libraries.
-::
-:: Copyright (C) 2014, The University of Texas at Austin
-::
-:: Redistribution and use in source and binary forms, with or without
-:: modification, are permitted provided that the following conditions are
-:: met:
-:: - Redistributions of source code must retain the above copyright
-:: notice, this list of conditions and the following disclaimer.
-:: - Redistributions in binary form must reproduce the above copyright
-:: notice, this list of conditions and the following disclaimer in the
-:: documentation and/or other materials provided with the distribution.
-:: - Neither the name of The University of Texas at Austin nor the names
-:: of its contributors may be used to endorse or promote products
-:: derived from this software without specific prior written permission.
-::
-:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-::
-::
-
-@echo off
-
-:ENVIRONMENT
- set GEN_CHECK_REV_FILE=.\build\gen-check-rev-file.py
- set GATHER_SRC=.\build\gather-src-for-windows.py
- set GEN_CONFIG_FILE=.\build\gen-config-file.py
- set CONFIG_DEFS_TEMPL=.\build\config.mk.in
- set SRC_TREE_DIR=..\frame
- set TOP_BUILD_DIR=.
-
-:PARAMS
- if "%1"=="" (goto USAGE)
- if "%2"=="" (goto USAGE)
- if "%3"=="" (goto USAGE)
-
- set ARCH=%1
- set BUILD=%2
- set CCOMPILER=%3
-
-:TASK_UNIT
- echo %0: Checking/updating revision file.
- %GEN_CHECK_REV_FILE% -v
- echo %0: Gathering source files into local flat directories.
- %GATHER_SRC% %SRC_TREE_DIR% %TOP_BUILD_DIR%
- echo %0: Creating configure definitions file.
- %GEN_CONFIG_FILE% %TOP_BUILD_DIR% %ARCH% %BUILD% %CCOMPILER% %CONFIG_DEFS_TEMPL%
- echo %0: Configuration and setup complete. You may now run nmake.
-
- goto END
-
-:USAGE
- echo.
- echo configure.cmd
- echo.
- echo A wrapper script for various configuration and setup scripts that need
- echo. to be run before nmake when building BLIS for Microsoft Windows.
- echo.
- echo USAGE:
- echo %0 [arch] [build] [cc]
- echo.
- echo arch -- The architecture string to build.
- echo Supported values: {x86,x64}
- echo build -- The kind of build.
- echo Supported values: {debug,release}
- echo cc -- The C compiler to use.
- echo Supported values: {icl,cl}
- echo.
- echo examples:
- echo %0 x86 debug icl
- echo %0 x64 release cl
- echo.
-
-:END
+::\r
+::\r
+:: BLIS \r
+:: An object-based framework for developing high-performance BLAS-like\r
+:: libraries.\r
+::\r
+:: Copyright (C) 2014, The University of Texas at Austin\r
+::\r
+:: Redistribution and use in source and binary forms, with or without\r
+:: modification, are permitted provided that the following conditions are\r
+:: met:\r
+:: - Redistributions of source code must retain the above copyright\r
+:: notice, this list of conditions and the following disclaimer.\r
+:: - Redistributions in binary form must reproduce the above copyright\r
+:: notice, this list of conditions and the following disclaimer in the\r
+:: documentation and/or other materials provided with the distribution.\r
+:: - Neither the name of The University of Texas at Austin nor the names\r
+:: of its contributors may be used to endorse or promote products\r
+:: derived from this software without specific prior written permission.\r
+::\r
+:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+::\r
+::\r
+\r
+@echo off\r
+\r
+:ENVIRONMENT\r
+ set GEN_CHECK_REV_FILE=.\build\gen-check-rev-file.py\r
+ set GATHER_SRC=.\build\gather-src-for-windows.py\r
+ set GEN_CONFIG_FILE=.\build\gen-config-file.py\r
+ set CONFIG_DEFS_TEMPL=.\build\config.mk.in\r
+ set SRC_TREE_DIR=..\frame\r
+ set TOP_BUILD_DIR=.\r
+\r
+:PARAMS\r
+ if "%1"=="" (goto USAGE)\r
+ if "%2"=="" (goto USAGE)\r
+ if "%3"=="" (goto USAGE)\r
+\r
+ set ARCH=%1\r
+ set BUILD=%2\r
+ set CCOMPILER=%3\r
+ \r
+:TASK_UNIT\r
+ echo %0: Checking/updating revision file.\r
+ %GEN_CHECK_REV_FILE% -v\r
+ echo %0: Gathering source files into local flat directories.\r
+ %GATHER_SRC% %SRC_TREE_DIR% %TOP_BUILD_DIR%\r
+ echo %0: Creating configure definitions file.\r
+ %GEN_CONFIG_FILE% %TOP_BUILD_DIR% %ARCH% %BUILD% %CCOMPILER% %CONFIG_DEFS_TEMPL%\r
+ echo %0: Configuration and setup complete. You may now run nmake. \r
+\r
+ goto END\r
+\r
+:USAGE\r
+ echo. \r
+ echo configure.cmd\r
+ echo. \r
+ echo A wrapper script for various configuration and setup scripts that need\r
+ echo. to be run before nmake when building BLIS for Microsoft Windows.\r
+ echo. \r
+ echo USAGE:\r
+ echo %0 [arch] [build] [cc]\r
+ echo.\r
+ echo arch -- The architecture string to build.\r
+ echo Supported values: {x86,x64}\r
+ echo build -- The kind of build.\r
+ echo Supported values: {debug,release}\r
+ echo cc -- The C compiler to use.\r
+ echo Supported values: {icl,cl}\r
+ echo. \r
+ echo examples:\r
+ echo %0 x86 debug icl\r
+ echo %0 x64 release cl\r
+ echo.\r
+\r
+:END\r
index 0d73eb01e11df5879e4f62679b80eed5f1282f45..db0cdc1d26bdd664c89e973e97ae762582a1633a 100644 (file)
--- a/blis/windows/gendll.cmd
+++ b/blis/windows/gendll.cmd
-@echo off
-@setlocal enabledelayedexpansion
-
-rem --------------------------------------------------------------------
-rem Build a dll out of a set of object files specified by the
-rem argument /objlist.
-rem
-rem The .lib file thus created is an "import" library, which one links
-rem with, but the bulk of the code ends up in the associated .dll file.
-rem ---------------------------------------------------------------------
-
-set THIS_SCRIPT=%~dp0%~nx0
-
-if "%1"=="" goto USAGE
-if "%2"=="" goto USAGE
-if "%3"=="" goto USAGE
-if "%4"=="" goto USAGE
-if "%5"=="" goto USAGE
-
-set gd_lib_name=%1
-set gd_link=%gd_lib_name%-static.link
-set LINKER=%3
-set LINKARGSFILE=%4
-set gd_def=%5
-
-:PARSE_ARGS
-set IMPORT=
-set OBJLIST=
-:ARGLOOP
-if "%6"=="" goto ENDARGLOOP
-if /i not "%6"=="/import" goto OBJARG
-set IMPORT=!IMPORT! %7
-goto SHIFT
-:OBJARG
-if /i not "%6"=="/objlist" goto ENDARGLOOP
-set OBJLIST=%7
-:SHIFT
-shift /4
-shift /4
-goto ARGLOOP
-:ENDARGLOOP
-
-if defined OBJLIST goto COMPILER_SETUP
-echo Error: must supply /objlist <file with list of object names>
-goto USAGE
-
-:COMPILER_SETUP
-set gd_path=%2
-set gd_dll_path=%gd_path%.dll
-set gd_main_c=dll_main__%gd_lib_name%.c
-set gd_main_obj=dll_main__%gd_lib_name%.obj
-
-rem create C file for dll_main
-for /F "tokens=*" %%i in ("#include <windows.h>") do echo %%i >%gd_main_c%
-echo. >>%gd_main_c%
-echo BOOLEAN WINAPI DllMain( >>%gd_main_c%
-echo HINSTANCE hDllHandle, >>%gd_main_c%
-echo DWORD nReason, >>%gd_main_c%
-echo LPVOID Reserved){ >>%gd_main_c%
-echo. >>%gd_main_c%
-echo BOOLEAN bSuccess = TRUE;>>%gd_main_c%
-echo. >>%gd_main_c%
-echo switch (nReason){ >>%gd_main_c%
-echo case DLL_PROCESS_ATTACH: >>%gd_main_c%
-echo DisableThreadLibraryCalls( hDllHandle ); >>%gd_main_c%
-echo break; >>%gd_main_c%
-echo case DLL_PROCESS_DETACH: >>%gd_main_c%
-echo break; >>%gd_main_c%
-echo. >>%gd_main_c%
-echo }; >>%gd_main_c%
-echo. >>%gd_main_c%
-echo return bSuccess; >>%gd_main_c%
-echo }; >>%gd_main_c%
-echo.>>%gd_main_c%
-
-rem set up link file by specifying dll filepath and main object
-echo /Fe%gd_dll_path% > %gd_link%
-echo %gd_main_obj% >> %gd_link%
-
-rem add contents of linkargs file; most of the link argument action is
-rem in this file
-type %LINKARGSFILE% >> %gd_link%
-
-rem add command-line import libraries, if any
-if defined IMPORT echo !IMPORT! >> %gd_link%
-
-rem add export specification
-echo %gd_def% >> %gd_link%
-
-rem add contents of OBJLIST file
-type %OBJLIST% >> %gd_link%
-
-rem create dll, import lib, and export file
-%LINKER% /nologo /c /O2 /Fo%gd_main_obj% %gd_main_c% >> gendll-cl.log
-%LINKER% @%gd_link%
-
-:CLEANUP
-del /F /Q %gd_link% %gd_main_c% %gd_main_obj% gendll-cl.log
-goto END
-
-
-:USAGE
-echo.
-echo. gendll.cmd
-echo.
-echo. Generate a dynamically-linked library from a set of object files
-echo. specified in objlist_file.
-echo.
-echo. Usage:
-echo. %0 dllname dllpath linker linkargs_file symbols_file {/import importlib} /objlist objlist_file
-echo.
-echo. dllname -- the name of the DLL being created, with no extension.
-echo. dllpath -- the path to the DLL being created, with no extension.
-echo. linker -- the compiler to use to link the DLL.
-echo. linkargs_file -- the path to a file containing a list of all linker
-echo. arguments--link options, libraries, and library paths--
-echo. that that may be needed to successfully link the DLL
-echo. being created.
-echo. symbols_file -- the path to a file containing a list of symbols to
-echo. export in the DLL.
-echo. importlib -- the path to a .lib library that you wish to import into
-echo. the DLL being created. Optional.
-echo. objlist_file -- the path to a file containing the list of object files
-echo. that make up the bulk of the DLL being created.
-echo.
-
-:END
-endlocal
+@echo off\r
+@setlocal enabledelayedexpansion\r
+\r
+rem --------------------------------------------------------------------\r
+rem Build a dll out of a set of object files specified by the \r
+rem argument /objlist.\r
+rem\r
+rem The .lib file thus created is an "import" library, which one links\r
+rem with, but the bulk of the code ends up in the associated .dll file.\r
+rem ---------------------------------------------------------------------\r
+\r
+set THIS_SCRIPT=%~dp0%~nx0\r
+\r
+if "%1"=="" goto USAGE\r
+if "%2"=="" goto USAGE\r
+if "%3"=="" goto USAGE\r
+if "%4"=="" goto USAGE\r
+if "%5"=="" goto USAGE\r
+\r
+set gd_lib_name=%1\r
+set gd_link=%gd_lib_name%-static.link\r
+set LINKER=%3\r
+set LINKARGSFILE=%4\r
+set gd_def=%5\r
+\r
+:PARSE_ARGS\r
+set IMPORT=\r
+set OBJLIST=\r
+:ARGLOOP\r
+if "%6"=="" goto ENDARGLOOP\r
+if /i not "%6"=="/import" goto OBJARG\r
+set IMPORT=!IMPORT! %7\r
+goto SHIFT\r
+:OBJARG\r
+if /i not "%6"=="/objlist" goto ENDARGLOOP\r
+set OBJLIST=%7\r
+:SHIFT\r
+shift /4\r
+shift /4\r
+goto ARGLOOP\r
+:ENDARGLOOP\r
+\r
+if defined OBJLIST goto COMPILER_SETUP\r
+echo Error: must supply /objlist <file with list of object names>\r
+goto USAGE\r
+\r
+:COMPILER_SETUP\r
+set gd_path=%2\r
+set gd_dll_path=%gd_path%.dll\r
+set gd_main_c=dll_main__%gd_lib_name%.c\r
+set gd_main_obj=dll_main__%gd_lib_name%.obj\r
+\r
+rem create C file for dll_main\r
+for /F "tokens=*" %%i in ("#include <windows.h>") do echo %%i >%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo BOOLEAN WINAPI DllMain( >>%gd_main_c%\r
+echo HINSTANCE hDllHandle, >>%gd_main_c%\r
+echo DWORD nReason, >>%gd_main_c%\r
+echo LPVOID Reserved){ >>%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo BOOLEAN bSuccess = TRUE;>>%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo switch (nReason){ >>%gd_main_c%\r
+echo case DLL_PROCESS_ATTACH: >>%gd_main_c%\r
+echo DisableThreadLibraryCalls( hDllHandle ); >>%gd_main_c%\r
+echo break; >>%gd_main_c%\r
+echo case DLL_PROCESS_DETACH: >>%gd_main_c%\r
+echo break; >>%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo }; >>%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo return bSuccess; >>%gd_main_c%\r
+echo }; >>%gd_main_c%\r
+echo.>>%gd_main_c%\r
+\r
+rem set up link file by specifying dll filepath and main object\r
+echo /Fe%gd_dll_path% > %gd_link%\r
+echo %gd_main_obj% >> %gd_link%\r
+\r
+rem add contents of linkargs file; most of the link argument action is\r
+rem in this file\r
+type %LINKARGSFILE% >> %gd_link%\r
+\r
+rem add command-line import libraries, if any\r
+if defined IMPORT echo !IMPORT! >> %gd_link%\r
+\r
+rem add export specification\r
+echo %gd_def% >> %gd_link%\r
+\r
+rem add contents of OBJLIST file\r
+type %OBJLIST% >> %gd_link%\r
+\r
+rem create dll, import lib, and export file\r
+%LINKER% /nologo /c /O2 /Fo%gd_main_obj% %gd_main_c% >> gendll-cl.log\r
+%LINKER% @%gd_link%\r
+\r
+:CLEANUP\r
+del /F /Q %gd_link% %gd_main_c% %gd_main_obj% gendll-cl.log\r
+goto END\r
+\r
+\r
+:USAGE\r
+echo. \r
+echo. gendll.cmd\r
+echo. \r
+echo. Generate a dynamically-linked library from a set of object files\r
+echo. specified in objlist_file.\r
+echo. \r
+echo. Usage:\r
+echo. %0 dllname dllpath linker linkargs_file symbols_file {/import importlib} /objlist objlist_file\r
+echo.\r
+echo. dllname -- the name of the DLL being created, with no extension.\r
+echo. dllpath -- the path to the DLL being created, with no extension.\r
+echo. linker -- the compiler to use to link the DLL.\r
+echo. linkargs_file -- the path to a file containing a list of all linker\r
+echo. arguments--link options, libraries, and library paths--\r
+echo. that that may be needed to successfully link the DLL\r
+echo. being created.\r
+echo. symbols_file -- the path to a file containing a list of symbols to\r
+echo. export in the DLL.\r
+echo. importlib -- the path to a .lib library that you wish to import into\r
+echo. the DLL being created. Optional.\r
+echo. objlist_file -- the path to a file containing the list of object files\r
+echo. that make up the bulk of the DLL being created.\r
+echo.\r
+\r
+:END\r
+endlocal\r
index d21c4dd3ae735523578c2767001ab590fa4ee662..7110fabfcf341b0e1dbdba69291463a1e464bf08 100644 (file)
--- a/build/tar_files_list.txt
+++ b/build/tar_files_list.txt
cblas/README.TI
cblas/src
clapack
+ticblas/ticblas.h
+ticblas/src
diff --git a/build_opencl_k2h_large.sh b/build_opencl_k2h_large.sh
--- /dev/null
@@ -0,0 +1,12 @@
+source setup_env_devkit.sh
+cd ../../libarch_intgit/libarch
+make clean
+make TARGET=SOC_K2H LIBOS=LIB_OPENCL
+cd -
+make cleanARMplusDSP MEM_MODEL=Large TARGET=SOC_K2H LIBOS=LIB_OPENCL
+make ARMplusDSP MEM_MODEL=Large TARGET=SOC_K2H LIBOS=LIB_OPENCL
+make installARMplusDSPlib DESTDIR=~/proclibs/linalg_opencl_k2h_Large_install
+export LINALG_DIR=~/proclibs/linalg_opencl_k2h_Large_install
+cd examples/arm+dsp/matmpy
+make clean
+make
diff --git a/build_opencl_k2h_medium.sh b/build_opencl_k2h_medium.sh
--- /dev/null
@@ -0,0 +1,12 @@
+source setup_env_devkit.sh
+cd ../../libarch_intgit/libarch
+make clean
+make TARGET=SOC_K2H LIBOS=LIB_OPENCL
+cd -
+make cleanARMplusDSP MEM_MODEL=Medium TARGET=SOC_K2H LIBOS=LIB_OPENCL
+make ARMplusDSP MEM_MODEL=Medium TARGET=SOC_K2H LIBOS=LIB_OPENCL
+make installARMplusDSPlib DESTDIR=~/proclibs/linalg_opencl_k2h_medium_install
+export LINALG_DIR=~/proclibs/linalg_opencl_k2h_medium_install
+cd examples/arm+dsp/matmpy
+make clean
+make
diff --git a/build_opencl_k2h_small.sh b/build_opencl_k2h_small.sh
--- /dev/null
@@ -0,0 +1,12 @@
+source setup_env_devkit.sh
+cd ../../libarch_intgit/libarch
+make clean
+make TARGET=SOC_K2H LIBOS=LIB_OPENCL
+cd -
+make cleanARMplusDSP MEM_MODEL=Small TARGET=SOC_K2H LIBOS=LIB_OPENCL
+make ARMplusDSP MEM_MODEL=Small TARGET=SOC_K2H LIBOS=LIB_OPENCL
+make installARMplusDSPlib DESTDIR=~/proclibs/linalg_opencl_k2h_small_install
+export LINALG_DIR=~/proclibs/linalg_opencl_k2h_small_install
+cd examples/arm+dsp/matmpy
+make clean
+make
diff --git a/build_rtos_c6678_small.sh b/build_rtos_c6678_small.sh
--- /dev/null
@@ -0,0 +1,11 @@
+cd ../../libarch_intgit/libarch
+make clean
+make TARGET=SOC_C6678 LIBOS=LIB_RTOS
+cd -
+make cleanDSPlibs MEM_MODEL=Small TARGET=SOC_C6678 LIBOS=LIB_RTOS
+make DSPlibs MEM_MODEL=Small TARGET=SOC_C6678 LIBOS=LIB_RTOS
+make installDSPlib DESTDIR=~/proclibs/linalg_rtos_c6678_small_install
+export LINALG_DIR=~/proclibs/linalg_rtos_c6678_small_install
+cd examples/dsponly/dgemm_test
+make clean
+make MEM_MODEL=Small TARGET=SOC_C6678
diff --git a/cblas/Makefile b/cblas/Makefile
index 967e24e888c2c418e0ea0e0695da4b7926cb8e0d..d0e292b2ce3360492feb7030b392f68269e3ec1a 100644 (file)
--- a/cblas/Makefile
+++ b/cblas/Makefile
allprecision:
( cd src && make all)
libinstall:
-ifeq ($(arch), C66)
- (cp $(TEMPCBLIB) $(patsubst %.a, %.ae66, $(TEMPCBLIB) ))
-endif
stest1: link
( cd testing && make stest1 )
diff --git a/cblas/Makefile.C66 b/cblas/Makefile.C66
index ce713d075bb50a2eadc241a00780349b17ca4ae3..cbb8cdb00171252ce2074adf82c0b39d4b7acc1f 100644 (file)
--- a/cblas/Makefile.C66
+++ b/cblas/Makefile.C66
# Libraries and includes
#-----------------------------------------------------------------------------
-#BLLIB = ../blis-dsp/lib/c66x/libblis.a
-BLLIB = ../blis/install/c66x/lib/libblis.a
-CBLIB = ../lib/$(PLAT)/libcblas_$(PLAT).a
+CBLIB = ../lib/$(PLAT)/libcblas.ae66
LN_S = ln -sf
# Flags for Compilers
#-----------------------------------------------------------------------------
-#CFLAGS = -O3 --c99 --use_g2 -I$(TI_OCL_CGT_INSTALL)/include -mv6600 -DADD_ -eo .o -fr obj/$(PLAT)
-CFLAGS = -O3 --c99 --use_g2 -I$(LINUX_DEVKIT_ROOT)/usr/share/ti/cgt-c6x/include -mv6600 -DADD_ -eo .o -fr obj/$(PLAT)
-#FFLAGS = -O3
+CFLAGS = -O3 --c99 --use_g2 -I$(CGTROOT)/include -mv6600 -DADD_ -eo .o -fr obj/$(PLAT)
#-----------------------------------------------------------------------------
# Archive programs and flags
index b107da123d6348de208e6bf46eb70454e184eedc..9a1e05072d9ebf9de9531362b513af2f10f74d7b 100755 (executable)
mkdir results_opt;
export TI_CBLAS_OFFLOAD=000;
-make run_tests;
+#make run_tests;
+./run_tests_only.sh
mv *.out results_arm;
mv *.SNAP results_arm;
export TI_CBLAS_OFFLOAD=111;
-make run_tests;
+#make run_tests;
+./run_tests_only.sh
mv *.out results_dsp;
mv *.SNAP results_dsp;
export TI_CBLAS_OFFLOAD=002;
-make run_tests;
+#make run_tests;
+./run_tests_only.sh
mv *.out results_opt;
mv *.SNAP results_opt;
diff --git a/clapack/BLAS/run_tests_only.sh b/clapack/BLAS/run_tests_only.sh
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+export TI_CBLAS_OFFLOAD=000;
+echo "Now testing BLAS Level 1"
+echo -n "single precision real.. " ; ./xblat1s_ARM > sblat1.out 2> /dev/null ; rm -f /tmp/opencl* ; echo done
+echo -n "double precision real.. " ; ./xblat1d_ARM > dblat1.out 2> /dev/null ; rm -f /tmp/opencl* ; echo done
+echo -n "single precision complex.. " ; ./xblat1c_ARM > cblat1.out 2> /dev/null ; rm -f /tmp/opencl* ; echo done
+echo -n "double precision complex.. " ; ./xblat1z_ARM > zblat1.out 2> /dev/null ; rm -f /tmp/opencl* ; echo done
+echo
+(grep FAIL [sdcz]blat1.out && echo "ABOVE LEVEL 1 TESTS FAILED") || (echo "ALL LEVEL 1 TESTS PASSED. Summary follows:";
+egrep 'PASS|FAIL|Test' [scdz]blat1.out; echo "ALL LEVEL 1 TESTS PASSED.")
+echo
+
+echo "Now testing BLAS Level 2"
+echo -n "single precision real.. " ; ./xblat2s_ARM < sblat2.in > /dev/null 2>&1 ; rm -f /tmp/opencl* ; echo done
+echo -n "double precision real.. " ; ./xblat2d_ARM < dblat2.in > /dev/null 2>&1 ; rm -f /tmp/opencl* ; echo done
+echo -n "single precision complex.. " ; ./xblat2c_ARM < cblat2.in > /dev/null 2>&1 ; rm -f /tmp/opencl*; echo done
+echo -n "double precision complex.. " ; ./xblat2z_ARM < zblat2.in > /dev/null 2>&1 ; rm -f /tmp/opencl*; echo done
+echo
+(grep FAILED [sdcz]blat2.out && echo "ABOVE LEVEL 2 TESTS FAILED") || (echo "ALL LEVEL 2 TESTS PASSED. Summary follows:"; grep PASSED [scdz]blat2.out; echo "ALL LEVEL 2 TESTS PASSED.")
+echo
+
+echo "Now testing BLAS Level 3"
+echo -n "single precision real.. " ; ./xblat3s_ARM < sblat3.in > /dev/null 2>&1 ; rm -f /tmp/opencl* ; echo done
+echo -n "double precision real.. " ; ./xblat3d_ARM < dblat3.in > /dev/null 2>&1 ; rm -f /tmp/opencl* ; echo done
+echo -n "single precision complex.. " ; ./xblat3c_ARM < cblat3.in > /dev/null 2>&1 ; rm -f /tmp/opencl*; echo done
+echo -n "double precision complex.. " ; ./xblat3z_ARM < zblat3.in > /dev/null 2>&1 ; rm -f /tmp/opencl*; echo done
+echo
+(grep FAILED [sdcz]blat3.out && echo "ABOVE LEVEL 3 TESTS FAILED") || (echo "ALL LEVEL 3 TESTS PASSED. Summary follows:"; grep PASSED [scdz]blat3.out; echo "ALL LEVEL 3 TESTS PASSED.")
+
index b4e2096863d22fd04a5519816819139f0244bb48..bd02240dd461e1ddf65956bc2b7d3b7f9f176dbf 100644 (file)
mv libf2c$(PLAT).a ..
clean:
- rm -f libf2c$(PLAT).a *.o arith.h signal1.h sysdep1.h
+# rm -f libf2c$(PLAT).a *.o arith.h signal1.h sysdep1.h
+# don't generate arith.h when cross-compiling
+ rm -f libf2c$(PLAT).a *.o signal1.h sysdep1.h
backspac.o: fio.h
close.o: fio.h
xwsne.o: lio.h
xwsne.o: fmt.h
-arith.h: arithchk.c
- $(CC) $(CFLAGS) -DNO_FPINIT arithchk.c -lm ||\
- $(CC) -DNO_LONG_LONG $(CFLAGS) -DNO_FPINIT arithchk.c -lm
- ./a.out >arith.h
- rm -f a.out arithchk.o
+# Don't generate arith.h when cross-compiling. When compiling F2CLIB for platforms that have different
+# arithmetics than what's in arith.h, uncommet out 4 lines below and generate a.out and run it on the
+# platform to generate arith.h
+#arith.h: arithchk.c
+# $(CC) $(CFLAGS) -DNO_FPINIT arithchk.c -lm || $(CC) -DNO_LONG_LONG $(CFLAGS) -DNO_FPINIT arithchk.c -lm
+# ./a.out >arith.h
+# rm -f a.out arithchk.o
check:
xsum Notice README abort_.c arithchk.c backspac.c c_abs.c c_cos.c \
diff --git a/clapack/F2CLIBS/libf2c/signal1.h b/clapack/F2CLIBS/libf2c/signal1.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* You may need to adjust the definition of signal1 to supply a */
-/* cast to the correct argument type. This detail is system- and */
-/* compiler-dependent. The #define below assumes signal.h declares */
-/* type SIG_PF for the signal function's second argument. */
-
-/* For some C++ compilers, "#define Sigarg_t ..." may be appropriate. */
-
-#include <signal.h>
-
-#ifndef Sigret_t
-#define Sigret_t void
-#endif
-#ifndef Sigarg_t
-#ifdef KR_headers
-#define Sigarg_t
-#else
-#define Sigarg_t int
-#endif
-#endif /*Sigarg_t*/
-
-#ifdef USE_SIG_PF /* compile with -DUSE_SIG_PF under IRIX */
-#define sig_pf SIG_PF
-#else
-typedef Sigret_t (*sig_pf)(Sigarg_t);
-#endif
-
-#define signal1(a,b) signal(a,(sig_pf)b)
-
-#ifdef __cplusplus
-#define Sigarg ...
-#define Use_Sigarg
-#else
-#define Sigarg Int n
-#define Use_Sigarg n = n /* shut up compiler warning */
-#endif
diff --git a/clapack/F2CLIBS/libf2c/sysdep1.h b/clapack/F2CLIBS/libf2c/sysdep1.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef SYSDEP_H_INCLUDED
-#define SYSDEP_H_INCLUDED
-#undef USE_LARGEFILE
-#ifndef NO_LONG_LONG
-
-#ifdef __sun__
-#define USE_LARGEFILE
-#define OFF_T off64_t
-#endif
-
-#ifdef __linux__
-#define USE_LARGEFILE
-#define OFF_T __off64_t
-#endif
-
-#ifdef _AIX43
-#define _LARGE_FILES
-#define _LARGE_FILE_API
-#define USE_LARGEFILE
-#endif /*_AIX43*/
-
-#ifdef __hpux
-#define _FILE64
-#define _LARGEFILE64_SOURCE
-#define USE_LARGEFILE
-#endif /*__hpux*/
-
-#ifdef __sgi
-#define USE_LARGEFILE
-#endif /*__sgi*/
-
-#ifdef __FreeBSD__
-#define OFF_T off_t
-#define FSEEK fseeko
-#define FTELL ftello
-#endif
-
-#ifdef USE_LARGEFILE
-#ifndef OFF_T
-#define OFF_T off64_t
-#endif
-#define _LARGEFILE_SOURCE
-#define _LARGEFILE64_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#define FOPEN fopen64
-#define FREOPEN freopen64
-#define FSEEK fseeko64
-#define FSTAT fstat64
-#define FTELL ftello64
-#define FTRUNCATE ftruncate64
-#define STAT stat64
-#define STAT_ST stat64
-#endif /*USE_LARGEFILE*/
-#endif /*NO_LONG_LONG*/
-
-#ifndef NON_UNIX_STDIO
-#ifndef USE_LARGEFILE
-#define _INCLUDE_POSIX_SOURCE /* for HP-UX */
-#define _INCLUDE_XOPEN_SOURCE /* for HP-UX */
-#include "sys/types.h"
-#include "sys/stat.h"
-#endif
-#endif
-
-#endif /*SYSDEP_H_INCLUDED*/
diff --git a/clapack/TESTING/run_clapack_tests.sh b/clapack/TESTING/run_clapack_tests.sh
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+mkdir results_arm;
+mkdir results_dsp;
+mkdir results_opt;
+
+export TI_CBLAS_OFFLOAD=000;
+./run_testsuite.sh
+mv *.out results_arm;
+
+export TI_CBLAS_OFFLOAD=001
+./run_testsuite.sh
+mv *.out results_dsp;
+
+export TI_CBLAS_OFFLOAD=002
+./run_testsuite.sh
+mv *.out results_opt;
+
diff --git a/clapack/TESTING/run_testsuite.sh b/clapack/TESTING/run_testsuite.sh
--- /dev/null
@@ -0,0 +1,166 @@
+#!/bin/bash
+
+echo Testing REAL LAPACK linear equation routines
+./xlintsts_ARM < stest.in > stest.out 2>&1
+echo NEP: Testing Nonsymmetric Eigenvalue Problem routines
+./xeigtsts_ARM < nep.in > snep.out 2>&1
+echo SEP: Testing Symmetric Eigenvalue Problem routines
+./xeigtsts_ARM < sep.in > ssep.out 2>&1
+echo SVD: Testing Singular Value Decomposition routines
+./xeigtsts_ARM < svd.in > ssvd.out 2>&1
+echo SEC: Testing REAL Eigen Condition Routines
+./xeigtsts_ARM < sec.in > sec.out 2>&1
+echo SEV: Testing REAL Nonsymmetric Eigenvalue Driver
+./xeigtsts_ARM < sed.in > sed.out 2>&1
+echo SGG: Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines
+./xeigtsts_ARM < sgg.in > sgg.out 2>&1
+echo SGD: Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines
+./xeigtsts_ARM < sgd.in > sgd.out 2>&1
+echo SSB: Testing REAL Symmetric Eigenvalue Problem routines
+./xeigtsts_ARM < ssb.in > ssb.out 2>&1
+echo SSG: Testing REAL Symmetric Generalized Eigenvalue Problem routines
+./xeigtsts_ARM < ssg.in > ssg.out 2>&1
+echo SGEBAL: Testing the balancing of a REAL general matrix
+./xeigtsts_ARM < sbal.in > sbal.out 2>&1
+echo SGEBAK: Testing the back transformation of a REAL balanced matrix
+./xeigtsts_ARM < sbak.in > sbak.out 2>&1
+echo SGGBAL: Testing the balancing of a pair of REAL general matrices
+./xeigtsts_ARM < sgbal.in > sgbal.out 2>&1
+echo SGGBAK: Testing the back transformation of a pair of REAL balanced matrices
+./xeigtsts_ARM < sgbak.in > sgbak.out 2>&1
+echo SBB: Testing banded Singular Value Decomposition routines
+./xeigtsts_ARM < sbb.in > sbb.out 2>&1
+echo GLM: Testing Generalized Linear Regression Model routines
+./xeigtsts_ARM < glm.in > sglm.out 2>&1
+echo GQR: Testing Generalized QR and RQ factorization routines
+./xeigtsts_ARM < gqr.in > sgqr.out 2>&1
+echo GSV: Testing Generalized Singular Value Decomposition routines
+./xeigtsts_ARM < gsv.in > sgsv.out 2>&1
+echo LSE: Testing Constrained Linear Least Squares routines
+./xeigtsts_ARM < lse.in > slse.out 2>&1
+echo Testing COMPLEX LAPACK linear equation routines
+./xlintstc_ARM < ctest.in > ctest.out 2>&1
+echo NEP: Testing Nonsymmetric Eigenvalue Problem routines
+./xeigtstc_ARM < nep.in > cnep.out 2>&1
+echo SEP: Testing Symmetric Eigenvalue Problem routines
+./xeigtstc_ARM < sep.in > csep.out 2>&1
+echo SVD: Testing Singular Value Decomposition routines
+./xeigtstc_ARM < svd.in > csvd.out 2>&1
+echo CEC: Testing COMPLEX Eigen Condition Routines
+./xeigtstc_ARM < cec.in > cec.out 2>&1
+echo CES: Testing COMPLEX Nonsymmetric Schur Form Driver
+./xeigtstc_ARM < ced.in > ced.out 2>&1
+echo CGG: Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines
+./xeigtstc_ARM < cgg.in > cgg.out 2>&1
+echo CGD: Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines
+./xeigtstc_ARM < cgd.in > cgd.out 2>&1
+echo CHB: Testing Hermitian Eigenvalue Problem routines
+./xeigtstc_ARM < csb.in > csb.out 2>&1
+echo CSG: Testing Symmetric Generalized Eigenvalue Problem routines
+./xeigtstc_ARM < csg.in > csg.out 2>&1
+echo CGEBAL: Testing the balancing of a COMPLEX general matrix
+./xeigtstc_ARM < cbal.in > cbal.out 2>&1
+echo CGEBAK: Testing the back transformation of a COMPLEX balanced matrix
+./xeigtstc_ARM < cbak.in > cbak.out 2>&1
+echo CGGBAL: Testing the balancing of a pair of COMPLEX general matrices
+./xeigtstc_ARM < cgbal.in > cgbal.out 2>&1
+echo CGGBAK: Testing the back transformation of a pair of COMPLEX balanced matrices
+./xeigtstc_ARM < cgbak.in > cgbak.out 2>&1
+echo CBB: Testing banded Singular Value Decomposition routines
+./xeigtstc_ARM < cbb.in > cbb.out 2>&1
+echo GLM: Testing Generalized Linear Regression Model routines
+./xeigtstc_ARM < glm.in > cglm.out 2>&1
+echo GQR: Testing Generalized QR and RQ factorization routines
+./xeigtstc_ARM < gqr.in > cgqr.out 2>&1
+echo GSV: Testing Generalized Singular Value Decomposition routines
+./xeigtstc_ARM < gsv.in > cgsv.out 2>&1
+echo LSE: Testing Constrained Linear Least Squares routines
+./xeigtstc_ARM < lse.in > clse.out 2>&1
+echo Testing DOUBLE PRECISION LAPACK linear equation routines
+./xlintstd_ARM < dtest.in > dtest.out 2>&1
+echo NEP: Testing Nonsymmetric Eigenvalue Problem routines
+./xeigtstd_ARM < nep.in > dnep.out 2>&1
+echo SEP: Testing Symmetric Eigenvalue Problem routines
+./xeigtstd_ARM < sep.in > dsep.out 2>&1
+echo SVD: Testing Singular Value Decomposition routines
+./xeigtstd_ARM < svd.in > dsvd.out 2>&1
+echo DEC: Testing DOUBLE PRECISION Eigen Condition Routines
+./xeigtstd_ARM < dec.in > dec.out 2>&1
+echo DEV: Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver
+./xeigtstd_ARM < ded.in > ded.out 2>&1
+echo DGG: Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines
+./xeigtstd_ARM < dgg.in > dgg.out 2>&1
+echo DGD: Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines
+./xeigtstd_ARM < dgd.in > dgd.out 2>&1
+echo DSB: Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines
+./xeigtstd_ARM < dsb.in > dsb.out 2>&1
+echo DSG: Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines
+./xeigtstd_ARM < dsg.in > dsg.out 2>&1
+echo DGEBAL: Testing the balancing of a DOUBLE PRECISION general matrix
+./xeigtstd_ARM < dbal.in > dbal.out 2>&1
+echo DGEBAK: Testing the back transformation of a DOUBLE PRECISION balanced matrix
+./xeigtstd_ARM < dbak.in > dbak.out 2>&1
+echo DGGBAL: Testing the balancing of a pair of DOUBLE PRECISION general matrices
+./xeigtstd_ARM < dgbal.in > dgbal.out 2>&1
+echo DGGBAK: Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices
+./xeigtstd_ARM < dgbak.in > dgbak.out 2>&1
+echo DBB: Testing banded Singular Value Decomposition routines
+./xeigtstd_ARM < dbb.in > dbb.out 2>&1
+echo GLM: Testing Generalized Linear Regression Model routines
+./xeigtstd_ARM < glm.in > dglm.out 2>&1
+echo GQR: Testing Generalized QR and RQ factorization routines
+./xeigtstd_ARM < gqr.in > dgqr.out 2>&1
+echo GSV: Testing Generalized Singular Value Decomposition routines
+./xeigtstd_ARM < gsv.in > dgsv.out 2>&1
+echo LSE: Testing Constrained Linear Least Squares routines
+./xeigtstd_ARM < lse.in > dlse.out 2>&1
+echo Testing COMPLEX16 LAPACK linear equation routines
+./xlintstz_ARM < ztest.in > ztest.out 2>&1
+echo NEP: Testing Nonsymmetric Eigenvalue Problem routines
+./xeigtstz_ARM < nep.in > znep.out 2>&1
+echo SEP: Testing Symmetric Eigenvalue Problem routines
+./xeigtstz_ARM < sep.in > zsep.out 2>&1
+echo SVD: Testing Singular Value Decomposition routines
+./xeigtstz_ARM < svd.in > zsvd.out 2>&1
+echo ZEC: Testing COMPLEX16 Eigen Condition Routines
+./xeigtstz_ARM < zec.in > zec.out 2>&1
+echo ZES: Testing COMPLEX16 Nonsymmetric Schur Form Driver
+./xeigtstz_ARM < zed.in > zed.out 2>&1
+echo ZGG: Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines
+./xeigtstz_ARM < zgg.in > zgg.out 2>&1
+echo ZGD: Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines
+./xeigtstz_ARM < zgd.in > zgd.out 2>&1
+echo ZHB: Testing Hermitian Eigenvalue Problem routines
+./xeigtstz_ARM < zsb.in > zsb.out 2>&1
+echo ZSG: Testing Symmetric Generalized Eigenvalue Problem routines
+./xeigtstz_ARM < zsg.in > zsg.out 2>&1
+echo ZGEBAL: Testing the balancing of a COMPLEX16 general matrix
+./xeigtstz_ARM < zbal.in > zbal.out 2>&1
+echo ZGEBAK: Testing the back transformation of a COMPLEX16 balanced matrix
+./xeigtstz_ARM < zbak.in > zbak.out 2>&1
+echo ZGGBAL: Testing the balancing of a pair of COMPLEX general matrices
+./xeigtstz_ARM < zgbal.in > zgbal.out 2>&1
+echo ZGGBAK: Testing the back transformation of a pair of COMPLEX16 balanced matrices
+./xeigtstz_ARM < zgbak.in > zgbak.out 2>&1
+echo ZBB: Testing banded Singular Value Decomposition routines
+./xeigtstz_ARM < zbb.in > zbb.out 2>&1
+echo GLM: Testing Generalized Linear Regression Model routines
+./xeigtstz_ARM < glm.in > zglm.out 2>&1
+echo GQR: Testing Generalized QR and RQ factorization routines
+./xeigtstz_ARM < gqr.in > zgqr.out 2>&1
+echo GSV: Testing Generalized Singular Value Decomposition routines
+./xeigtstz_ARM < gsv.in > zgsv.out 2>&1
+echo LSE: Testing Constrained Linear Least Squares routines
+./xeigtstz_ARM < lse.in > zlse.out 2>&1
+echo Testing REAL LAPACK RFP protoype linear equation routines
+./xlintstrfs_ARM < stest_rfp.in > stest_rfp.out 2>&1
+echo Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines
+./xlintstds_ARM < dstest.in > dstest.out 2>&1
+echo Testing DOUBLE PRECISION LAPACK RFP protoype linear equation routines
+./xlintstrfd_ARM < dtest_rfp.in > dtest_rfp.out 2>&1
+echo Testing COMPLEX LAPACK RFP protoype linear equation routines
+./xlintstrfc_ARM < ctest_rfp.in > ctest_rfp.out 2>&1
+echo Testing COMPLEX-COMPLEX16 LAPACK protoype linear equation routines
+./xlintstzc_ARM < zctest.in > zctest.out 2>&1
+echo Testing COMPLEX16 LAPACK RFP protoype linear equation routines
+./xlintstrfz_ARM < ztest_rfp.in > ztest_rfp.out 2>&1
similarity index 74%
rename from docs/LINALG_1.0.0_manifest.html
rename to docs/LINALG_1.2.0_manifest.html
index bcb97324247e71ea7a5ff5851c4d7d35a9d0d55a..b09a6882a22e976c9c196b18ced974981609218f 100644 (file)
rename from docs/LINALG_1.0.0_manifest.html
rename to docs/LINALG_1.2.0_manifest.html
index bcb97324247e71ea7a5ff5851c4d7d35a9d0d55a..b09a6882a22e976c9c196b18ced974981609218f 100644 (file)
-<!--\r\r
-Texas Instruments Manifest Format 2.0\r\r
--->\r\r
-\r\r
-<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\r\r
-<html>\r\r
-\r\r
-<head>\r\r
-<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />\r\r
-<!-- @Start Style -->\r\r
-<!-- Default style in case someone doesnt have Internet Access -->\r\r
-<style type="text/css" id="internalStyle">\r\r
- body, div, p {\r\r
- font-family: Lucida Grande, Verdana, Geneva, Arial, sans-serif;\r\r
- font-size: 13px;\r\r
- line-height: 1.3;\r\r
- }\r\r
- body {\r\r
- margin: 20px; \r\r
- }\r\r
- h1 {\r\r
- font-size: 150%;\r\r
- }\r\r
- h2 {\r\r
- font-size: 120%;\r\r
- }\r\r
- h3 {\r\r
- font-size: 100%;\r\r
- }\r\r
- img {\r\r
- border: 0px;\r\r
- vertical-align: middle;\r\r
- }\r\r
- table, th, td, tr {\r\r
- border: 1px solid black; \r\r
- font-family: Lucida Grande, Verdana, Geneva, Arial, sans-serif;\r\r
- font-size: 13px;\r\r
- line-height: 1.3;\r\r
- empty-cells: show; \r\r
- padding: 5px;\r\r
- }\r\r
- table {\r\r
- border-collapse: collapse; \r\r
- width: 100%;\r\r
- }\r\r
- tr {\r\r
- page-break-inside: avoid;\r\r
- }\r\r
- #TIlogoLeft {\r\r
- background-color: black; \r\r
- padding: 0;\r\r
- width: 20%;\r\r
- }\r\r
- #TIlogoRight {\r\r
- background-color: red; \r\r
- padding: 0;\r\r
- }\r\r
- #ProductName {\r\r
- text-align: center;\r\r
- }\r\r
- #ReleaseDate {\r\r
- text-align: center;\r\r
- }\r\r
- .LogoSection {\r\r
- margin: 0;\r\r
- padding: 0;\r\r
- }\r\r
- .HeaderSection {\r\r
- margin: 25px 0 25px 0;\r\r
- padding: 0;\r\r
- }\r\r
- .LegendSection {\r\r
- margin: 25px 0 25px 0;\r\r
- }\r\r
- .ExportSection {\r\r
- margin: 25px 0 25px 0;\r\r
- }\r\r
- .DisclaimerSection {\r\r
- margin: 25px 0 25px 0; \r\r
- }\r\r
- .CreditSection {\r\r
- margin: 25px 0 25px 0; \r\r
- }\r\r
- .LicenseSection {\r\r
- margin: 25px 0 25px 0; \r\r
- }\r\r
- .ManifestTable {\r\r
- margin: 25px 0 25px 0; \r\r
- }\r\r
-</style> \r\r
-<!-- Override style from TI if they have Internet Access -->\r\r
-<link type="text/css" rel="stylesheet" href="timanifeststyle.css">\r\r
-<!-- @End Style -->\r\r
-<title>Texas Instruments Manifest</title>\r\r
-</head>\r\r
-\r\r
-<body><!-- Logo display, will need to fix up the URLs, this is just for testing.. Image alternate display not wporking well yet -->\r\r
-<div class="LogoSection">\r\r
-<table>\r\r
- <tbody>\r\r
- <tr>\r\r
- <td id="TIlogoLeft">\r\r
- <a href="http://www.ti.com/">\r\r
- <!-- img src="tilogo.gif" alt="Texas Instruments Incorporated" -->\r\r
- <img alt="" src="data:image/gif;base64,R0lGODlh3gA2AKIAAAAAAP///7u7u29vbz8/PwYGBujo6BgYGCH5BAAAAAAALAAAAADeADYAAAP/CLrc/jDKSau9OOvNu/9gKI5kaZ5oqq5s675wLM90bd94ru987//AoHBILBqPyKRyyWw6n9CodHorDALYLIHKJVqz2q44eAUHtoDB4DBu48rgLQErcNtnX7NhMDcICIB3gix5ZmtqAAZZew8EAo+QkQIDNVZqiIM1cHGKZ4YPAmaiAWw0c1gFmZqjB3SbZ6kNe6WhsAeOlDV0qjSFAXUAp7lwuREFtVsFgMvLB7fNAM+BCs+lDLd8BNYOuxfV22PL0RiWlwO1u3kDqejAEsjR6GB86FsHoYwA6gxWnVgGEegUuIelWJk6jswAGlXQ36J1xBSoQwfulIEDr/6l+VeK/+AehrAGOHRnAWRBbbWegckXAV6wk4AeRQtDQBEaBYsYlMl2hUCsBt0iKgilT9EfAlfO7SmzdKkrkQUT/fqZSECqLCSlntH375IAA1tqGUilLIBSNVnU+NmJNBRVChlF1QwAdlRWBy5P3QymwCLBYhs73cTHYBq3X33nDQ2wcWuBgef0FRD4GK3jU3VCZZUJAIw1OGg0P+4bFiubOWoOsEP1+KvZn3wurDbZ6lfcuw3yYkFjRSeYzRe7ARAbW0K3PmGIMi0OFDG1Mmha+RnufAHn3xL9ha6uTZ/rXagZ1GKAtTsHeWb+FEQvHILuX4+mLzj2j2r4TrFesTwMbE5Cuv8JzbTSGuRV1xgfUJFC3WbA0JWFalcItpgf8YU2yT/qATaedent5cBb8zk0DzIitgfKbonRFV9Wp2xl3UXq5Ccibp05598BnRigiAIJmrZAexkJQIuBwzX4CB3SQbeYQkPVAUco63DI2HzsAdYAiAvEZdYlaVQ5wXs3+bQAjovEUoBRR9LVAFLaPXCcY/KMqVRasQB5kiJgLcYgTkJiuCWKC2ZpIY/z/LRhYefkBAGW1HTyRy2UjObLHxSAOZ948EUVGCSC3SLZbB7iZKOLc2GRRgMH/VhdHnJwFCgD8iEGx0VKvpqbO+hoaCppEg3UiTES1CTkhNaQ+Qs4LQGql07/lET4mIQ6SvTSVGZ9Bmhz/bkYzK+PFKtpje6wumRm1wrLZzSdQASoZvyswdmSuk7p616HfkjBTxZBQucFgqXCFKdn1NpiUlQJhs8kteBWG0AbATbXS2tBlaeoVkmJRova4KkGPmhMFdiSYmq8cbTRYhrlkiHaNufJ9mIgVqEXnAOJM5JE4sgjudQ8bF82x+cKBP4Iiedecyjgx2/WtMNjjhcL9h+S4xq9RYJgsbeeUbmdrPTSQbPccsyijEXOfI8xyuinVJH1wdkS/MQ2Bc5Iq08DyHYwGglvPyCilbz0fa8GLV7r9+Btb7CJ14Qnzg8HpdKoOOF5Py752JNXvrblNphzEHnmnF/a+ecTbA465qKPXnnppkuOeuqKr8465K+z7nrsfc9Ouyq23z5I7rrfwXvvbhSQAAA7" />\r\r
- </a>\r\r
- </td>\r\r
- <td id="TILogoRight">\r\r
- <!-- img src="titagline.gif" alt="Technology for Innovators(tm)"-->\r\r
- <img alt="" src="data:image/gif;base64,R0lGODlhOgEaALMAAP8AAP////92dv+3t/+Njf/W1v/t7f8hIf/19f+jo//Hx/8/P/9cXP/j4//6+v/+/iH5BAAAAAAALAAAAAA6ARoAAAT/EMhJq7046827/2AojmRpnmiqrmzrvnAsz3Rt33iu73zv/8CgcEgsGo/IpHLJbDqft0NDMCBQodis1jcADBKE7nYcCpjPgU5AQBKkVYOHAeRudqtXsh60/vRHdSoBBCGBNAkLe4o4f2psgG8pjR6GM5OLmDB/DA0GBoQADAgICRIBBQUOYgwGCg2kEgudBgUHAIGcBg0MsZ0NCnMGYgsBtqEGAbCynrW3AQONgcIFBgiErK6wAAfUtLbCscWiowoAyLDczLZu0AIJCAYOoJn0G38ObAwPEvLEts/O1vUhsA8AAjGonEmA9W6hGAVpEjiQoKBAhT8HJSRkVyEQQAAJ//a5YeMPQIFyACqCnJjSIgFCB4oB+HOSokWOAB6wIWCxnk8MfYh5QsYg5sVHfQLVMSqhztJIxWIaC6QzJy8KfZgqrNT0zR+nUNl8fSMvZ6IDwJCJRfoI7IR4Cub9nDsha6RwR02xUZpGq1utUWUq9FKgYV6/abgOHjt45tquEgY0SDDHoJg+fxhXolKNrmfH/EoR5EdAKmjQfB1qvPmGIQIJ3g4gC2egVF7LqxtP8Ng2cViTKFUCIGbNFKEEmB/VbDlYdqLRn+du8oTg6jjbmfe+CbTM2+BcuySgbQVtQoOCt7s3U8wbsqGs3ZppZLnylwFe8Uql825ogANPckUnYDoOCogxQGXADajggjcw4AA8DSSyTQASMmjhhTQscBWGHHbo4YcghijiiCSWaOKJKKao4oostugiFBEAADs=" />\r\r
- </td>\r\r
- </tr>\r\r
- </tbody>\r\r
-</table>\r\r
-</div><div class="HeaderSection">\r\r
-<h1 id="ProductName">\r\r
-<!-- @Start Product -->\r\r
-LINALG Manifest\r\r
-<!-- @End Product -->\r\r
-</h1>\r\r
-\r\r
-<h2 id="ReleaseDate">\r\r
-<!-- @Start Date -->\r\r
-04-23-2015\r\r
-<!-- @End Date -->\r\r
-</h2>\r\r
-</div><div class="LegendSection">\r\r
-<h2>Legend</h2>\r\r
-<p>(explanation of the fields in the Manifest Table below)</p>\r\r
-<table>\r\r
-<tbody>\r\r
-<tr>\r\r
-<td>\r\r
-<b>Software Name </b>\r\r
-</td>\r\r
-<td>\r\r
-The name of the application or file\r\r
-</td>\r\r
-</tr>\r\r
-<tr>\r\r
-<td>\r\r
-<b>Version</b>\r\r
-</td>\r\r
-<td>\r\r
-Version of the application or file\r\r
-</td>\r\r
-</tr>\r\r
-<tr>\r\r
-<td>\r\r
-<b>License Type</b>\r\r
-</td>\r\r
-<td>\r\r
-Type of license(s) under which TI will be providing\r\r
-software to the licensee (e.g. BSD-3-Clause, GPL-2.0, TI TSPA License, TI\r\r
-Commercial License). The license could be under Commercial terms or Open Source. See Open Source Reference License Disclaimer in\r\r
-the Disclaimers Section. Whenever possible, TI will use an <a href="http://spdx.org/licenses/"> SPDX Short Identifier </a> for an Open Source\r\r
-License. TI Commercial license terms are not usually included in the manifest and are conveyed through a variety \r\r
-of means such as a clickwrap license upon install, \r\r
-a signed license agreement and so forth.\r\r
-</td>\r\r
-</tr>\r\r
-<tr>\r\r
-<td>\r\r
-<b>Location</b>\r\r
-</td>\r\r
-<td>\r\r
-The directory name and path on the media or a specific file where the Software is located. Typically fully qualified path names \r\r
-are not used and instead the relevant top level directory of the application is given. \r\r
-A notation often used in the manifests is [as installed]/directory/*. Note that the asterisk implies that all\r\r
-files under that directory are licensed as the License Type field denotes. Any exceptions to this will \r\r
-generally be denoted as [as installed]/directory/* except as noted below which means as shown in subsequent rows of \r\r
-the manifest.\r\r
-</td>\r\r
-</tr>\r\r
-<tr>\r\r
-<td>\r\r
-<b>Delivered As</b>\r\r
-</td>\r\r
-<td>\r\r
-This field will either be “Source”, “Binary” or “Source\r\r
-and Binary” and is the primary form the content of the Software is delivered\r\r
-in. If the Software is delivered in an archive format, this field\r\r
-applies to the contents of the archive. If the word Limited is used\r\r
-with Source, as in “Limited Source” or “Limited Source and Binary” then\r\r
-only portions of the Source for the application are provided.\r\r
-</td>\r\r
-</tr>\r\r
-<tr>\r\r
-<td>\r\r
-<b>Modified by TI</b>\r\r
-</td>\r\r
-<td>\r\r
-This field will either be “Yes” or “No”. A “Yes” means\r\r
-TI has made changes to the Software. A “No” means TI has not made any\r\r
-changes. Note: This field is not applicable for Software “Obtained\r\r
-from” TI.\r\r
-</td>\r\r
-</tr>\r\r
-<tr>\r\r
-<td>\r\r
-<b>Obtained from</b>\r\r
-</td>\r\r
-<td>\r\r
-This field specifies from where or from whom TI obtained\r\r
-the Software. It may be a URL to an Open Source site, a 3<sup>rd</sup>\r\r
-party licensor, or TI. See Links Disclaimer in the Disclaimers\r\r
-Section.\r\r
-</td>\r\r
-</tr>\r\r
-</tbody>\r\r
-</table>\r\r
-</div><div class="DisclaimerSection">\r\r
-<h2>Disclaimers</h2>\r\r
-<h3>Export Control Classification Number (ECCN)</h3>\r\r
-<p>Any use of ECCNs listed in the Manifest is at the user’s risk\r\r
-and without recourse to TI. Your\r\r
-company, as the exporter of record, is responsible for determining the\r\r
-correct classification of any item at\r\r
-the time of export. Any export classification by TI of Software is for\r\r
-TI’s internal use only and shall not be construed as a representation\r\r
-or warranty\r\r
-regarding the proper export classification for such Software or whether\r\r
-an export\r\r
-license or other documentation is required for exporting such Software</p>\r\r
-<h3>Links in the Manifest</h3>\r\r
-<p>Any\r\r
-links appearing on this Manifest\r\r
-(for example in the “Obtained from” field) were verified at the time\r\r
-the Manifest was created. TI makes no guarantee that any listed links\r\r
-will\r\r
-remain active in the future.</p>\r\r
-<h3>Open Source License References</h3>\r\r
-<p>Your company is responsible for confirming the\r\r
-applicable license terms for any open source Software\r\r
-listed in this Manifest that was not “Obtained from” TI. Any open\r\r
-source license\r\r
-specified in this Manifest for Software that was\r\r
-not “Obtained from” TI is for TI’s internal use only and shall not be\r\r
-construed as a representation or warranty regarding the proper open\r\r
-source license terms\r\r
-for such Software.</p>\r\r
-</div><div class="ExportSection">\r\r
-<h2>Export Information</h2>\r\r
-<p>ECCN for Software included in this release:</p>\r\r
-Publicly Available - Open Source or TI TSPA License\r\r
-</div><div class="ManifestTable">\r\r
-<!-- h2>Manifest Table</h2 -->\r\r
- \r
- <table> \r
- <tbody> \r
- \r
- <h2> \r
- LINALG LINALG 1.0.0 Manifest \r
- </h2> \r
- \r
- \r
- <p> \r
- \r
- See the Legend above for a description of these columns. \r
- \r
- </p> \r
- \r
- <table id="targetpackages" name="targetpackages"> \r
- <thead> \r
- <tr> \r
- <td><b>Software Name</b></td> \r
- <td><b>Version</b></td> \r
- <td><b>License Type</b></td> \r
- <td><b>Delivered As</b></td> \r
- <td><b>Modified by TI</b></td> \r
- <td></td> \r
- <td></td> \r
- </tr> \r
- </thead> \r
- \r
- \r
- <tbody> \r
- <tr> \r
- <td id="name" name="name" rowspan="2"> \r
- LINALG \r
- </td> \r
- <td id="version" name="version" rowspan="2"> \r
- 1.0.0 \r
- </td> \r
- <td id="license" name="license" rowspan="2"> \r
- BSD-3-CLAUSE \r
- </td> \r
- <td id="delivered" name="delivered" rowspan="2"> \r
- Source and binary \r
- </td> \r
- <td id="modified" name="modified" rowspan="2"> \r
- N/A \r
- </td> \r
- <td><b>Location</b></td> \r
- <td id="location" name="location"> \r
- [installation directory]/linalg \r
- </td> \r
- </tr> \r
- <tr> \r
- <td><b>Obtained from</b></td> \r
- <td id="obtained" name="obtained"> \r
- TI \r
- </td> \r
- </tr> \r
- \r
- <tbody> \r
- <tr> \r
- <td id="name" name="name" rowspan="2"> \r
- BLIS \r
- </td> \r
- <td id="version" name="version" rowspan="2"> \r
- 0.1.6 \r
- </td> \r
- <td id="license" name="license" rowspan="2"> \r
- BSD-3-CLAUSE \r
- </td> \r
- <td id="delivered" name="delivered" rowspan="2"> \r
- Source and binary \r
- </td> \r
- <td id="modified" name="modified" rowspan="2"> \r
- Yes \r
- </td> \r
- <td><b>Location</b></td> \r
- <td id="location" name="location"> \r
- [installation directory]/linalg/blis \r
- </td> \r
- </tr> \r
- <tr> \r
- <td><b>Obtained from</b></td> \r
- <td id="obtained" name="obtained"> \r
- https://github.com/flame/blis.git \r
- </td> \r
- </tr> \r
- \r
- <tbody> \r
- <tr> \r
- <td id="name" name="name" rowspan="2"> \r
- CBLAS \r
- </td> \r
- <td id="version" name="version" rowspan="2"> \r
- N/A \r
- </td> \r
- <td id="license" name="license" rowspan="2"> \r
- CBLAS \r
- </td> \r
- <td id="delivered" name="delivered" rowspan="2"> \r
- Source and binary \r
- </td> \r
- <td id="modified" name="modified" rowspan="2"> \r
- No \r
- </td> \r
- <td><b>Location</b></td> \r
- <td id="location" name="location"> \r
- [installation directory]/linalg/cblas \r
- </td> \r
- </tr> \r
- <tr> \r
- <td><b>Obtained from</b></td> \r
- <td id="obtained" name="obtained"> \r
- http://www.netlib.org/blas/#_cblas \r
- </td> \r
- </tr> \r
- \r
- <tbody> \r
- <tr> \r
- <td id="name" name="name" rowspan="2"> \r
- CLAPACK \r
- </td> \r
- <td id="version" name="version" rowspan="2"> \r
- 3.2.1 \r
- </td> \r
- <td id="license" name="license" rowspan="2"> \r
- BSD-3-CLAUSE \r
- </td> \r
- <td id="delivered" name="delivered" rowspan="2"> \r
- Source and binary \r
- </td> \r
- <td id="modified" name="modified" rowspan="2"> \r
- Yes \r
- </td> \r
- <td><b>Location</b></td> \r
- <td id="location" name="location"> \r
- [installation directory]/linalg/clapack \r
- </td> \r
- </tr> \r
- <tr> \r
- <td><b>Obtained from</b></td> \r
- <td id="obtained" name="obtained"> \r
- http://www.netlib.org/clapack/ \r
- </td> \r
- </tr> \r
- \r
- </tbody> \r
- </table> \r
- \r
- </p> \r
- </p> \r
- <p> \r
-\r\r
-</div><div class="CreditSection">\r\r
-<h2>Credits</h2>\r\r
-<BR> <BR><BR><BR><BR>\r\r
-</div><div class="LicenseSection">\r\r
-<h2>Licenses</h2>\r\r
-<BR><h3><b> LINALG Licenses </b></h3><BR> <BR><BR><BR>Texas Instruments License<BR><BR><BR>BERKELEY SOFTWARE DISTRIBUTION LICENSE <BR><BR>Copyright (C) 2014 Texas Instruments Incorporated - http://www.ti.com/<BR><BR>Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:<BR><BR>Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.<BR><BR>Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.<BR><BR>Neither the name of Texas Instruments Incorporated nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.<BR><BR>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.<BR><BR>BLIS License<BR>The BLIS framework is licensed under the following license, typically<BR>known as the "new" or "modified" or "3-clause" BSD license.<BR><BR><BR>Copyright (C) 2014, The University of Texas at Austin<BR><BR>Redistribution and use in source and binary forms, with or without<BR>modification, are permitted provided that the following conditions are<BR>met:<BR> - Redistributions of source code must retain the above copyright<BR> notice, this list of conditions and the following disclaimer.<BR> - Redistributions in binary form must reproduce the above copyright<BR> notice, this list of conditions and the following disclaimer in the<BR> documentation and/or other materials provided with the distribution.<BR> - Neither the name of The University of Texas at Austin nor the names<BR> of its contributors may be used to endorse or promote products<BR> derived from this software without specific prior written permission.<BR><BR>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS<BR>"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT<BR>LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR<BR>A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT<BR>HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,<BR>SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT<BR>LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,<BR>DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY<BR>THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT<BR>(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE<BR>OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.<BR><BR>CBLAS License<BR>The reference BLAS is a freely-available software package. It is available from netlib via anonymous ftp and the World Wide Web. Thus, it can be included in commercial software packages (and has been). We only ask that proper credit be given to the authors.<BR>Like all software, it is copyrighted. It is not trademarked, but we do ask the following:<BR>- If you modify the source for these routines we ask that you change the name of the routine and comment the changes made to the original. <BR>- We will gladly answer any questions regarding the software. If a modification is done, however, it is the responsibility of the person who modified the routine to provide support. <BR><BR>CLAPACK License <BR><BR>Copyright (c) 1992-2013 The University of Tennessee and The University<BR> of Tennessee Research Foundation. All rights<BR> reserved.<BR>Copyright (c) 2000-2013 The University of California Berkeley. All<BR> rights reserved.<BR>Copyright (c) 2006-2013 The University of Colorado Denver. All rights<BR> reserved.<BR><BR>$COPYRIGHT$<BR><BR>Additional copyrights may follow<BR><BR>$HEADER$<BR><BR>Redistribution and use in source and binary forms, with or without<BR>modification, are permitted provided that the following conditions are<BR>met:<BR><BR>- Redistributions of source code must retain the above copyright<BR> notice, this list of conditions and the following disclaimer.<BR><BR>- Redistributions in binary form must reproduce the above copyright<BR> notice, this list of conditions and the following disclaimer listed<BR> in this license in the documentation and/or other materials<BR> provided with the distribution.<BR><BR>- Neither the name of the copyright holders nor the names of its<BR> contributors may be used to endorse or promote products derived from<BR> this software without specific prior written permission.<BR><BR>The copyright holders provide no reassurances that the source code<BR>provided does not infringe any patent, copyright, or any other<BR>intellectual property rights of third parties. The copyright holders<BR>disclaim any liability to any recipient for claims brought against<BR>recipient by any third party for infringement of that parties<BR>intellectual property rights.<BR><BR>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS<BR>"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT<BR>LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR<BR>A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT<BR>OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,<BR>SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT<BR>LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,<BR>DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY<BR>THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT<BR>(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE<BR>OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.<BR><BR><BR><BR>\r\r
-</div>\r\r
-\r\r
+<!--\r
+Texas Instruments Manifest Format 2.0\r
+-->\r
+\r
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\r
+<html>\r
+\r
+<head>\r
+<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />\r
+<!-- @Start Style -->\r
+<!-- Default style in case someone doesnt have Internet Access -->\r
+<style type="text/css" id="internalStyle">\r
+ body, div, p {\r
+ font-family: Lucida Grande, Verdana, Geneva, Arial, sans-serif;\r
+ font-size: 13px;\r
+ line-height: 1.3;\r
+ }\r
+ body {\r
+ margin: 20px; \r
+ }\r
+ h1 {\r
+ font-size: 150%;\r
+ }\r
+ h2 {\r
+ font-size: 120%;\r
+ }\r
+ h3 {\r
+ font-size: 100%;\r
+ }\r
+ img {\r
+ border: 0px;\r
+ vertical-align: middle;\r
+ }\r
+ table, th, td, tr {\r
+ border: 1px solid black; \r
+ font-family: Lucida Grande, Verdana, Geneva, Arial, sans-serif;\r
+ font-size: 13px;\r
+ line-height: 1.3;\r
+ empty-cells: show; \r
+ padding: 5px;\r
+ }\r
+ table {\r
+ border-collapse: collapse; \r
+ width: 100%;\r
+ }\r
+ tr {\r
+ page-break-inside: avoid;\r
+ }\r
+ #TIlogoLeft {\r
+ background-color: black; \r
+ padding: 0;\r
+ width: 20%;\r
+ }\r
+ #TIlogoRight {\r
+ background-color: red; \r
+ padding: 0;\r
+ }\r
+ #ProductName {\r
+ text-align: center;\r
+ }\r
+ #ReleaseDate {\r
+ text-align: center;\r
+ }\r
+ .LogoSection {\r
+ margin: 0;\r
+ padding: 0;\r
+ }\r
+ .HeaderSection {\r
+ margin: 25px 0 25px 0;\r
+ padding: 0;\r
+ }\r
+ .LegendSection {\r
+ margin: 25px 0 25px 0;\r
+ }\r
+ .ExportSection {\r
+ margin: 25px 0 25px 0;\r
+ }\r
+ .DisclaimerSection {\r
+ margin: 25px 0 25px 0; \r
+ }\r
+ .CreditSection {\r
+ margin: 25px 0 25px 0; \r
+ }\r
+ .LicenseSection {\r
+ margin: 25px 0 25px 0; \r
+ }\r
+ .ManifestTable {\r
+ margin: 25px 0 25px 0; \r
+ }\r
+</style> \r
+<!-- Override style from TI if they have Internet Access -->\r
+<link type="text/css" rel="stylesheet" href="timanifeststyle.css">\r
+<!-- @End Style -->\r
+<title>Texas Instruments Manifest</title>\r
+</head>\r
+\r
+<body><!-- Logo display, will need to fix up the URLs, this is just for testing.. Image alternate display not wporking well yet -->\r
+<div class="LogoSection">\r
+<table>\r
+ <tbody>\r
+ <tr>\r
+ <td id="TIlogoLeft">\r
+ <a href="http://www.ti.com/">\r
+ <!-- img src="tilogo.gif" alt="Texas Instruments Incorporated" -->\r
+ <img alt="" src="data:image/gif;base64,R0lGODlh3gA2AKIAAAAAAP///7u7u29vbz8/PwYGBujo6BgYGCH5BAAAAAAALAAAAADeADYAAAP/CLrc/jDKSau9OOvNu/9gKI5kaZ5oqq5s675wLM90bd94ru987//AoHBILBqPyKRyyWw6n9CodHorDALYLIHKJVqz2q44eAUHtoDB4DBu48rgLQErcNtnX7NhMDcICIB3gix5ZmtqAAZZew8EAo+QkQIDNVZqiIM1cHGKZ4YPAmaiAWw0c1gFmZqjB3SbZ6kNe6WhsAeOlDV0qjSFAXUAp7lwuREFtVsFgMvLB7fNAM+BCs+lDLd8BNYOuxfV22PL0RiWlwO1u3kDqejAEsjR6GB86FsHoYwA6gxWnVgGEegUuIelWJk6jswAGlXQ36J1xBSoQwfulIEDr/6l+VeK/+AehrAGOHRnAWRBbbWegckXAV6wk4AeRQtDQBEaBYsYlMl2hUCsBt0iKgilT9EfAlfO7SmzdKkrkQUT/fqZSECqLCSlntH375IAA1tqGUilLIBSNVnU+NmJNBRVChlF1QwAdlRWBy5P3QymwCLBYhs73cTHYBq3X33nDQ2wcWuBgef0FRD4GK3jU3VCZZUJAIw1OGg0P+4bFiubOWoOsEP1+KvZn3wurDbZ6lfcuw3yYkFjRSeYzRe7ARAbW0K3PmGIMi0OFDG1Mmha+RnufAHn3xL9ha6uTZ/rXagZ1GKAtTsHeWb+FEQvHILuX4+mLzj2j2r4TrFesTwMbE5Cuv8JzbTSGuRV1xgfUJFC3WbA0JWFalcItpgf8YU2yT/qATaedent5cBb8zk0DzIitgfKbonRFV9Wp2xl3UXq5Ccibp05598BnRigiAIJmrZAexkJQIuBwzX4CB3SQbeYQkPVAUco63DI2HzsAdYAiAvEZdYlaVQ5wXs3+bQAjovEUoBRR9LVAFLaPXCcY/KMqVRasQB5kiJgLcYgTkJiuCWKC2ZpIY/z/LRhYefkBAGW1HTyRy2UjObLHxSAOZ948EUVGCSC3SLZbB7iZKOLc2GRRgMH/VhdHnJwFCgD8iEGx0VKvpqbO+hoaCppEg3UiTES1CTkhNaQ+Qs4LQGql07/lET4mIQ6SvTSVGZ9Bmhz/bkYzK+PFKtpje6wumRm1wrLZzSdQASoZvyswdmSuk7p616HfkjBTxZBQucFgqXCFKdn1NpiUlQJhs8kteBWG0AbATbXS2tBlaeoVkmJRova4KkGPmhMFdiSYmq8cbTRYhrlkiHaNufJ9mIgVqEXnAOJM5JE4sgjudQ8bF82x+cKBP4Iiedecyjgx2/WtMNjjhcL9h+S4xq9RYJgsbeeUbmdrPTSQbPccsyijEXOfI8xyuinVJH1wdkS/MQ2Bc5Iq08DyHYwGglvPyCilbz0fa8GLV7r9+Btb7CJ14Qnzg8HpdKoOOF5Py752JNXvrblNphzEHnmnF/a+ecTbA465qKPXnnppkuOeuqKr8465K+z7nrsfc9Ouyq23z5I7rrfwXvvbhSQAAA7" />\r
+ </a>\r
+ </td>\r
+ <td id="TILogoRight">\r
+ <!-- img src="titagline.gif" alt="Technology for Innovators(tm)"-->\r
+ <img alt="" src="data:image/gif;base64,R0lGODlhOgEaALMAAP8AAP////92dv+3t/+Njf/W1v/t7f8hIf/19f+jo//Hx/8/P/9cXP/j4//6+v/+/iH5BAAAAAAALAAAAAA6ARoAAAT/EMhJq7046827/2AojmRpnmiqrmzrvnAsz3Rt33iu73zv/8CgcEgsGo/IpHLJbDqft0NDMCBQodis1jcADBKE7nYcCpjPgU5AQBKkVYOHAeRudqtXsh60/vRHdSoBBCGBNAkLe4o4f2psgG8pjR6GM5OLmDB/DA0GBoQADAgICRIBBQUOYgwGCg2kEgudBgUHAIGcBg0MsZ0NCnMGYgsBtqEGAbCynrW3AQONgcIFBgiErK6wAAfUtLbCscWiowoAyLDczLZu0AIJCAYOoJn0G38ObAwPEvLEts/O1vUhsA8AAjGonEmA9W6hGAVpEjiQoKBAhT8HJSRkVyEQQAAJ//a5YeMPQIFyACqCnJjSIgFCB4oB+HOSokWOAB6wIWCxnk8MfYh5QsYg5sVHfQLVMSqhztJIxWIaC6QzJy8KfZgqrNT0zR+nUNl8fSMvZ6IDwJCJRfoI7IR4Cub9nDsha6RwR02xUZpGq1utUWUq9FKgYV6/abgOHjt45tquEgY0SDDHoJg+fxhXolKNrmfH/EoR5EdAKmjQfB1qvPmGIQIJ3g4gC2egVF7LqxtP8Ng2cViTKFUCIGbNFKEEmB/VbDlYdqLRn+du8oTg6jjbmfe+CbTM2+BcuySgbQVtQoOCt7s3U8wbsqGs3ZppZLnylwFe8Uql825ogANPckUnYDoOCogxQGXADajggjcw4AA8DSSyTQASMmjhhTQscBWGHHbo4YcghijiiCSWaOKJKKao4oostugiFBEAADs=" />\r
+ </td>\r
+ </tr>\r
+ </tbody>\r
+</table>\r
+</div><div class="HeaderSection">\r
+<h1 id="ProductName">\r
+<!-- @Start Product -->\r
+LINALG Manifest\r
+<!-- @End Product -->\r
+</h1>\r
+\r
+<h2 id="ReleaseDate">\r
+<!-- @Start Date -->\r
+10-06-2015\r
+<!-- @End Date -->\r
+</h2>\r
+</div><div class="LegendSection">\r
+<h2>Legend</h2>\r
+<p>(explanation of the fields in the Manifest Table below)</p>\r
+<table>\r
+<tbody>\r
+<tr>\r
+<td>\r
+<b>Software Name </b>\r
+</td>\r
+<td>\r
+The name of the application or file\r
+</td>\r
+</tr>\r
+<tr>\r
+<td>\r
+<b>Version</b>\r
+</td>\r
+<td>\r
+Version of the application or file\r
+</td>\r
+</tr>\r
+<tr>\r
+<td>\r
+<b>License Type</b>\r
+</td>\r
+<td>\r
+Type of license(s) under which TI will be providing\r
+software to the licensee (e.g. BSD-3-Clause, GPL-2.0, TI TSPA License, TI\r
+Commercial License). The license could be under Commercial terms or Open Source. See Open Source Reference License Disclaimer in\r
+the Disclaimers Section. Whenever possible, TI will use an <a href="http://spdx.org/licenses/"> SPDX Short Identifier </a> for an Open Source\r
+License. TI Commercial license terms are not usually included in the manifest and are conveyed through a variety \r
+of means such as a clickwrap license upon install, \r
+a signed license agreement and so forth.\r
+</td>\r
+</tr>\r
+<tr>\r
+<td>\r
+<b>Location</b>\r
+</td>\r
+<td>\r
+The directory name and path on the media or a specific file where the Software is located. Typically fully qualified path names \r
+are not used and instead the relevant top level directory of the application is given. \r
+A notation often used in the manifests is [as installed]/directory/*. Note that the asterisk implies that all\r
+files under that directory are licensed as the License Type field denotes. Any exceptions to this will \r
+generally be denoted as [as installed]/directory/* except as noted below which means as shown in subsequent rows of \r
+the manifest.\r
+</td>\r
+</tr>\r
+<tr>\r
+<td>\r
+<b>Delivered As</b>\r
+</td>\r
+<td>\r
+This field will either be “Source”, “Binary” or “Source\r
+and Binary” and is the primary form the content of the Software is delivered\r
+in. If the Software is delivered in an archive format, this field\r
+applies to the contents of the archive. If the word Limited is used\r
+with Source, as in “Limited Source” or “Limited Source and Binary” then\r
+only portions of the Source for the application are provided.\r
+</td>\r
+</tr>\r
+<tr>\r
+<td>\r
+<b>Modified by TI</b>\r
+</td>\r
+<td>\r
+This field will either be “Yes” or “No”. A “Yes” means\r
+TI has made changes to the Software. A “No” means TI has not made any\r
+changes. Note: This field is not applicable for Software “Obtained\r
+from” TI.\r
+</td>\r
+</tr>\r
+<tr>\r
+<td>\r
+<b>Obtained from</b>\r
+</td>\r
+<td>\r
+This field specifies from where or from whom TI obtained\r
+the Software. It may be a URL to an Open Source site, a 3<sup>rd</sup>\r
+party licensor, or TI. See Links Disclaimer in the Disclaimers\r
+Section.\r
+</td>\r
+</tr>\r
+</tbody>\r
+</table>\r
+</div><div class="DisclaimerSection">\r
+<h2>Disclaimers</h2>\r
+<h3>Export Control Classification Number (ECCN)</h3>\r
+<p>Any use of ECCNs listed in the Manifest is at the user’s risk\r
+and without recourse to TI. Your\r
+company, as the exporter of record, is responsible for determining the\r
+correct classification of any item at\r
+the time of export. Any export classification by TI of Software is for\r
+TI’s internal use only and shall not be construed as a representation\r
+or warranty\r
+regarding the proper export classification for such Software or whether\r
+an export\r
+license or other documentation is required for exporting such Software</p>\r
+<h3>Links in the Manifest</h3>\r
+<p>Any\r
+links appearing on this Manifest\r
+(for example in the “Obtained from” field) were verified at the time\r
+the Manifest was created. TI makes no guarantee that any listed links\r
+will\r
+remain active in the future.</p>\r
+<h3>Open Source License References</h3>\r
+<p>Your company is responsible for confirming the\r
+applicable license terms for any open source Software\r
+listed in this Manifest that was not “Obtained from” TI. Any open\r
+source license\r
+specified in this Manifest for Software that was\r
+not “Obtained from” TI is for TI’s internal use only and shall not be\r
+construed as a representation or warranty regarding the proper open\r
+source license terms\r
+for such Software.</p>\r
+</div><div class="ExportSection">\r
+<h2>Export Information</h2>\r
+<p>ECCN for Software included in this release:</p>\r
+Publicly Available - Open Source or TI TSPA License\r
+</div><div class="ManifestTable">\r
+<!-- h2>Manifest Table</h2 -->\r
+
+ <table>
+ <tbody>
+
+ <h2>
+ LINALG LINALG 1.2.0 Manifest
+ </h2>
+
+
+ <p>
+
+ See the Legend above for a description of these columns.
+
+ </p>
+
+ <table id="targetpackages" name="targetpackages">
+ <thead>
+ <tr>
+ <td><b>Software Name</b></td>
+ <td><b>Version</b></td>
+ <td><b>License Type</b></td>
+ <td><b>Delivered As</b></td>
+ <td><b>Modified by TI</b></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </thead>
+
+
+ <tbody>
+ <tr>
+ <td id="name" name="name" rowspan="2">
+ LINALG
+ </td>
+ <td id="version" name="version" rowspan="2">
+ 1.2.0
+ </td>
+ <td id="license" name="license" rowspan="2">
+ BSD-3-CLAUSE
+ </td>
+ <td id="delivered" name="delivered" rowspan="2">
+ Source and binary
+ </td>
+ <td id="modified" name="modified" rowspan="2">
+ N/A
+ </td>
+ <td><b>Location</b></td>
+ <td id="location" name="location">
+ [installation directory]/linalg
+ </td>
+ </tr>
+ <tr>
+ <td><b>Obtained from</b></td>
+ <td id="obtained" name="obtained">
+ TI
+ </td>
+ </tr>
+
+ <tbody>
+ <tr>
+ <td id="name" name="name" rowspan="2">
+ BLIS
+ </td>
+ <td id="version" name="version" rowspan="2">
+ 0.1.6
+ </td>
+ <td id="license" name="license" rowspan="2">
+ BSD-3-CLAUSE
+ </td>
+ <td id="delivered" name="delivered" rowspan="2">
+ Source and binary
+ </td>
+ <td id="modified" name="modified" rowspan="2">
+ Yes
+ </td>
+ <td><b>Location</b></td>
+ <td id="location" name="location">
+ [installation directory]/linalg/blis
+ </td>
+ </tr>
+ <tr>
+ <td><b>Obtained from</b></td>
+ <td id="obtained" name="obtained">
+ https://github.com/flame/blis.git
+ </td>
+ </tr>
+
+ <tbody>
+ <tr>
+ <td id="name" name="name" rowspan="2">
+ CBLAS
+ </td>
+ <td id="version" name="version" rowspan="2">
+ N/A
+ </td>
+ <td id="license" name="license" rowspan="2">
+ CBLAS
+ </td>
+ <td id="delivered" name="delivered" rowspan="2">
+ Source and binary
+ </td>
+ <td id="modified" name="modified" rowspan="2">
+ No
+ </td>
+ <td><b>Location</b></td>
+ <td id="location" name="location">
+ [installation directory]/linalg/cblas
+ </td>
+ </tr>
+ <tr>
+ <td><b>Obtained from</b></td>
+ <td id="obtained" name="obtained">
+ http://www.netlib.org/blas/#_cblas
+ </td>
+ </tr>
+
+ <tbody>
+ <tr>
+ <td id="name" name="name" rowspan="2">
+ CLAPACK
+ </td>
+ <td id="version" name="version" rowspan="2">
+ 3.2.1
+ </td>
+ <td id="license" name="license" rowspan="2">
+ BSD-3-CLAUSE
+ </td>
+ <td id="delivered" name="delivered" rowspan="2">
+ Source and binary
+ </td>
+ <td id="modified" name="modified" rowspan="2">
+ Yes
+ </td>
+ <td><b>Location</b></td>
+ <td id="location" name="location">
+ [installation directory]/linalg/clapack
+ </td>
+ </tr>
+ <tr>
+ <td><b>Obtained from</b></td>
+ <td id="obtained" name="obtained">
+ http://www.netlib.org/clapack/
+ </td>
+ </tr>
+
+ </tbody>
+ </table>
+
+ </p>
+ </p>
+ <p>
+\r
+</div><div class="CreditSection">\r
+<h2>Credits</h2>\r
+<BR> <BR><BR><BR><BR>\r
+</div><div class="LicenseSection">\r
+<h2>Licenses</h2>\r
+<BR><h3><b> LINALG Licenses </b></h3><BR> <BR><BR><BR>Texas Instruments License<BR><BR><BR>BERKELEY SOFTWARE DISTRIBUTION LICENSE <BR><BR>Copyright (C) 2014 Texas Instruments Incorporated - http://www.ti.com/<BR><BR>Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:<BR><BR>Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.<BR><BR>Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.<BR><BR>Neither the name of Texas Instruments Incorporated nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.<BR><BR>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.<BR><BR>BLIS License<BR>The BLIS framework is licensed under the following license, typically<BR>known as the "new" or "modified" or "3-clause" BSD license.<BR><BR><BR>Copyright (C) 2014, The University of Texas at Austin<BR><BR>Redistribution and use in source and binary forms, with or without<BR>modification, are permitted provided that the following conditions are<BR>met:<BR> - Redistributions of source code must retain the above copyright<BR> notice, this list of conditions and the following disclaimer.<BR> - Redistributions in binary form must reproduce the above copyright<BR> notice, this list of conditions and the following disclaimer in the<BR> documentation and/or other materials provided with the distribution.<BR> - Neither the name of The University of Texas at Austin nor the names<BR> of its contributors may be used to endorse or promote products<BR> derived from this software without specific prior written permission.<BR><BR>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS<BR>"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT<BR>LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR<BR>A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT<BR>HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,<BR>SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT<BR>LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,<BR>DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY<BR>THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT<BR>(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE<BR>OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.<BR><BR>CBLAS License<BR>The reference BLAS is a freely-available software package. It is available from netlib via anonymous ftp and the World Wide Web. Thus, it can be included in commercial software packages (and has been). We only ask that proper credit be given to the authors.<BR>Like all software, it is copyrighted. It is not trademarked, but we do ask the following:<BR>- If you modify the source for these routines we ask that you change the name of the routine and comment the changes made to the original. <BR>- We will gladly answer any questions regarding the software. If a modification is done, however, it is the responsibility of the person who modified the routine to provide support. <BR><BR>CLAPACK License <BR><BR>Copyright (c) 1992-2013 The University of Tennessee and The University<BR> of Tennessee Research Foundation. All rights<BR> reserved.<BR>Copyright (c) 2000-2013 The University of California Berkeley. All<BR> rights reserved.<BR>Copyright (c) 2006-2013 The University of Colorado Denver. All rights<BR> reserved.<BR><BR>$COPYRIGHT$<BR><BR>Additional copyrights may follow<BR><BR>$HEADER$<BR><BR>Redistribution and use in source and binary forms, with or without<BR>modification, are permitted provided that the following conditions are<BR>met:<BR><BR>- Redistributions of source code must retain the above copyright<BR> notice, this list of conditions and the following disclaimer.<BR><BR>- Redistributions in binary form must reproduce the above copyright<BR> notice, this list of conditions and the following disclaimer listed<BR> in this license in the documentation and/or other materials<BR> provided with the distribution.<BR><BR>- Neither the name of the copyright holders nor the names of its<BR> contributors may be used to endorse or promote products derived from<BR> this software without specific prior written permission.<BR><BR>The copyright holders provide no reassurances that the source code<BR>provided does not infringe any patent, copyright, or any other<BR>intellectual property rights of third parties. The copyright holders<BR>disclaim any liability to any recipient for claims brought against<BR>recipient by any third party for infringement of that parties<BR>intellectual property rights.<BR><BR>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS<BR>"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT<BR>LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR<BR>A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT<BR>OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,<BR>SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT<BR>LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,<BR>DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY<BR>THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT<BR>(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE<BR>OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.<BR><BR><BR><BR>\r
+</div>\r
+\r
</body></html>
\ No newline at end of file
diff --git a/docs/doxygen/doxycfg.txt b/docs/doxygen/doxycfg.txt
--- /dev/null
+++ b/docs/doxygen/doxycfg.txt
@@ -0,0 +1,1781 @@
+# Doxyfile 1.7.6.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME = "LINALG "
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF = "Linear Algebra Library"
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY = ./docs/doxygen
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS = YES
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+SYMBOL_CACHE_SIZE = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path.
+
+CITE_BIB_FILES =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT = ./ticblas ./docs/doxygen
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS = *.c *.h *.dox
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH = ./docs/doxygen/images
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+# for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# style sheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+# will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE = 4
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS = YES
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP = YES
diff --git a/docs/doxygen/mainpage.dox b/docs/doxygen/mainpage.dox
--- /dev/null
@@ -0,0 +1,25 @@
+/*
+ * This is a little header file which doxygen parses to generate the main
+ * documentation page
+ */
+
+/*! \mainpage LINALG User's Guide
+ *
+ * @section introduction Introduction
+ * LINALG is TI's Linear Algebra Library, supporting two types of TI platforms:
+ * - ARM+DSP platforms such as AM57xx and Keystone II devices
+ * - DSP-only platforms such as Keystone I devices
+ *
+ * LINALG includes BLAS and LAPACK:
+ * - BLAS is based on BLIS (https://github.com/flame/blis) and available on both ARM+DSP and DSP-only plaforms.
+ * - LAPACK is based on CLAPACK 3.2.1 (http://www.netlib.org/clapack/) and available only on ARM+DSP platforms.
+ *
+ * @section linalgapi LINALG API
+ * For ARM+DSP library, LINALG API is the standard CBLAS and CLAPACK API, accessible through ARM (host).
+ * For DSP-only library, LINALG API includes the standard CBLAS API as well as TI CBLAS API extension.
+ *
+ * - CBLAS API: http://www.netlib.org/blas/#_cblas
+ * - CLAPACK API: http://www.netlib.org/clapack/
+ * - @ref ti_cblas_api
+ *
+ */
diff --git a/docs/linalg_user_guide.html b/docs/linalg_user_guide.html
--- /dev/null
@@ -0,0 +1 @@
+doxygen/html/index.html
\ No newline at end of file
diff --git a/examples/Makefile b/examples/arm+dsp/Makefile
similarity index 100%
rename from examples/dgemm_test/Makefile
rename to examples/arm+dsp/dgemm_test/Makefile
rename from examples/dgemm_test/Makefile
rename to examples/arm+dsp/dgemm_test/Makefile
similarity index 100%
rename from examples/dgemm_test/dgemm_test.c
rename to examples/arm+dsp/dgemm_test/dgemm_test.c
rename from examples/dgemm_test/dgemm_test.c
rename to examples/arm+dsp/dgemm_test/dgemm_test.c
similarity index 100%
rename from examples/dsyrk_test/Makefile
rename to examples/arm+dsp/dsyrk_test/Makefile
rename from examples/dsyrk_test/Makefile
rename to examples/arm+dsp/dsyrk_test/Makefile
similarity index 100%
rename from examples/dsyrk_test/dsyrk_test.c
rename to examples/arm+dsp/dsyrk_test/dsyrk_test.c
rename from examples/dsyrk_test/dsyrk_test.c
rename to examples/arm+dsp/dsyrk_test/dsyrk_test.c
similarity index 100%
rename from examples/gemm_bench/Makefile
rename to examples/arm+dsp/gemm_bench/Makefile
rename from examples/gemm_bench/Makefile
rename to examples/arm+dsp/gemm_bench/Makefile
similarity index 82%
rename from examples/gemm_bench/main.c
rename to examples/arm+dsp/gemm_bench/main.c
index e061337cf13d12635e80476e359f713b3cbea133..a0afa97c4c7e925430eb20f550195c85b5fe001f 100644 (file)
rename from examples/gemm_bench/main.c
rename to examples/arm+dsp/gemm_bench/main.c
index e061337cf13d12635e80476e359f713b3cbea133..a0afa97c4c7e925430eb20f550195c85b5fe001f 100644 (file)
#define NUM_TEST_RUN 5
+#define GFLOPS_MARGIN (1.1f)
/*-----------------------------------------------------------------------------
* Timing Setup
int num_size, gemm_err;
int M, N, K, m, n, k, test_idx;
float time_secs, gflops, gflops_ref, cpu_freq_GHz;
- cl_platform_id platform;
- cl_uint num_platforms;
- cl_device_id devices;
- cl_uint num_devices;
+ cl_platform_id platform;
+ cl_uint num_platforms;
+ cl_device_id devices;
+ cl_uint num_devices;
cl_uint cpu_freq;
- size_t cpu_freq_size;
+ size_t cpu_freq_size;
FILE *fp_time, *fp_gflops;
- if(clGetPlatformIDs(1, &platform, &num_platforms) != CL_SUCCESS) {
- printf("Error in clGetPlatformIDs\n.");
- exit(0);
- }
-
- if(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &devices, &num_devices) != CL_SUCCESS) {
- printf("Error in clGetDeviceIDs\n.");
- exit(0);
- }
- if(clGetDeviceInfo(devices, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), (void *)&cpu_freq, &cpu_freq_size) != CL_SUCCESS) {
- printf("Error in clGetDeviceInfo\n.");
- exit(0);
- }
- cpu_freq_GHz = (float)cpu_freq/1e3; /* convert from MHz to GHz */
- printf("CPU frequency is %f GHz.\n", cpu_freq_GHz);
-
+ if(clGetPlatformIDs(1, &platform, &num_platforms) != CL_SUCCESS) {
+ printf("Error in clGetPlatformIDs\n.");
+ exit(0);
+ }
+
+ if(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &devices, &num_devices) != CL_SUCCESS) {
+ printf("Error in clGetDeviceIDs\n.");
+ exit(0);
+ }
+ if(clGetDeviceInfo(devices, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), (void *)&cpu_freq, &cpu_freq_size) != CL_SUCCESS) {
+ printf("Error in clGetDeviceInfo\n.");
+ exit(0);
+ }
+ cpu_freq_GHz = (float)cpu_freq/1e3; /* convert from MHz to GHz */
+ printf("CPU frequency is %f GHz.\n", cpu_freq_GHz);
+
srand(12345);
- /* setting up TI CBLAS during first call */
- run_dgemm(1000, 1000, 1000, &time_secs, &gflops);
-
+ /* setting up TI CBLAS during first call */
+ run_dgemm(1000, 1000, 1000, &time_secs, &gflops);
+
/*------- benchmarking DGEMM ------- */
fp_time = fopen("dgemm_time.dat","w");
fp_gflops = fopen("dgemm_gflops.dat","w");
-
- test_idx = 0;
+
+ test_idx = 0;
for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
printf("Running DGEMM for (M,N,K) = (%d,%d,%d). ", M,N,K);
-
+
gemm_err = run_dgemm(M, N, K, &time_secs, &gflops);
-
- gflops_ref = dgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
- gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
+
+ if(gemm_err == -1) { /* out of memory for DSP offloading */
+ printf("DGEMM out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+ exit(0);
+ }
+ else {
+ fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
+ fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
+ }
+
+ gflops_ref = dgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
+ gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
- if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
- printf("DGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
- exit(0);
- }
+ if((gflops > gflops_ref*GFLOPS_MARGIN) || (gflops < gflops_ref/GFLOPS_MARGIN)) {
+ printf("DGEMM test FAILED! GFLOPS deviates from reference unacceptably.\n");
+ exit(0);
+ }
if(gemm_err == -1) { /* out of memory for DSP offloading */
printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- exit(0);
+ exit(0);
}
else {
fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
/*------- benchmarking SGEMM -------*/
fp_time = fopen("sgemm_time.dat","w");
fp_gflops = fopen("sgemm_gflops.dat","w");
-
- test_idx = 0;
+
+ test_idx = 0;
for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
printf("Running SGEMM for (M,N,K) = (%d,%d,%d). ", M,N,K);
-
+
gemm_err = run_sgemm(M, N, K, &time_secs, &gflops);
-
- gflops_ref = sgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
- gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
+
+ if(gemm_err == -1) { /* out of memory for DSP offloading */
+ printf("SGEMM out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+ exit(0);
+ }
+ else {
+ fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
+ fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
+ }
+
+ gflops_ref = sgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
+ gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
- if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
- printf("SGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
- exit(0);
- }
+ if((gflops > gflops_ref*GFLOPS_MARGIN) || (gflops < gflops_ref/GFLOPS_MARGIN)) {
+ printf("SGEMM test FAILED! GFLOPS deviates from reference unacceptably.\n");
+ exit(0);
+ }
if(gemm_err == -1) { /* out of memory for DSP offloading */
printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- exit(0);
+ exit(0);
}
else {
fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
/*------- benchmarking CGEMM -------*/
fp_time = fopen("cgemm_time.dat","w");
fp_gflops = fopen("cgemm_gflops.dat","w");
-
- test_idx = 0;
+
+ test_idx = 0;
for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
printf("Running CGEMM for (M,N,K) = (%d,%d,%d). ", M,N,K);
-
+
gemm_err = run_cgemm(M, N, K, &time_secs, &gflops);
-
- gflops_ref = cgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
- gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
- printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
- if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
- printf("CGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
- exit(0);
- }
if(gemm_err == -1) { /* out of memory for DSP offloading */
- printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- exit(0);
+ printf("CGEMM out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+ exit(0);
}
else {
fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
}
+
+ gflops_ref = cgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
+ gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
+ printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
+ if((gflops > gflops_ref*GFLOPS_MARGIN) || (gflops < gflops_ref/GFLOPS_MARGIN)) {
+ printf("CGEMM test FAILED! GFLOPS deviates from reference unacceptably.\n");
+ exit(0);
+ }
}
}
}
/*------- benchmarking ZGEMM -------*/
fp_time = fopen("zgemm_time.dat","w");
fp_gflops = fopen("zgemm_gflops.dat","w");
-
- test_idx = 0;
+
+ test_idx = 0;
for (M=GEMM_MATRIX_SIZE_START,m=0; m<NUM_MATRIX_SIZE_TO_BENCHMARK; m++,M*=2)
{
for (N=GEMM_MATRIX_SIZE_START,n=0; n<NUM_MATRIX_SIZE_TO_BENCHMARK; n++,N*=2)
for (K=GEMM_MATRIX_SIZE_START,k=0; k<NUM_MATRIX_SIZE_TO_BENCHMARK; k++,K*=2)
{
printf("Running ZGEMM for (M,N,K) = (%d,%d,%d). ", M,N,K);
-
+
gemm_err = run_zgemm(M, N, K, &time_secs, &gflops);
-
- gflops_ref = zgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
- gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
- printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
- if((gflops > gflops_ref*1.1) || (gflops < gflops_ref/1.1)) {
- printf("ZGEMM test FAILED! GFLOPS deviates from reference unacceptably.");
- exit(0);
- }
-
if(gemm_err == -1) { /* out of memory for DSP offloading */
- printf("Out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
- exit(0);
+ printf("ZGEMM out of memory for (M,N,K) = (%d,%d,%d).\n", M,N,K);
+ exit(0);
}
else {
fprintf(fp_time, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, time_secs);
fprintf(fp_gflops, "%6d\t%6d\t%6d\t%10.8e\n", M, N, K, gflops);
}
+
+ gflops_ref = zgemm_gflops_ref[test_idx++]; /* read reference GFLOPS */
+ gflops_ref = gflops_ref * cpu_freq_GHz; /* scale ref GFLOPS by CPU freq */
+ printf("Measured %f GFLOPS, reference %f GFLOPS.\n", gflops, gflops_ref);
+ if((gflops > gflops_ref*GFLOPS_MARGIN) || (gflops < gflops_ref/GFLOPS_MARGIN)) {
+ printf("ZGEMM test FAILED! GFLOPS deviates from reference unacceptably.\n");
+ exit(0);
+ }
}
}
}
fclose(fp_time);
fclose(fp_gflops);
- printf("Passed.\n");
+ printf("Passed.\n");
return 0;
}
return (-1);
}
-
+
total_time = 0.0;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
return (-1);
}
-
+
total_time = 0.0;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
return (-1);
}
- total_time = 0.0;
+ total_time = 0.0;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*----------------------------------------------------------------------
total_time += time_secs;
total_GFLOPS += operation_count/time_secs*1e-9;
}
-
+
__free_ddr(A);
__free_ddr(B);
__free_ddr(C);
return (-1);
}
- total_time = 0.0;
+ total_time = 0.0;
for (iter = 0; iter < NUM_TEST_RUN; iter++)
{
/*----------------------------------------------------------------------
total_time += time_secs;
total_GFLOPS += operation_count/time_secs*1e-9;
}
-
+
__free_ddr(A);
__free_ddr(B);
__free_ddr(C);
similarity index 100%
rename from examples/ludinv/Makefile
rename to examples/arm+dsp/ludinv/Makefile
rename from examples/ludinv/Makefile
rename to examples/arm+dsp/ludinv/Makefile
similarity index 100%
rename from examples/ludinv/dlaran.c
rename to examples/arm+dsp/ludinv/dlaran.c
rename from examples/ludinv/dlaran.c
rename to examples/arm+dsp/ludinv/dlaran.c
similarity index 100%
rename from examples/ludinv/dlarnd.c
rename to examples/arm+dsp/ludinv/dlarnd.c
rename from examples/ludinv/dlarnd.c
rename to examples/arm+dsp/ludinv/dlarnd.c
similarity index 100%
rename from examples/ludinv/dlatm1.c
rename to examples/arm+dsp/ludinv/dlatm1.c
rename from examples/ludinv/dlatm1.c
rename to examples/arm+dsp/ludinv/dlatm1.c
similarity index 100%
rename from examples/ludinv/dlatm2.c
rename to examples/arm+dsp/ludinv/dlatm2.c
rename from examples/ludinv/dlatm2.c
rename to examples/arm+dsp/ludinv/dlatm2.c
similarity index 100%
rename from examples/ludinv/dlatm3.c
rename to examples/arm+dsp/ludinv/dlatm3.c
rename from examples/ludinv/dlatm3.c
rename to examples/arm+dsp/ludinv/dlatm3.c
similarity index 100%
rename from examples/ludinv/dlatmr.c
rename to examples/arm+dsp/ludinv/dlatmr.c
rename from examples/ludinv/dlatmr.c
rename to examples/arm+dsp/ludinv/dlatmr.c
diff --git a/examples/make.inc b/examples/arm+dsp/make.inc
similarity index 66%
rename from examples/make.inc
rename to examples/arm+dsp/make.inc
index 7c9c5dfeb2be61fb8f39d58b42ab5371f9447bc3..de310ba794731e515d90a2810b6322321eafb2a9 100644 (file)
rename from examples/make.inc
rename to examples/arm+dsp/make.inc
index 7c9c5dfeb2be61fb8f39d58b42ab5371f9447bc3..de310ba794731e515d90a2810b6322321eafb2a9 100644 (file)
--- a/examples/make.inc
# gcc ARM cross compiler will not, by default, search the host's
# /usr/include. Explicitly specify here to find dependent vendor headers
-CC = arm-linux-gnueabihf-gcc
+ CC = arm-linux-gnueabihf-gcc
else
-CC = gcc
+ CC = gcc
endif
+CFLAGS = -g -O2 -I$(TARGET_ROOTDIR)/usr/include -I$(LINALG_DIR)/include
-CFLAGS = -g -O2 -I$(TARGET_ROOTDIR)/usr/include
-
-LIB_DIR = $(TARGET_ROOTDIR)/usr/lib/
+LIB_DIR = $(LINALG_DIR)/lib/
LD_FLAGS=-L$(TARGET_ROOTDIR)/lib -L$(TARGET_ROOTDIR)/usr/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/usr/lib
-BLASLIB = $(LIB_DIR)libcblas_armplusdsp.a $(LIB_DIR)libblis.a -lOpenCL -locl_util -lstdc++ -lrt -lm -lgomp
-LAPACKLIB = $(LIB_DIR)libcblaswr.a $(LIB_DIR)liblapack.a $(LIB_DIR)libf2c.a $(LIB_DIR)libcblas_armplusdsp.a $(LIB_DIR)libblis.a -lOpenCL -locl_util -lstdc++ -lrt -lm -lgomp
+#BLASLIB = $(LIB_DIR)libcblas_armplusdsp.a $(LIB_DIR)libblis.a -lOpenCL -locl_util -lstdc++ -lrt -lm -lgomp -lpthread
+BLASLIB = $(LIB_DIR)libcblas_armplusdsp.a -lOpenCL -locl_util -lstdc++ -lrt -lm -lgomp -lpthread
+LAPACKLIB = $(LIB_DIR)libcblaswr.a $(LIB_DIR)liblapack.a $(LIB_DIR)libf2c.a $(LIB_DIR)libcblas_armplusdsp.a -lOpenCL -locl_util -lstdc++ -lrt -lm -lgomp -lpthread
%.o: %.c
$(CC) -c $(CFLAGS) $<
similarity index 100%
rename from examples/matmpy/Makefile
rename to examples/arm+dsp/matmpy/Makefile
rename from examples/matmpy/Makefile
rename to examples/arm+dsp/matmpy/Makefile
diff --git a/examples/arm+dsp/run_tests_evm.sh b/examples/arm+dsp/run_tests_evm.sh
--- /dev/null
@@ -0,0 +1,10 @@
+./matmpy/matmpy > testlog.txt
+./dsyrk_test/dsyrk_test >> testlog.txt
+./ztrsm_test/ztrsm_test >> testlog.txt
+./dgemm_test/dgemm_test >> testlog.txt
+./eig/eig >> testlog.txt
+./ludinv/ludinv >> testlog.txt
+./ztrmm_test/ztrmm_test >> testlog.txt
+
+grep FAIL testlog.txt && echo "tests failed" || echo "All tests have passed."
+
similarity index 100%
rename from examples/ztrmm_test/Makefile
rename to examples/arm+dsp/ztrmm_test/Makefile
rename from examples/ztrmm_test/Makefile
rename to examples/arm+dsp/ztrmm_test/Makefile
similarity index 100%
rename from examples/ztrmm_test/ztrmm_test.c
rename to examples/arm+dsp/ztrmm_test/ztrmm_test.c
rename from examples/ztrmm_test/ztrmm_test.c
rename to examples/arm+dsp/ztrmm_test/ztrmm_test.c
similarity index 100%
rename from examples/ztrsm_test/Makefile
rename to examples/arm+dsp/ztrsm_test/Makefile
rename from examples/ztrsm_test/Makefile
rename to examples/arm+dsp/ztrsm_test/Makefile
similarity index 100%
rename from examples/ztrsm_test/ztrsm_test.c
rename to examples/arm+dsp/ztrsm_test/ztrsm_test.c
rename from examples/ztrsm_test/ztrsm_test.c
rename to examples/arm+dsp/ztrsm_test/ztrsm_test.c
diff --git a/examples/dsponly/Makefile b/examples/dsponly/Makefile
--- /dev/null
@@ -0,0 +1,21 @@
+.SILENT:
+
+MFS = $(wildcard */Makefile)
+DIRS = $(patsubst %/Makefile,%,$(MFS))
+
+all:
+ for dir in $(DIRS); do \
+ echo "=============== " $$dir " =================" ; \
+ $(MAKE) -C $$dir; \
+ done
+
+test:
+ for dir in $(DIRS); do \
+ echo "=============== " $$dir " =================" ; \
+ $(MAKE) -C $$dir run; \
+ done
+
+clean:
+ for dir in $(DIRS); do \
+ $(MAKE) -C $$dir clean; \
+ done
diff --git a/examples/dsponly/common/Makefile.common b/examples/dsponly/common/Makefile.common
--- /dev/null
@@ -0,0 +1,143 @@
+# Makefile to build OpenMP applications
+
+#
+# Check if required environment variables are defined
+#
+ifneq ($(MAKECMDGOALS),clean)
+
+# Path to C6000 compiler tools
+ifeq ($(CGTROOT),)
+$(call error,ERROR - CGTROOT NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(XDC_DIR),)
+$(call error,ERROR - XDC_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(XDAIS_DIR),)
+$(call error,ERROR - XDAIS_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(BIOS_DIR),)
+$(call error,ERROR - BIOS_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(IPC_DIR),)
+$(call error,ERROR - IPC_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(OMP_DIR),)
+$(call error,ERROR - OMP_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(PDK_DIR),)
+$(call error,ERROR - PDK_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(FC_DIR),)
+$(call error,ERROR - FC_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(EDMA3_DIR),)
+$(call error,ERROR - EDMA3_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(LIBARCH_DIR),)
+$(call error,ERROR - LIBARCH_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+ifeq ($(LINALG_DIR),)
+$(call error,ERROR - LINALG_DIR NOT DEFINED, PLEASE REFER TO README.txt)
+endif
+
+endif
+
+
+default: all
+
+# Configuration file used (without the .cfg extension)
+CFGDIR = $(COMMON_FOLDER)/omp_config
+
+# Set BUILD_TYPE to debug/release to pick appropriate libraries
+BUILD_TYPE = release
+
+# Platform file
+ifneq ($(MAKECMDGOALS),clean)
+
+ifeq ($(TARGET),SOC_AM572x)
+XDCPLATFORM = ti.runtime.openmp.platforms.am57x
+CL_OPTS += -DSOC_AM572x
+else ifeq ($(TARGET),SOC_C6678)
+XDCPLATFORM = ti.runtime.openmp.platforms.evm6678
+CL_OPTS += -DSOC_C6678
+else ifeq ($(TARGET),SOC_K2H)
+XDCPLATFORM = ti.runtime.openmp.platforms.evmTCI6636K2H
+CL_OPTS += -DSOC_K2H
+else
+$(call error, ERROR - TARGET NOT DEFINED. Must specify one of: TARGET=SOC_K2H,TARGET=SOC_C6678,TARGET=SOC_AM572X)
+endif
+
+ifeq ($(BUILD_TYPE),debug)
+ CL_OPTS += -g --optimize_with_debug=on
+else
+ CL_OPTS += -o3
+endif
+
+endif
+
+testfiles_obj += ticblas_config.obj fc_config_c6678.obj
+
+CL_OPTS += -@ $(OPT_CMD) -mv6600 --omp -I $(OMP_DIR)/packages/ti/runtime/openmp
+CL_OPTS += -I$(FC_DIR)/packages -I$(XDAIS_DIR)/packages -I$(EDMA3_DIR)/packages -I$(LIBARCH_DIR)/include -I$(LINALG_DIR)/include
+CL_OPTS += -D$(TARGET) -DLIB_RTOS
+
+# Include . to find config.bld
+XDCPATH = $(PDK_DIR)/packages;$(OMP_DIR)/packages;$(BIOS_DIR)/packages;$(IPC_DIR)/packages;$(FC_DIR)/packages;$(XDAIS_DIR)/packages;$(EDMA3_DIR)/packages;.
+
+XS = $(XDC_DIR)/xs
+XDC = $(XDC_DIR)/xdc
+XDCTARGET = ti.targets.elf.C66
+
+#
+# Compiler option configuration
+#
+LNK_CMD = $(CFGDIR)/linker.cmd
+LNK_CMD_FC= $(COMMON_FOLDER)/linker_fc.cmd
+OPT_CMD = $(CFGDIR)/compiler.opt
+LNK_OPTS = -x -c --priority -w
+CL = $(CGTROOT)/bin/cl6x
+RTS_LIB = $(CGTROOT)/lib/libc.a
+LIBARCH_LIB = $(LIBARCH_DIR)/lib/libArch.ae66
+LINALG_LIB = $(LINALG_DIR)/lib/libcblas.ae66
+
+all: $(outfile)
+
+# Generate and build libomp config packages
+libomp_config: $(CFGDIR).cfg
+ @echo making $(CFGDIR) files
+ $(XS) --xdcpath "$(XDCPATH)" xdc.tools.configuro -c $(CGTROOT) --cb -t $(XDCTARGET) -p $(XDCPLATFORM) -r $(BUILD_TYPE) $(CFGDIR).cfg
+
+libomp_clean:
+ @echo Removing $(CFGDIR)
+ @rm -rf $(CFGDIR)/
+
+$(outfile): libomp_config $(testfiles_obj)
+ echo generating $(outfile)
+ $(CL) $(CL_OPTS) -z $(LNK_OPTS) -m $*.map $(LNK_CMD) $(LNK_CMD_FC) $(testfiles_obj) $(RTS_LIB) $(LIBARCH_LIB) $(LINALG_LIB) --output_file=$(outfile)
+
+ticblas_config.obj: $(COMMON_FOLDER)/ticblas_config.c
+ $(CL) $(CL_OPTS) $<
+ echo Compiling $<
+
+fc_config_c6678.obj: $(COMMON_FOLDER)/fc_config_c6678.c
+ $(CL) $(CL_OPTS) $<
+ echo Compiling $<
+
+clean: libomp_clean
+ @rm -rf *.map *.out *.obj *.mak
+
+#
+# Cleans libomp artifacts
+#
+realclean: libomp_clean
+ @rm -rf *.map *.out *.obj *.mak
diff --git a/examples/dsponly/common/fc_config_c6678.c b/examples/dsponly/common/fc_config_c6678.c
--- /dev/null
@@ -0,0 +1,834 @@
+/* ======================================================================= */
+/* TEXAS INSTRUMENTS, INC. */
+/* */
+/* FFTLIB FFT Library */
+/* */
+/* Copyright (C) 2013 Texas Instruments Incorporated - http://www.ti.com/ */
+/* */
+/* */
+/* Redistribution and use in source and binary forms, with or without */
+/* modification, are permitted provided that the following conditions */
+/* are met: */
+/* */
+/* Redistributions of source code must retain the above copyright */
+/* notice, this list of conditions and the following disclaimer. */
+/* */
+/* Redistributions in binary form must reproduce the above copyright */
+/* notice, this list of conditions and the following disclaimer in the */
+/* documentation and/or other materials provided with the */
+/* distribution. */
+/* */
+/* Neither the name of Texas Instruments Incorporated nor the names of */
+/* its contributors may be used to endorse or promote products derived */
+/* from this software without specific prior written permission. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
+/* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */
+/* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR */
+/* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT */
+/* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
+/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT */
+/* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
+/* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY */
+/* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE */
+/* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* ======================================================================= */
+
+#include <xdc/std.h>
+#include <ti/sdo/edma3/rm/edma3_rm.h>
+#include <ti/sdo/fc/edma3/edma3_config.h>
+
+#define EDMA_MGR_NUM_EDMA_INSTANCES 3
+
+/* In the arrays below, each bit of a 32-bit word corresponds to a single */
+/* PaRAMSet/EDMAChannel/QDMAChannel/TCC owned by the corresponding region, */
+/* i.e., can be used for general purpose EDMA tranfers, or reserved for */
+/* EDMA transfers by hardware peripherals (cannot be used for general */
+/* purpose EDMA tranfers) */
+
+#define DMA_CHANNEL_TO_EVENT_MAPPING_0 (0x00000000u)
+#define DMA_CHANNEL_TO_EVENT_MAPPING_1 (0x00000000u)
+
+/* EDMA3_InstanceInitConfig sample0 with region neither owning nor */
+/* reserving any EDMA resources */
+#define regionSample0 \
+{ \
+ /* Resources owned by Region */ \
+ /* ownPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u}, \
+ \
+ /* ownDmaChannels */ \
+ {0x00000000u, 0x00000000u}, \
+ \
+ /* ownQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* ownTccs */ \
+ {0x00000000u, 0x00000000u}, \
+ \
+ /* Resources reserved by Region */ \
+ /* resvdPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u}, \
+ \
+ /* resvdDmaChannels */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1}, \
+ \
+ /* resvdQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* resvdTccs */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1} \
+}
+
+/* EDMA3_InstanceInitConfig sample1 with region owning PaRAM sets 64-105, */
+/* and EDMA channel 0-7, but not reserving any EDMA resources */
+/* Note that the first N PaRAM sets (N=number of EDMA channels available */
+/* on an EDMA instance) are reserved in EDMA3 LLD ). */
+#define regionSample1 \
+{ \
+ /* Resources owned by Region */ \
+ /* ownPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0xFFFFFFFFu, 0xFFFFFFFFu, \
+ 0xFFFFFFFFu, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u}, \
+ \
+ /* ownDmaChannels */ \
+ {0x0000FFFFu, 0x00000000u}, \
+ \
+ /* ownQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* ownTccs */ \
+ {0x0000FFFFu, 0x00000000u}, \
+ \
+ /* Resources reserved by Region */ \
+ /* resvdPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u}, \
+ \
+ /* resvdDmaChannels */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1}, \
+ \
+ /* resvdQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* resvdTccs */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1} \
+}
+
+
+/* EDMA3_InstanceInitConfig sample2 with region owning PaRAM sets 106-147, */
+/* and EDMA channel 8-15, but not reserving any EDMA resources */
+#define regionSample2 \
+{ \
+ /* Resources owned by Region */ \
+ /* ownPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u}, \
+ \
+ /* ownDmaChannels */ \
+ {0xFFFF0000u, 0x00000000u}, \
+ \
+ /* ownQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* ownTccs */ \
+ {0xFFFF0000u, 0x00000000u}, \
+ \
+ /* Resources reserved by Region */ \
+ /* resvdPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u}, \
+ \
+ /* resvdDmaChannels */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1}, \
+ \
+ /* resvdQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* resvdTccs */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1} \
+}
+
+/* EDMA3_InstanceInitConfig sample3 with region owning PaRAM sets 148-189, */
+/* and EDMA channel 16-23, but not reserving any EDMA resources */
+#define regionSample3 \
+{ \
+ /* Resources owned by Region */ \
+ /* ownPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u}, \
+ \
+ /* ownDmaChannels */ \
+ {0x00000000u, 0x0000FFFFu}, \
+ \
+ /* ownQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* ownTccs */ \
+ {0x00000000u, 0x0000FFFFu}, \
+ \
+ /* Resources reserved by Region */ \
+ /* resvdPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u}, \
+ \
+ /* resvdDmaChannels */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1}, \
+ \
+ /* resvdQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* resvdTccs */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1} \
+}
+
+/* EDMA3_InstanceInitConfig sample4 with region owning PaRAM sets 190-231, */
+/* and EDMA channel 24-31, but not reserving any EDMA resources */
+#define regionSample4 \
+{ \
+ /* Resources owned by Region */ \
+ /* ownPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0xFFFFFFFFu, \
+ 0xFFFFFFFFu, 0xFFFFFFFFu, 0x00000000u, 0x00000000u}, \
+ \
+ /* ownDmaChannels */ \
+ {0x00000000u, 0xFFFF0000u}, \
+ \
+ /* ownQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* ownTccs */ \
+ {0x00000000u, 0xFFFF0000u}, \
+ \
+ /* Resources reserved by Region */ \
+ /* resvdPaRAMSets */ \
+ {0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, \
+ 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u}, \
+ \
+ /* resvdDmaChannels */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1}, \
+ \
+ /* resvdQdmaChannels */ \
+ {0x00000000u}, \
+ \
+ /* resvdTccs */ \
+ {DMA_CHANNEL_TO_EVENT_MAPPING_0, DMA_CHANNEL_TO_EVENT_MAPPING_1} \
+}
+
+#define NUM_EDMA_INSTANCES 3
+const EDMA3_InstanceInitConfig C6678_config[NUM_EDMA_INSTANCES][EDMA3_MAX_REGIONS] =
+{
+ /* EDMA3 INSTANCE# 0 */
+ { regionSample0, regionSample0, regionSample0, regionSample0,
+ regionSample0, regionSample0, regionSample0, regionSample0
+ },
+ /* EDMA3 INSTANCE# 1 */
+ { regionSample1, regionSample2, regionSample3, regionSample4,
+ regionSample0, regionSample0, regionSample0, regionSample0
+ },
+ /* EDMA3 INSTANCE# 2 */
+ { regionSample0, regionSample0, regionSample0, regionSample0,
+ regionSample1, regionSample2, regionSample3, regionSample4
+ }
+};
+
+const EDMA3_InstanceInitConfig edmaMgrInstanceInitConfig[EDMA_MGR_NUM_EDMA_INSTANCES][EDMA3_MAX_REGIONS] =
+{
+ /* EDMA3 INSTANCE# 0 */
+ { regionSample0, regionSample0, regionSample0, regionSample0,
+ regionSample0, regionSample0, regionSample0, regionSample0
+ },
+ /* EDMA3 INSTANCE# 1 */
+ { regionSample1, regionSample2, regionSample3, regionSample4,
+ regionSample0, regionSample0, regionSample0, regionSample0
+ },
+ /* EDMA3 INSTANCE# 2 */
+ { regionSample0, regionSample0, regionSample0, regionSample0,
+ regionSample1, regionSample2, regionSample3, regionSample4
+ }
+};
+
+int32_t edmaMgrRegion2Instance[EDMA3_MAX_REGIONS] = {1,1,1,1,2,2,2,2};
+
+/* Driver Object Initialization Configuration */
+EDMA3_GblConfigParams edmaMgrGblConfigParams [EDMA_MGR_NUM_EDMA_INSTANCES] =
+ {
+ {
+ /* EDMA3 INSTANCE# 0 */
+ /** Total number of DMA Channels supported by the EDMA3 Controller */
+ 16u,
+ /** Total number of QDMA Channels supported by the EDMA3 Controller */
+ 8u,
+ /** Total number of TCCs supported by the EDMA3 Controller */
+ 16u,
+ /** Total number of PaRAM Sets supported by the EDMA3 Controller */
+ 128u,
+ /** Total number of Event Queues in the EDMA3 Controller */
+ 2u,
+ /** Total number of Transfer Controllers (TCs) in the EDMA3 Controller */
+ 2u,
+ /** Number of Regions on this EDMA3 controller */
+ 8u,
+
+ /**
+ * \brief Channel mapping existence
+ * A value of 0 (No channel mapping) implies that there is fixed association
+ * for a channel number to a parameter entry number or, in other words,
+ * PaRAM entry n corresponds to channel n.
+ */
+ 1u,
+
+ /** Existence of memory protection feature */
+ 1u,
+
+ /** Global Register Region of CC Registers */
+ (void *)0x02700000u,
+ /** Transfer Controller (TC) Registers */
+ {
+ (void *)0x02760000u,
+ (void *)0x02768000u,
+ (void *)NULL,
+ (void *)NULL,
+ (void *)NULL,
+ (void *)NULL,
+ (void *)NULL,
+ (void *)NULL
+ },
+ /** Interrupt no. for Transfer Completion */
+ 38u,
+ /** Interrupt no. for CC Error */
+ 32u,
+ /** Interrupt no. for TCs Error */
+ {
+ 34u,
+ 35u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ },
+
+ /**
+ * \brief EDMA3 TC priority setting
+ *
+ * User can program the priority of the Event Queues
+ * at a system-wide level. This means that the user can set the
+ * priority of an IO initiated by either of the TCs (Transfer Controllers)
+ * relative to IO initiated by the other bus masters on the
+ * device (ARM, DSP, USB, etc)
+ */
+ {
+ 0u,
+ 1u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ 0u
+ },
+ /**
+ * \brief To Configure the Threshold level of number of events
+ * that can be queued up in the Event queues. EDMA3CC error register
+ * (CCERR) will indicate whether or not at any instant of time the
+ * number of events queued up in any of the event queues exceeds
+ * or equals the threshold/watermark value that is set
+ * in the queue watermark threshold register (QWMTHRA).
+ */
+ {
+ 16u,
+ 16u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ 0u
+ },
+
+ /**
+ * \brief To Configure the Default Burst Size (DBS) of TCs.
+ * An optimally-sized command is defined by the transfer controller
+ * default burst size (DBS). Different TCs can have different
+ * DBS values. It is defined in Bytes.
+ */
+ {
+ 128u,
+ 128u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ 0u
+ },
+
+ /**
+ * \brief Mapping from each DMA channel to a Parameter RAM set,
+ * if it exists, otherwise of no use.
+ */
+ {
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ /* DMA channels 16-63 DOES NOT exist */
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS,
+ EDMA3_MAX_PARAM_SETS, EDMA3_MAX_PARAM_SETS
+ },
+
+ /**
+ * \brief Mapping from each DMA channel to a TCC. This specific
+ * TCC code will be returned when the transfer is completed
+ * on the mapped channel.
+ */
+ {
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ /* DMA channels 16-63 DOES NOT exist */
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC,
+ EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC, EDMA3_MAX_TCC
+ },
+
+ /**
+ * \brief Mapping of DMA channels to Hardware Events from
+ * various peripherals, which use EDMA for data transfer.
+ * All channels need not be mapped, some can be free also.
+ */
+ {
+ 0x00000000u,
+ 0x00000000u
+ }
+ },
+
+ {
+ /* EDMA3 INSTANCE# 1 */
+ /** Total number of DMA Channels supported by the EDMA3 Controller */
+ 64u,
+ /** Total number of QDMA Channels supported by the EDMA3 Controller */
+ 8u,
+ /** Total number of TCCs supported by the EDMA3 Controller */
+ 64u,
+ /** Total number of PaRAM Sets supported by the EDMA3 Controller */
+ 512u,
+ /** Total number of Event Queues in the EDMA3 Controller */
+ 4u,
+ /** Total number of Transfer Controllers (TCs) in the EDMA3 Controller */
+ 4u,
+ /** Number of Regions on this EDMA3 controller */
+ 8u,
+
+ /**
+ * \brief Channel mapping existence
+ * A value of 0 (No channel mapping) implies that there is fixed association
+ * for a channel number to a parameter entry number or, in other words,
+ * PaRAM entry n corresponds to channel n.
+ */
+ 1u,
+
+ /** Existence of memory protection feature */
+ 1u,
+
+ /** Global Register Region of CC Registers */
+ (void *)0x02720000u,
+ /** Transfer Controller (TC) Registers */
+ {
+ (void *)0x02770000u,
+ (void *)0x02778000u,
+ (void *)0x02780000u,
+ (void *)0x02788000u,
+ (void *)NULL,
+ (void *)NULL,
+ (void *)NULL,
+ (void *)NULL
+ },
+ /** Interrupt no. for Transfer Completion */
+ 8u,
+ /** Interrupt no. for CC Error */
+ 0u,
+ /** Interrupt no. for TCs Error */
+ {
+ 2u,
+ 3u,
+ 4u,
+ 5u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ },
+
+ /**
+ * \brief EDMA3 TC priority setting
+ *
+ * User can program the priority of the Event Queues
+ * at a system-wide level. This means that the user can set the
+ * priority of an IO initiated by either of the TCs (Transfer Controllers)
+ * relative to IO initiated by the other bus masters on the
+ * device (ARM, DSP, USB, etc)
+ */
+ {
+ 0u,
+ 1u,
+ 2u,
+ 3u,
+ 0u,
+ 0u,
+ 0u,
+ 0u
+ },
+ /**
+ * \brief To Configure the Threshold level of number of events
+ * that can be queued up in the Event queues. EDMA3CC error register
+ * (CCERR) will indicate whether or not at any instant of time the
+ * number of events queued up in any of the event queues exceeds
+ * or equals the threshold/watermark value that is set
+ * in the queue watermark threshold register (QWMTHRA).
+ */
+ {
+ 16u,
+ 16u,
+ 16u,
+ 16u,
+ 0u,
+ 0u,
+ 0u,
+ 0u
+ },
+
+ /**
+ * \brief To Configure the Default Burst Size (DBS) of TCs.
+ * An optimally-sized command is defined by the transfer controller
+ * default burst size (DBS). Different TCs can have different
+ * DBS values. It is defined in Bytes.
+ */
+ {
+ 128u,
+ 64u,
+ 128u,
+ 64u,
+ 0u,
+ 0u,
+ 0u,
+ 0u
+ },
+
+ /**
+ * \brief Mapping from each DMA channel to a Parameter RAM set,
+ * if it exists, otherwise of no use.
+ */
+ {
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP
+ },
+
+ /**
+ * \brief Mapping from each DMA channel to a TCC. This specific
+ * TCC code will be returned when the transfer is completed
+ * on the mapped channel.
+ */
+ {
+ 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u,
+ 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u,
+ 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u,
+ 24u, 25u, 26u, 27u, 28u, 29u, 30u, 31u,
+ 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP
+ },
+
+ /**
+ * \brief Mapping of DMA channels to Hardware Events from
+ * various peripherals, which use EDMA for data transfer.
+ * All channels need not be mapped, some can be free also.
+ */
+ {
+ 0xFFFFFFFFu,
+ 0xFF000000u
+ }
+ },
+
+ {
+ /* EDMA3 INSTANCE# 2 */
+ /** Total number of DMA Channels supported by the EDMA3 Controller */
+ 64u,
+ /** Total number of QDMA Channels supported by the EDMA3 Controller */
+ 8u,
+ /** Total number of TCCs supported by the EDMA3 Controller */
+ 64u,
+ /** Total number of PaRAM Sets supported by the EDMA3 Controller */
+ 512u,
+ /** Total number of Event Queues in the EDMA3 Controller */
+ 4u,
+ /** Total number of Transfer Controllers (TCs) in the EDMA3 Controller */
+ 4u,
+ /** Number of Regions on this EDMA3 controller */
+ 8u,
+
+ /**
+ * \brief Channel mapping existence
+ * A value of 0 (No channel mapping) implies that there is fixed association
+ * for a channel number to a parameter entry number or, in other words,
+ * PaRAM entry n corresponds to channel n.
+ */
+ 1u,
+
+ /** Existence of memory protection feature */
+ 1u,
+
+ /** Global Register Region of CC Registers */
+ (void *)0x02740000u,
+ /** Transfer Controller (TC) Registers */
+ {
+ (void *)0x02790000u,
+ (void *)0x02798000u,
+ (void *)0x027A0000u,
+ (void *)0x027A8000u,
+ (void *)NULL,
+ (void *)NULL,
+ (void *)NULL,
+ (void *)NULL
+ },
+ /** Interrupt no. for Transfer Completion */
+ 24u,
+ /** Interrupt no. for CC Error */
+ 16u,
+ /** Interrupt no. for TCs Error */
+ {
+ 18u,
+ 19u,
+ 20u,
+ 21u,
+ 0u,
+ 0u,
+ 0u,
+ 0u,
+ },
+
+ /**
+ * \brief EDMA3 TC priority setting
+ *
+ * User can program the priority of the Event Queues
+ * at a system-wide level. This means that the user can set the
+ * priority of an IO initiated by either of the TCs (Transfer Controllers)
+ * relative to IO initiated by the other bus masters on the
+ * device (ARM, DSP, USB, etc)
+ */
+ {
+ 0u,
+ 1u,
+ 2u,
+ 3u,
+ 0u,
+ 0u,
+ 0u,
+ 0u
+ },
+ /**
+ * \brief To Configure the Threshold level of number of events
+ * that can be queued up in the Event queues. EDMA3CC error register
+ * (CCERR) will indicate whether or not at any instant of time the
+ * number of events queued up in any of the event queues exceeds
+ * or equals the threshold/watermark value that is set
+ * in the queue watermark threshold register (QWMTHRA).
+ */
+ {
+ 16u,
+ 16u,
+ 16u,
+ 16u,
+ 0u,
+ 0u,
+ 0u,
+ 0u
+ },
+
+ /**
+ * \brief To Configure the Default Burst Size (DBS) of TCs.
+ * An optimally-sized command is defined by the transfer controller
+ * default burst size (DBS). Different TCs can have different
+ * DBS values. It is defined in Bytes.
+ */
+ {
+ 128u,
+ 64u,
+ 64u,
+ 128u,
+ 0u,
+ 0u,
+ 0u,
+ 0u
+ },
+
+ /**
+ * \brief Mapping from each DMA channel to a Parameter RAM set,
+ * if it exists, otherwise of no use.
+ */
+ {
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP,
+ EDMA3_RM_CH_NO_PARAM_MAP, EDMA3_RM_CH_NO_PARAM_MAP
+ },
+
+ /**
+ * \brief Mapping from each DMA channel to a TCC. This specific
+ * TCC code will be returned when the transfer is completed
+ * on the mapped channel.
+ */
+ {
+ 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u,
+ 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u,
+ 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u,
+ 24u, 25u, 26u, 27u, 28u, 29u, 30u, 31u,
+ 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP,
+ EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP, EDMA3_RM_CH_NO_TCC_MAP
+ },
+
+ /**
+ * \brief Mapping of DMA channels to Hardware Events from
+ * various peripherals, which use EDMA for data transfer.
+ * All channels need not be mapped, some can be free also.
+ */
+ {
+ 0xFFFFFFFFu,
+ 0xFF000000u
+ }
+ },
+ };
+
+
+int32_t *ti_sdo_fc_edmamgr_region2Instance = (int32_t*)&edmaMgrRegion2Instance[0];
+EDMA3_GblConfigParams *ti_sdo_fc_edmamgr_edma3GblConfigParams = (EDMA3_GblConfigParams*)&edmaMgrGblConfigParams[0];
+EDMA3_InstanceInitConfig *ti_sdo_fc_edmamgr_edma3RegionConfig = (EDMA3_InstanceInitConfig*)&edmaMgrInstanceInitConfig[0][0];
diff --git a/examples/dsponly/common/linker_fc.cmd b/examples/dsponly/common/linker_fc.cmd
--- /dev/null
@@ -0,0 +1,24 @@
+
+SECTIONS
+{
+ .fclocalfar :
+ {
+ "edmamgr.ae66" (.fardata)
+ "ecpy.ae66" (.fardata)
+ "edma3Chan.ae66" (.fardata)
+ "edma3.ae66" (.fardata)
+ "rman.ae66" (.fardata)
+ "nullres.ae66" (.fardata)
+ "fcsettings.ae66" (.fardata)
+ "edma3_lld_rm.ae66" (.fardata)
+
+ "edmamgr.ae66" (.far)
+ "edma3Chan.ae66" (.far)
+ "edma3.ae66" (.far)
+ "rman.ae66" (.far)
+ "nullres.ae66" (.far)
+ "fcsettings.ae66" (.far)
+ "edma3_lld_rm.ae66" (.far)
+ } > L2SRAM
+
+}
diff --git a/examples/dsponly/common/omp_config.cfg b/examples/dsponly/common/omp_config.cfg
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2012-2015, Texas Instruments Incorporated
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Texas Instruments Incorporated nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Import configuration for using FC EDMA */
+var RMAN = xdc.useModule('ti.sdo.fc.rman.RMAN');
+RMAN.useDSKT2 = false;
+RMAN.persistentAllocFxn = "EdmaMgr_heap_alloc";
+RMAN.persistentFreeFxn = "EdmaMgr_heap_free";
+var EdmaMgr = xdc.useModule('ti.sdo.fc.edmamgr.EdmaMgr');
+
+/***************************/
+/* SECTION MAPPING */
+/***************************/
+var program = xdc.useModule('xdc.cfg.Program');
+
+program.sectMap[".args"] = new Program.SectionSpec();
+program.sectMap[".bss"] = new Program.SectionSpec();
+program.sectMap[".cinit"] = new Program.SectionSpec();
+program.sectMap[".cio"] = new Program.SectionSpec();
+program.sectMap[".const"] = new Program.SectionSpec();
+program.sectMap[".data"] = new Program.SectionSpec();
+program.sectMap[".far"] = new Program.SectionSpec();
+program.sectMap[".fardata"] = new Program.SectionSpec();
+program.sectMap[".neardata"] = new Program.SectionSpec();
+program.sectMap[".rodata"] = new Program.SectionSpec();
+program.sectMap[".stack"] = new Program.SectionSpec();
+program.sectMap[".switch"] = new Program.SectionSpec();
+program.sectMap[".sysmem"] = new Program.SectionSpec();
+program.sectMap[".text"] = new Program.SectionSpec();
+program.sectMap[".blas_msmc"] = new Program.SectionSpec();
+program.sectMap[".blas_l2"] = new Program.SectionSpec();
+
+// Must place these sections in core local memory
+program.sectMap[".args"].loadSegment = "L2SRAM";
+program.sectMap[".cio"].loadSegment = "L2SRAM";
+
+// Variables in the following data sections can potentially be 'shared' in
+// OpenMP. These sections must be placed in shared memory.
+program.sectMap[".bss"].loadSegment = "DDR3";
+program.sectMap[".cinit"].loadSegment = "DDR3";
+program.sectMap[".const"].loadSegment = "DDR3";
+program.sectMap[".data"].loadSegment = "DDR3";
+program.sectMap[".far"].loadSegment = "DDR3";
+program.sectMap[".fardata"].loadSegment = "DDR3";
+program.sectMap[".neardata"].loadSegment = "DDR3";
+program.sectMap[".rodata"].loadSegment = "DDR3";
+program.sectMap[".sysmem"].loadSegment = "DDR3";
+program.sectMap[".blas_l2"].loadSegment = "L2SRAM";
+program.sectMap[".blas_msmc"].loadSegment = "MSMCSRAM";
+
+// Code sections shared by cores - place in shared memory to avoid duplication
+program.sectMap[".switch"].loadSegment = "DDR3";
+program.sectMap[".text"].loadSegment = "DDR3";
+
+// Size the default stack and place it in L2SRAM
+var deviceName = String(Program.cpu.deviceName);
+if (deviceName.search("DRA7XX") == -1) { program.stack = 0x10000; }
+else { program.stack = 0x8000; }
+program.sectMap[".stack"].loadSegment = "L2SRAM";
+
+// Since there are no arguments passed to main, set .args size to 0
+program.argSize = 0;
+
+/********************************/
+/* OPENMP RUNTIME CONFIGURATION */
+/********************************/
+
+// Include OMP runtime in the build
+var ompSettings = xdc.useModule("ti.runtime.openmp.Settings");
+
+// Set to true if the application uses or has dependencies on BIOS components
+ompSettings.usingRtsc = true;
+
+if (ompSettings.usingRtsc)
+{
+ /* Configure OpenMP for BIOS
+ * - OpenMP.configureCores(masterCoreId, numberofCoresInRuntime)
+ * Configures the id of the master core and the number of cores
+ * available to the runtime.
+ */
+
+ var OpenMP = xdc.useModule('ti.runtime.ompbios.OpenMP');
+
+ // Configure the index of the master core and the number of cores available
+ // to the runtime. The cores are contiguous.
+ OpenMP.masterCoreIdx = 0;
+
+ // Setup number of cores based on the device
+ if (deviceName.search("DRA7XX") != -1) { OpenMP.numCores = 2; }
+ else if (deviceName.search("6670") != -1) { OpenMP.numCores = 4; }
+ else if (deviceName.search("6657") != -1) { OpenMP.numCores = 2; }
+ else { OpenMP.numCores = 8; }
+
+ // Pull in memory ranges described in Platform.xdc to configure the runtime
+ var ddr3 = Program.cpu.memoryMap["DDR3"];
+ var ddr3_nc = Program.cpu.memoryMap["DDR3_NC"];
+ var msmc = Program.cpu.memoryMap["MSMCSRAM"];
+ var msmcNcVirt = Program.cpu.memoryMap["OMP_MSMC_NC_VIRT"];
+ var msmcNcPhy = Program.cpu.memoryMap["OMP_MSMC_NC_PHY"];
+
+ // Initialize the runtime with memory range information
+ if (deviceName.search("DRA7XX") == -1) {
+ OpenMP.msmcBase = msmc.base
+ OpenMP.msmcSize = msmc.len;
+
+ OpenMP.msmcNoCacheVirtualBase = msmcNcVirt.base;
+ OpenMP.msmcNoCacheVirtualSize = msmcNcVirt.len;
+
+ OpenMP.msmcNoCachePhysicalBase = msmcNcPhy.base;
+ }
+ else
+ {
+ OpenMP.allocateStackFromHeap = true;
+ OpenMP.allocateStackFromHeapSize = 0x010000;
+
+ OpenMP.hasMsmc = false;
+ OpenMP.ddrNoCacheBase = ddr3_nc.base;
+ OpenMP.ddrNoCacheSize = ddr3_nc.len;
+ }
+
+ OpenMP.ddrBase = ddr3.base;
+ OpenMP.ddrSize = ddr3.len;
+
+ // Configure memory allocation using HeapOMP
+ // HeapOMP handles
+ // - Memory allocation requests from BIOS components (core local memory)
+ // - Shared memory allocation by utilizing the IPC module to enable
+ // multiple cores to allocate memory out of the same heap - used by malloc
+ if (deviceName.search("DRA7XX") == -1) {
+ var HeapOMP = xdc.useModule('ti.runtime.ompbios.HeapOMP');
+
+ // Shared Region 0 must be initialized for IPC
+ var sharedRegionId = 0;
+
+ // Size of the core local heap
+ var localHeapSize = 0x8000;
+
+ // Size of the heap shared by all the cores
+ var sharedHeapSize = 0x08000000;
+
+ // Initialize a Shared Region & create a heap in the DDR3 memory region
+ var SharedRegion = xdc.useModule('ti.sdo.ipc.SharedRegion');
+ SharedRegion.setEntryMeta( sharedRegionId,
+ { base: ddr3.base,
+ len: sharedHeapSize,
+ ownerProcId: OpenMP.masterCoreIdx,
+ cacheEnable: true,
+ createHeap: true,
+ isValid: true,
+ name: "DDR3_SR0",
+ });
+
+ // Configure and setup HeapOMP
+ HeapOMP.configure(sharedRegionId, localHeapSize);
+ }
+ else
+ {
+ OpenMP.useIpcSharedHeap = false;
+ OpenMP.allocateLocalHeapSize = 0x8000
+ OpenMP.allocateSharedHeapSize = 0x00800000
+ }
+
+
+ var Startup = xdc.useModule('xdc.runtime.Startup');
+ Startup.lastFxns.$add('&__TI_omp_initialize_rtsc_mode');
+}
+else
+{
+ /* Size the heap. It must be placed in shared memory */
+ program.heap = sharedHeapSize;
+}
diff --git a/examples/dsponly/common/ticblas_config.c b/examples/dsponly/common/ticblas_config.c
--- /dev/null
@@ -0,0 +1,179 @@
+/******************************************************************************
+ * Copyright (c) 2015, Texas Instruments Incorporated - http://www.ti.com
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <omp.h>
+#include <string.h>
+#include <stdio.h>
+#include <libarch.h>
+#include <ticblas.h>
+#include <cblas.h>
+
+/* use small memory model of BLAS */
+#ifdef SOC_C6678
+#define BLAS_L2_BUF_SIZE (220*1024UL) /* 220KB SRAM is available in L2 for C6678 EVM */
+#define BLAS_MSMC_BUF_SIZE (2*1024*1024UL) /* reserve 2MB for BLAS */
+#define BLAS_L3_DDR_SIZE (5120)
+#else
+# if SOC_K2H
+# define BLAS_L2_BUF_SIZE (768*1024UL)
+# define BLAS_MSMC_BUF_SIZE (4608*1024UL) /* 4.5MB */
+# define BLAS_L3_DDR_SIZE (5120)
+# else
+# error "Target undefined! Must be one of SOC_C6678 or SOC_K2H"
+# endif
+#endif
+
+size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+/* define MSMC memory for BLAS - can be shared with other libraries */
+#pragma DATA_SECTION(blas_msmc_buf, ".blas_msmc")
+#pragma DATA_ALIGN(blas_msmc_buf,32)
+char blas_msmc_buf[BLAS_MSMC_BUF_SIZE];
+
+/* define L2 memory for BLAS - can be shared with other libraries */
+#pragma DATA_SECTION(blas_l2_buf, ".blas_l2")
+#pragma DATA_ALIGN(blas_l2_buf,32)
+char blas_l2_buf[BLAS_L2_BUF_SIZE];
+
+char blas_ddr_buf[BLAS_L3_DDR_SIZE];
+
+
+/*==============================================================================
+ * This function configures and initializes memory for BLAS calls
+ *============================================================================*/
+int config_mem_for_ticblas(double *l2_buf, size_t l2_buf_size,
+ double *msmc_buf, size_t msmc_buf_size,
+ double *ddr_buf, size_t ddr_buf_size)
+{
+ size_t smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow;
+ void *l1d_SRAM_ptr;
+ int l1d_cfg_err;
+
+ /* First, verify the provided/available memory meet requirements */
+ tiCblasGetSizes(&smem_size_vfast, &smem_size_fast, &smem_size_med, &smem_size_slow);
+
+ printf("BLAS memory requirements - vfast size: %d, fast size: %d, medium size: %d, slow size: %d.\n", smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow);
+
+ if( (smem_size_vfast> lib_get_L1D_total_size()) /* total available L1D */
+ ||(smem_size_fast > l2_buf_size) /* provided L2 size */
+ ||(smem_size_med > msmc_buf_size) /* provided MSMC memory */
+ ||(smem_size_slow > ddr_buf_size)
+ ) {
+ printf("Provided memory is not enough for BLAS!\n");
+ exit(0);
+ }
+
+ /* Configure L1D if necessary */
+ l1D_SRAM_size_orig = lib_get_L1D_SRAM_size(); /* get current L1D SRAM size */
+ l1d_cfg_err = LIB_CACHE_SUCCESS;
+ printf("Original L1D SRAM size is: %d\n", l1D_SRAM_size_orig);
+ printf("Required L1D SRAM size is: %d\n", smem_size_vfast);
+ if(l1D_SRAM_size_orig < smem_size_vfast) { /* configure L1D if needs more SRAM */
+ #pragma omp parallel
+ {
+ l1d_cfg_err = lib_L1D_config_SRAM(smem_size_vfast);
+ if(l1d_cfg_err) {
+ printf("L1D configuration fails on core %d!\n", lib_get_coreID());
+ exit(1);
+ }
+ }
+ }
+
+ #pragma omp parallel
+ {
+ printf("New L1D SRAM size on core %d is: %d\n", lib_get_coreID(), lib_get_L1D_SRAM_size());
+ }
+
+ /* get L1D SRAM base address */
+ l1d_SRAM_ptr = lib_get_L1D_SRAM_base();
+ printf("L1D SRAM base address is 0x%x.\n", (unsigned int)l1d_SRAM_ptr);
+
+ /* pass allocated memories for heap initialization */
+ return(tiCblasInit(l1d_SRAM_ptr, lib_get_L1D_SRAM_size(),
+ l2_buf, l2_buf_size,
+ msmc_buf, msmc_buf_size,
+ ddr_buf, ddr_buf_size));
+} /* config_mem_for_ticblas */
+
+/*==============================================================================
+ * This function reconfigures L1D after processing is finished
+ *============================================================================*/
+int reconfig_mem_after_ticblas()
+{
+ int l1d_cfg_err;
+
+ /* configure L1D back */
+ l1d_cfg_err = LIB_CACHE_SUCCESS;
+ if(l1D_SRAM_size_orig!=lib_get_L1D_SRAM_size()) {
+ #pragma omp parallel
+ {
+ l1d_cfg_err = lib_L1D_config_SRAM(l1D_SRAM_size_orig);
+ if(l1d_cfg_err) {
+ printf("L1D reconfiguration fails on core %d!\n", lib_get_coreID());
+ exit(2);
+ }
+ }
+ }
+
+ printf("L1D SRAM size reconfigured to: %d\n", lib_get_L1D_SRAM_size());
+
+ return(TICBLAS_SUCCESS);
+} /* reconfig_mem_after_ticblas */
+
+
+/*==============================================================================
+ * This function prepares for calling TI's CBLAS
+ *============================================================================*/
+void prepare_for_ticblas()
+{
+ int err;
+
+ printf("L2 SRAM size is %d, total L2 size is %d.\n", lib_get_L2_SRAM_size(), lib_get_L2_total_size());
+
+ /* Call TI CBLAS API to creat new CBLAS instance */
+ tiCblasNew();
+
+ /* Configure memory for TI CBLAS if necessary */
+ err = config_mem_for_ticblas((double *)blas_l2_buf, (size_t)BLAS_L2_BUF_SIZE,
+ (double *)blas_msmc_buf, (size_t)BLAS_MSMC_BUF_SIZE,
+ (double *)blas_ddr_buf, (size_t)BLAS_L3_DDR_SIZE);
+
+ if(err) {
+ printf("Memory configuration for CBLAS failed with error code %d.\n", err);
+ exit (0);
+ }
+}
+
+void cleanup_after_ticblas()
+{
+ /* Reconfigure memory if necessary */
+ if(reconfig_mem_after_ticblas() == TICBLAS_SUCCESS) {
+ printf("Memory reconfiguration after BLAS call finished.\n");
+ }
+
+ tiCblasDelete();
+}
diff --git a/examples/dsponly/dgemm_test/Makefile b/examples/dsponly/dgemm_test/Makefile
--- /dev/null
@@ -0,0 +1,12 @@
+
+COMMON_FOLDER = ../common
+testfiles_obj = dgemm_test.obj
+outfile = dgemm_test.out
+CL_OPTS =
+
+include $(COMMON_FOLDER)/Makefile.common
+
+%.obj: %.c
+ $(CL) $(CL_OPTS) $<
+ echo Compiling $<
+
diff --git a/examples/dsponly/dgemm_test/dgemm_test.c b/examples/dsponly/dgemm_test/dgemm_test.c
--- /dev/null
@@ -0,0 +1,214 @@
+/******************************************************************************
+ * Copyright (c) 2015, Texas Instruments Incorporated - http://www.ti.com
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/******************************************************************************
+* FILE: dgemm_test.c
+******************************************************************************/
+#include <omp.h>
+#include <string.h>
+#include <stdio.h>
+#include <libarch.h>
+#include <ticblas.h>
+#include <cblas.h>
+
+#define FLOPS_PER_UNIT_PERF 1e9
+
+extern void cleanup_after_ticblas();
+extern void prepare_for_ticblas();
+extern double omp_get_wtime(void);
+
+void matrix_gen(double *A, double *B, double *C, int m, int k, int n);
+void mat_mpy(const double * A, const double * B, double * C, int mat_N,
+ int mat_K, int mat_M, double alpha, double beta);
+double dotprod(const double * A, const double * B, int n);
+void print_matrix(double *mat, int m, int n);
+double diff_matrix(double *mat1, double * mat2, int m, int n);
+
+int main (int argc, char *argv[])
+{
+ double *A, *B, *C, *C_copy;
+ int m, n, k;
+ double alpha, beta, precision_diff, time, time_diff, gflops;
+
+ int nthreads, tid;
+
+ /* Verify OpenMP working properly */
+ #pragma omp parallel private(nthreads, tid)
+ {
+ tid = omp_get_thread_num(); /* Obtain thread number */
+ printf("Hello World from thread = %d\n", tid);
+
+ /* Only master thread does this */
+ if (tid == 0) {
+ nthreads = omp_get_num_threads();
+ printf("Number of threads = %d\n", nthreads);
+ }
+
+ } /* All threads join master thread and disband */
+
+ /* hard code dgemm parameters */
+ m = k = n = 1000;
+ alpha = 0.7;
+ beta = 1.3;
+
+ /* Allocate memory for matrices */
+ A = (double *)malloc( m*k*sizeof( double ) );
+ B = (double *)malloc( k*n*sizeof( double ) );
+ C = (double *)malloc( m*n*sizeof( double ) );
+ C_copy = (double *)malloc( m*n*sizeof( double ) );
+ if (A == NULL || B == NULL || C == NULL || C_copy == NULL) {
+ printf( "\nERROR: Can't allocate memory for matrices. Aborting... \n\n");
+ free(A);
+ free(B);
+ free(C);
+ return 1;
+ }
+
+ /* Initialize random number generator */
+ srand(123456789);
+
+ /* Configure memory and initialize TI CBLAS */
+ prepare_for_ticblas();
+
+ /* Generate matrices */
+ matrix_gen(A, B, C, m, k, n);
+ memcpy(C_copy, C, m*n*sizeof(double));
+
+ /* Call standard CBLAS API for dgemm */
+ time = omp_get_wtime();
+ cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, k, B, n, beta, C, n);
+ time_diff = omp_get_wtime() - time;
+ gflops = ( 2.0 * m * n * k ) / time_diff / FLOPS_PER_UNIT_PERF;
+ printf("DGEMM time for (m,n,k) = (%d,%d,%d) is %e, GFLOPS is %e.\n", m,n,k, time_diff, gflops);
+
+ /* Straightforward matrix multiplication as reference */
+ mat_mpy(A, B, C_copy, m, n, k, alpha, beta);
+
+ /* Find the difference between dgemm and reference */
+ precision_diff = diff_matrix(C, C_copy, m, k);
+ printf("Precision error is %e.\n", precision_diff);
+
+ /* Finalize TI CBLAS and reconfigure memory */
+ cleanup_after_ticblas();
+
+ return 0;
+}
+
+/*==============================================================================
+ * This function generates matrices of random data
+ *============================================================================*/
+void matrix_gen(double *A, double *B, double *C, int m, int k, int n)
+{
+
+ int i;
+ for (i = 0; i < (m*k); i++) {
+ A[i] = (double)rand()/RAND_MAX - 0.5;
+ }
+
+ for (i = 0; i < (k*n); i++) {
+ B[i] = (double)rand()/RAND_MAX - 0.5;
+ }
+
+ for (i = 0; i < (m*n); i++) {
+ C[i] = (double)rand()/RAND_MAX - 0.5;
+ }
+
+}
+
+
+/******************************************************************************
+* Straightforward implementation of matrix multiplication with row-major
+******************************************************************************/
+void mat_mpy(const double * A, const double * B, double * C, int mat_N,
+ int mat_K, int mat_M, double alpha, double beta)
+{
+ int col, row;
+ double b_col[mat_K];
+
+ for (col = 0; col < mat_M; ++col)
+ {
+ for (row = 0; row < mat_K; ++row)
+ b_col[row] = B[row*mat_M+col];
+
+ for (row = 0; row < mat_N; ++row)
+ C[row*mat_M+col] = alpha*dotprod(A + (row * mat_K), b_col, mat_K)
+ + beta*C[row*mat_M+col];
+ }
+}
+
+/******************************************************************************
+* dot product for matrix multiplication
+******************************************************************************/
+double dotprod(const double * A, const double * B, int n)
+{
+ int i;
+
+ float result = 0;
+ for (i = 0; i < n; ++i) result += A[i] * B[i];
+
+ return result;
+}
+
+/******************************************************************************
+* Print a row-major matrix
+******************************************************************************/
+void print_matrix(double *mat, int m, int n)
+{
+ int i, j;
+
+ for(i=0; i<m; i++) {
+ for(j=0; j<n; j++) {
+ printf( " %10.5f ", mat[i*n+j]);
+ }
+ printf( "\n" );
+ }
+}
+
+/******************************************************************************
+* Find the maximum absolute difference of two matrices
+******************************************************************************/
+double diff_matrix(double *mat1, double * mat2, int m, int n)
+{
+ int i, j;
+ double abs_max_err, err;
+
+ abs_max_err = 0.0f;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ err = fabs(mat1[i*n+j] - mat2[i*n+j]);
+ if(abs_max_err < err) {
+ abs_max_err = err;
+ }
+ }
+ }
+
+ return (abs_max_err);
+}
+
+/* Nothing past this point */
diff --git a/examples/dsponly/readme.txt b/examples/dsponly/readme.txt
--- /dev/null
@@ -0,0 +1,17 @@
+To build linalg examples, follow instructions below:
+
+1. A target must be defined, e.g. make TARGET=SOC_C6678 or make TARGET=SOC_K2H or make TARGET=SOC_AM572x
+2. Following environment variables must be defined (version numbers just serve as an example):
+export BIOS_DIR=<installation_folder>/bios_6_45_00_17
+export IPC_DIR=<installation_folder>/ipc_3_41_00_06_eng
+export XDC_DIR=<installation_folder>/xdctools_3_31_02_38_core
+export OMP_DIR=<installation_folder>/openmp_dsp_c667x_2_02_00_02
+export C6636_PDK_DIR=<installation_folder>/pdk_k2hk_4_0_0
+export C6678_PDK_DIR=<installation_folder>/pdk_c667x_2_0_0
+export PDK_DIR=<installation_folder>/pdk_c667x_2_0_0
+export CGTROOT=<installation_folder>/ti-cgt-c6000_8.0.3
+export XDAIS_DIR=<installation_folder>/xdais_7_24_00_04
+export FC_DIR=<installation_folder>/framework_components_3_40_01_04
+export LIBARCH_DIR=<installation_folder>/libarch
+export EDMA3_DIR=<installation_folder>/edma3_lld_02_12_01_22
+export PATH=<installation_folder>/ti-cgt-c6000_8.0.3/bin:$PATH
\ No newline at end of file
diff --git a/examples/dsponly/setup_env_rtos_yocto.sh b/examples/dsponly/setup_env_rtos_yocto.sh
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export CGTROOT="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/share/ti/cgt-c6x"
+export XDC_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-xdctools-tree"
+export XDAIS_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-xdais-tree"
+export BIOS_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-sysbios-tree"
+export IPC_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-ipc-tree"
+export OMP_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-omp-tree"
+export C6678_PDK_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-pdk-tree"
+export FC_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-framework-components-tree"
+export EDMA3_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-edma3lld-tree"
+export LIBARCH_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-libarch-tree"
+export LINALG_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-linalg-tree"
+export PATH="/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin/arm-linux-gnueabi:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/bin/crossscripts:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/sbin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/sbin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/sources/bitbake/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/sources/bitbake/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games"
diff --git a/git_add.sh b/git_add.sh
--- /dev/null
+++ b/git_add.sh
@@ -0,0 +1,386 @@
+git add Makefile
+git add blasblisacc/Makefile
+git add blasblisacc/src/Makefile
+git add blasblisacc/src/Makefile.ARM
+git rm blasblisacc/src/blas_wrap_gen.sh
+git add blasblisacc/src/facade.c
+git rm blasblisacc/src/oclgen.pl
+git add blasblisacc/src/ofld_tbl_cgemm.c
+git add blasblisacc/src/ofld_tbl_csyrk.c
+git add blasblisacc/src/ofld_tbl_ctrmm.c
+git add blasblisacc/src/ofld_tbl_ctrsm.c
+git add blasblisacc/src/ofld_tbl_dgemm.c
+git add blasblisacc/src/ofld_tbl_dsyrk.c
+git add blasblisacc/src/ofld_tbl_dtrmm.c
+git add blasblisacc/src/ofld_tbl_dtrsm.c
+git add blasblisacc/src/ofld_tbl_sgemm.c
+git add blasblisacc/src/ofld_tbl_ssyrk.c
+git add blasblisacc/src/ofld_tbl_strmm.c
+git add blasblisacc/src/ofld_tbl_strsm.c
+git add blasblisacc/src/ofld_tbl_zgemm.c
+git add blasblisacc/src/ofld_tbl_zsyrk.c
+git add blasblisacc/src/ofld_tbl_ztrmm.c
+git add blasblisacc/src/ofld_tbl_ztrsm.c
+git rm blasblisacc/src/ti_cblas.h
+git add blasblisacc/src/ti_cblas_cblas_caxpy.c
+git add blasblisacc/src/ti_cblas_cblas_ccopy.c
+git add blasblisacc/src/ti_cblas_cblas_cdotc_sub.c
+git add blasblisacc/src/ti_cblas_cblas_cdotu_sub.c
+git add blasblisacc/src/ti_cblas_cblas_cgbmv.c
+git add blasblisacc/src/ti_cblas_cblas_cgemm.c
+git add blasblisacc/src/ti_cblas_cblas_cgemv.c
+git add blasblisacc/src/ti_cblas_cblas_cgerc.c
+git add blasblisacc/src/ti_cblas_cblas_cgeru.c
+git add blasblisacc/src/ti_cblas_cblas_chbmv.c
+git add blasblisacc/src/ti_cblas_cblas_chemm.c
+git add blasblisacc/src/ti_cblas_cblas_chemv.c
+git add blasblisacc/src/ti_cblas_cblas_cher.c
+git add blasblisacc/src/ti_cblas_cblas_cher2.c
+git add blasblisacc/src/ti_cblas_cblas_cher2k.c
+git add blasblisacc/src/ti_cblas_cblas_cherk.c
+git add blasblisacc/src/ti_cblas_cblas_chpmv.c
+git add blasblisacc/src/ti_cblas_cblas_chpr.c
+git add blasblisacc/src/ti_cblas_cblas_chpr2.c
+git add blasblisacc/src/ti_cblas_cblas_crotg.c
+git add blasblisacc/src/ti_cblas_cblas_cscal.c
+git add blasblisacc/src/ti_cblas_cblas_csscal.c
+git add blasblisacc/src/ti_cblas_cblas_cswap.c
+git add blasblisacc/src/ti_cblas_cblas_csymm.c
+git add blasblisacc/src/ti_cblas_cblas_csyr2k.c
+git add blasblisacc/src/ti_cblas_cblas_csyrk.c
+git add blasblisacc/src/ti_cblas_cblas_ctbmv.c
+git add blasblisacc/src/ti_cblas_cblas_ctbsv.c
+git add blasblisacc/src/ti_cblas_cblas_ctpmv.c
+git add blasblisacc/src/ti_cblas_cblas_ctpsv.c
+git add blasblisacc/src/ti_cblas_cblas_ctrmm.c
+git add blasblisacc/src/ti_cblas_cblas_ctrmv.c
+git add blasblisacc/src/ti_cblas_cblas_ctrsm.c
+git add blasblisacc/src/ti_cblas_cblas_ctrsv.c
+git add blasblisacc/src/ti_cblas_cblas_dasum.c
+git add blasblisacc/src/ti_cblas_cblas_daxpy.c
+git add blasblisacc/src/ti_cblas_cblas_dcopy.c
+git add blasblisacc/src/ti_cblas_cblas_ddot.c
+git add blasblisacc/src/ti_cblas_cblas_dgbmv.c
+git add blasblisacc/src/ti_cblas_cblas_dgemm.c
+git add blasblisacc/src/ti_cblas_cblas_dgemv.c
+git add blasblisacc/src/ti_cblas_cblas_dger.c
+git add blasblisacc/src/ti_cblas_cblas_dnrm2.c
+git add blasblisacc/src/ti_cblas_cblas_drot.c
+git add blasblisacc/src/ti_cblas_cblas_drotg.c
+git add blasblisacc/src/ti_cblas_cblas_drotm.c
+git add blasblisacc/src/ti_cblas_cblas_drotmg.c
+git add blasblisacc/src/ti_cblas_cblas_dsbmv.c
+git add blasblisacc/src/ti_cblas_cblas_dscal.c
+git add blasblisacc/src/ti_cblas_cblas_dsdot.c
+git add blasblisacc/src/ti_cblas_cblas_dspmv.c
+git add blasblisacc/src/ti_cblas_cblas_dspr.c
+git add blasblisacc/src/ti_cblas_cblas_dspr2.c
+git add blasblisacc/src/ti_cblas_cblas_dswap.c
+git add blasblisacc/src/ti_cblas_cblas_dsymm.c
+git add blasblisacc/src/ti_cblas_cblas_dsymv.c
+git add blasblisacc/src/ti_cblas_cblas_dsyr.c
+git add blasblisacc/src/ti_cblas_cblas_dsyr2.c
+git add blasblisacc/src/ti_cblas_cblas_dsyr2k.c
+git add blasblisacc/src/ti_cblas_cblas_dsyrk.c
+git add blasblisacc/src/ti_cblas_cblas_dtbmv.c
+git add blasblisacc/src/ti_cblas_cblas_dtbsv.c
+git add blasblisacc/src/ti_cblas_cblas_dtpmv.c
+git add blasblisacc/src/ti_cblas_cblas_dtpsv.c
+git add blasblisacc/src/ti_cblas_cblas_dtrmm.c
+git add blasblisacc/src/ti_cblas_cblas_dtrmv.c
+git add blasblisacc/src/ti_cblas_cblas_dtrsm.c
+git add blasblisacc/src/ti_cblas_cblas_dtrsv.c
+git add blasblisacc/src/ti_cblas_cblas_dzasum.c
+git add blasblisacc/src/ti_cblas_cblas_dznrm2.c
+git add blasblisacc/src/ti_cblas_cblas_icamax.c
+git add blasblisacc/src/ti_cblas_cblas_idamax.c
+git add blasblisacc/src/ti_cblas_cblas_isamax.c
+git add blasblisacc/src/ti_cblas_cblas_izamax.c
+git add blasblisacc/src/ti_cblas_cblas_sasum.c
+git add blasblisacc/src/ti_cblas_cblas_saxpy.c
+git add blasblisacc/src/ti_cblas_cblas_scasum.c
+git add blasblisacc/src/ti_cblas_cblas_scnrm2.c
+git add blasblisacc/src/ti_cblas_cblas_scopy.c
+git add blasblisacc/src/ti_cblas_cblas_sdot.c
+git add blasblisacc/src/ti_cblas_cblas_sdsdot.c
+git add blasblisacc/src/ti_cblas_cblas_sgbmv.c
+git add blasblisacc/src/ti_cblas_cblas_sgemm.c
+git add blasblisacc/src/ti_cblas_cblas_sgemv.c
+git add blasblisacc/src/ti_cblas_cblas_sger.c
+git add blasblisacc/src/ti_cblas_cblas_snrm2.c
+git add blasblisacc/src/ti_cblas_cblas_srot.c
+git add blasblisacc/src/ti_cblas_cblas_srotg.c
+git add blasblisacc/src/ti_cblas_cblas_srotm.c
+git add blasblisacc/src/ti_cblas_cblas_srotmg.c
+git add blasblisacc/src/ti_cblas_cblas_ssbmv.c
+git add blasblisacc/src/ti_cblas_cblas_sscal.c
+git add blasblisacc/src/ti_cblas_cblas_sspmv.c
+git add blasblisacc/src/ti_cblas_cblas_sspr.c
+git add blasblisacc/src/ti_cblas_cblas_sspr2.c
+git add blasblisacc/src/ti_cblas_cblas_sswap.c
+git add blasblisacc/src/ti_cblas_cblas_ssymm.c
+git add blasblisacc/src/ti_cblas_cblas_ssymv.c
+git add blasblisacc/src/ti_cblas_cblas_ssyr.c
+git add blasblisacc/src/ti_cblas_cblas_ssyr2.c
+git add blasblisacc/src/ti_cblas_cblas_ssyr2k.c
+git add blasblisacc/src/ti_cblas_cblas_ssyrk.c
+git add blasblisacc/src/ti_cblas_cblas_stbmv.c
+git add blasblisacc/src/ti_cblas_cblas_stbsv.c
+git add blasblisacc/src/ti_cblas_cblas_stpmv.c
+git add blasblisacc/src/ti_cblas_cblas_stpsv.c
+git add blasblisacc/src/ti_cblas_cblas_strmm.c
+git add blasblisacc/src/ti_cblas_cblas_strmv.c
+git add blasblisacc/src/ti_cblas_cblas_strsm.c
+git add blasblisacc/src/ti_cblas_cblas_strsv.c
+git add blasblisacc/src/ti_cblas_cblas_xerbla.c
+git add blasblisacc/src/ti_cblas_cblas_zaxpy.c
+git add blasblisacc/src/ti_cblas_cblas_zcopy.c
+git add blasblisacc/src/ti_cblas_cblas_zdotc_sub.c
+git add blasblisacc/src/ti_cblas_cblas_zdotu_sub.c
+git add blasblisacc/src/ti_cblas_cblas_zdscal.c
+git add blasblisacc/src/ti_cblas_cblas_zgbmv.c
+git add blasblisacc/src/ti_cblas_cblas_zgemm.c
+git add blasblisacc/src/ti_cblas_cblas_zgemv.c
+git add blasblisacc/src/ti_cblas_cblas_zgerc.c
+git add blasblisacc/src/ti_cblas_cblas_zgeru.c
+git add blasblisacc/src/ti_cblas_cblas_zhbmv.c
+git add blasblisacc/src/ti_cblas_cblas_zhemm.c
+git add blasblisacc/src/ti_cblas_cblas_zhemv.c
+git add blasblisacc/src/ti_cblas_cblas_zher.c
+git add blasblisacc/src/ti_cblas_cblas_zher2.c
+git add blasblisacc/src/ti_cblas_cblas_zher2k.c
+git add blasblisacc/src/ti_cblas_cblas_zherk.c
+git add blasblisacc/src/ti_cblas_cblas_zhpmv.c
+git add blasblisacc/src/ti_cblas_cblas_zhpr.c
+git add blasblisacc/src/ti_cblas_cblas_zhpr2.c
+git add blasblisacc/src/ti_cblas_cblas_zrotg.c
+git add blasblisacc/src/ti_cblas_cblas_zscal.c
+git add blasblisacc/src/ti_cblas_cblas_zswap.c
+git add blasblisacc/src/ti_cblas_cblas_zsymm.c
+git add blasblisacc/src/ti_cblas_cblas_zsyr2k.c
+git add blasblisacc/src/ti_cblas_cblas_zsyrk.c
+git add blasblisacc/src/ti_cblas_cblas_ztbmv.c
+git add blasblisacc/src/ti_cblas_cblas_ztbsv.c
+git add blasblisacc/src/ti_cblas_cblas_ztpmv.c
+git add blasblisacc/src/ti_cblas_cblas_ztpsv.c
+git add blasblisacc/src/ti_cblas_cblas_ztrmm.c
+git add blasblisacc/src/ti_cblas_cblas_ztrmv.c
+git add blasblisacc/src/ti_cblas_cblas_ztrsm.c
+git add blasblisacc/src/ti_cblas_cblas_ztrsv.c
+git add blasblisacc/src/ti_cblas_initfini.c
+git add blasblisacc/src/ti_cblas_kernel.cl
+git add blasblisacc/src/ti_l3_offload.c
+git add blis/Makefile
+git add blis/config/c66x/bli_config.h
+git add blis/config/c66x/bli_kernel.h
+git add blis/config/c66x/make_defs.mk
+git add blis/config/c66x/touch.h
+git add blis/config/cortex-a15/bli_kernel.h
+git add blis/frame/1m/packm/bli_packm_threading.c
+git add blis/frame/3/gemm/bli_gemm.c
+git add blis/frame/3/gemm/bli_gemm_blk_var1f.c
+git add blis/frame/3/gemm/bli_gemm_blk_var2f.c
+git add blis/frame/3/gemm/bli_gemm_blk_var3f.c
+git add blis/frame/3/gemm/bli_gemm_cntl.c
+git add blis/frame/3/gemm/bli_gemm_front.c
+git add blis/frame/3/gemm/bli_gemm_int.c
+git add blis/frame/3/gemm/bli_gemm_ker_var2.c
+git add blis/frame/3/gemm/bli_gemm_threading.c
+git add blis/frame/3/hemm/bli_hemm.c
+git add blis/frame/3/hemm/bli_hemm_front.c
+git add blis/frame/3/her2k/bli_her2k.c
+git add blis/frame/3/her2k/bli_her2k_front.c
+git add blis/frame/3/herk/bli_herk.c
+git add blis/frame/3/herk/bli_herk_blk_var1f.c
+git add blis/frame/3/herk/bli_herk_blk_var2f.c
+git add blis/frame/3/herk/bli_herk_blk_var3f.c
+git add blis/frame/3/herk/bli_herk_front.c
+git add blis/frame/3/herk/bli_herk_int.c
+git add blis/frame/3/herk/bli_herk_l_ker_var2.c
+git add blis/frame/3/herk/bli_herk_threading.c
+git add blis/frame/3/herk/bli_herk_u_ker_var2.c
+git add blis/frame/3/symm/bli_symm.c
+git add blis/frame/3/symm/bli_symm_front.c
+git add blis/frame/3/syr2k/bli_syr2k.c
+git add blis/frame/3/syr2k/bli_syr2k_front.c
+git add blis/frame/3/syrk/bli_syrk.c
+git add blis/frame/3/syrk/bli_syrk_front.c
+git add blis/frame/3/trmm/bli_trmm.c
+git add blis/frame/3/trmm/bli_trmm_blk_var1f.c
+git add blis/frame/3/trmm/bli_trmm_blk_var2b.c
+git add blis/frame/3/trmm/bli_trmm_blk_var2f.c
+git add blis/frame/3/trmm/bli_trmm_blk_var3b.c
+git add blis/frame/3/trmm/bli_trmm_blk_var3f.c
+git add blis/frame/3/trmm/bli_trmm_front.c
+git add blis/frame/3/trmm/bli_trmm_int.c
+git add blis/frame/3/trmm/bli_trmm_ll_ker_var2.c
+git add blis/frame/3/trmm/bli_trmm_lu_ker_var2.c
+git add blis/frame/3/trmm/bli_trmm_rl_ker_var2.c
+git add blis/frame/3/trmm/bli_trmm_ru_ker_var2.c
+git add blis/frame/3/trmm/bli_trmm_threading.c
+git add blis/frame/3/trsm/bli_trsm.c
+git add blis/frame/3/trsm/bli_trsm_blk_var1b.c
+git add blis/frame/3/trsm/bli_trsm_blk_var1f.c
+git add blis/frame/3/trsm/bli_trsm_blk_var2b.c
+git add blis/frame/3/trsm/bli_trsm_blk_var2f.c
+git add blis/frame/3/trsm/bli_trsm_blk_var3b.c
+git add blis/frame/3/trsm/bli_trsm_blk_var3f.c
+git add blis/frame/3/trsm/bli_trsm_cntl.c
+git add blis/frame/3/trsm/bli_trsm_front.c
+git add blis/frame/3/trsm/bli_trsm_int.c
+git add blis/frame/3/trsm/bli_trsm_rl_ker_var2.c
+git add blis/frame/3/trsm/bli_trsm_ru_ker_var2.c
+git add blis/frame/3/trsm/bli_trsm_threading.c
+git add blis/frame/base/bli_dma.c
+git add blis/frame/base/bli_dma.h
+git add blis/frame/base/bli_error.c
+git add blis/frame/base/bli_malloc.c
+git add blis/frame/base/bli_malloc.h
+git add blis/frame/base/bli_mem.c
+git add blis/frame/base/bli_obj.c
+git add blis/frame/base/bli_threading.c
+git add blis/frame/base/bli_threading.h
+git add blis/frame/include/bli_mem_pool_macro_defs.h
+git add blis/frame/include/bli_obj_macro_defs.h
+git add blis/frame/include/bli_type_defs.h
+git add blis/frame/include/blis.h
+git add blis/frame/include/level0/bli_fprints.h
+git add blis/frame/util/printm/bli_fprintm.c
+git add blis/frame/util/printv/bli_fprintv.c
+git add blis/kernels/c66x/1m/bli_packm_cxk_ukernels.c
+git add blis/kernels/c66x/1m/bli_packm_cxk_ukernels.h
+git add blis/kernels/c66x/3/bli_gemm_ukernels.c
+git add blis/testsuite/Makefile
+git add blis/testsuite/input.general
+git add blis/testsuite/src/test_addm.c
+git add blis/testsuite/src/test_addv.c
+git add blis/testsuite/src/test_axpy2v.c
+git add blis/testsuite/src/test_axpyf.c
+git add blis/testsuite/src/test_axpym.c
+git add blis/testsuite/src/test_axpyv.c
+git add blis/testsuite/src/test_copym.c
+git add blis/testsuite/src/test_copyv.c
+git add blis/testsuite/src/test_dotaxpyv.c
+git add blis/testsuite/src/test_dotv.c
+git add blis/testsuite/src/test_dotxaxpyf.c
+git add blis/testsuite/src/test_dotxf.c
+git add blis/testsuite/src/test_dotxv.c
+git add blis/testsuite/src/test_gemm.c
+git add blis/testsuite/src/test_gemm_ukr.c
+git add blis/testsuite/src/test_gemmtrsm_ukr.c
+git add blis/testsuite/src/test_gemv.c
+git add blis/testsuite/src/test_ger.c
+git add blis/testsuite/src/test_hemm.c
+git add blis/testsuite/src/test_hemv.c
+git add blis/testsuite/src/test_her.c
+git add blis/testsuite/src/test_her2.c
+git add blis/testsuite/src/test_her2k.c
+git add blis/testsuite/src/test_herk.c
+git add blis/testsuite/src/test_libblis.c
+git add blis/testsuite/src/test_libblis.h
+git add blis/testsuite/src/test_normfm.c
+git add blis/testsuite/src/test_normfv.c
+git add blis/testsuite/src/test_randm.c
+git add blis/testsuite/src/test_randv.c
+git add blis/testsuite/src/test_scal2m.c
+git add blis/testsuite/src/test_scal2v.c
+git add blis/testsuite/src/test_scalm.c
+git add blis/testsuite/src/test_scalv.c
+git add blis/testsuite/src/test_setm.c
+git add blis/testsuite/src/test_setv.c
+git add blis/testsuite/src/test_subm.c
+git add blis/testsuite/src/test_subv.c
+git add blis/testsuite/src/test_symm.c
+git add blis/testsuite/src/test_symv.c
+git add blis/testsuite/src/test_syr.c
+git add blis/testsuite/src/test_syr2.c
+git add blis/testsuite/src/test_syr2k.c
+git add blis/testsuite/src/test_syrk.c
+git add blis/testsuite/src/test_trmm.c
+git add blis/testsuite/src/test_trmm3.c
+git add blis/testsuite/src/test_trmv.c
+git add blis/testsuite/src/test_trsm.c
+git add blis/testsuite/src/test_trsm_ukr.c
+git add blis/testsuite/src/test_trsv.c
+git add blis/version
+git add blis/windows/Makefile
+git add blis/windows/build/config.mk.in
+git add blis/windows/build/defs.mk
+git add blis/windows/build/ignore_list
+git add blis/windows/build/ignore_list.windows
+git add blis/windows/build/leaf_list
+git add blis/windows/build/libblis-symbols.def
+git add blis/windows/build/nmake-help.cmd
+git add blis/windows/configure.cmd
+git add blis/windows/gendll.cmd
+git add build/tar_files_list.txt
+git add cblas/Makefile
+git add cblas/Makefile.C66
+git add clapack/BLAS/run_blas_tests.sh
+git add clapack/F2CLIBS/libf2c/Makefile
+git rm clapack/F2CLIBS/libf2c/signal1.h
+git rm clapack/F2CLIBS/libf2c/sysdep1.h
+git rm docs/LINALG_1.0.0_manifest.html
+git rm examples/Makefile
+git rm examples/dgemm_test/Makefile
+git rm examples/dgemm_test/dgemm_test.c
+git rm examples/dsyrk_test/Makefile
+git rm examples/dsyrk_test/dsyrk_test.c
+git rm examples/eig/Makefile
+git rm examples/eig/dlaran.c
+git rm examples/eig/dlarnd.c
+git rm examples/eig/dlatm1.c
+git rm examples/eig/dlatm2.c
+git rm examples/eig/dlatm3.c
+git rm examples/eig/dlatmr.c
+git rm examples/eig/main.c
+git rm examples/gemm_bench/Makefile
+git rm examples/gemm_bench/main.c
+git rm examples/ludinv/Makefile
+git rm examples/ludinv/dlaran.c
+git rm examples/ludinv/dlarnd.c
+git rm examples/ludinv/dlatm1.c
+git rm examples/ludinv/dlatm2.c
+git rm examples/ludinv/dlatm3.c
+git rm examples/ludinv/dlatmr.c
+git rm examples/ludinv/main.c
+git rm examples/make.inc
+git rm examples/matmpy/Makefile
+git rm examples/matmpy/main.c
+git rm examples/ztrmm_test/Makefile
+git rm examples/ztrmm_test/ztrmm_test.c
+git rm examples/ztrsm_test/Makefile
+git rm examples/ztrsm_test/ztrsm_test.c
+git add make.inc
+
+git add blasblisacc/src/ti_cblas_acc.h
+git add blasblisacc/src/ti_cblas_mem_config.c
+git add blasblisacc/src/wrap_gen/blas_wrap_gen.sh
+git add blasblisacc/src/wrap_gen/oclgen.pl
+git add blis/frame/base/bli_profile.c
+git add blis/frame/base/bli_profile.h
+git add blis/kernels/armv7a/3/bli_cgemm_kernel_2x2.S
+git add blis/kernels/armv7a/3/bli_dgemm_kernel_4x4.S
+git add blis/kernels/armv7a/3/bli_sgemm_kernel_4x4.S
+git add blis/kernels/armv7a/3/bli_zgemm_kernel_2x2.S
+git add blis/testsuite/dsponly
+git add blis/testsuite/parselog.pl
+git add build_opencl_k2h_large.sh
+git add build_opencl_k2h_medium.sh
+git add build_opencl_k2h_small.sh
+git add build_rtos_c6678_small.sh
+git add clapack/BLAS/run_tests_only.sh
+git add clapack/TESTING/run_clapack_tests.sh
+git add clapack/TESTING/run_testsuite.sh
+git add docs/LINALG_1.2.0_manifest.html
+git add docs/doxygen
+git add docs/linalg_user_guide.html
+git add examples/arm+dsp
+git add examples/dsponly
+git add git_add.sh
+git add setup_env.sh
+git add setup_env_C6678_rtos.sh
+git add setup_env_k2h_ocl.sh
+git add setup_env_k2h_rtos.sh
+git add setup_env_rtos_yocto.sh
+git add ticblas
diff --git a/make.inc b/make.inc
index be81de2881ecd1d20990041a2f53f23a951a0fb5..b2e1afec899fc27de0980f28c6d0d6af27e19070 100644 (file)
--- a/make.inc
+++ b/make.inc
-#DSP_INCLUDE = -I$(TI_OCL_CGT_INSTALL)/include
-DSP_INCLUDE += -I$(TARGET_ROOTDIR)/usr/share/ti/cgt-c6x/include
-DSP_INCLUDE += -I$(TARGET_ROOTDIR)/usr/share/ti/opencl
-#DSP_INCLUDE += -I$(TI_OCL_INSTALL_DIR)/include
CPP = g++
CL6X = cl6x -mv6600 --abi=eabi $(DSP_INCLUDE)
# is located on the build host and necessary ARM libraries are installed
# on that file system.
ifneq ($(MAKECMDGOALS),clean)
- ifeq ($(TARGET_ROOTDIR),)
- $(error Environment variable TARGET_ROOTDIR must be defined. Set it to point at the EVM root file system)
- endif
- ifeq ($(OMP_DIR),)
- $(error Environment variable OMP_DIR must be defined. Set it to point at the OpenMP for DSP Installation Directory, or run the setup_hpc_env script available in the MCSDK-HPC installation)
- endif
+ ifneq ($(LIBOS),LIB_RTOS)
+ ifeq ($(TARGET_ROOTDIR),)
+ $(error Environment variable TARGET_ROOTDIR must be defined. Set it to point at the EVM root file system)
+ endif
+ endif
+
+ ifeq ($(OMP_DIR),)
+ $(error Environment variable OMP_DIR must be defined. Set it to point at the OpenMP for DSP Installation Directory, or run the setup_hpc_env script available in the MCSDK-HPC installation)
+ endif
endif
# gcc ARM cross compiler will not, by default, search the host's
diff --git a/setup_env.sh b/setup_env.sh
--- /dev/null
+++ b/setup_env.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+export TI_OCL_INSTALL="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm"
+export TI_OCL_INSTALL_DIR="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/share/ti/opencl"
+export TI_OCL_CGT_INSTALL="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/share/ti/cgt-c6x"
+export PDK_DIR="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/share/ti/ti-pdk-tree"
+export FC_DIR="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/share/ti/ti-framework-components-tree"
+export XDAIS_DIR="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/share/ti/ti-xdais-tree"
+export XDC_DIR="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/share/ti/ti-xdctools-tree"
+export BIOS_DIR="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/share/ti/ti-sysbios-tree"
+#export OMP_DIR="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/share/ti/ti-omp-tree"
+export OMP_DIR="/home/a0869574local/ti/openmp_dsp_2_02_00_01"
+#export LIBARCH_DIR="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/share/ti/ti-libarch-tree"
+export LIBARCH_DIR="/home/a0869574local/proclibs/libarch_intgit/libarch"
+export TARGET_ROOTDIR="/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm"
+
+#export XDC_DIR=/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/share/ti/ti-xdctools-tree
+#export OMP_DIR=/home/a0869574local/ti/openmp_dsp_2_01_16_03
+#export PDK_DIR=/home/a0869574local/ti/pdk_keystone2_3_00_04_18
+#export FC_DIR=/home/a0869574local/ti/framework_components_3_31_00_02
+#export XDAIS_DIR=/home/a0869574local/ti/xdais_7_24_00_04
+#export XDC_DIR=/home/a0869574local/ti/xdctools_3_25_05_94
+export PATH=/home/a0869574local/tisdk/sources/oe-core/scripts:/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin/arm-linux-gnueabi:/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/k2hk-evm/usr/bin/crossscripts:/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/sbin:/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin:/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/sbin:/home/a0869574local/tisdk/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/tisdk/sources/oe-core/scripts:/home/a0869574local/tisdk/sources/bitbake/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/tisdk/sources/oe-core/scripts:/home/a0869574local/tisdk/sources/bitbake/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games
+
diff --git a/setup_env_C6678_rtos.sh b/setup_env_C6678_rtos.sh
--- /dev/null
+++ b/setup_env_C6678_rtos.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export CGTROOT="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/share/ti/cgt-c6x"
+export C6678_PDK_DIR="/home/a0869574local/ti/processor-sdk-rtos-c667x-evm-02.00.01.07/pdk_c667x_2_0_0"
+export PDK_DIR="/home/a0869574local/ti/processor-sdk-rtos-c667x-evm-02.00.01.07/pdk_c667x_2_0_0"
+export FC_DIR="/home/a0869574local/ti/processor-sdk-rtos-c667x-evm-02.00.01.07/framework_components_3_40_01_04"
+export XDAIS_DIR="/home/a0869574local/ti/processor-sdk-rtos-c667x-evm-02.00.01.07/xdais_7_24_00_04"
+export BIOS_DIR="/home/a0869574local/ti/processor-sdk-rtos-c667x-evm-02.00.01.07/bios_6_45_00_19"
+export OMP_DIR="/home/a0869574local/ti/processor-sdk-rtos-c667x-evm-02.00.01.07/openmp_dsp_c667x_2_02_00_02"
+export LIBARCH_DIR="/home/a0869574local/proclibs/libarch_intgit/libarch"
+export XDC_DIR="/home/a0869574local/ti-rtos-sdk-12-08/xdctools_3_31_02_38_core"
+export IPC_DIR="/home/a0869574local/ti/processor-sdk-rtos-c667x-evm-02.00.01.07/ipc_3_41_00_08"
+export EDMA3_DIR="/home/a0869574local/ti/processor-sdk-rtos-c667x-evm-02.00.01.07/edma3_lld_02_12_01_22"
+export PATH="/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin/arm-linux-gnueabi:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/bin/crossscripts:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/sbin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/sbin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/sources/bitbake/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/sources/bitbake/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games"
+export LINALG_DIR=~/proclibs/linalg_rtos_c6678_small_install
diff --git a/setup_env_k2h_ocl.sh b/setup_env_k2h_ocl.sh
--- /dev/null
+++ b/setup_env_k2h_ocl.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export TI_OCL_INSTALL_DIR="/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/cortexa15hf-vfp-neon-linux-gnueabi/usr/share/ti/opencl"
+export CGTROOT="/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/x86_64-arago-linux/usr/share/ti/cgt-c6x"
+export TI_OCL_CGT_INSTALL="/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/x86_64-arago-linux/usr/share/ti/cgt-c6x"
+export XDC_DIR=/home/a0869574local/ti-rtos-sdk-12-08/xdctools_3_31_02_38_core
+export BIOS_DIR="/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/cortexa15hf-vfp-neon-linux-gnueabi/usr/share/ti/ti-sysbios-tree"
+export XDAIS_DIR="/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/cortexa15hf-vfp-neon-linux-gnueabi/usr/share/ti/ti-xdais-tree"
+export FC_DIR="/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/cortexa15hf-vfp-neon-linux-gnueabi/usr/share/ti/ti-framework-components-tree"
+export PDK_DIR="/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/cortexa15hf-vfp-neon-linux-gnueabi/usr/share/ti/ti-pdk-tree"
+export OMP_DIR="/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/cortexa15hf-vfp-neon-linux-gnueabi/usr/share/ti/ti-omp-tree"
+export LIBARCH_DIR="/home/a0869574local/proclibs/libarch_intgit/libarch"
+export TARGET_ROOTDIR="/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/cortexa15hf-vfp-neon-linux-gnueabi"
+
+export PATH=/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/x86_64-arago-linux/usr/share/ti/cgt-c6x/bin:/home/a0869574local/ti/processor-sdk-linux-k2hk-evm-02.00.02.03/linux-devkit/sysroots/x86_64-arago-linux/usr/bin:$PATH
diff --git a/setup_env_k2h_rtos.sh b/setup_env_k2h_rtos.sh
--- /dev/null
+++ b/setup_env_k2h_rtos.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export CGTROOT="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/share/ti/cgt-c6x"
+export XDC_DIR="/home/a0869574local/ti/processor-sdk-rtos-k2hk-evm-02.00.02.02/xdctools_3_32_00_06_core"
+export BIOS_DIR="/home/a0869574local/ti/processor-sdk-rtos-k2hk-evm-02.00.02.02/bios_6_45_00_20"
+export XDAIS_DIR="/home/a0869574local/ti/processor-sdk-rtos-k2hk-evm-02.00.02.02/xdais_7_24_00_04"
+export IPC_DIR="/home/a0869574local/ti/processor-sdk-rtos-k2hk-evm-02.00.02.02/ipc_3_42_00_01_eng"
+export EDMA3_DIR="/home/a0869574local/ti/processor-sdk-rtos-k2hk-evm-02.00.02.02/edma3_lld_02_12_01_22"
+export FC_DIR="/home/a0869574local/ti/processor-sdk-rtos-k2hk-evm-02.00.02.02/framework_components_3_40_01_04"
+export PDK_DIR="/home/a0869574local/ti/processor-sdk-rtos-k2hk-evm-02.00.02.02/pdk_k2hk_4_0_1"
+export C6636_PDK_DIR="/home/a0869574local/ti/processor-sdk-rtos-k2hk-evm-02.00.02.02/pdk_k2hk_4_0_1"
+export OMP_DIR="/home/a0869574local/ti/processor-sdk-rtos-k2hk-evm-02.00.02.02/openmp_dsp_k2x_2_02_00_02"
+export LIBARCH_DIR="/home/a0869574local/proclibs/libarch_intgit/libarch"
+export PATH="/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin/arm-linux-gnueabi:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/bin/crossscripts:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/sbin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/sbin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/sources/bitbake/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/sources/bitbake/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games"
diff --git a/setup_env_rtos_yocto.sh b/setup_env_rtos_yocto.sh
--- /dev/null
+++ b/setup_env_rtos_yocto.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+export BIOS_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-sysbios-tree"
+export IPC_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-ipc-tree"
+export XDC_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-xdctools-tree"
+export OMP_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-omp-tree"
+export C6678_PDK_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-pdk-tree"
+export PDK_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-pdk-tree"
+export CGTROOT="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/share/ti/cgt-c6x"
+export XDAIS_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-xdais-tree"
+export FC_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-framework-components-tree"
+#export LIBARCH_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-libarch-tree"
+export LIBARCH_DIR="/home/a0869574local/proclibs/libarch_intgit/libarch"
+export EDMA3_DIR="/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/share/ti/ti-edma3lld-tree"
+export PATH="/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin/arm-linux-gnueabi:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/c667x-evm/usr/bin/crossscripts:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/sbin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/usr/bin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/sbin:/home/a0869574local/yocoto/tisdk-rtos/build/arago-tmp-external-linaro-toolchain/sysroots/x86_64-linux/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/sources/bitbake/bin:/home/a0869574local/gcc-linaro-4.9-2015.05-x86_64_arm-linux-gnueabihf/bin:/home/a0869574local/yocoto/tisdk-rtos/sources/oe-core/scripts:/home/a0869574local/yocoto/tisdk-rtos/sources/bitbake/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games"
diff --git a/ticblas/src/Makefile b/ticblas/src/Makefile
--- /dev/null
+++ b/ticblas/src/Makefile
@@ -0,0 +1,51 @@
+
+include ../../make.inc
+
+INCDIR := $(CGTROOT)/include
+INCDIR += -I$(OMP_DIR)/packages/ti/runtime/openmp
+INCDIR += -I$(FC_DIR)/packages
+INCDIR += -I$(XDC_DIR)/packages
+INCDIR += -I$(BIOS_DIR)/packages
+INCDIR += -I$(XDAIS_DIR)/packages
+INCDIR += -I$(LIBARCH_DIR)/include
+INCDIR += -I$(PDK_DIR)/packages
+
+ifeq ($(LIBOS),LIB_OPENCL)
+INCDIR += -I$(TI_OCL_INSTALL_DIR)
+endif
+
+ifeq ($(MEM_MODEL),Large)
+BLIS_INC = ../../blis/install/c66xLarge/include/blis/
+BLIS_MEM_MODEL = MEM_MODEL_LARGE
+else ifeq ($(MEM_MODEL),Medium)
+BLIS_INC = ../../blis/install/c66xMedium/include/blis/
+BLIS_MEM_MODEL = MEM_MODEL_MEDIUM
+else ifeq ($(MEM_MODEL),Small)
+BLIS_INC = ../../blis/install/c66xSmall/include/blis/
+BLIS_MEM_MODEL = MEM_MODEL_SMALL
+else ifeq ($(MEM_MODEL),Tiny)
+BLIS_INC = ../../blis/install/c66xTiny/include/blis/
+BLIS_MEM_MODEL = MEM_MODEL_TINY
+endif
+
+INCDIR += -I$(BLIS_INC)
+
+INCS = -I. -I$(strip $(subst ;, -I,$(subst $(space),$(space),$(INCDIR))))
+
+CL6X_FLAGS = $(INCS) --openmp --use_g2 -D$(TARGET) -D$(LIBOS) -D$(BLIS_MEM_MODEL)
+
+DSP_LIB_DIR = ../lib
+DSP_LIB = $(DSP_LIB_DIR)/libticblas.ae66
+
+OBJS = ticblas.obj
+
+all: lib
+cross: lib
+
+lib: $(OBJS)
+ @echo; echo "Building DSP lib: $(DSP_LIB)"
+ mkdir -p $(DSP_LIB_DIR)
+ $(AR) -cr $(DSP_LIB) $(OBJS)
+
+clean::
+ rm -r $(DSP_LIB_DIR)
\ No newline at end of file
diff --git a/ticblas/src/make.inc b/ticblas/src/make.inc
--- /dev/null
+++ b/ticblas/src/make.inc
@@ -0,0 +1,17 @@
+
+# Defines
+DSP_LIB_DIR = ../lib
+DSP_LIB = $(DSP_LIB_DIR)/libticblas.ae66
+
+OBJS = ticblas.obj
+
+all: lib
+cross: lib
+
+lib: $(OBJS)
+ @echo; echo "Building DSP lib: $(DSP_LIB)"
+ mkdir -p $(DSP_LIB_DIR)
+ $(AR) -cr $(DSP_LIB) $(OBJS)
+
+clean::
+ rm $(DSP_LIB)
\ No newline at end of file
diff --git a/ticblas/src/ticblas.c b/ticblas/src/ticblas.c
--- /dev/null
+++ b/ticblas/src/ticblas.c
@@ -0,0 +1,217 @@
+/******************************************************************************\r
+ * Copyright (c) 2015, Texas Instruments Incorporated - http://www.ti.com\r
+ * All rights reserved.\r
+ *\r
+ * Redistribution and use in source and binary forms, with or without\r
+ * modification, are permitted provided that the following conditions are met:\r
+ * * Redistributions of source code must retain the above copyright\r
+ * notice, this list of conditions and the following disclaimer.\r
+ * * Redistributions in binary form must reproduce the above copyright\r
+ * notice, this list of conditions and the following disclaimer in the\r
+ * documentation and/or other materials provided with the distribution.\r
+ * * Neither the name of Texas Instruments Incorporated nor the\r
+ * names of its contributors may be used to endorse or promote products\r
+ * derived from this software without specific prior written permission.\r
+ *\r
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\r
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\r
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\r
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\r
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\r
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\r
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\r
+ * THE POSSIBILITY OF SUCH DAMAGE.\r
+ *****************************************************************************/\r
+#include "libarch.h"\r
+#include "../ticblas.h"\r
+#include "blis.h"\r
+\r
+#define getNextMultiple(x, y) ( ( ((x)+(y)-1)/(y) )* (y) )\r
+\r
+#if 0\r
+#ifdef MEM_MODEL_LARGE\r
+#define BLAS_LEVEL3_L1DSRAM_SIZE (28*1024UL)\r
+#define BLAS_LEVEL3_L2SRAM_SIZE (767*1024UL) /* 767KB */\r
+#define BLAS_LEVEL3_MSMC_SIZE (0x47FDC0) /* 4.5MB */\r
+#else\r
+# ifdef MEM_MODEL_MEDIUM\r
+# define BLAS_LEVEL3_L1DSRAM_SIZE (28*1024UL)\r
+# define BLAS_LEVEL3_L2SRAM_SIZE (384*1024UL) /* 384KB */\r
+# define BLAS_LEVEL3_MSMC_SIZE (0x380000) /* 3.5MB */\r
+# else\r
+# ifdef MEM_MODEL_SMALL\r
+# define BLAS_LEVEL3_L1DSRAM_SIZE (18*1024UL)\r
+# define BLAS_LEVEL3_L2SRAM_SIZE (183*1024UL) /* 187KB */\r
+# define BLAS_LEVEL3_MSMC_SIZE (1520*1024UL)/* 1.5MB */\r
+# else\r
+# error "Unsupported memory model"\r
+# endif\r
+# endif\r
+#endif\r
+#endif\r
+/*\r
+#define BLAS_MEM_SIZE_VFAST BLAS_LEVEL3_L1DSRAM_SIZE \r
+#define BLAS_MEM_SIZE_FAST BLAS_LEVEL3_L2SRAM_SIZE\r
+#define BLAS_MEM_SIZE_MEDIUM BLAS_LEVEL3_MSMC_SIZE\r
+*/\r
+#define BLAS_MEM_SIZE_VFAST ( getNextMultiple(BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_MN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE) )\r
+#define BLAS_MEM_SIZE_FAST ( getNextMultiple(BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_MN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE) )\r
+#define BLAS_MEM_SIZE_MEDIUM ( getNextMultiple(BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE) \\r
+ + getNextMultiple(BLIS_MN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE) )\r
+#define BLAS_MEM_SIZE_SLOW (4804)\r
+\r
+/* Define memory descriptors for memory management */\r
+lib_memdscr_t blas_mem_vfast;\r
+lib_memdscr_t blas_mem_fast;\r
+lib_memdscr_t blas_mem_medium;\r
+lib_memdscr_t blas_mem_slow;\r
+\r
+/* Define a memory descriptor array */\r
+lib_memdscr_t * blas_memdscr_tab[LIB_MEMTYPE_N] = {\r
+ &blas_mem_vfast,\r
+ &blas_mem_fast,\r
+ &blas_mem_medium,\r
+ &blas_mem_slow\r
+};\r
+\r
+// note these pointers must be filled if used functions\r
+char *pool_mk_mem_L1;\r
+char *pool_kn_mem_L1;\r
+char *pool_mn_mem_L1;\r
+\r
+char *pool_mk_mem_L2;\r
+char *pool_kn_mem_L2;\r
+char *pool_mn_mem_L2;\r
+\r
+char *pool_mk_mem_L3;\r
+char *pool_kn_mem_L3;\r
+char *pool_mn_mem_L3;\r
+\r
+extern void bli_mem_init();\r
+\r
+/*==============================================================================\r
+ * This function returns the address of the memory descriptor array\r
+ *============================================================================*/\r
+void * blasGetMemHandle()\r
+{\r
+ return((void *)&blas_memdscr_tab[0]);\r
+} /* blasGetMemHandle */\r
+\r
+/*==============================================================================\r
+ * It returns the size requirement of each of the 4 memory types defined in \r
+ * the library framework. \r
+ *============================================================================*/\r
+void tiCblasGetSizes(size_t *smem_size_vfast, size_t *smem_size_fast, \r
+ size_t *smem_size_medium, size_t *smem_size_slow)\r
+{\r
+ *smem_size_vfast = BLAS_MEM_SIZE_VFAST; // very fast scratch memory\r
+ *smem_size_fast = BLAS_MEM_SIZE_FAST; // fast scratch memory\r
+ *smem_size_medium = BLAS_MEM_SIZE_MEDIUM; // medium speed scratch memory\r
+ *smem_size_slow = BLAS_MEM_SIZE_SLOW; // slow scratch memory\r
+/*\r
+ printf("BLIS_MK_POOL_SIZE_L1 is %d.\n", BLIS_MK_POOL_SIZE_L1);\r
+ printf("BLIS_KN_POOL_SIZE_L1 is %d.\n", BLIS_KN_POOL_SIZE_L1);\r
+ printf("BLIS_MN_POOL_SIZE_L1 is %d.\n", BLIS_MN_POOL_SIZE_L1);\r
+ printf("BLIS_MK_POOL_SIZE_L2 is %d.\n", BLIS_MK_POOL_SIZE_L2);\r
+ printf("BLIS_KN_POOL_SIZE_L2 is %d.\n", BLIS_KN_POOL_SIZE_L2);\r
+ printf("BLIS_MN_POOL_SIZE_L2 is %d.\n", BLIS_MN_POOL_SIZE_L2);\r
+ printf("BLIS_MK_POOL_SIZE_L3 is %d.\n", BLIS_MK_POOL_SIZE_L3);\r
+ printf("BLIS_KN_POOL_SIZE_L3 is %d.\n", BLIS_KN_POOL_SIZE_L3);\r
+ printf("BLIS_MN_POOL_SIZE_L3 is %d.\n", BLIS_MN_POOL_SIZE_L3);\r
+*/\r
+} /* tiCblasGetSizes */\r
+\r
+/*==============================================================================\r
+ * It performs necessary initialization through library framework API in order\r
+ * to do memory allocations. \r
+ *============================================================================*/\r
+int tiCblasInit(void * mem_vfast_base, size_t mem_vfast_size,\r
+ void * mem_fast_base, size_t mem_fast_size,\r
+ void * mem_medium_base, size_t mem_medium_size,\r
+ void * mem_slow_base, size_t mem_slow_size)\r
+{\r
+ lib_memdscr_t **blas_mem_handle = blasGetMemHandle();\r
+\r
+ /* Verify supplied memories meet requirements */ \r
+ if( ((mem_vfast_base == NULL) || (mem_vfast_size < BLAS_MEM_SIZE_VFAST) )\r
+ ||((mem_fast_base == NULL) || (mem_fast_size < BLAS_MEM_SIZE_FAST) )\r
+ ||((mem_medium_base == NULL) || (mem_medium_size < BLAS_MEM_SIZE_MEDIUM) )\r
+ ||((mem_slow_base == NULL) || (mem_slow_size < BLAS_MEM_SIZE_SLOW) )\r
+ ) {\r
+ return(TICBLAS_ERROR);\r
+ }\r
+ else {\r
+ lib_smem_vinit(blas_mem_handle, mem_vfast_base, mem_vfast_size);\r
+ lib_smem_finit(blas_mem_handle, mem_fast_base, mem_fast_size);\r
+ lib_smem_minit(blas_mem_handle, mem_medium_base, mem_medium_size);\r
+ lib_smem_sinit(blas_mem_handle, mem_slow_base, mem_slow_size); \r
+ \r
+ pool_mk_mem_L1 = lib_smem_valloc(blas_mem_handle, BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);\r
+ pool_kn_mem_L1 = lib_smem_valloc(blas_mem_handle, BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);\r
+ pool_mn_mem_L1 = lib_smem_valloc(blas_mem_handle, BLIS_MN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);\r
+\r
+ pool_mk_mem_L2 = lib_smem_falloc(blas_mem_handle, BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);\r
+ pool_kn_mem_L2 = lib_smem_falloc(blas_mem_handle, BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);\r
+ pool_mn_mem_L2 = lib_smem_falloc(blas_mem_handle, BLIS_MN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);\r
+ \r
+ pool_mk_mem_L3 = lib_smem_malloc(blas_mem_handle, BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);\r
+ pool_kn_mem_L3 = lib_smem_malloc(blas_mem_handle, BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);\r
+ pool_mn_mem_L3 = lib_smem_malloc(blas_mem_handle, BLIS_MN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);\r
+/*\r
+ printf("BLIS_MK_POOL_SIZE_L1 is %d, pool_mk_mem_L1 is 0x%x.\n", BLIS_MK_POOL_SIZE_L1, (unsigned int)pool_mk_mem_L1);\r
+ printf("BLIS_KN_POOL_SIZE_L1 is %d, pool_kn_mem_L1 is 0x%x.\n", BLIS_KN_POOL_SIZE_L1, (unsigned int)pool_kn_mem_L1);\r
+ printf("BLIS_MN_POOL_SIZE_L1 is %d, pool_mn_mem_L1 is 0x%x.\n", BLIS_MN_POOL_SIZE_L1, (unsigned int)pool_mn_mem_L1);\r
+ printf("BLIS_MK_POOL_SIZE_L2 is %d, pool_mk_mem_L2 is 0x%x.\n", BLIS_MK_POOL_SIZE_L2, (unsigned int)pool_mk_mem_L2);\r
+ printf("BLIS_KN_POOL_SIZE_L2 is %d, pool_kn_mem_L2 is 0x%x.\n", BLIS_KN_POOL_SIZE_L2, (unsigned int)pool_kn_mem_L2);\r
+ printf("BLIS_MN_POOL_SIZE_L2 is %d, pool_mn_mem_L2 is 0x%x.\n", BLIS_MN_POOL_SIZE_L2, (unsigned int)pool_mn_mem_L2);\r
+ printf("BLIS_MK_POOL_SIZE_L3 is %d, pool_mk_mem_L3 is 0x%x.\n", BLIS_MK_POOL_SIZE_L3, (unsigned int)pool_mk_mem_L3);\r
+ printf("BLIS_KN_POOL_SIZE_L3 is %d, pool_kn_mem_L3 is 0x%x.\n", BLIS_KN_POOL_SIZE_L3, (unsigned int)pool_kn_mem_L3);\r
+ printf("BLIS_MN_POOL_SIZE_L3 is %d, pool_mn_mem_L3 is 0x%x.\n", BLIS_MN_POOL_SIZE_L3, (unsigned int)pool_mn_mem_L3);\r
+*/ \r
+ if( (pool_mk_mem_L1 == NULL)\r
+ ||(pool_kn_mem_L1 == NULL) \r
+ ||(pool_mn_mem_L1 == NULL) \r
+ ||(pool_mk_mem_L2 == NULL) \r
+ ||(pool_kn_mem_L2 == NULL) \r
+ ||(pool_mn_mem_L2 == NULL) \r
+ ||(pool_mk_mem_L3 == NULL) \r
+ ||(pool_kn_mem_L3 == NULL) \r
+ ||(pool_mn_mem_L3 == NULL) ) {\r
+ return(TICBLAS_ERROR); \r
+ } \r
+ else {\r
+ bli_mem_init();\r
+ return(TICBLAS_SUCCESS);\r
+ }\r
+ }\r
+} /* tiCblasInit */\r
+\r
+int tiCblasNew()\r
+{\r
+ if(bli_init() == BLIS_SUCCESS) {\r
+ return(TICBLAS_SUCCESS);\r
+ }\r
+ else {\r
+ return(TICBLAS_ERROR);\r
+ }\r
+}\r
+\r
+int tiCblasDelete()\r
+{\r
+ if(bli_finalize() == BLIS_SUCCESS) {\r
+ return(TICBLAS_SUCCESS);\r
+ }\r
+ else {\r
+ return(TICBLAS_ERROR);\r
+ }\r
+}\r
+\r
+/* Nothing after this line */\r
diff --git a/ticblas/ticblas.h b/ticblas/ticblas.h
--- /dev/null
+++ b/ticblas/ticblas.h
@@ -0,0 +1,108 @@
+/******************************************************************************\r
+ * Copyright (c) 2015, Texas Instruments Incorporated - http://www.ti.com\r
+ * All rights reserved.\r
+ *\r
+ * Redistribution and use in source and binary forms, with or without\r
+ * modification, are permitted provided that the following conditions are met:\r
+ * * Redistributions of source code must retain the above copyright\r
+ * notice, this list of conditions and the following disclaimer.\r
+ * * Redistributions in binary form must reproduce the above copyright\r
+ * notice, this list of conditions and the following disclaimer in the\r
+ * documentation and/or other materials provided with the distribution.\r
+ * * Neither the name of Texas Instruments Incorporated nor the\r
+ * names of its contributors may be used to endorse or promote products\r
+ * derived from this software without specific prior written permission.\r
+ *\r
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\r
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\r
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\r
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\r
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\r
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\r
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\r
+ * THE POSSIBILITY OF SUCH DAMAGE.\r
+ *****************************************************************************/\r
+#ifndef _TICBLAS_H\r
+#define _TICBLAS_H\r
+\r
+#include <stddef.h> \r
+\r
+/** @defgroup ti_cblas_api CBLAS API Extension for TI-DSP \r
+ * @{\r
+ */\r
+/** @} */\r
+\r
+/** @addtogroup ti_cblas_api \r
+ * @{\r
+ * @name Error Return Codes\r
+ */\r
+/*@{*/\r
+#define TICBLAS_SUCCESS (0) /**< Success. No error. */\r
+#define TICBLAS_ERROR (-1) /**< Failure. */ \r
+/*@}*/\r
+/** @} */\r
+\r
+/**\r
+ * @ingroup ti_cblas_api\r
+ * @brief Function tiCblasGetSizes() returns the required size of each of the\r
+ * memory types defined by the Library Architecture and Framework \r
+ * (LibArch)\r
+ *\r
+ * @param[out] smem_size_vfast size of very fast shared memory \r
+ * @param[out] smem_size_fast size of fast shared memory \r
+ * @param[out] smem_size_medium size of medium speed shared memory \r
+ * @param[out] smem_size_slow size of slow shared memory \r
+ *\r
+ */\r
+void tiCblasGetSizes(size_t *smem_size_vfast, size_t *smem_size_fast, \r
+ size_t *smem_size_medium, size_t *smem_size_slow);\r
+\r
+/**\r
+ * @ingroup ti_cblas_api\r
+ * @brief Function tiCblasNew() creates an instance for CBLAS.\r
+ *\r
+ * @remarks tiCblasNew() MUST be called before tiCblasInit().\r
+ *\r
+ * @retval TICBLAS_SUCCESS @copydoc TICBLAS_SUCCESS\r
+ * @retval TICBLAS_ERROR @copydoc TICBLAS_ERROR\r
+ */\r
+int tiCblasNew();\r
+\r
+/**\r
+ * @ingroup ti_cblas_api\r
+ * @brief Function tiCblasInit() performs heap initialization for CBLAS \r
+ * to do memory allocations.\r
+ * \r
+ * @remarks tiCblasInit() must NOT be called before tiCblasNew(). \r
+ *\r
+ * @param[in] mem_vfast_base base of very fast shared memory \r
+ * @param[in] mem_vfast_size size of very fast shared memory \r
+ * @param[in] mem_fast_base base of fast shared memory \r
+ * @param[in] mem_fast_size size of fast shared memory \r
+ * @param[in] mem_medium_base base of medium speed shared memory \r
+ * @param[in] mem_medium_size size of medium speed shared memory \r
+ * @param[in] mem_slow_base base of slow shared memory \r
+ * @param[in] mem_slow_size size of slow shared memory \r
+ *\r
+ * @retval TICBLAS_SUCCESS @copydoc TICBLAS_SUCCESS\r
+ * @retval TICBLAS_ERROR @copydoc TICBLAS_ERROR\r
+ */\r
+int tiCblasInit(void * mem_vfast_base, size_t mem_vfast_size,\r
+ void * mem_fast_base, size_t mem_fast_size,\r
+ void * mem_medium_base, size_t mem_medium_size,\r
+ void * mem_slow_base, size_t mem_slow_size);\r
+\r
+/**\r
+ * @ingroup ti_cblas_api\r
+ * @brief Function tiCblasDelete() deletes the instance of CBLAS created by\r
+ * tiCblasNew(). \r
+ *\r
+ * @retval TICBLAS_SUCCESS @copydoc TICBLAS_SUCCESS\r
+ * @retval TICBLAS_ERROR @copydoc TICBLAS_ERROR\r
+ */\r
+int tiCblasDelete();\r
+\r
+#endif /* _TICBLAS_H */\r