summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 4823f27)
raw | patch | inline | side by side (parent: 4823f27)
author | Jianzhong Xu <xuj@ti.com> | |
Fri, 4 Mar 2016 20:26:22 +0000 (15:26 -0500) | ||
committer | Jianzhong Xu <xuj@ti.com> | |
Fri, 4 Mar 2016 20:26:22 +0000 (15:26 -0500) |
394 files changed:
diff --git a/Makefile b/Makefile
index d7de1721ad201466f0f87f353b355ddc5c2efa6e..ca06b9e084f594a56ba67a83787f30b0579a427f 100644 (file)
--- a/Makefile
+++ b/Makefile
LINALG_BLIS_DIR = blis
LINALG_CBLAS_DIR = cblas
+LINALG_TICBLAS_DIR = ticblas
LINALG_BLASACC_DIR = blasblisacc
LINALG_CLAPACK_DIR = clapack
BLIS_VERSION = $(shell cat $(LINALG_BLIS_DIR)/version)
-LINALG_HEADERS =$(LINALG_CLAPACK_DIR)/INCLUDE/blaswrap.h
-LINALG_HEADERS+=$(LINALG_CLAPACK_DIR)/INCLUDE/clapack.h
-LINALG_HEADERS+=$(LINALG_CLAPACK_DIR)/INCLUDE/f2c.h
-LINALG_HEADERS+=$(LINALG_CBLAS_DIR)/include/cblas.h
+CBLAS_HEADERS =$(LINALG_CBLAS_DIR)/include/cblas.h
+CBLAS_HEADERS +=$(LINALG_TICBLAS_DIR)/ticblas.h
+CLAPACK_HEADERS =$(LINALG_CLAPACK_DIR)/INCLUDE/blaswrap.h
+CLAPACK_HEADERS+=$(LINALG_CLAPACK_DIR)/INCLUDE/clapack.h
+CLAPACK_HEADERS+=$(LINALG_CLAPACK_DIR)/INCLUDE/f2c.h
-build: ARMonly
-prebuild: DSPonly
-linalg: ARMplusDSP
+ifeq ($(MEM_MODEL),Large)
+BLIS_CFG = c66xLarge
+else ifeq ($(MEM_MODEL),Medium)
+BLIS_CFG = c66xMedium
+else ifeq ($(MEM_MODEL),Small)
+BLIS_CFG = c66xSmall
+endif
-ARMonly:
+DSPlibs:
+ cd $(LINALG_CBLAS_DIR); make arch=C66 alllib; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/$(BLIS_CFG) c66x; make -j8 MEM_MODEL=$(MEM_MODEL) TARGET=$(TARGET) LIBOS=$(LIBOS); make install; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make MEM_MODEL=$(MEM_MODEL) TARGET=$(TARGET) LIBOS=$(LIBOS); cd ../lib; \
+ echo "combining BLIS, CBLAS, and TICBLAS libraries into one: libcblas.ae66"; \
+ mkdir -p objs; cd objs; rm -f *; ar x ../../../blis/install/$(BLIS_CFG)/lib/libblis.ae66; mmv 'cblas*.o' 'blis_cblas#1.o'; \
+ ar -x ../../../cblas/lib/C66/libcblas.ae66; ar -x ../libticblas.ae66; chmod +rw *;cd ../../..; \
+ mkdir -p lib; cd lib; rm -f *; ar -cr libcblas.ae66 ../ticblas/lib/objs/*; cd ..
+
+ARMlibs:
cd $(LINALG_CBLAS_DIR); make arch=ARM alllib; \
cd ../$(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; make -j8; make install; \
- cd ../$(LINALG_BLASACC_DIR)/src; make -f Makefile.ARM; \
- cd ../../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make
+ cd ../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make -j8; cd ..
-ARMplusDSP:
- cd $(LINALG_CBLAS_DIR); make arch=ARM alllib; make arch=C66 alllib; \
- cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make -j8; make install; \
- ./configure -p install/arm cortex-a15; make -j8; make install; \
- cd ../$(LINALG_BLASACC_DIR); make cross; \
- cd ../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make
+ARMplusDSP: DSPlibs ARMlibs
+ cd $(LINALG_BLASACC_DIR)/src; make MEM_MODEL=$(MEM_MODEL) TARGET=$(TARGET); cd ../..; \
+ cd lib; rm -f *; \
+ cp ../$(LINALG_BLASACC_DIR)/lib/libcblas_armplusdsp.a .; \
+ cp ../$(LINALG_BLIS_DIR)/install/arm/lib/libblis.a .; \
+ ar -x libblis.a; mmv "cblas_*.o" "blis_cblas_#1.o"; ar -x libcblas_armplusdsp.a; rm *.a; \
+ ar -cr libcblas_armplusdsp.a *.o; rm *.o; cd ..; \
+ cp $(LINALG_CLAPACK_DIR)/lapack_ARM.a ./lib/liblapack.a; \
+ cp $(LINALG_CLAPACK_DIR)/libcblaswr_ARM.a ./lib/libcblaswr.a; \
+ cp $(LINALG_CLAPACK_DIR)/F2CLIBS/libf2c_ARM.a ./lib/libf2c.a
-BLIStest:
- cd $(LINALG_BLIS_DIR)/testsuite; make lib=OpenCLCBLAS -j8
+cleanDSPlibs:
+ cd $(LINALG_CBLAS_DIR); make arch=C66 clean; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/$(BLIS_CFG) c66x; make -j8 clean; rm install/$(BLIS_CFG)/lib/*; rm install/$(BLIS_CFG)/include/blis/*; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make clean; \
+ cd ../..; rm lib/*
-cleanARMplusDSP:
- cd $(LINALG_CBLAS_DIR); make arch=ARM clean; make arch=C66 clean; \
- cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make clean; \
- ./configure -p install/arm cortex-a15; make clean; \
+cleanARMlibs:
+ cd $(LINALG_CBLAS_DIR); make arch=ARM clean; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; make clean; \
cd ../$(LINALG_BLASACC_DIR); make clean; \
cd ../$(LINALG_BLIS_DIR)/testsuite; make clean; \
cd ../../$(LINALG_CLAPACK_DIR); make clean
-clean:
- cd $(LINALG_CBLAS_DIR)/src; make arch=ARM clean; \
- cd ../../$(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; make clean; \
- cd ../$(LINALG_BLASACC_DIR)/src; make -f Makefile.ARM cleanARM; \
- cd ../../$(LINALG_CLAPACK_DIR); make clean
+BLIStest:
+ cd $(LINALG_BLIS_DIR); ./configure -p install/arm cortex-a15; \
+ cd testsuite; make lib=OpenCLCBLAS -j8
+
+BLIStestDSP:
+ cd $(LINALG_BLIS_DIR); ./configure -p install/$(BLIS_CFG) c66x; \
+ cd testsuite/dsponly; make MEM_MODEL=Small TARGET=SOC_C6678 LIBOS=LIB_RTOS
+
+BLAStest:
+ cd $(LINALG_CLAPACK_DIR)/BLAS/TESTING; make -f Makeblat1; make -f Makeblat2; make -f Makeblat3
+
+CLAPACKtest:
+ cd $(LINALG_CLAPACK_DIR)/TESTING/MATGEN; make
+ cd $(LINALG_CLAPACK_DIR)/TESTING/LIN; make
+ cd $(LINALG_CLAPACK_DIR)/TESTING/EIG; make
+
+
+cleanARMplusDSP: cleanDSPlibs cleanARMlibs
+
+docs: ./docs/doxygen/doxycfg.txt ./docs/doxygen/mainpage.dox
+ doxygen ./docs/doxygen/doxycfg.txt
+
+installDSPlib:
+ install -m 755 -d ${DESTDIR}/include
+ install -m 755 -d ${DESTDIR}/lib
+ cp $(CBLAS_HEADERS) ${DESTDIR}/include
+ cp ./lib/libcblas.ae66 ${DESTDIR}/lib
+ cp -r docs ${DESTDIR}
+
+installARMplusDSPlib:
+ install -m 755 -d ${DESTDIR}/include
+ install -m 755 -d ${DESTDIR}/lib
+ cp $(CBLAS_HEADERS) ${DESTDIR}/include
+ cp $(CLAPACK_HEADERS) ${DESTDIR}/include
+ cp ./lib/libcblas_armplusdsp.a ${DESTDIR}/lib
+ cp ./lib/liblapack.a ${DESTDIR}/lib
+ cp ./lib/libcblaswr.a ${DESTDIR}/lib
+ cp ./lib/libf2c.a ${DESTDIR}/lib
+ cp -r docs ${DESTDIR}
-DSPonly:
- cd $(LINALG_CBLAS_DIR); make arch=C66 alllib; \
- cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make -j8; make install; \
- cd ../$(LINALG_BLASACC_DIR)/src; make ti_cblas_kernel.dsp_h
-
-install:
- install -m 755 -d ${DESTDIR}/usr/include
- install -m 755 -d ${DESTDIR}/usr/lib
- install -m 755 -d ${DESTDIR}/usr/share/doc/ti-linalg
- install -m 755 -d ${DESTDIR}/usr/share/ti/examples/linalg
- cp $(LINALG_HEADERS) ${DESTDIR}/usr/include
- cp $(LINALG_BLASACC_DIR)/lib/libcblas_armplusdsp.a ${DESTDIR}/usr/lib
- cp $(LINALG_BLIS_DIR)/install/arm/lib/libblis-$(BLIS_VERSION)-cortex-a15.a ${DESTDIR}/usr/lib/libblis.a
- cp -r ./examples/* ${DESTDIR}/usr/share/ti/examples/linalg
- cp $(LINALG_CLAPACK_DIR)/lapack_ARM.a ${DESTDIR}/usr/lib/liblapack.a
- cp $(LINALG_CLAPACK_DIR)/libcblaswr_ARM.a ${DESTDIR}/usr/lib/libcblaswr.a
- cp $(LINALG_CLAPACK_DIR)/F2CLIBS/libf2c_ARM.a ${DESTDIR}/usr/lib/libf2c.a
- cp docs/* ${DESTDIR}/usr/share/doc/ti-linalg
diff --git a/blasblisacc/Makefile b/blasblisacc/Makefile
index 8d02c6746ca589e283737cfdc86e2d002fcc83a4..1c6e202d87b96753837a5ea321a4f314be6047a8 100644 (file)
--- a/blasblisacc/Makefile
+++ b/blasblisacc/Makefile
include ../make.inc
# use all for cross compilation
-cross: all
# build library
all:
cd src; $(MAKE)
+crossC66x:
+ cd src; $(MAKE) crossC66x
+
+crossAM57x:
+ cd src; $(MAKE) crossAM57x
+
+crossShannon:
+ cd src; $(MAKE) crossShannon
+
debug:
cd src; $(MAKE) debug
index 0431d63aa62f52e0d6115bc8670a21d1a53a4d62..1946377c397f40d394014ad3b3564c96e7f30836 100644 (file)
--- a/blasblisacc/src/Makefile
+++ b/blasblisacc/src/Makefile
include ../../make.inc
-TI_INSTALL_DIR?=/usr/src/dsp
-
-PATH:=$(TI_OCL_CGT_INSTALL)/bin:$(PATH)
-
-define FIND_DSP_PKG
- export $(1)?=$$(patsubst %/$(3),%,$$(lastword $$(sort $$(wildcard $$(TI_INSTALL_DIR)/$(2)/$(3)))))
- ifeq ($$($(1)),)
- $$(error ERROR - $(1) is not defined and could not be found in $(TI_INSTALL_DIR)/ )
- else
- ifeq ($$(wildcard $$($(1))/$(3)),)
- $$(error ERROR - "$(1) = $$($(1))" Is not valid!)
- endif
- endif
- $$(info Using $(1) = $$($(1)))
-endef
-
-UNAME_M :=$(shell uname -m)
-
-MAKEFLAGS += -j1
+PATH:=$(CGTROOT)/bin:$(PATH)
# Defines
TI_CBLAS_FAT_BINARY = 1
ARM_PLUS_DSP_LIB_DIR = ../lib
-# INCLUDE Directory
-$(eval $(call FIND_DSP_PKG,OMP_DIR,openmp_dsp*,packages))
-ifneq (,$(findstring 86, $(UNAME_M)))
-$(eval $(call FIND_DSP_PKG,C6636_PDK_DIR,pdk_keystone2*,packages))
-endif
-INCDIR := $(TI_OCL_CGT_INSTALL)/include;$(TARGET_ROOTDIR)/usr/include;../../blis/install/c66x/include/blis/;$(OMP_DIR)/packages/ti/runtime/openmp;$(PDK_DIR)/packages
+INCDIR := $(CGTROOT)/include
+INCDIR += -I$(OMP_DIR)/packages/ti/runtime/openmp
+INCDIR += -I$(FC_DIR)/packages
+INCDIR += -I$(XDC_DIR)/packages
+INCDIR += -I$(BIOS_DIR)/packages
+INCDIR += -I$(XDAIS_DIR)/packages
+INCDIR += -I$(LIBARCH_DIR)/include
+INCDIR += -I$(PDK_DIR)/packages
+INCDIR += -I$(TI_OCL_INSTALL_DIR)
INCS = -I. -I$(strip $(subst ;, -I,$(subst $(space),$(space),$(INCDIR))))
OBJS = ti_cblas_initfini.o
+# CBLAS and BLIS directories
+CBLAS_DSP_LIB = ../../cblas/lib/C66/libcblas.ae66
+TICBLAS_DSP_LIB = ../../ticblas/lib/libticblas.ae66
+CBLAS_ARM_LIB = ../../cblas/lib/ARM/libcblas_ARM.a
+LIBARCH_LIB = $(LIBARCH_DIR)/lib/libArch.ae66
+
+ifeq ($(MEM_MODEL),Large)
+BLIS_DSP_LIB = ../../blis/install/c66xLarge/lib/libblis.ae66
+else ifeq ($(MEM_MODEL),Medium)
+BLIS_DSP_LIB = ../../blis/install/c66xMedium/lib/libblis.ae66
+else ifeq ($(MEM_MODEL),Small)
+BLIS_DSP_LIB = ../../blis/install/c66xSmall/lib/libblis.ae66
+#else ifeq ($(MEM_MODEL),Tiny)
+endif
CPP_DEBUG = -g
-CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DDEVICE_K2H -I../../cblas/include -I../../blis/install/arm/include/blis/ -I$(TI_OCL_INSTALL_DIR)/include -fopenmp
-CL6X_FLAGS = $(INCS) --openmp --use_g2
+CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -D$(TARGET) -I../../cblas/include -I../../blis/install/arm/include/blis/ -I$(TI_OCL_INSTALL_DIR)/include -fopenmp
+CL6X_FLAGS = $(INCS) --openmp --use_g2 -D$(TARGET) -DLIB_OPENCL
CLOCL_FLAGS =
OBJCOPY_ARGS=
ARM_PLUS_DSP_LIB= $(ARM_PLUS_DSP_LIB_DIR)/libcblas_armplusdsp.a
# OpenCL libraries included in make.inc
LIBS += -lpthread
-# CBLAS and BLIS directories
-CBLAS_DSP_LIB = ../../cblas/lib/C66/libcblas_C66.ae66
-BLIS_DSP_LIB = ../../blis/install/c66x/lib/libblis.ae66
-CBLAS_ARM_LIB = ../../cblas/lib/ARM/libcblas_ARM.a
-
OCL_BIN = ti_cblas_kernel.out
ifeq ($(TI_CBLAS_FAT_BINARY), 1)
OBJS += ofld_tbl_strsm.o
OBJS += ofld_tbl_ztrsm.o
-
all: armplusdsp
cross: armplusdsp
+crossC66x: BLIS_DSP_LIB = ../../blis/install/c66x/lib/libblis.ae66
+#crossC66x: CL6X_FLAGS+= -I../../blis/install/c66x/include/blis/
+crossC66x: cross $(OCL_BIN) $(OBJS)
+
+crossAM57x: BLIS_DSP_LIB = ../../blis/install/am57x/lib/libblis.ae66
+#crossAM57x: CL6X_FLAGS+= -I../../blis/install/am57x/include/blis/
+crossAM57x: cross $(OCL_BIN) $(OBJS)
+
+crossShannon: BLIS_DSP_LIB = ../../blis/install/shannon/lib/libblis.ae66
+#crossShannon: CL6X_FLAGS+= -I../../blis/install/shannon/include/blis/
+crossShannon: cross $(OCL_BIN) $(OBJS)
+
debug: CPP_FLAGS += -DTI_CBLAS_DEBUG $(CPP_DEBUG) #-DTI_CBLAS_PROFILE
debug: cross
+debug: CL6X_FLAGS += -DTI_CBLAS_DEBUG
profile: CPP_FLAGS += -DTI_CBLAS_PROFILE
profile: armplusdsp
ti_cblas_initfini.o: $(OCL_BIN)
# target for fat binary
-ti_cblas_kernel.dsp_h: ti_cblas_kernel.cl facade.obj $(CBLAS_DSP_LIB) $(BLIS_DSP_LIB)
+ti_cblas_kernel.dsp_h: ti_cblas_kernel.cl facade.obj ti_cblas_mem_config.obj $(CBLAS_DSP_LIB) $(BLIS_DSP_LIB) $(TICBLAS_DSP_LIB) $(LIBARCH_LIB)
@echo; echo Building $@
@rm -f ti_cblas_kernel.out
@echo Building fat binary header
index a2a7f1202d163beee946bac735023dbf197ff0bf..a4e8255c1be7b6fd9da86d6bd9b7ead1405007a9 100644 (file)
ARM_PLUS_DSP_LIB_DIR = ../lib
# INCLUDE Directory
-INCDIR := $(TI_OCL_CGT_INSTALL)/include;$(TI_OCL_INSTALL_DIR)/opencl/include;$(LINUX_DEVKIT_ROOT)/usr/include;../../blis/install/c66x/include/blis/;$(OMP_DIR)/packages/ti/runtime/openmp
+INCDIR := $(CGTROOT)/include;$(TI_OCL_INSTALL_DIR)/opencl/include;$(LINUX_DEVKIT_ROOT)/usr/include;../../blis/install/c66x/include/blis/;$(OMP_DIR)/packages/ti/runtime/openmp
INCS = -I. -I$(strip $(subst ;, -I,$(subst $(space),$(space),$(INCDIR))))
OBJS = ti_cblas_initfini.o
CPP_DEBUG = -g
-CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DDEVICE_K2H -I../../cblas/include -I../../blis/install/arm/include/blis/ -I$(TI_OCL_INSTALL_DIR)/include -fopenmp
+CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DSOC_K2H -I../../cblas/include -I../../blis/install/arm/include/blis/ -I$(TI_OCL_INSTALL_DIR)/include -fopenmp
CL6X_FLAGS = $(INCS) --openmp --use_g2
CLOCL_FLAGS =
OBJCOPY_ARGS=
ARM_PLUS_DSP_LIB= $(ARM_PLUS_DSP_LIB_DIR)/libcblas_armplusdsp.a
-# CBLAS and BLIS directories
-CBLAS_DSP_LIB = ../../cblas/lib/C66/libcblas_C66.ae66
-BLIS_DSP_LIB = ../../blis/install/c66x/lib/libblis.ae66
-CBLAS_ARM_LIB = ../../cblas/lib/ARM/libcblas_ARM.a
-
OCL_BIN = ti_cblas_kernel.out
ifeq ($(TI_CBLAS_FAT_BINARY), 1)
index b4f117b66c2811b621db1166e5c297aaafc97161..547d98328ae13fad4ada8e194bedd8736449e132 100644 (file)
--- a/blasblisacc/src/facade.c
+++ b/blasblisacc/src/facade.c
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
#include "../../cblas/include/cblas.h"
-#include "blis.h"
-#define DEVICE_K2H
+#include "../../ticblas/ticblas.h"
-#include <dsp_c.h>
+#ifdef TI_CBLAS_DEBUG
+#include "stdio.h"
-#define getNextMultiple(x, y) ( ( ((x)+(y)-1)/(y) )* (y) )
-// L1 buffer is hardwared here
-#define L1_BUF_LOC 0x00F00000
+extern char *pool_mk_mem_L1;
+extern char *pool_kn_mem_L1;
+extern char *pool_mn_mem_L1;
+extern char *pool_mk_mem_L2;
+extern char *pool_kn_mem_L2;
+extern char *pool_mn_mem_L2;
+extern char *pool_mk_mem_L3;
+extern char *pool_kn_mem_L3;
+extern char *pool_mn_mem_L3;
+#endif
-// note these pointers must be filled if used functions
-char *pool_mk_mem_L1;
-char *pool_kn_mem_L1;
-char *pool_mn_mem_L1;
+extern int bli_l3_mem_config(void *msmc_buf, size_t msmc_buf_size, void *ddr_buf, size_t ddr_buf_size, size_t *l1D_SRAM_size_orig, size_t *l2_SRAM_size_orig);
+extern int bli_l3_mem_reconfig(size_t l1D_SRAM_size_orig, size_t l2_SRAM_size_orig);
-char *pool_mk_mem_L2;
-char *pool_kn_mem_L2;
-char *pool_mn_mem_L2;
-
-char *pool_mk_mem_L3;
-char *pool_kn_mem_L3;
-char *pool_mn_mem_L3;
-
-void ti_bli_init_dsp(char *l3_buf, char *l2_buf)
-{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- bli_init();
-}
-
-void ti_bli_finalize_dsp(void)
-{
- bli_finalize();
-}
void cblas_caxpy_facade(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_caxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ccopy_facade(const int N, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ccopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cdotc_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cdotc_sub(N, X, incX, Y, incY, dotc);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cdotu_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cdotu_sub(N, X, incX, Y, incY, dotu);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_cgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
-
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_cgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_cgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgerc_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgerc(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cgeru_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cgeru(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_chemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_chemm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_chemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cher_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const void *X, const int incX, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cher(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_cher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const float beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_cher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const float beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_cher2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_cherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const void *A, const int lda, const float beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_cherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const void *A, const int lda, const float beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_cherk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_chpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *Ap, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chpr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const void *X, const int incX, void *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chpr(order, Uplo, N, alpha, X, incX, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_chpr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_chpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_crotg_facade(void *a, void *b, float *c, void *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_crotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cscal_facade(const int N, const void *alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_csscal_facade(const int N, const float alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_csscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_cswap_facade(const int N, void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_cswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_csymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_csymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_csymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_csyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_csyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_csyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_csyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_csyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_csyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ctbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ctpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_ctrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ctrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ctrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_ctrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ctrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ctrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ctrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dasum_facade(const int N, const double *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_daxpy_facade(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_daxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dcopy_facade(const int N, const double *X, const int incX, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dcopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ddot_facade(const int N, const double *X, const int incX, const double *Y, const int incY, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_ddot(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_dgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
-// printf("dgemm facade A: %x, B: %x\n", A, B);
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dger_facade(const enum CBLAS_ORDER order, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dger(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dnrm2_facade(const int N, const double *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dnrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drot_facade(const int N, double *X, const int incX, double *Y, const int incY, const double c, const double s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drot(N, X, incX, Y, incY, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drotg_facade(double *a, double *b, double *c, double *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drotm_facade(const int N, double *X, const int incX, double *Y, const int incY, const double *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drotm(N, X, incX, Y, incY, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_drotmg_facade(double *d1, double *d2, double *b1, const double b2, double *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_drotmg(d1, d2, b1, b2, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dscal_facade(const int N, const double alpha, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsdot_facade(const int N, const float *X, const int incX, const float *Y, const int incY, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dsdot(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dspmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *Ap, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dspr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, double *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dspr(order, Uplo, N, alpha, X, incX, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dspr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dspr2(order, Uplo, N, alpha, X, incX, Y, incY, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dswap_facade(const int N, double *X, const int incX, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_dsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dsymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dsymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsyr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, double *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsyr(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dsyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dsyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_dsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dsyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_dsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double beta, double *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_dsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double *A, const int lda, const double beta, double *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dsyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dtbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *Ap, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dtpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *Ap, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_dtrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dtrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dtrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_dtrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double *A, const int lda, double *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_dtrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_dtrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const double *A, const int lda, double *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_dtrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dzasum_facade(const int N, const void *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dzasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_dznrm2_facade(const int N, const void *X, const int incX, double *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_dznrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_icamax_facade(const int N, const void *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_icamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_idamax_facade(const int N, const double *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_idamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_isamax_facade(const int N, const float *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_isamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_izamax_facade(const int N, const void *X, const int incX, CBLAS_INDEX *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_izamax(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sasum_facade(const int N, const float *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_sasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_saxpy_facade(const int N, const float alpha, const float *X, const int incX, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_saxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_scasum_facade(const int N, const void *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_scasum(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_scnrm2_facade(const int N, const void *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_scnrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_scopy_facade(const int N, const float *X, const int incX, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_scopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sdot_facade(const int N, const float *X, const int incX, const float *Y, const int incY, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_sdot(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sdsdot_facade(const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_sdsdot(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_sgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_sgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sger_facade(const enum CBLAS_ORDER order, const int M, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sger(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_snrm2_facade(const int N, const float *X, const int incX, float *retval)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
*retval = cblas_snrm2(N, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srot_facade(const int N, float *X, const int incX, float *Y, const int incY, const float c, const float s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srot(N, X, incX, Y, incY, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srotg_facade(float *a, float *b, float *c, float *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srotm_facade(const int N, float *X, const int incX, float *Y, const int incY, const float *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srotm(N, X, incX, Y, incY, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_srotmg_facade(float *d1, float *d2, float *b1, const float b2, float *P)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_srotmg(d1, d2, b1, b2, P);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sscal_facade(const int N, const float alpha, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sspmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *Ap, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sspmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sspr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, float *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sspr(order, Uplo, N, alpha, X, incX, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sspr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sspr2(order, Uplo, N, alpha, X, incX, Y, incY, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_sswap_facade(const int N, float *X, const int incX, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_sswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_ssymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ssymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ssymv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *A, const int lda, const float *X, const int incX, const float beta, float *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssymv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssyr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, float *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssyr(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ssyr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const float alpha, const float *X, const int incX, const float *Y, const int incY, float *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ssyr2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ssyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_ssyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ssyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_ssyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float beta, float *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_ssyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const float alpha, const float *A, const int lda, const float beta, float *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ssyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_stbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_stbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_stpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *Ap, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_stpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *Ap, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_stpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_strmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_strmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_strmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_strmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_strsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const float alpha, const float *A, const int lda, float *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_strsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_strsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const float *A, const int lda, float *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_strsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_xerbla_facade(int p, const char *rout, const char *form)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_xerbla(p, rout, form);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zaxpy_facade(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zaxpy(N, alpha, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zcopy_facade(const int N, const void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zcopy(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zdotc_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zdotc_sub(N, X, incX, Y, incY, dotc);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zdotu_sub_facade(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zdotu_sub(N, X, incX, Y, incY, dotu);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zdscal_facade(const int N, const double alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zdscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const int KL, const int KU, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgbmv(order, TransA, M, N, KL, KU, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zgemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_zgemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgerc_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgerc(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zgeru_facade(const enum CBLAS_ORDER order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zgeru(order, M, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const int K, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhbmv(order, Uplo, N, K, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zhemm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zhemm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_zhemv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhemv(order, Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zher_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zher(order, Uplo, N, alpha, X, incX, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zher2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zher2(order, Uplo, N, alpha, X, incX, Y, incY, A, lda);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_zher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const double beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zher2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const double beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zher2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_zherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const void *A, const int lda, const double beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zherk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const void *A, const int lda, const double beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zherk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_zhpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *Ap, const void *X, const int incX, const void *beta, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhpmv(order, Uplo, N, alpha, Ap, X, incX, beta, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhpr_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhpr(order, Uplo, N, alpha, X, incX, A);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zhpr2_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *Ap)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zhpr2(order, Uplo, N, alpha, X, incX, Y, incY, Ap);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zrotg_facade(void *a, void *b, double *c, void *s)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zrotg(a, b, c, s);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zscal_facade(const int N, const void *alpha, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zscal(N, alpha, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_zswap_facade(const int N, void *X, const int incX, void *Y, const int incY)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_zswap(N, X, incX, Y, incY);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_zsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zsymm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zsymm(Order, Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_zsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zsyr2k_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zsyr2k(Order, Uplo, Trans, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
-void cblas_zsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, float *l3_buf, float *l2_buf_loc)
+void cblas_zsyrk_facade(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const void *alpha, const void *A, const int lda, const void *beta, void *C, const int ldc, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_zsyrk(Order, Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ztbmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztbmv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztbsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const int K, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztbsv(order, Uplo, TransA, Diag, N, K, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztpmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztpmv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
void cblas_ztpsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *Ap, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztpsv(order, Uplo, TransA, Diag, N, Ap, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_ztrmm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ztrmm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ztrmv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztrmv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
-void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, float *l3_buf, float *l2_buf_loc)
+void cblas_ztrsm_facade(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const void *alpha, const void *A, const int lda, void *B, const int ldb, void *l3_buf, size_t l3_buf_size, void *ddr_buf, size_t ddr_buf_size, int *err_code)
{
- pool_mk_mem_L1 = (char *) getNextMultiple((int) L1_BUF_LOC, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L1 = (char *) getNextMultiple(((int) pool_mk_mem_L1) + BLIS_MK_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L1 = (char *) getNextMultiple(((int) pool_kn_mem_L1) + BLIS_KN_POOL_SIZE_L1, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L2 = (char *) getNextMultiple((int) l2_buf_loc, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L2 = (char *) getNextMultiple(((int) pool_mk_mem_L2) + BLIS_MK_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L2 = (char *) getNextMultiple(((int) pool_kn_mem_L2) + BLIS_KN_POOL_SIZE_L2, BLIS_CACHE_LINE_SIZE);
-
- pool_mk_mem_L3 = (char *) getNextMultiple((int) l3_buf, BLIS_CACHE_LINE_SIZE);
- pool_kn_mem_L3 = (char *) getNextMultiple(((int) pool_mk_mem_L3) + BLIS_MK_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
- pool_mn_mem_L3 = (char *) getNextMultiple(((int) pool_kn_mem_L3) + BLIS_KN_POOL_SIZE_L3, BLIS_CACHE_LINE_SIZE);
-
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_4k();
- }
+ size_t l1D_SRAM_size_orig, l2_SRAM_size_orig;
+
+ *err_code = bli_l3_mem_config(l3_buf, l3_buf_size, ddr_buf, ddr_buf_size, &l1D_SRAM_size_orig, &l2_SRAM_size_orig);
+ if(*err_code != TICBLAS_SUCCESS) {
+ return;
+ }
+
cblas_ztrsm(Order, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);
- #pragma omp parallel
- {
- __cache_l1d_flush();
- __cache_l1d_all();
- }
+ *err_code = bli_l3_mem_reconfig(l1D_SRAM_size_orig, l2_SRAM_size_orig);
}
void cblas_ztrsv_facade(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int N, const void *A, const int lda, void *X, const int incX)
{
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_512k();
- }
cblas_ztrsv(order, Uplo, TransA, Diag, N, A, lda, X, incX);
-
- // return default L2 cache (128 K)
- #pragma omp parallel
- {
- __cache_l2_flush();
- __cache_l2_128k();
- }
}
index 512b3de01ec805dcc16adf503ac940db972b1786..e9ca10dcfb969a338d9680fcc12c4c85cb24a35c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_cgemm[GEMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 33e7a384d61192c03d14b038302493ed2c0d6d33..74c637b8c29f1af78277048e32a85d8e0b9e30b4 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_csyrk[SYRK_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 959136b0435c881f0215cfe95d0e268f74ce064d..1d0522cfe906a8ce216d6182ca89afcdc4da839a 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ctrmm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index d817eb843661e4042b93db86ed79762f83ffd4d9..21dcdaf7e2a7a418860ef9ce237f86261e913bf6 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ctrsm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
index f24fc228aa363b2fde5a0123d55a7ee8da8ba607..75819ee81081f97ab372f0f4087fb7844240f1be 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_dgemm[GEMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index ee3a346a32154d9156b9261ee4b27ca9ace25bc2..776469b243fcba5789a8c3a4eb7169419754351e 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_dsyrk[SYRK_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index f96dec82c700530e4b02a7ff75f6cc9dda04c444..446189f43978c51e8a802b985670cda0ed2a9ad7 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_dtrmm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 29cfd611247f54943909be96d0f6b874cd273d19..daeb38ba53c4633d809a81df30524e28aff58e81 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_dtrsm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 4ecf176f0d708a3447bc4afa063dc603347edc52..b058b2fc71543a14d0c789e1a2cd0032bfb7accd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_sgemm[GEMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 0600ebb476a2ccb6a6310810265da06947b03880..5b89cc96949345af939f91043e4f3667e16cb38c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ssyrk[SYRK_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index e108f6561fc2c6ccf09a352b6e50826eab609a36..22f14e7ad1f5d14889ff04d2ffb6187e05ecfae5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_strmm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 9cbd4848161076093d95d09f4aa42a3803e757cd..68cc3ae8c6e40b9571c82dc91920a6d7d805fc96 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_strsm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 594dee1d142bbc22e54a1dc76a450385a6ec7105..f7259284446da61c46a6b666b5c75e5cc6a7746a 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_zgemm[GEMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index e713dea37beddd640da49d37be12db9ba2bca51b..8df383eabbb90e63e03754c186af49196c64c714 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_zsyrk[SYRK_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 6aa24fb6936eaae5fdfd2d310971736966e647b7..0d9caa8943f49acd5e7c1da22b0eb6ee20febfc2 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ztrmm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
index 012cec7311442a93c98fc9c2267eb9e836fa6565..2d630e8e4c0e0dfc0fc5caa0fb2c34dfe49e42ce 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
char ofld_tbl_ztrsm[TRMM_OFFLOAD_TBL_SIZE] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
similarity index 99%
rename from blasblisacc/src/ti_cblas.h
rename to blasblisacc/src/ti_cblas_acc.h
index fdea549af1fc8564b93fe57071c73b12b391c815..70a040c7758529e14f487b8e89a092300dd61e3c 100644 (file)
rename from blasblisacc/src/ti_cblas.h
rename to blasblisacc/src/ti_cblas_acc.h
index fdea549af1fc8564b93fe57071c73b12b391c815..70a040c7758529e14f487b8e89a092300dd61e3c 100644 (file)
#define MSMC_BUF_SIZE 0x47FDC0
//#define MSMC_BUF_SIZE 0x47F100 // MR=NR=4 for S
+#define DDR_BUF_SIZE (16384)
//DSPBLIS
//#define MSMC_BUF_SIZE 0x400000
index 2af9c66bb289f3c7b7fd3fb5ebecebede04e22af..9f20b26c5c959bd22f81091306731494796cd593 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_caxpy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_caxpy");
- __real_cblas_caxpy(N,alpha,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_caxpy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_caxpy");
+ __real_cblas_caxpy(N,alpha,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_caxpy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_caxpy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_caxpy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CAXPY_IDX, "ocl_cblas_caxpy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CAXPY_IDX, "ocl_cblas_caxpy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_caxpy");
index 4458a8428c256bcc5ec52ac49f23b716fb4c8907..bf48d8435b8ebb2d106c242cd1e87bb2797835e3 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_ccopy");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ccopy");
- __real_cblas_ccopy(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ccopy", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_ccopy");
+ __real_cblas_ccopy(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_ccopy", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ccopy");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_ccopy");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CCOPY_IDX, "ocl_cblas_ccopy");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CCOPY_IDX, "ocl_cblas_ccopy");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_ccopy");
diff --git a/blasblisacc/src/ti_cblas_cblas_cdotc_sub.c b/blasblisacc/src/ti_cblas_cblas_cdotc_sub.c
index 730493ab1d33db83f82e64e998bdecd9f979192c..c54530ef6cebdcdc221d47d613b5b5b405ba2ce3 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cdotc_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotc_sub");
- __real_cblas_cdotc_sub(N,X,incX,Y,incY,dotc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotc_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotc_sub");
+ __real_cblas_cdotc_sub(N,X,incX,Y,incY,dotc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotc_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotc_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotc_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CDOTC_SUB_IDX, "ocl_cblas_cdotc_sub");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CDOTC_SUB_IDX, "ocl_cblas_cdotc_sub");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cdotc_sub");
diff --git a/blasblisacc/src/ti_cblas_cblas_cdotu_sub.c b/blasblisacc/src/ti_cblas_cblas_cdotu_sub.c
index 8f795c7306faa23fef1358d10c2d73c51855ee14..4070c52189bb4baecbceff2636f44f78a3583df5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cdotu_sub");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotu_sub");
- __real_cblas_cdotu_sub(N,X,incX,Y,incY,dotu);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotu_sub", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cdotu_sub");
+ __real_cblas_cdotu_sub(N,X,incX,Y,incY,dotu);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cdotu_sub", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotu_sub");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cdotu_sub");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CDOTU_SUB_IDX, "ocl_cblas_cdotu_sub");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CDOTU_SUB_IDX, "ocl_cblas_cdotu_sub");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -144,9 +144,10 @@ void cblas_cdotu_sub(const int N, const void *X, const int incX, const void *Y,
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cdotu_sub");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cdotu_sub", (float) clock_diff);
return ;
index 954148f3cd5a81443707ac8cf6dc37fe80a79a3f..f425fa1f73742d0754dbdc86339d717072ac2e0f 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgbmv");
- __real_cblas_cgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgbmv");
+ __real_cblas_cgbmv(order,TransA,M,N,KL,KU,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGBMV_IDX, "ocl_cblas_cgbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGBMV_IDX, "ocl_cblas_cgbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -206,8 +206,8 @@ void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgbmv");
index a8edae8b8dbce36fd6af685df6deff9a5b800d3b..3eb73efaa015ccac8fb7a71cb33dd5c1b3daf95c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cgemm_offload_dsp(Order,M,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemm");
- __real_cblas_cgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cgemm_offload_dsp(Order,M,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemm");
+ __real_cblas_cgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMM_IDX, "ocl_cblas_cgemm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMM_IDX, "ocl_cblas_cgemm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -196,6 +196,7 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
@@ -211,9 +212,28 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
#endif
#ifdef __cplusplus
- __K->setArg(15, __local(L2_BUF_SIZE));
+ __K->setArg(15, msmc_size);
#else
- err |= clSetKernelArg(__K, 15, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 15, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(16, buf_DDR);
+ __K->setArg(17, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(18, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 18, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -227,9 +247,14 @@ void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_cgemm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index f6d7f9b0f4bc663bc6945bcda65a5d291131e9c4..05fe4cfa38d6ea3c18874a5be894dc2632ef0b4c 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemv");
- __real_cblas_cgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgemv");
+ __real_cblas_cgemv(order,TransA,M,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMV_IDX, "ocl_cblas_cgemv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMV_IDX, "ocl_cblas_cgemv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -194,8 +194,8 @@ void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgemv");
index f8776e95f76acbee95d0bfdd7ba28277edece224..d16e435a1c22065338c6d2de95c0db1ee822aac5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgerc");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgerc");
- __real_cblas_cgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgerc", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgerc");
+ __real_cblas_cgerc(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgerc", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgerc");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgerc");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGERC_IDX, "ocl_cblas_cgerc");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGERC_IDX, "ocl_cblas_cgerc");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -177,8 +177,8 @@ void cblas_cgerc(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgerc");
index 087b519c40b6d7f7ac6dd4f1af24e23d5dc20668..ebf8c199e75cd4246eedc8f70ac34187efc691e0 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cgeru");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgeru");
- __real_cblas_cgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgeru", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cgeru");
+ __real_cblas_cgeru(order,M,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cgeru", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgeru");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cgeru");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGERU_IDX, "ocl_cblas_cgeru");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGERU_IDX, "ocl_cblas_cgeru");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -177,9 +177,10 @@ void cblas_cgeru(const enum CBLAS_ORDER order, const int M, const int N, const v
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cgeru");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cgeru", (float) clock_diff);
return ;
index 8c264a868db44bee88fd041e28550001b25dc381..98ad84a78b75120d2ef3dbddf55405c452afc3fc 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chbmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chbmv");
- __real_cblas_chbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chbmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chbmv");
+ __real_cblas_chbmv(order,Uplo,N,K,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chbmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chbmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chbmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHBMV_IDX, "ocl_cblas_chbmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHBMV_IDX, "ocl_cblas_chbmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -194,9 +194,10 @@ void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chbmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_chbmv", (float) clock_diff);
return ;
index d48091a5f5f2170178aba7278739b7f960354a54..f8348692443c0900bd911592622e6b64326ad181 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chemm");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!chemm_offload_dsp(Order,Side,M,N)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemm");
- __real_cblas_chemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemm", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!chemm_offload_dsp(Order,Side,M,N)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemm");
+ __real_cblas_chemm(Order,Side,Uplo,M,N,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemm", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemm");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemm");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHEMM_IDX, "ocl_cblas_chemm");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHEMM_IDX, "ocl_cblas_chemm");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -190,12 +190,11 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
void *msmc_ptr;
-
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
-
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -207,9 +206,28 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -223,9 +241,14 @@ void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_chemm is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index f5a16d6bc462786122a657b43809c944dac7416b..b81a5ca893498ff1851356a8dc0588dee46b23fd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chemv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemv");
- __real_cblas_chemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chemv");
+ __real_cblas_chemv(order,Uplo,N,alpha,A,lda,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chemv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chemv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHEMV_IDX, "ocl_cblas_chemv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHEMV_IDX, "ocl_cblas_chemv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -188,8 +188,8 @@ void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chemv");
index f42e4bcbb6a156f1df643d0b0697c24cd6edaa5a..f0ea8143e8f473bd6e4efb3d7bbc585a0329057e 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cher");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher");
- __real_cblas_cher(order,Uplo,N,alpha,X,incX,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher");
+ __real_cblas_cher(order,Uplo,N,alpha,X,incX,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER_IDX, "ocl_cblas_cher");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER_IDX, "ocl_cblas_cher");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -151,9 +151,10 @@ void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cher");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cher", (float) clock_diff);
return ;
index 00eb4aa2a31e311d01caa36a4b262bd7434d77b7..12ba685746c159533dc326afb43a520f3641febe 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cher2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2");
- __real_cblas_cher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2");
+ __real_cblas_cher2(order,Uplo,N,alpha,X,incX,Y,incY,A,lda);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER2_IDX, "ocl_cblas_cher2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER2_IDX, "ocl_cblas_cher2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -177,9 +177,10 @@ void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cher2");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_cher2", (float) clock_diff);
return ;
index b36127567e6967788e3b0d0592ca3837236873a8..ac6e37f4e20db12efe3e74d170d9e7c791a2b4bd 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cher2k");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cher2k_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2k");
- __real_cblas_cher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2k", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cher2k_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cher2k");
+ __real_cblas_cher2k(Order,Uplo,Trans,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cher2k", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2k");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cher2k");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER2K_IDX, "ocl_cblas_cher2k");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHER2K_IDX, "ocl_cblas_cher2k");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -183,11 +183,13 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#else
err |= clSetKernelArg(__K, 12, sizeof(ldc), &ldc);
#endif
+
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(13, buf_MSMC);
#else
@@ -199,9 +201,28 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
#endif
#ifdef __cplusplus
- __K->setArg(14, __local(L2_BUF_SIZE));
+ __K->setArg(14, msmc_size);
#else
- err |= clSetKernelArg(__K, 14, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 14, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(15, buf_DDR);
+ __K->setArg(16, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(17, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 17, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -215,10 +236,15 @@ void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, cons
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_cher2k is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cher2k");
index ed3f9e82c70bba64fd195c10e3bffd2faea7e484..bce49b1cd305abee7baa64eefc6f4b064cf26b26 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cherk");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cherk_offload_dsp(Order,N,K)))) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cherk");
- __real_cblas_cherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cherk", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_NONE) || ((TI_CBLAS_L3_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE) && (!cherk_offload_dsp(Order,N,K)))) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cherk");
+ __real_cblas_cherk(Order,Uplo,Trans,N,K,alpha,A,lda,beta,C,ldc);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cherk", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cherk");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cherk");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHERK_IDX, "ocl_cblas_cherk");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHERK_IDX, "ocl_cblas_cherk");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, Order);
#else
@@ -159,10 +159,11 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
void *msmc_ptr;
+ size_t msmc_size = MSMC_BUF_SIZE;
msmc_ptr = ti_cblas_mem_alloc(MSMC_BUF_SIZE);
#ifdef __cplusplus
- //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
Buffer buf_MSMC(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, MSMC_BUF_SIZE, (void *)msmc_ptr);
+ //Buffer buf_MSMC(ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_MSMC_TI, MSMC_BUF_SIZE);
__K->setArg(11, buf_MSMC);
#else
@@ -174,9 +175,28 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
#endif
#ifdef __cplusplus
- __K->setArg(12, __local(L2_BUF_SIZE));
+ __K->setArg(12, msmc_size);
#else
- err |= clSetKernelArg(__K, 12, L2_BUF_SIZE, NULL);
+ err |= clSetKernelArg(__K, 12, sizeof(msmc_size), &msmc_size);
+#endif
+
+ void *ddr_ptr;
+ size_t ddr_size = DDR_BUF_SIZE;
+ ddr_ptr = __malloc_ddr(DDR_BUF_SIZE);
+ Buffer buf_DDR(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, DDR_BUF_SIZE, ddr_ptr);
+ __K->setArg(13, buf_DDR);
+ __K->setArg(14, ddr_size);
+
+ /* create a buffer argument to get the return error code from the DSP */
+ int err_code;
+#ifdef __cplusplus
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(15, buf_err);
+#else
+ cl_mem buf_err = clCreateBuffer(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code, &err);
+ TI_CBLAS_OCL_CHKERROR("clCreateBuffer",err);
+ err |= clSetKernelArg(__K, 15, sizeof(buf_err), &buf_err);
+ TI_CBLAS_OCL_CHKERROR("clSetKernelArg",err);
#endif
#ifdef __cplusplus
@@ -190,9 +210,14 @@ void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
+ if(err_code != TICBLAS_SUCCESS) {
+ printf("Error code returned by offloaded cblas_cherk is %d\n.", err_code);
+ }
+
ti_cblas_mem_free(msmc_ptr);
+ __free_ddr(ddr_ptr);
ti_cblas_delete_kernel(__K);
index 1440c48809a0f924c01cae29c1684b153ab4cc85..3de67e443f5714b7acf185940575112751242127 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chpmv");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpmv");
- __real_cblas_chpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpmv", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpmv");
+ __real_cblas_chpmv(order,Uplo,N,alpha,Ap,X,incX,beta,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpmv", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpmv");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpmv");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPMV_IDX, "ocl_cblas_chpmv");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPMV_IDX, "ocl_cblas_chpmv");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -182,9 +182,10 @@ void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chpmv");
TI_CBLAS_PROFILE_REPORT(" Entire %s call (DSP) took %8.2f us\n","cblas_chpmv", (float) clock_diff);
return ;
index 3209139569cc6768189af8591af8abaf5925f177..ec3155a58e59e5df7d40026b3d1ddece8e422b29 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chpr");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr");
- __real_cblas_chpr(order,Uplo,N,alpha,X,incX,A);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr");
+ __real_cblas_chpr(order,Uplo,N,alpha,X,incX,A);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPR_IDX, "ocl_cblas_chpr");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPR_IDX, "ocl_cblas_chpr");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chpr");
index 21111f2a03aaef1469068ad3c6da249f2db5a136..e64057cbf75f07fab9af75620d3e99c189f82c17 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
@@ -44,20 +45,19 @@ void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_chpr2");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr2");
- __real_cblas_chpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr2", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_chpr2");
+ __real_cblas_chpr2(order,Uplo,N,alpha,X,incX,Y,incY,Ap);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_chpr2", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr2");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_chpr2");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPR2_IDX, "ocl_cblas_chpr2");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CHPR2_IDX, "ocl_cblas_chpr2");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, order);
#else
@@ -171,8 +171,8 @@ void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_chpr2");
index 1b09cfb517b53af7edbdcbf393e24e1ac9b9686b..190ec8f1cca21f5d78802f88b28ec476ebdc5533 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_crotg");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_crotg");
- __real_cblas_crotg(a,b,c,s);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_crotg", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_crotg");
+ __real_cblas_crotg(a,b,c,s);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_crotg", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_crotg");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_crotg");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CROTG_IDX, "ocl_cblas_crotg");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CROTG_IDX, "ocl_cblas_crotg");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
Buffer buf_a(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(float), (void *)a);
__K->setArg(0, buf_a);
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_crotg");
index 1c5e49ad4f03bf0d121637bb786e6bf7165c1d3e..4c930606f994d222ea6a9cbd085763678db343b0 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cscal");
- __real_cblas_cscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cscal");
+ __real_cblas_cscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSCAL_IDX, "ocl_cblas_cscal");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSCAL_IDX, "ocl_cblas_cscal");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_cscal");
index 1e28a64d0419d6bdeffa67add4ec526c94cae935..a548c62aca96cd7f9884405a665c336488c6c2b5 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_csscal");
- TI_CBLAS_PROFILE_START();
-
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csscal");
- __real_cblas_csscal(N,alpha,X,incX);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csscal", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_csscal");
+ __real_cblas_csscal(N,alpha,X,incX);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_csscal", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csscal");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_csscal");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSSCAL_IDX, "ocl_cblas_csscal");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSSCAL_IDX, "ocl_cblas_csscal");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
TI_CBLAS_DEBUG_PRINT("Finished executing %s\n", "cblas_csscal");
index 9c2f0dd4275fe1d14e7c3bee794855ef9eaaed7b..e6d206a42c89e5be55b324d02633db0abc9cae19 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include "ti_cblas.h"
+#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#ifdef __cplusplus
extern "C" {
/* Do an init on first use */
if (!ti_cblas_init_done) ti_cblas_init();
TI_CBLAS_DEBUG_PRINT("Intercepted call to %s\n", "cblas_cswap");
+
/* OpenCL cannot deal with overlapping memory regions. This is an issue when you
* are trying to swap two rows of a matrix, where the matrix is column major. Hence,
* the offload of this routine to the DSP is disabled.
*/
#ifndef TI_CBLAS_SWAP_ENABLE_OFFLOAD
TI_CBLAS_PROFILE_START();
-
TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cswap");
- __real_cblas_cswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
+ __real_cblas_cswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
return ;
#else
- TI_CBLAS_PROFILE_START();
+ TI_CBLAS_PROFILE_START();
/* Dynamic condtional offload to ARM */
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
- TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cswap");
- __real_cblas_cswap(N,X,incX,Y,incY);
- TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
- return ;
- }
- /* End ARM offload */
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_NONE)) {
+ TI_CBLAS_DEBUG_PRINT("Executing ARM %s\n", "cblas_cswap");
+ __real_cblas_cswap(N,X,incX,Y,incY);
+ TI_CBLAS_PROFILE_REPORT(" Entire %s call (ARM) took %8.2f us\n","cblas_cswap", (float) clock_diff);
+ return ;
+ }
+ /* End ARM offload */
/******************************************************************/
/* DSP offload WILL be done if control reaches here */
- TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cswap");
+ TI_CBLAS_DEBUG_PRINT("Offloading to DSP %s\n", "cblas_cswap");
/* Lookup kernel pointer from global table */
#ifdef __cplusplus
#else
cl_kernel __K;
#endif
- __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSWAP_IDX, "ocl_cblas_cswap");
+ __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CSWAP_IDX, "ocl_cblas_cswap");
+
#ifdef __cplusplus
try
#else
#endif
{
-
#ifdef __cplusplus
__K->setArg(0, N);
#else
@@ -144,9 +144,10 @@ void cblas_cswap(const int N, void *X, const int incX, void *Y, const int incY)
TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
err |= clReleaseEvent(e);
TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-
#endif
+
ti_cblas_delete_kernel(__K);
+