summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: c2bcb77)
raw | patch | inline | side by side (parent: c2bcb77)
author | Jianzhong Xu <a0869574local@uda0869574> | |
Fri, 30 Oct 2015 20:52:50 +0000 (16:52 -0400) | ||
committer | Jianzhong Xu <a0869574local@uda0869574> | |
Fri, 30 Oct 2015 20:52:50 +0000 (16:52 -0400) |
110 files changed:
diff --git a/Makefile b/Makefile
index 9b9b298ebdad5136a902b6c1b2e581087d069170..9fe02c9c4d1b43491361362ee3cf12f4810502eb 100644 (file)
--- a/Makefile
+++ b/Makefile
LINALG_HEADERS+=$(LINALG_CLAPACK_DIR)/INCLUDE/f2c.h
LINALG_HEADERS+=$(LINALG_CBLAS_DIR)/include/cblas.h
+
build: ARMonly
prebuild: DSPonly
linalg: ARMplusDSP
cd ../$(LINALG_BLASACC_DIR)/src; make -f Makefile.ARM; \
cd ../../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make
-ARMplusDSP:
+ARMplusDSP_K2H:
cd $(LINALG_CBLAS_DIR); make arch=ARM alllib; make arch=C66 alllib; \
- cd ../$(LINALG_TICBLAS_DIR)/src; make; cd ..; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make MEM_MODEL=Large; cd ..; \
cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make -j8; make install; \
./configure -p install/arm cortex-a15; make -j8; make install; \
- cd ../$(LINALG_BLASACC_DIR); make cross; \
- cd ../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make
+ cd ../$(LINALG_BLASACC_DIR); make crossC66x; \
+ cd ../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make -j8
+
+ARMplusDSP_AM57x:
+ cd $(LINALG_CBLAS_DIR); make arch=ARM alllib; make arch=C66 alllib; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make MEM_MODEL=Small; cd ..; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/am57x am57x; make -j8; make install; \
+ ./configure -p install/arm cortex-a15; make -j8; make install; \
+ cd ../$(LINALG_BLASACC_DIR); make crossAM57x; \
+ cd ../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make -j8
+
+DSPonly_Shannon:
+ cd $(LINALG_CBLAS_DIR); make arch=ARM alllib; make arch=C66 alllib; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make MEM_MODEL=Medium; cd ..; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/shannon shannon; make -j8; make install; \
+ ./configure -p install/arm cortex-a15; make -j8; make install; \
+ cd ../$(LINALG_BLASACC_DIR); make crossShannon; \
+ cd ../$(LINALG_CLAPACK_DIR); make f2clib; make cblaswrap; cd SRC; make -j8
BLIStest:
cd $(LINALG_BLIS_DIR)/testsuite; make lib=OpenCLCBLAS -j8
-cleanARMplusDSP:
+BLAStest:
+ cd $(LINALG_CLAPACK_DIR)/BLAS/TESTING; make -f Makeblat1; make -f Makeblat2; make -f Makeblat3
+
+CLAPACKtest:
+ cd $(LINALG_CLAPACK_DIR)/TESTING/MATGEN; make
+ cd $(LINALG_CLAPACK_DIR)/TESTING/LIN; make
+ cd $(LINALG_CLAPACK_DIR)/TESTING/EIG; make
+
+
+cleanARMplusDSP_K2H:
cd $(LINALG_CBLAS_DIR); make arch=ARM clean; make arch=C66 clean; \
cd ../$(LINALG_TICBLAS_DIR)/src; make clean; cd ..; \
cd ../$(LINALG_BLIS_DIR); ./configure -p install/c66x c66x; make clean; \
cd ../$(LINALG_BLASACC_DIR); make clean; \
cd ../$(LINALG_BLIS_DIR)/testsuite; make clean; \
cd ../../$(LINALG_CLAPACK_DIR); make clean
+
+cleanARMplusDSP_AM57x:
+ cd $(LINALG_CBLAS_DIR); make arch=ARM clean; make arch=C66 clean; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make clean; cd ..; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/am57x am57x; make clean; \
+ ./configure -p install/arm cortex-a15; make clean; \
+ cd ../$(LINALG_BLASACC_DIR); make clean; \
+ cd ../$(LINALG_BLIS_DIR)/testsuite; make clean; \
+ cd ../../$(LINALG_CLAPACK_DIR); make clean
+
+cleanShannon:
+ cd $(LINALG_CBLAS_DIR); make arch=ARM clean; make arch=C66 clean; \
+ cd ../$(LINALG_TICBLAS_DIR)/src; make clean; cd ..; \
+ cd ../$(LINALG_BLIS_DIR); ./configure -p install/shannon shannon; make clean; \
+ ./configure -p install/arm cortex-a15; make clean; \
+ cd ../$(LINALG_BLASACC_DIR); make clean; \
+ cd ../$(LINALG_BLIS_DIR)/testsuite; make clean; \
+ cd ../../$(LINALG_CLAPACK_DIR); make clean
clean:
cd $(LINALG_CBLAS_DIR)/src; make arch=ARM clean; \
diff --git a/blasblisacc/Makefile b/blasblisacc/Makefile
index 8d02c6746ca589e283737cfdc86e2d002fcc83a4..1c6e202d87b96753837a5ea321a4f314be6047a8 100644 (file)
--- a/blasblisacc/Makefile
+++ b/blasblisacc/Makefile
include ../make.inc
# use all for cross compilation
-cross: all
# build library
all:
cd src; $(MAKE)
+crossC66x:
+ cd src; $(MAKE) crossC66x
+
+crossAM57x:
+ cd src; $(MAKE) crossAM57x
+
+crossShannon:
+ cd src; $(MAKE) crossShannon
+
debug:
cd src; $(MAKE) debug
index edc23a8f528e0855c663626cdd7805aa6c5efb9f..07fd4dd3f671ef1c83d69d3dab64482134b6f26e 100644 (file)
--- a/blasblisacc/src/Makefile
+++ b/blasblisacc/src/Makefile
$(eval $(call FIND_DSP_PKG,XDAIS_DIR,xdais*,packages))
$(eval $(call FIND_DSP_PKG,XDC_DIR,xdc*,packages))
-INCDIR := $(TI_OCL_CGT_INSTALL)/include;../../blis/install/c66x/include/blis/;$(OMP_DIR)/packages/ti/runtime/openmp;$(PDK_INC)
+#INCDIR := $(TI_OCL_CGT_INSTALL)/include;../../blis/install/c66x/include/blis/;$(OMP_DIR)/packages/ti/runtime/openmp;$(PDK_INC)
+INCDIR := $(TI_OCL_CGT_INSTALL)/include;$(OMP_DIR)/packages/ti/runtime/openmp;$(PDK_INC)
INCDIR += -I$(FC_DIR)/packages
INCDIR += -I$(XDC_DIR)/packages
INCDIR += -I$(XDAIS_DIR)/packages
# CBLAS and BLIS directories
CBLAS_DSP_LIB = ../../cblas/lib/C66/libcblas_C66.ae66
TICBLAS_DSP_LIB = ../../ticblas/lib/libticblas.a66x
-BLIS_DSP_LIB = ../../blis/install/c66x/lib/libblis.ae66
CBLAS_ARM_LIB = ../../cblas/lib/ARM/libcblas_ARM.a
LIBARCH_LIB = $(LIBARCH_DIR)/packages/ti/libarch/lib/libArch.a66x
+ifeq ($(MEM_MODEL),Large)
+BLIS_DSP_LIB = ../../blis/install/c66x/lib/libblis.ae66
+else ifeq ($(MEM_MODEL),Medium)
+BLIS_DSP_LIB = ../../blis/install/shannon/lib/libblis.ae66
+else ifeq ($(MEM_MODEL),Small)
+BLIS_DSP_LIB = ../../blis/install/am57x/lib/libblis.ae66
+#else ifeq ($(MEM_MODEL),Tiny)
+endif
+
OCL_BIN = ti_cblas_kernel.out
ifeq ($(TI_CBLAS_FAT_BINARY), 1)
all: armplusdsp
cross: armplusdsp
+crossC66x: BLIS_DSP_LIB = ../../blis/install/c66x/lib/libblis.ae66
+#crossC66x: CL6X_FLAGS+= -I../../blis/install/c66x/include/blis/
+crossC66x: cross $(OCL_BIN) $(OBJS)
+
+crossAM57x: BLIS_DSP_LIB = ../../blis/install/am57x/lib/libblis.ae66
+#crossAM57x: CL6X_FLAGS+= -I../../blis/install/am57x/include/blis/
+crossAM57x: cross $(OCL_BIN) $(OBJS)
+
+crossShannon: BLIS_DSP_LIB = ../../blis/install/shannon/lib/libblis.ae66
+#crossShannon: CL6X_FLAGS+= -I../../blis/install/shannon/include/blis/
+crossShannon: cross $(OCL_BIN) $(OBJS)
+
debug: CPP_FLAGS += -DTI_CBLAS_DEBUG $(CPP_DEBUG) #-DTI_CBLAS_PROFILE
debug: cross
index c88afc9e9db7c02b78b7b93a391b9d2a958142b3..fdbaef071facd58c0f3b1ddf7a2486988223d74b 100644 (file)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-//#include "stdio.h"
-#include "blis.h"
#include "../../ticblas/ticblas.h"
#include <ti/libarch/libarch.h>
#define BLIS_L3_DDR_SIZE_ZERO (0)
+extern void bli_init();
+extern void bli_finalize();
+
int bli_l3_mem_config(double *msmc_buf, size_t msmc_buf_size, size_t *l1D_SRAM_size_orig, size_t *l2_SRAM_size_orig)
{
size_t smem_size_vfast, smem_size_fast, smem_size_med, smem_size_slow;
@@ -41,20 +42,20 @@ int bli_l3_mem_config(double *msmc_buf, size_t msmc_buf_size, size_t *l1D_SRAM_s
/* First, verify the provided/available memory meet requirements */
tiCblasGetSizes(&smem_size_vfast, &smem_size_fast, &smem_size_med, &smem_size_slow);
- if( (smem_size_vfast> lib_get_L1D_total_size()) // total available L1D
- ||(smem_size_fast > lib_get_L2_total_size()) // total available L2
- ||(smem_size_med > msmc_buf_size) // provided MSMC memory
- ||(smem_size_slow > BLIS_L3_DDR_SIZE_ZERO) // DDR not used
+ if( (smem_size_vfast> lib_get_L1D_total_size()) /* total available L1D */
+ ||(smem_size_fast > lib_get_L2_total_size()) /* total available L2 */
+ ||(smem_size_med > msmc_buf_size) /* provided MSMC memory */
+ ||(smem_size_slow > BLIS_L3_DDR_SIZE_ZERO) /* DDR not used */
) {
return(-2);
}
/* Configure L1D if necessary */
- *l1D_SRAM_size_orig = lib_get_L1D_SRAM_size(); // get current L1D SRAM size
+ *l1D_SRAM_size_orig = lib_get_L1D_SRAM_size(); /* get current L1D SRAM size */
l1d_cfg_err = LIB_CACHE_SUCCESS;
- //printf("Original L1D SRAM size is: %d\n", *l1D_SRAM_size_orig);
- //printf("Required L1D SRAM size is: %d\n", smem_size_vfast);
- if(*l1D_SRAM_size_orig < smem_size_vfast) { // configure L1D if needs more SRAM
+ /*printf("Original L1D SRAM size is: %d\n", *l1D_SRAM_size_orig);
+ printf("Required L1D SRAM size is: %d\n", smem_size_vfast);*/
+ if(*l1D_SRAM_size_orig < smem_size_vfast) { /* configure L1D if needs more SRAM */
#pragma omp parallel
{
l1d_cfg_err = lib_L1D_config_SRAM(smem_size_vfast);
@@ -62,11 +63,11 @@ int bli_l3_mem_config(double *msmc_buf, size_t msmc_buf_size, size_t *l1D_SRAM_s
}
/* Configure L2 if necessary */
- *l2_SRAM_size_orig = lib_get_L2_SRAM_size(); // get current L2 SRAM size
+ *l2_SRAM_size_orig = lib_get_L2_SRAM_size(); /* get current L2 SRAM size */
l2_cfg_err = LIB_CACHE_SUCCESS;
- //printf("Original L2 SRAM size is: %d\n", *l2_SRAM_size_orig);
- //printf("Required L2 SRAM size is: %d\n", smem_size_fast);
- if(*l2_SRAM_size_orig < smem_size_fast) { // configure L2 if needs more SRAM
+ /*printf("Original L2 SRAM size is: %d\n", *l2_SRAM_size_orig);
+ printf("Required L2 SRAM size is: %d\n", smem_size_fast);*/
+ if(*l2_SRAM_size_orig < smem_size_fast) { /* configure L2 if needs more SRAM */
#pragma omp parallel
{
l2_cfg_err = lib_L2_config_SRAM(smem_size_fast);
l2_SRAM_ptr, smem_size_fast,
msmc_buf, msmc_buf_size,
NULL, BLIS_L3_DDR_SIZE_ZERO));
-}
+} /* bli_l3_mem_config */
/*==============================================================================
* This function reconfigures L1D and L2 after processing is finished
}
return(TICBLAS_SUCCESS);
-} /* test_reconfig_memory */
+} /* bli_l3_mem_reconfig */
+/* This function will be removed. Function tiCblasNew() will be used instead. */
void ti_bli_init_dsp(char *l3_buf, char *l2_buf)
{
-// printf("In function ti_bli_init_dsp, l3_buff is 0x%x, l2_buf is 0x%x.\n", (unsigned int)l3_buf, (unsigned int)l2_buf);
+/* printf("In function ti_bli_init_dsp, l3_buff is 0x%x, l2_buf is 0x%x.\n", (unsigned int)l3_buf, (unsigned int)l2_buf); */
bli_init();
}
+/* This function will be removed. Function tiCblasDelete() will be used instead. */
void ti_bli_finalize_dsp(void)
{
bli_finalize();
diff --git a/blis/Makefile b/blis/Makefile
index b7ee56fbe1b153b485bf4ca8ff13dd274074bb7a..c41726c53e2e59a9dce2f159970a65c198d36c41 100644 (file)
--- a/blis/Makefile
+++ b/blis/Makefile
endif # pnacl
# --- Install rules ---
+#ifeq ($(CONFIG_NAME),c66x) $(filter $(var),X `')
install-libs: check-env $(MK_LIBS_INST_W_VERS_CONF)
-ifeq ($(CONFIG_NAME),c66x)
+ifeq ($(CONFIG_NAME), $(filter c66x am57x shannon,$(CONFIG_NAME)))
@echo "Installing as DSP Binary"
@cp $(MK_BLIS_LIB_INST_W_VERS_CONF) $(patsubst %.a, %.ae66, $(MK_BLIS_LIB_INST_W_VERS_CONF))
@$(SYMLINK) $(notdir $(patsubst %.a, %.ae66, $(MK_BLIS_LIB_INST_W_VERS_CONF))) $(INSTALL_PREFIX)/lib/$(BLIS_LIB_BASE_NAME).ae66
diff --git a/blis/config/am57x/bli_config.h b/blis/config/am57x/bli_config.h
--- /dev/null
@@ -0,0 +1,291 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_CONFIG_H
+#define BLIS_CONFIG_H
+
+#define BLIS_ENABLE_C66X_BUILD
+
+#define BLIS_ENABLE_C66X_AM57X
+
+#define BLIS_ENABLE_C66X_MEM_POOLS
+
+#define BLIS_ENABLE_C66X_OPENCL
+
+#ifdef BLIS_ENABLE_C66X_OPENCL
+// clocl creates a cio section in L2 when fprintf is used. Redefining fprintf to map to printf.
+#define fprintf ti_printf
+#endif
+
+
+
+
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
+// -- INTEGER PROPERTIES -------------------------------------------------------
+
+// The bit size of the integer type used to track values such as dimensions,
+// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
+// integers while 64 results in 64-bit integers. Any other value results in use
+// of the C99 type "long int". Note that this ONLY affects integers used
+// internally within BLIS as well as those exposed in the native BLAS-like BLIS
+// interface.
+#define BLIS_INT_TYPE_SIZE 32
+
+
+
+// -- FLOATING-POINT PROPERTIES ------------------------------------------------
+
+// Define the number of floating-point types supported, and the size of the
+// largest type.
+#define BLIS_NUM_FP_TYPES 4
+#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
+
+// Enable use of built-in C99 "float complex" and "double complex" types and
+// associated overloaded operations and functions? Disabling results in
+// scomplex and dcomplex being defined in terms of simple structs.
+//#define BLIS_ENABLE_C99_COMPLEX
+
+// -- c66x headers -------------------------------------------------------------
+#include "c6x.h"
+
+#include <ti/csl/device/k2h/src/cslr_device.h>
+
+#include <ti/libarch/libarch.h>
+//#include <ti/csl/csl_chipAux.h> // CSL_chipReadDNUM -> to read coreID
+//#include <ti/csl/csl_cacheAux.h> // CACHE_invL1d
+
+// for __clock64()
+#include <dsp_c.h>
+
+
+// -- EDMA ---------------------------------------------------------------------
+#define BLIS_ENABLE_C66X_EDMA
+
+#ifdef BLIS_ENABLE_C66X_EDMA
+
+#define BLIS_GEMM_DMAB_CNTL NULL
+#define BLIS_GEMM_DMAA_CNTL NULL
+
+/*
+#if USING_FC_EDMAMGR
+#include <xdc/std.h>
+
+#define ECPY_INLINE_ALL 1
+#define EDMAMGR_INLINE_ALL 1
+#include <ti/sdo/fc/edmamgr/edmamgr.h>
+#else
+#include "edmamgr.h"
+#endif
+*/
+#define BLIS_C66X_MAXDMASTRIDE 0x7FFF
+
+#define BLIS_C66X_EDMA_MAX_NUM_CHANNELS 6
+#endif
+
+
+// -- PREFETCH -----------------------------------------------------------------
+//#define BLIS_ENABLE_C66X_PREFETCH
+
+#ifdef BLIS_ENABLE_C66X_PREFETCH
+#include "touch.h"
+#endif
+
+// -- IDMA -----------------------------------------------------------------
+#define BLIS_ENABLE_C66X_IDMA
+
+#ifdef BLIS_ENABLE_C66X_IDMA
+#include "idma.h"
+#endif
+
+// -- PROFILE -----------------------------------------------------------------
+//uncomment to Profile performance
+//#define BLIS_ENABLE_PROFILE
+
+// -- MULTITHREADING -----------------------------------------------------------
+
+// The maximum number of BLIS threads that will run concurrently.
+#define BLIS_ENABLE_MULTITHREADING
+#define BLIS_ENABLE_OPENMP
+/* While testing this code on Hawking, this value needs to be 8. OpenMP randomly
+assigns the OpenMP threads to the cores. This value needs to be 8 to make sure
+all the cores are initialized before the openMP region begins
+*/
+
+#define BLIS_MAX_NUM_THREADS 8
+
+#define BLIS_C66X_IC_NT 2
+#define BLIS_C66X_JC_NT 1
+#define BLIS_C66X_JR_NT 1
+#define BLIS_C66X_IR_NT 1
+
+
+
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+// -- Contiguous (static) memory allocator --
+
+// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
+// contiguous memory pools.
+
+#define BLIS_NUM_MC_X_KC_BLOCKS_L3 0
+#define BLIS_NUM_MC_X_KC_BLOCKS_L2 1 //2 //Each L2 ram is local to the DSP Just need one buffer per thread that is packed
+#define BLIS_NUM_MC_X_KC_BLOCKS_L1 0
+#define BLIS_NUM_MR_X_KC_BLOCKS_L1 2 // To transfer A to L1 in a ping-poing manner
+#define BLIS_NUM_MC_X_KC_BLOCKS 2*BLIS_MAX_NUM_THREADS + 1 //To test w/o DMA and L2, L3 memory, all memory must be in DDR3 now
+
+#define BLIS_NUM_KC_X_NC_BLOCKS_L3 1 //2
+#define BLIS_NUM_KC_X_NC_BLOCKS_L2 0
+#define BLIS_NUM_KC_X_NC_BLOCKS_L1 0
+#define BLIS_NUM_KC_X_NR_BLOCKS_L1 1
+#define BLIS_NUM_KC_X_NC_BLOCKS 2*BLIS_MAX_NUM_THREADS //To test w/o DMA and L2, L3 memory, all memory must be in DDR3 now
+
+#define BLIS_NUM_MC_X_NC_BLOCKS_L3 0
+#define BLIS_NUM_MC_X_NC_BLOCKS_L2 0
+#define BLIS_NUM_MC_X_NR_BLOCKS_L2 3 //Bringing C into the L2 memory. We need 3 buffers, one to read, one to compute and one to write.
+#define BLIS_NUM_MC_X_NC_BLOCKS_L1 0
+#define BLIS_NUM_MR_X_NR_BLOCKS_L1 0
+#define BLIS_NUM_MC_X_NC_BLOCKS 0
+
+
+// The maximum preload byte offset is used to pad the end of the contiguous
+// memory pools so that the micro-kernel, when computing with the end of the
+// last block, can exceed the bounds of the usable portion of the memory
+// region without causing a segmentation fault.
+#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
+
+// -- Memory alignment --
+
+// It is sometimes useful to define the various memory alignments in terms
+// of some other characteristics of the system, such as the cache line size
+// and the page size.
+#define BLIS_CACHE_LINE_SIZE 64
+#define BLIS_PAGE_SIZE 4096
+
+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE 16
+
+// Alignment size used to align local stack buffers within macro-kernel
+// functions.
+#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when allocating memory dynamically from the operating
+// system (eg: posix_memalign()). To disable heap alignment and just use
+// malloc() instead, set this to 1.
+#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when sizing leading dimensions of dynamically
+// allocated memory.
+#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
+
+// Alignment size used when allocating entire blocks of contiguous memory
+// from the contiguous memory allocator.
+#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_CACHE_LINE_SIZE//BLIS_PAGE_SIZE
+
+// Extra buffer space in each block in L1 to account for bank conflict
+/* There are 2 buffers of size MRK and 1 buffer of size KNR in L1. The
+ * extra buffer space in each block in L1 is computed based on the
+ * remaining space available in L1. L1DSRAM is configured to size 28K.
+ * The total size of the blocks in L1 = 2*MR*KC*size of datatype + KC*NR* size of datatype.
+ * The remaining available space in L1 is divided such that
+ * 2*BLIS_MRK_BLOCK_BUFFER_L1+1*BLIS_KNR_PANEL_BUFFER_L1+0BLIS_MRNR_BLOCK_BUFFER_L1_S = remaining available space.
+ */
+#define BLIS_MRK_BLOCK_BUFFER_L1_S 128 //
+#define BLIS_MRK_BLOCK_BUFFER_L1_D 64 //
+#define BLIS_MRK_BLOCK_BUFFER_L1_C 64 //
+#define BLIS_MRK_BLOCK_BUFFER_L1_Z 32 //
+#define BLIS_KNR_PANEL_BUFFER_L1_S 256 //
+#define BLIS_KNR_PANEL_BUFFER_L1_D 128
+#define BLIS_KNR_PANEL_BUFFER_L1_C 128
+#define BLIS_KNR_PANEL_BUFFER_L1_Z 64
+#define BLIS_MRNR_BLOCK_BUFFER_L1_S 128 //
+#define BLIS_MRNR_BLOCK_BUFFER_L1_D 64 //
+#define BLIS_MRNR_BLOCK_BUFFER_L1_C 64 //
+#define BLIS_MRNR_BLOCK_BUFFER_L1_Z 32 //
+
+//
+
+#define bli_sbank 8
+#define bli_dbank 16
+#define bli_cbank 16
+#define bli_zbank 24
+
+
+
+// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
+
+// Basic (homogeneous) datatype support always enabled.
+
+// Enable mixed domain operations?
+//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+
+// Enable extra mixed precision operations?
+//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+
+
+
+// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
+
+// Stay initialized after auto-initialization, unless and until the user
+// explicitly calls bli_finalize().
+#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
+
+
+
+// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
+
+// Enable the BLAS compatibility layer?
+#define BLIS_ENABLE_BLAS2BLIS
+
+// The bit size of the integer type used to track values such as dimensions and
+// leading dimensions (ie: column strides) within the BLAS compatibility layer.
+// A value of 32 results in the compatibility layer using 32-bit signed integers
+// while 64 results in 64-bit integers. Any other value results in use of the
+// C99 type "long int". Note that this ONLY affects integers used within the
+// BLAS compatibility layer.
+#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
+
+// Fortran-77 name-mangling macros.
+#define PASTEF770(name) name ## _
+#define PASTEF77(ch1,name) ch1 ## name ## _
+#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
+
+
+
+
+#endif
+
diff --git a/blis/config/am57x/bli_kernel.h b/blis/config/am57x/bli_kernel.h
--- /dev/null
@@ -0,0 +1,247 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_KERNEL_H
+#define BLIS_KERNEL_H
+
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
+
+// -- Cache blocksizes --
+
+//
+// Constraints:
+//
+// (1) MC must be a multiple of:
+// (a) MR (for zero-padding purposes)
+// (b) NR (for zero-padding purposes when MR and NR are "swapped")
+// (2) NC must be a multiple of
+// (a) NR (for zero-padding purposes)
+// (b) MR (for zero-padding purposes when MR and NR are "swapped")
+// (3) KC must be a multiple of
+// (a) MR and
+// (b) NR (for triangular operations such as trmm and trsm).
+//
+
+#define BLIS_DEFAULT_MC_S 144 //432
+#define BLIS_DEFAULT_KC_S 428
+#define BLIS_DEFAULT_NC_S 1224 //2752 //
+
+#define BLIS_DEFAULT_MC_D 140 //420 //
+#define BLIS_DEFAULT_KC_D 220
+#define BLIS_DEFAULT_NC_D 1184 //2672 //
+
+#define BLIS_DEFAULT_MC_C 116
+#define BLIS_DEFAULT_KC_C 260
+#define BLIS_DEFAULT_NC_C 1008
+
+#define BLIS_DEFAULT_MC_Z 86
+#define BLIS_DEFAULT_KC_Z 178
+#define BLIS_DEFAULT_NC_Z 736
+
+#define BLIS_DEFAULT_4M_MC_C 140
+#define BLIS_DEFAULT_4M_KC_C 220
+#define BLIS_DEFAULT_4M_NC_C 1184
+
+#define BLIS_DEFAULT_4M_MC_Z 86
+#define BLIS_DEFAULT_4M_KC_Z 178
+#define BLIS_DEFAULT_4M_NC_Z 736
+
+#define BLIS_DEFAULT_3M_MC_C 88
+#define BLIS_DEFAULT_3M_KC_C 220
+#define BLIS_DEFAULT_3M_NC_C 792
+
+#define BLIS_DEFAULT_3M_MC_Z 56
+#define BLIS_DEFAULT_3M_KC_Z 178
+#define BLIS_DEFAULT_3M_NC_Z 488
+
+// -- Register blocksizes --
+
+#define BLIS_DEFAULT_MR_S 4
+#define BLIS_DEFAULT_NR_S 8
+
+#define BLIS_DEFAULT_MR_D 4
+#define BLIS_DEFAULT_NR_D 4
+
+#define BLIS_DEFAULT_MR_C 2
+#define BLIS_DEFAULT_NR_C 4
+
+#define BLIS_DEFAULT_MR_Z 1
+#define BLIS_DEFAULT_NR_Z 1
+
+
+// NOTE: If the micro-kernel, which is typically unrolled to a factor
+// of f, handles leftover edge cases (ie: when k % f > 0) then these
+// register blocksizes in the k dimension can be defined to 1.
+
+//#define BLIS_DEFAULT_KR_S 1
+//#define BLIS_DEFAULT_KR_D 1
+//#define BLIS_DEFAULT_KR_C 1
+//#define BLIS_DEFAULT_KR_Z 1
+
+// -- Cache blocksize extensions (for optimizing edge cases) --
+
+// NOTE: These cache blocksize "extensions" have the same constraints as
+// the corresponding default blocksizes above. When these values are
+// non-zero, blocksizes used at edge cases are extended (enlarged) if
+// such an extension would encompass the remaining portion of the
+// matrix dimension.
+
+//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
+//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
+//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
+
+//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
+//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
+//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
+
+//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
+//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
+//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
+
+//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
+//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
+//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
+
+// -- Register blocksize extensions (for packed micro-panels) --
+
+// NOTE: These register blocksize "extensions" determine whether the
+// leading dimensions used within the packed micro-panels are equal to
+// or greater than their corresponding register blocksizes above.
+
+//#define BLIS_EXTEND_MR_S 0
+//#define BLIS_EXTEND_NR_S 0
+
+//#define BLIS_EXTEND_MR_D 0
+//#define BLIS_EXTEND_NR_D 0
+
+//#define BLIS_EXTEND_MR_C 0
+//#define BLIS_EXTEND_NR_C 0
+
+//#define BLIS_EXTEND_MR_Z 0
+//#define BLIS_EXTEND_NR_Z 0
+
+
+
+// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
+
+
+
+
+// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
+
+
+
+
+// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
+
+// -- gemm --
+#define BLIS_SGEMM_UKERNEL bli_sgemm_ukernel_4x8
+#define BLIS_DGEMM_UKERNEL bli_dgemm_ukernel_4x4
+#define BLIS_CGEMM_UKERNEL bli_cgemm_ukernel_2x4
+#define BLIS_ZGEMM_UKERNEL bli_zgemm_ukernel_2x2
+
+// -- trsm-related --
+#define BLIS_SGEMMTRSM_U_UKERNEL bli_sgemmtrsm_u_ukernel_4x4
+#define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_ukernel_4x4
+
+#define BLIS_STRSM_U_UKERNEL bli_strsm_u_ukernel_4x4
+#define BLIS_STRSM_L_UKERNEL bli_strsm_l_ukernel_4x4
+
+
+// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
+
+// -- packm --
+#define BLIS_SPACKM_4XK_KERNEL bli_spackm_4xk_ukernel
+#define BLIS_SPACKM_8XK_KERNEL bli_spackm_8xk_ukernel
+
+// -- unpackm --
+
+
+
+
+// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
+
+// -- axpy2v --
+
+// -- dotaxpyv --
+
+// -- axpyf --
+
+// -- dotxf --
+
+// -- dotxaxpyf --
+
+
+
+
+// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
+
+// -- addv --
+
+// -- axpyv --
+
+// -- copyv --
+
+// -- dotv --
+
+// -- dotxv --
+
+// -- invertv --
+
+// -- scal2v --
+
+// -- scalv --
+
+// -- setv --
+
+// -- subv --
+
+// -- swapv --
+
+// adding packm micro kernel prototypes
+#include "bli_packm_cxk_ukernels.h"
+
+// Declaration for bli_sgemm_ukernel_4x4 which is used by gemmtrsm ukernel
+void bli_sgemm_ukernel_4x4(
+ dim_t k,
+ float* restrict alpha,
+ float* restrict a,
+ float* restrict b,
+ float* restrict beta,
+ float* restrict c, inc_t rs_c, inc_t cs_c,
+ auxinfo_t* data
+ );
+
+#endif
+
diff --git a/blis/config/am57x/corepack_regs.h b/blis/config/am57x/corepack_regs.h
--- /dev/null
@@ -0,0 +1,82 @@
+/**
+ Structures for CorePack registers.
+*/
+
+#ifndef RT_COREPACK_REGS_H
+#define RT_COREPACK_REGS_H
+
+#include <stdint.h>
+
+/**
+ IDMA registers placed at 0x0182 0000.
+*/
+struct corepack_idma_regs {
+ uint32_t idma0_stat; /* 0000 */
+ uint32_t idma0_mask; /* 0004 */
+ uint32_t idma0_source; /* 0008 */
+ uint32_t idma0_dest; /* 000C */
+ uint32_t idma0_count; /* 0010 */
+ uint32_t res1[59]; /* 0014 - 00ff */
+ uint32_t idma1_stat; /* 0100 */
+ uint32_t res2; /* 0104 */
+ uint32_t idma1_source; /* 0108 */
+ uint32_t idma1_dest; /* 010c */
+ uint32_t idma1_count; /* 0110 */
+ uint32_t res3[16315]; /* 0114 - ffff */
+};
+
+/**
+ Cache registers placed at 0x0184 0000.
+*/
+struct corepack_cache_regs {
+ uint32_t l2cfg; /* 0000 */
+ uint32_t res1[7]; /* 0004 - 001f */
+ uint32_t l1pcfg; /* 0020 */
+ uint32_t l1pcc; /* 0024 */
+ uint32_t res2[6]; /* 0028 - 003f */
+ uint32_t l1dcfg; /* 0040 */
+ uint32_t l1dcc; /* 0044 */
+ uint32_t res3[4078]; /* 0048 - 3fff */
+ uint32_t l2wbar; /* 4000 */
+ uint32_t l2wwc; /* 4004 */
+ uint32_t res4[2]; /* 4008 - 400f */
+ uint32_t l2wibar; /* 4010 */
+ uint32_t l2wiwc; /* 4014 */
+ uint32_t l2ibar; /* 4018 */
+ uint32_t l2iwc; /* 401c */
+ uint32_t l1pibar; /* 4020 */
+ uint32_t l1piwc; /* 4024 */
+ uint32_t res5[2]; /* 4028 - 402f */
+ uint32_t l1dwibar; /* 4030 */
+ uint32_t l1dwiwc; /* 4034 */
+ uint32_t res6[2]; /* 4038 - 403f */
+ uint32_t l1dwbar; /* 4040 */
+ uint32_t l1dwwc; /* 4044 */
+ uint32_t l1dibar; /* 4048 */
+ uint32_t l1diwc; /* 404c */
+ uint32_t res7[1004]; /* 4050 - 4fff */
+ uint32_t l2wb; /* 5000 */
+ uint32_t l2wbinv; /* 5004 */
+ uint32_t l2inv; /* 5008 */
+ uint32_t res8[7]; /* 500c - 5027 */
+ uint32_t l1pinv; /* 5028 */
+ uint32_t res9[5]; /* 502c - 503f */
+ uint32_t l1dwb; /* 5040 */
+ uint32_t l1dwbinv; /* 5044 */
+ uint32_t l1dinv; /* 5048 */
+ uint32_t res10[3053]; /* 504c - 7fff */
+ uint32_t mar[256]; /* 8000 - 83ff */
+ uint32_t res11[7936]; /* 8400 - ffff */
+};
+
+struct corepack_regs {
+ uint32_t res1[32768]; /* 0180 0000 - 0181 ffff */
+ struct corepack_idma_regs idma_regs; /* 0182 0000 - 0182 ffff */
+ uint32_t res2[16384]; /* 0183 0000 - 0183 ffff */
+ struct corepack_cache_regs cache_regs; /* 0184 0000 - 0184 ffff */
+};
+
+static volatile struct corepack_regs * const corepack_regs =
+ (struct corepack_regs *)(0x01800000ul);
+
+#endif
diff --git a/blis/config/am57x/edmamgr.h b/blis/config/am57x/edmamgr.h
--- /dev/null
@@ -0,0 +1,104 @@
+#ifndef _EdmaMgr_h
+#define _EdmaMgr_h
+#include <stdint.h>
+
+typedef void *EdmaMgr_Handle;
+
+int32_t EdmaMgr_init (int32_t proc_id, void* edma3_config);
+
+EdmaMgr_Handle EdmaMgr_alloc (int32_t max_linked_transfers);
+
+int32_t EdmaMgr_free (EdmaMgr_Handle h);
+
+void EdmaMgr_wait (EdmaMgr_Handle h);
+
+int32_t EdmaMgr_copy1D1D (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes);
+
+int32_t EdmaMgr_copy1D2D (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes,
+ int32_t num_lines,
+ int32_t pitch);
+
+int32_t EdmaMgr_copy2D1D (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes,
+ int32_t num_lines,
+ int32_t pitch);
+
+int32_t EdmaMgr_copy2D2D (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes,
+ int32_t num_lines,
+ int32_t pitch);
+
+int32_t EdmaMgr_copy2D2DSep (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes,
+ int32_t num_lines,
+ int32_t src_pitch,
+ int32_t dst_pitch);
+
+int32_t EdmaMgr_copy1D1DLinked (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copy1D2DLinked (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_lines[],
+ int32_t pitch[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copy2D1DLinked (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_lines[],
+ int32_t pitch[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copy2D2DLinked (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_lines[],
+ int32_t pitch[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copy2D2DSepLinked(EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_lines[],
+ int32_t src_pitch[],
+ int32_t dst_pitch[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copyFast (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst);
+
+int32_t EdmaMgr_copyLinkedFast (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_transfers);
+
+#define EdmaMgr_SUCCESS 0
+#define EdmaMgr_ERROR_INVARG -1
+#define EdmaMgr_ERROR_INVCFG -2
+#define EdmaMgr_ERROR_RMANINIT -3
+#define EdmaMgr_ERROR_INVHANDLE -4
+#define EdmaMgr_ERROR_FREE -5
+
+#endif
diff --git a/blis/config/am57x/idma.h b/blis/config/am57x/idma.h
--- /dev/null
+++ b/blis/config/am57x/idma.h
@@ -0,0 +1,70 @@
+/**
+ Simple IDMA helper functions.
+*/
+
+/*
+
+Copyright (c) 2012 Kungliga Tekniska Högskolan
+(Royal Institute of Technology, Stockholm, Sweden).
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+*/
+
+#ifndef RT_IDMA_H
+#define RT_IDMA_H
+
+#include "corepack_regs.h"
+
+static inline void
+idma1_setup(void *dest,
+ const void *source,
+ unsigned size,
+ unsigned fill,
+ unsigned inter,
+ unsigned priority)
+{
+ uint32_t count;
+
+ count = size | (fill << 16) | (inter << 28) | (priority << 29);
+
+ corepack_regs->idma_regs.idma1_source = (uint32_t)source;
+ corepack_regs->idma_regs.idma1_dest = (uint32_t)dest;
+ corepack_regs->idma_regs.idma1_count = count;
+}
+
+static inline uint32_t
+idma1_status()
+{
+ return corepack_regs->idma_regs.idma1_stat;
+}
+
+static inline int
+idma1_done()
+{
+ return idma1_status() == 0;
+}
+
+#endif
diff --git a/blis/config/am57x/kernels/1m/bli_packm_cxk_ukernels.c b/blis/config/am57x/kernels/1m/bli_packm_cxk_ukernels.c
--- /dev/null
@@ -0,0 +1,193 @@
+/*\r
+\r
+ BLIS\r
+ An object-based framework for developing high-performance BLAS-like\r
+ libraries.\r
+\r
+ Copyright (C) 2014, The University of Texas\r
+\r
+ Redistribution and use in source and binary forms, with or without\r
+ modification, are permitted provided that the following conditions are\r
+ met:\r
+ - Redistributions of source code must retain the above copyright\r
+ notice, this list of conditions and the following disclaimer.\r
+ - Redistributions in binary form must reproduce the above copyright\r
+ notice, this list of conditions and the following disclaimer in the\r
+ documentation and/or other materials provided with the distribution.\r
+ - Neither the name of The University of Texas nor the names of its\r
+ contributors may be used to endorse or promote products derived\r
+ from this software without specific prior written permission.\r
+\r
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+\r
+*/\r
+\r
+#include "blis.h"\r
+\r
+/* Need to implement optimization for various cases */\r
+\r
+void bli_spackm_4xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ )\r
+{\r
+\r
+ float* restrict kappa_cast = kappa;\r
+ dim_t index; \r
+\r
+ if(*kappa_cast == 1.0f)\r
+ {\r
+ if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+\r
+ for(index=0;index<n;index++)\r
+ {\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ ptrA0 += ((lda>>1)-2);\r
+ ptrP0 += ((ldp>>1)-2);\r
+ }\r
+ return;\r
+ }\r
+ else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);\r
+ __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);\r
+ __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+ __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);\r
+ __float2_t val0, val1;\r
+ dim_t n_iter = n >> 1;\r
+ dim_t n_left = n & 1;\r
+\r
+ for(index=0;index<n_iter;index++)\r
+ {\r
+ val0 = *ptrA0++;\r
+ val1 = *ptrA1++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA2++;\r
+ val1 = *ptrA3++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ ptrP0 += ((ldp)-2);\r
+ ptrP1 += ((ldp)-2);\r
+ }\r
+ if(n_left)\r
+ {\r
+ float *restrict ptrA = ((float *) a+2*n_iter);\r
+ float *restrict ptrP = ((float *) p+2*n_iter*ldp);\r
+ ptrP[0] = ptrA[0];\r
+ ptrP[1] = ptrA[inca];\r
+ ptrP[2] = ptrA[2*inca];\r
+ ptrP[3] = ptrA[3*inca];\r
+ } \r
+ return; \r
+ }\r
+ }\r
+\r
+ /* handle unoptimized case using default packing routine */\r
+ bli_spackm_ref_4xk(conja, n, kappa, a, inca, lda, p, ldp);\r
+}\r
+\r
+void bli_spackm_8xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ )\r
+{\r
+\r
+ float* restrict kappa_cast = kappa;\r
+ dim_t index;\r
+\r
+ if(*kappa_cast == 1.0f)\r
+ {\r
+ if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+ for(index=0;index<n;index++)\r
+ {\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ ptrA0 += ((lda>>1)-4);\r
+ ptrP0 += ((ldp>>1)-4);\r
+ }\r
+ return;\r
+ }\r
+ else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);\r
+ __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);\r
+ __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);\r
+ __float2_t *restrict ptrA4 = (__float2_t *) (((float *) a)+4*inca);\r
+ __float2_t *restrict ptrA5 = (__float2_t *) (((float *) a)+5*inca);\r
+ __float2_t *restrict ptrA6 = (__float2_t *) (((float *) a)+6*inca);\r
+ __float2_t *restrict ptrA7 = (__float2_t *) (((float *) a)+7*inca);\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+ __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);\r
+ __float2_t val0, val1;\r
+ dim_t n_iter = n >> 1;\r
+ dim_t n_left = n & 1;\r
+ for(index=0;index<n_iter;index++)\r
+ {\r
+ val0 = *ptrA0++;\r
+ val1 = *ptrA1++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA2++;\r
+ val1 = *ptrA3++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA4++;\r
+ val1 = *ptrA5++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA6++;\r
+ val1 = *ptrA7++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ ptrP0 += ((ldp)-4);\r
+ ptrP1 += ((ldp)-4);\r
+ }\r
+ if(n_left)\r
+ {\r
+ float *restrict ptrA = ((float *) a+2*n_iter);\r
+ float *restrict ptrP = ((float *) p+2*n_iter*ldp);\r
+ ptrP[0] = ptrA[0];\r
+ ptrP[1] = ptrA[inca];\r
+ ptrP[2] = ptrA[2*inca];\r
+ ptrP[3] = ptrA[3*inca];\r
+ ptrP[4] = ptrA[4*inca];\r
+ ptrP[5] = ptrA[5*inca];\r
+ ptrP[6] = ptrA[6*inca];\r
+ ptrP[7] = ptrA[7*inca];\r
+ }\r
+ return; \r
+ }\r
+ }\r
+ /* handle unoptimized case using default packing routine */\r
+ bli_spackm_ref_8xk(conja, n, kappa, a, inca, lda, p, ldp);\r
+}\r
+\r
diff --git a/blis/config/am57x/kernels/1m/bli_packm_cxk_ukernels.h b/blis/config/am57x/kernels/1m/bli_packm_cxk_ukernels.h
--- /dev/null
@@ -0,0 +1,51 @@
+/*\r
+\r
+ BLIS\r
+ An object-based framework for developing high-performance BLAS-like\r
+ libraries.\r
+\r
+ Copyright (C) 2014, The University of Texas\r
+\r
+ Redistribution and use in source and binary forms, with or without\r
+ modification, are permitted provided that the following conditions are\r
+ met:\r
+ - Redistributions of source code must retain the above copyright\r
+ notice, this list of conditions and the following disclaimer.\r
+ - Redistributions in binary form must reproduce the above copyright\r
+ notice, this list of conditions and the following disclaimer in the\r
+ documentation and/or other materials provided with the distribution.\r
+ - Neither the name of The University of Texas nor the names of its\r
+ contributors may be used to endorse or promote products derived\r
+ from this software without specific prior written permission.\r
+\r
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+\r
+*/\r
+\r
+void bli_spackm_4xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ );\r
+\r
+void bli_spackm_8xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ );\r
+\r
+\r
diff --git a/blis/config/am57x/kernels/3/bli_gemm_ukernels.c b/blis/config/am57x/kernels/3/bli_gemm_ukernels.c
--- /dev/null
@@ -0,0 +1,1451 @@
+/*\r
+\r
+ BLIS\r
+ An object-based framework for developing high-performance BLAS-like\r
+ libraries.\r
+\r
+ Copyright (C) 2014, The University of Texas\r
+\r
+ Redistribution and use in source and binary forms, with or without\r
+ modification, are permitted provided that the following conditions are\r
+ met:\r
+ - Redistributions of source code must retain the above copyright\r
+ notice, this list of conditions and the following disclaimer.\r
+ - Redistributions in binary form must reproduce the above copyright\r
+ notice, this list of conditions and the following disclaimer in the\r
+ documentation and/or other materials provided with the distribution.\r
+ - Neither the name of The University of Texas nor the names of its\r
+ contributors may be used to endorse or promote products derived\r
+ from this software without specific prior written permission.\r
+\r
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+\r
+ */\r
+#include "blis.h"\r
+\r
+void bli_sgemm_ukernel_4x8(\r
+ dim_t k,\r
+ float* restrict alpha,\r
+ float* restrict a,\r
+ float* restrict b,\r
+ float* restrict beta,\r
+ float* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;\r
+ __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;\r
+ __float2_t * restrict ptrB = (__float2_t *) b;\r
+ __float2_t * restrict ptrA = (__float2_t *) a;\r
+ __float2_t * restrict ptrC;\r
+ __float2_t regA2;\r
+ int_least16_t index;\r
+ __float2_t regB2;\r
+\r
+ //touch routine: both a & b\r
+ //Length of b = NR*K*size of float;\r
+#ifdef BLIS_ENABLE_PREFETCH\r
+ //touch(a, k*BLIS_DEFAULT_MR_S*4);\r
+#endif\r
+\r
+ // zero out accumulators\r
+ sum0 = 0.0;\r
+ sum1 = 0.0;\r
+ sum2 = 0.0;\r
+ sum3 = 0.0;\r
+ sum4 = 0.0;\r
+ sum5 = 0.0;\r
+ sum6 = 0.0;\r
+ sum7 = 0.0;\r
+ sum8 = 0.0;\r
+ sum9 = 0.0;\r
+ suma = 0.0;\r
+ sumb = 0.0;\r
+ sumc = 0.0;\r
+ sumd = 0.0;\r
+ sume = 0.0;\r
+ sumf = 0.0;\r
+\r
+\r
+ for (index = 0; index < k; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 4x1 by 1x8\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 4x8 matrix\r
+ __float2_t b01, b23, b45, b67, a01, a23;\r
+ __x128_t reg128;\r
+\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+ b45 = *ptrB++;\r
+ b67 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum0 = _daddsp(sum0, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum1 = _daddsp(sum1, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ sum2 = _daddsp(sum2, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sum3 = _daddsp(sum3, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b45);\r
+ // accumulate a[0]*b[5] and -a[0]*b[4]\r
+ sum4 = _daddsp(sum4, _lof2_128(reg128));\r
+ // accumulate a[1]*b[4] and a[1]*b[5]\r
+ sum5 = _daddsp(sum5, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b67);\r
+ // accumulate a[0]*b[7] and -a[0]*b[6]\r
+ sum6 = _daddsp(sum6, _lof2_128(reg128));\r
+ // accumulate a[1]*b[6] and a[1]*b[7]\r
+ sum7 = _daddsp(sum7, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sum8 = _daddsp(sum8, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sum9 = _daddsp(sum9, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ suma = _daddsp(suma, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sumb = _daddsp(sumb, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b45);\r
+ // accumulate a[2]*b[5] and -a[2]*b[4]\r
+ sumc = _daddsp(sumc, _lof2_128(reg128));\r
+ // accumulate a[3]*b[4] and a[3]*b[5]\r
+ sumd = _daddsp(sumd, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b67);\r
+ // accumulate a[2]*b[7] and -a[2]*b[6]\r
+ sume = _daddsp(sume, _lof2_128(reg128));\r
+ // accumulate a[3]*b[6] and a[3]*b[7]\r
+ sumf = _daddsp(sumf, _hif2_128(reg128));\r
+\r
+ }\r
+\r
+ regA2 = _ftof2(*alpha, *alpha);\r
+ regB2 = _ftof2(*beta, *beta);\r
+ if (rs_c != 1)\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+ __float2_t c8, c9, ca, cb, cc, cd, ce, cf;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++; //c[0,0] and c[0,1]\r
+ c1 = *ptrC--; //c[0,2] and c[0,3]\r
+ ptrC += (rs_c>>1);\r
+ c4 = *ptrC++; //c[1,0] and c[1,1]\r
+ c5 = *ptrC--; //c[1,2] and c[1,3]\r
+ ptrC += (rs_c>>1);\r
+ c8 = *ptrC++; //c[2,0] and c[2,1]\r
+ c9 = *ptrC--; //c[2,2] and c[2,3]\r
+ ptrC += (rs_c>>1);\r
+ cc = *ptrC++; //c[3,0] and c[3,1]\r
+ cd = *ptrC--; //c[3,2] and c[3,3]\r
+\r
+ ptrC = (__float2_t *) c + 2;\r
+ c2 = *ptrC++; //c[0,4] and c[0,5]\r
+ c3 = *ptrC--; //c[0,6] and c[0,7]\r
+ ptrC += (rs_c>>1);\r
+ c6 = *ptrC++; //c[1,4] and c[1,5]\r
+ c7 = *ptrC--; //c[1,6] and c[1,7]\r
+ ptrC += (rs_c>>1);\r
+ ca = *ptrC++; //c[2,4] and c[2,5]\r
+ cb = *ptrC--; //c[2,6] and c[2,7]\r
+ ptrC += (rs_c>>1);\r
+ ce = *ptrC++; //c[3,4] and c[3,5]\r
+ cf = *ptrC; //c[3,6] and c[3,7]\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+ sum8 = _dmpysp(regA2, sum8);\r
+ sum9 = _dmpysp(regA2, sum9);\r
+ suma = _dmpysp(regA2, suma);\r
+ sumb = _dmpysp(regA2, sumb);\r
+ sumc = _dmpysp(regA2, sumc);\r
+ sumd = _dmpysp(regA2, sumd);\r
+ sume = _dmpysp(regA2, sume);\r
+ sumf = _dmpysp(regA2, sumf);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+ c8 = _dmpysp(c8, regB2);\r
+ c9 = _dmpysp(c9, regB2);\r
+ ca = _dmpysp(ca, regB2);\r
+ cb = _dmpysp(cb, regB2);\r
+ cc = _dmpysp(cc, regB2);\r
+ cd = _dmpysp(cd, regB2);\r
+ ce = _dmpysp(ce, regB2);\r
+ cf = _dmpysp(cf, regB2);\r
+\r
+ // update c[0,0] and c[1,0]\r
+ c0 = _daddsp(_ftof2(_lof2(sum0),-_hif2(sum0)),c0);\r
+ // update c[0,2] and c[0,3]\r
+ c1 = _daddsp(_ftof2(_lof2(sum2),-_hif2(sum2)),c1);\r
+ //update c[0,4] and c[0,5]\r
+ c2 = _daddsp(_ftof2(_lof2(sum4),-_hif2(sum4)),c2);\r
+ // update c[0,6] and c[0,7]\r
+ c3 = _daddsp(_ftof2(_lof2(sum6),-_hif2(sum6)),c3);\r
+\r
+ //update c[1,0] and c[1,1]\r
+ c4 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum1)),c4);\r
+ //update c[1,2] and c[1,3]\r
+ c5 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum3)),c5);\r
+ //update c[1,4] and c[1,5]\r
+ c6 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum5)),c6);\r
+ //update c[1,6] and c[1,7]\r
+ c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum7)),c7);\r
+\r
+ // update c[2,0] and c[2,0]\r
+ c8 = _daddsp(_ftof2(_lof2(sum8),-_hif2(sum8)),c8);\r
+ // update c[2,2] and c[2,3]\r
+ c9 = _daddsp(_ftof2(_lof2(suma),-_hif2(suma)),c9);\r
+ //update c[2,4] and c[2,5]\r
+ ca = _daddsp(_ftof2(_lof2(sumc),-_hif2(sumc)),ca);\r
+ // update c[2,6] and c[2,7]\r
+ cb = _daddsp(_ftof2(_lof2(sume),-_hif2(sume)),cb);\r
+\r
+ //update c[3,0] and c[3,1]\r
+ cc = _daddsp(_ftof2(_hif2(sum9),_lof2(sum9)),cc);\r
+ //update c[3,2] and c[3,3]\r
+ cd = _daddsp(_ftof2(_hif2(sumb),_lof2(sumb)),cd);\r
+ //update c[3,4] and c[3,5]\r
+ ce = _daddsp(_ftof2(_hif2(sumd),_lof2(sumd)),ce);\r
+ //update c[3,6] and c[3,7]\r
+ cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sumf)),cf);\r
+\r
+// // update c[0,0] and c[1,0]\r
+// c0 = (c + 0*rs_c + 0*cs_c);\r
+// c1 = (c + 1*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum1),-_hif2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,1] and c[1,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum1),_lof2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,2] and c[1,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum3),-_hif2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,3] and c[1,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum3),_lof2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,4] and c[1,4]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum5),-_hif2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,5] and c[1,5]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum5),_lof2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,6] and c[1,6]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum7),-_hif2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,7] and c[1,7]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum7),_lof2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,0] and c[3,0]\r
+// c0 = (c + 2*rs_c + 0*cs_c);\r
+// c1 = (c + 3*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum9),-_hif2(sum8));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,1] and c[3,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum9),_lof2(sum8));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,2] and c[3,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sumb),-_hif2(suma));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,3] and c[3,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sumb),_lof2(suma));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,4] and c[3,4]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sumd),-_hif2(sumc));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,5] and c[3,5]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sumd),_lof2(sumc));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,6] and c[2,6]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sumf),-_hif2(sume));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,7] and c[2,7]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sumf),_lof2(sume));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+\r
+ }\r
+ else\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+ __float2_t c8, c9, ca, cb, cc, cd, ce, cf;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++; //c[0,0] and c[1,0]\r
+ c1 = *ptrC--; //c[2,0] and c[3,0]\r
+ ptrC += (cs_c>>1); // divide by 2 because ptrC is __float2_t, and cs_c is the stride for floats\r
+ c2 = *ptrC++; //c[0,1] and c[1,1]\r
+ c3 = *ptrC--; //c[2,1] and c[3,1]\r
+ ptrC += (cs_c>>1);\r
+ c4 = *ptrC++; //c[0,2] and c[1,2]\r
+ c5 = *ptrC--; //c[2,2] and c[3,2]\r
+ ptrC += (cs_c>>1);\r
+ c6 = *ptrC++; //c[0,3] and c[1,3]\r
+ c7 = *ptrC--; //c[2,3] and c[3,3]\r
+ ptrC += (cs_c>>1);\r
+ c8 = *ptrC++; //c[0,4] and c[1,0]\r
+ c9 = *ptrC--; //c[2,4] and c[3,0]\r
+ ptrC += (cs_c>>1);\r
+ ca = *ptrC++; //c[0,5] and c[1,0]\r
+ cb = *ptrC--; //c[2,5] and c[3,0]\r
+ ptrC += (cs_c>>1);\r
+ cc = *ptrC++; //c[0,6] and c[1,0]\r
+ cd = *ptrC--; //c[2,6] and c[3,0]\r
+ ptrC += (cs_c>>1);\r
+ ce = *ptrC++; //c[0,7] and c[1,0]\r
+ cf = *ptrC; //c[2,7] and c[3,0]\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+ sum8 = _dmpysp(regA2, sum8);\r
+ sum9 = _dmpysp(regA2, sum9);\r
+ suma = _dmpysp(regA2, suma);\r
+ sumb = _dmpysp(regA2, sumb);\r
+ sumc = _dmpysp(regA2, sumc);\r
+ sumd = _dmpysp(regA2, sumd);\r
+ sume = _dmpysp(regA2, sume);\r
+ sumf = _dmpysp(regA2, sumf);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+ c8 = _dmpysp(c8, regB2);\r
+ c9 = _dmpysp(c9, regB2);\r
+ ca = _dmpysp(ca, regB2);\r
+ cb = _dmpysp(cb, regB2);\r
+ cc = _dmpysp(cc, regB2);\r
+ cd = _dmpysp(cd, regB2);\r
+ ce = _dmpysp(ce, regB2);\r
+ cf = _dmpysp(cf, regB2);\r
+\r
+ // update c[0,0] and c[1,0]\r
+ c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);\r
+ // update c[2,0] and c[3,0]\r
+ c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);\r
+ //update c[0,1] and c[1,1]\r
+ c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);\r
+ // update c[2,1] and c[3,1]\r
+ c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);\r
+ //update c[0,2] and c[1,2]\r
+ c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);\r
+ //update c[2,2] and c[3,2]\r
+ c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);\r
+ //update c[0,3] and c[1,3]\r
+ c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);\r
+ //update c[2,3] and c[3,3]\r
+ c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);\r
+ //update c[0,4] and c[1,4]\r
+ c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);\r
+ //update c[2,4] and c[3,4]\r
+ c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);\r
+ //update c[0,5] and c[1,5]\r
+ ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);\r
+ //update c[2,5] and c[3,5]\r
+ cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);\r
+ //update c[0,6] and c[1,6]\r
+ cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);\r
+ //update c[2,6] and c[3,6]\r
+ cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);\r
+ //update c[0,7] and c[1,7]\r
+ ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);\r
+ //update c[2,7] and c[3,7]\r
+ cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);\r
+\r
+ ptrC = (__float2_t *) c;\r
+ *ptrC++ = c0;\r
+ *ptrC-- = c1;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c2;\r
+ *ptrC-- = c3;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c4;\r
+ *ptrC-- = c5;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c6;\r
+ *ptrC-- = c7;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c8;\r
+ *ptrC-- = c9;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = ca;\r
+ *ptrC-- = cb;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = cc;\r
+ *ptrC-- = cd;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = ce;\r
+ *ptrC = cf;\r
+ }\r
+}\r
+\r
+void bli_sgemm_ukernel_4x4(\r
+ dim_t k,\r
+ float* restrict alpha,\r
+ float* restrict a,\r
+ float* restrict b,\r
+ float* restrict beta,\r
+ float* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;\r
+ __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;\r
+ __float2_t * restrict ptrB = (__float2_t *) b;\r
+ __float2_t * restrict ptrA = (__float2_t *) a;\r
+ __float2_t * restrict ptrC;\r
+ __float2_t regA2, regB2;\r
+ int_least16_t index;\r
+ int kEven, kLeft;\r
+\r
+ // zero out accumulators\r
+ sum0 = 0.0;\r
+ sum1 = 0.0;\r
+ sum2 = 0.0;\r
+ sum3 = 0.0;\r
+ sum4 = 0.0;\r
+ sum5 = 0.0;\r
+ sum6 = 0.0;\r
+ sum7 = 0.0;\r
+ sum8 = 0.0;\r
+ sum9 = 0.0;\r
+ suma = 0.0;\r
+ sumb = 0.0;\r
+ sumc = 0.0;\r
+ sumd = 0.0;\r
+ sume = 0.0;\r
+ sumf = 0.0;\r
+\r
+ kEven=k>>1;\r
+ kLeft=k&1;\r
+\r
+ for (index = 0; index < kEven; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 4x1 by 1x4\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 4x4 matrix\r
+ __float2_t b01, b23, a01, a23;\r
+ __x128_t reg128;\r
+\r
+ // for even k\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum0 = _daddsp(sum0, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum1 = _daddsp(sum1, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ sum2 = _daddsp(sum2, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sum3 = _daddsp(sum3, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sum4 = _daddsp(sum4, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sum5 = _daddsp(sum5, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ sum6 = _daddsp(sum6, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sum7 = _daddsp(sum7, _hif2_128(reg128));\r
+\r
+\r
+\r
+ // for odd k\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum8 = _daddsp(sum8, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum9 = _daddsp(sum9, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ suma = _daddsp(suma, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sumb = _daddsp(sumb, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sumc = _daddsp(sumc, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sumd = _daddsp(sumd, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ sume = _daddsp(sume, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sumf = _daddsp(sumf, _hif2_128(reg128));\r
+\r
+ }\r
+ if(kLeft)\r
+ { // last k if left;\r
+ __float2_t b01, b23, a01, a23;\r
+ __x128_t reg128;\r
+\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum0 = _daddsp(sum0, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum1 = _daddsp(sum1, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ sum2 = _daddsp(sum2, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sum3 = _daddsp(sum3, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sum4 = _daddsp(sum4, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sum5 = _daddsp(sum5, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ sum6 = _daddsp(sum6, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sum7 = _daddsp(sum7, _hif2_128(reg128));\r
+\r
+ }\r
+\r
+ sum0 = _daddsp(sum0, sum8);\r
+ sum1 = _daddsp(sum1, sum9);\r
+ sum2 = _daddsp(sum2, suma);\r
+ sum3 = _daddsp(sum3, sumb);\r
+ sum4 = _daddsp(sum4, sumc);\r
+ sum5 = _daddsp(sum5, sumd);\r
+ sum6 = _daddsp(sum6, sume);\r
+ sum7 = _daddsp(sum7, sumf);\r
+\r
+\r
+ regA2 = _ftof2(*alpha, *alpha);\r
+ regB2 = _ftof2(*beta, *beta);\r
+ if (rs_c != 1)\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++; //c[0,0] and c[0,1]\r
+ c1 = *ptrC--; //c[0,2] and c[0,3]\r
+ ptrC += (rs_c>>1);\r
+ c2 = *ptrC++; //c[1,0] and c[1,1]\r
+ c3 = *ptrC--; //c[1,2] and c[1,3]\r
+ ptrC += (rs_c>>1);\r
+ c4 = *ptrC++; //c[2,0] and c[2,1]\r
+ c5 = *ptrC--; //c[2,2] and c[2,3]\r
+ ptrC += (rs_c>>1);\r
+ c6 = *ptrC++; //c[3,0] and c[3,1]\r
+ c7 = *ptrC--; //c[3,2] and c[3,3]\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+\r
+ // update c[0,0] and c[0,1]\r
+ c0 = _daddsp(_ftof2(_lof2(sum0),-_hif2(sum0)),c0);\r
+ // update c[0,2] and c[0,3]\r
+ c1 = _daddsp(_ftof2(_lof2(sum2),-_hif2(sum2)),c1);\r
+ // update c[1,0] and c[1,1]\r
+ c2 = _daddsp(_ftof2(_hif2(sum1), _lof2(sum1)),c2);\r
+ // update c[1,2] and c[1,2]\r
+ c3 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum3)),c3);\r
+ // update c[2,0] and c[2,1]\r
+ c4 = _daddsp(_ftof2(_lof2(sum4),-_hif2(sum4)),c4);\r
+ // update c[2,2] and c[2,3]\r
+ c5 = _daddsp(_ftof2(_lof2(sum6),-_hif2(sum6)),c5);\r
+ // update c[3,0] and c[3,1]\r
+ c6 = _daddsp(_ftof2(_hif2(sum5), _lof2(sum5)),c6);\r
+ // update c[3,2] and c[3,2]\r
+ c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum7)),c7);\r
+\r
+ ptrC = (__float2_t *) c;\r
+ *ptrC++ = c0; //c[0,0] and c[0,1]\r
+ *ptrC-- = c1; //c[0,2] and c[0,3]\r
+ ptrC += (rs_c>>1);\r
+ *ptrC++ = c2; //c[1,0] and c[1,1]\r
+ *ptrC-- = c3; //c[1,2] and c[1,3]\r
+ ptrC += (rs_c>>1);\r
+ *ptrC++ = c4; //c[2,0] and c[2,1]\r
+ *ptrC-- = c5; //c[2,2] and c[2,3]\r
+ ptrC += (rs_c>>1);\r
+ *ptrC++ = c6; //c[3,0] and c[3,1]\r
+ *ptrC-- = c7; //c[3,2] and c[3,3]\r
+\r
+// // update c[0,0] and c[1,0]\r
+// c0 = (c + 0*rs_c + 0*cs_c);\r
+// c1 = (c + 1*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum1),-_hif2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,1] and c[1,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum1),_lof2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,2] and c[1,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum3),-_hif2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,3] and c[1,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum3),_lof2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,0] and c[3,0]\r
+// c0 = (c + 2*rs_c + 0*cs_c);\r
+// c1 = (c + 3*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum5),-_hif2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,1] and c[3,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum5),_lof2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,2] and c[3,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum7),-_hif2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,3] and c[3,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum7),_lof2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+\r
+ }\r
+ else\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++;\r
+ c1 = *ptrC--;\r
+ ptrC += (cs_c>>1);\r
+ c2 = *ptrC++;\r
+ c3 = *ptrC--;\r
+ ptrC += (cs_c>>1);\r
+ c4 = *ptrC++;\r
+ c5 = *ptrC--;\r
+ ptrC += (cs_c>>1);\r
+ c6 = *ptrC++;\r
+ c7 = *ptrC--;\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+\r
+ // update c[0,0] and c[1,0]\r
+ c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);\r
+ // update c[2,0] and c[3,0]\r
+ c1 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c1);\r
+ //update c[0,1] and c[1,1]\r
+ c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);\r
+ // update c[2,1] and c[3,1]\r
+ c3 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),c3);\r
+ //update c[0,2] and c[1,2]\r
+ c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);\r
+ //update c[2,2] and c[3,2]\r
+ c5 = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),c5);\r
+ //update c[0,3] and c[1,3]\r
+ c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);\r
+ //update c[2,3] and c[3,3]\r
+ c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),c7);\r
+\r
+ ptrC = (__float2_t *) c;\r
+ *ptrC++ = c0;\r
+ *ptrC-- = c1;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c2;\r
+ *ptrC-- = c3;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c4;\r
+ *ptrC-- = c5;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c6;\r
+ *ptrC-- = c7;\r
+\r
+ }\r
+}\r
+\r
+\r
+//void dgemmKernel(const double *pA, const double *pB, double *pC, const double a, const int k, const int stepC)\r
+void bli_dgemm_ukernel_4x4(\r
+ dim_t k,\r
+ double* restrict alpha,\r
+ double* restrict a,\r
+ double* restrict b,\r
+ double* restrict beta,\r
+ double* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ double sum00, sum01, sum02, sum03;\r
+ double sum10, sum11, sum12, sum13;\r
+ double sum20, sum21, sum22, sum23;\r
+ double sum30, sum31, sum32, sum33;\r
+ int index;\r
+ double al = *alpha;\r
+ double be = *beta;\r
+\r
+ //touch routine: both a & b\r
+ //Length of b = NR*K*size of double;\r
+ //Length of a = MR*K*size of double;\r
+#ifdef BLIS_ENABLE_PREFETCH\r
+ //touch(b, k*BLIS_DEFAULT_NR_D*8);\r
+ //touch(a, k*BLIS_DEFAULT_MR_D*8);\r
+#endif\r
+\r
+ sum00 = 0.0;\r
+ sum01 = 0.0;\r
+ sum02 = 0.0;\r
+ sum03 = 0.0;\r
+ sum10 = 0.0;\r
+ sum11 = 0.0;\r
+ sum12 = 0.0;\r
+ sum13 = 0.0;\r
+ sum20 = 0.0;\r
+ sum21 = 0.0;\r
+ sum22 = 0.0;\r
+ sum23 = 0.0;\r
+ sum30 = 0.0;\r
+ sum31 = 0.0;\r
+ sum32 = 0.0;\r
+ sum33 = 0.0;\r
+\r
+ for(index = 0; index < k; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 4x1 by 1x4\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 4x4 matrix\r
+ register double a0, a1, a2, a3;\r
+ register double b0, b1, b2, b3;\r
+\r
+ a0 = *a++;\r
+ a1 = *a++;\r
+ a2 = *a++;\r
+ a3 = *a++;\r
+ b0 = *b++;\r
+ b1 = *b++;\r
+ b2 = *b++;\r
+ b3 = *b++;\r
+\r
+ // a[0]*b[0]\r
+ sum00 += a0*b0;\r
+ // a[0]*b[1]\r
+ sum01 += a0*b1;\r
+ // a[0]*b[2]\r
+ sum02 += a0*b2;\r
+ // a[0]*b[3]\r
+ sum03 += a0*b3;\r
+ // a[1]*b[0]\r
+ sum10 += a1*b0;\r
+ // a[1]*b[1]\r
+ sum11 += a1*b1;\r
+ // a[1]*b[2]\r
+ sum12 += a1*b2;\r
+ // a[1]*b[3]\r
+ sum13 += a1*b3;\r
+ // a[2]*b[0]\r
+ sum20 += a2*b0;\r
+ // a[2]*b[1]\r
+ sum21 += a2*b1;\r
+ // a[2]*b[2]\r
+ sum22 += a2*b2;\r
+ // a[2]*b[3]\r
+ sum23 += a2*b3;\r
+ // a[3]*b[0]\r
+ sum30 += a3*b0;\r
+ // a[3]*b[1]\r
+ sum31 += a3*b1;\r
+ // a[3]*b[2]\r
+ sum32 += a3*b2;\r
+ // a[3]*b[3]\r
+ sum33 += a3*b3;\r
+ }\r
+\r
+ double* restrict cptr;\r
+ // 0th Column\r
+ // updating C[00]\r
+ cptr = c;\r
+ *cptr = *cptr * be;\r
+ *cptr += sum00 * al;\r
+\r
+ // updating C[10]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum10 * al;\r
+\r
+ // updating C[20]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum20 * al;\r
+\r
+ // updating C[30]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum30 * al;\r
+\r
+ // 1st column\r
+ // updating C[01]\r
+ cptr = c + cs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum01 * al;\r
+\r
+ // updating C[11]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum11 * al;\r
+\r
+ // updating C[21]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum21 * al;\r
+\r
+ // updating C[31]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum31 * al;\r
+\r
+ // 2nd Column\r
+ // updating C[02]\r
+ cptr = c + 2*cs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum02 * al;\r
+\r
+ // updating C[12]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum12 * al;\r
+\r
+ // updating C[22]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum22 * al;\r
+\r
+ // updating C[32]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum32 * al;\r
+\r
+ // 3rd Column\r
+ // updating C[03]\r
+ cptr = c + 3*cs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum03 * al;\r
+\r
+ // updating C[13]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum13 * al;\r
+\r
+ // updating C[23]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum23 * al;\r
+\r
+ // updating C[33]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum33 * al;\r
+\r
+ return;\r
+}\r
+\r
+void bli_cgemm_ukernel_2x4(\r
+ dim_t k,\r
+ scomplex* restrict alpha,\r
+ scomplex* restrict a,\r
+ scomplex* restrict b,\r
+ scomplex* restrict beta,\r
+ scomplex* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ __float2_t sum00a, sum10a, sum00b, sum10b;\r
+ __float2_t sum01a, sum11a, sum01b, sum11b;\r
+ __float2_t sum02a, sum12a, sum02b, sum12b;\r
+ __float2_t sum03a, sum13a, sum03b, sum13b;\r
+ __float2_t * restrict ptrB = (__float2_t *) b;\r
+ __float2_t * restrict ptrA = (__float2_t *) a;\r
+ __float2_t * restrict ptrC;\r
+ __float2_t regA, regB, regC;\r
+ int_least16_t index;\r
+\r
+ // zero out accumulators\r
+ sum00a = 0.0;\r
+ sum10a = 0.0;\r
+ sum01a = 0.0;\r
+ sum11a = 0.0;\r
+ sum02a = 0.0;\r
+ sum12a = 0.0;\r
+ sum03a = 0.0;\r
+ sum13a = 0.0;\r
+ sum00b = 0.0;\r
+ sum10b = 0.0;\r
+ sum01b = 0.0;\r
+ sum11b = 0.0;\r
+ sum02b = 0.0;\r
+ sum12b = 0.0;\r
+ sum03b = 0.0;\r
+ sum13b = 0.0;\r
+\r
+ for (index = 0; index < k; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 2x1 by 1x4\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 2x4 matrix\r
+ __float2_t b0, b1, b2, b3, a0, a1;\r
+ __x128_t reg128;\r
+\r
+ a0 = *ptrA++;\r
+ a1 = *ptrA++;\r
+\r
+ b0 = *ptrB++;\r
+ b1 = *ptrB++;\r
+ b2 = *ptrB++;\r
+ b3 = *ptrB++;\r
+\r
+ // the four partial sums are accumulated independently\r
+ // a[0]*b[0]\r
+ reg128 = _cmpysp(a0, b0);\r
+ sum00a = _daddsp(sum00a, _lof2_128(reg128));\r
+ sum00b = _daddsp(sum00b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[0]\r
+ reg128 = _cmpysp(a1, b0);\r
+ sum10a = _daddsp(sum10a, _lof2_128(reg128));\r
+ sum10b = _daddsp(sum10b, _hif2_128(reg128));\r
+\r
+ // a[0]*b[1]\r
+ reg128 = _cmpysp(a0, b1);\r
+ sum01a = _daddsp(sum01a, _lof2_128(reg128));\r
+ sum01b = _daddsp(sum01b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[1]\r
+ reg128 = _cmpysp(a1, b1);\r
+ sum11a = _daddsp(sum11a, _lof2_128(reg128));\r
+ sum11b = _daddsp(sum11b, _hif2_128(reg128));\r
+\r
+ // a[0]*b[2]\r
+ reg128 = _cmpysp(a0, b2);\r
+ sum02a = _daddsp(sum02a, _lof2_128(reg128));\r
+ sum02b = _daddsp(sum02b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[2]\r
+ reg128 = _cmpysp(a1, b2);\r
+ sum12a = _daddsp(sum12a, _lof2_128(reg128));\r
+ sum12b = _daddsp(sum12b, _hif2_128(reg128));\r
+\r
+ // a[0]*b[3]\r
+ reg128 = _cmpysp(a0, b3);\r
+ sum03a = _daddsp(sum03a, _lof2_128(reg128));\r
+ sum03b = _daddsp(sum03b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[3]\r
+ reg128 = _cmpysp(a1, b3);\r
+ sum13a = _daddsp(sum13a, _lof2_128(reg128));\r
+ sum13b = _daddsp(sum13b, _hif2_128(reg128));\r
+ }\r
+\r
+ {\r
+ __x128_t reg128;\r
+ ptrA = (__float2_t *) alpha;\r
+ ptrB = (__float2_t *) beta;\r
+ regA = *ptrA;\r
+ regB = *ptrB;\r
+\r
+ // the value of a and the final values need to be\r
+ // rearranged due to the specific way cmpysp assumes\r
+ // data arrangement\r
+ regA =_ftof2(-_lof(regA), _hif(regA));\r
+ //regB = _ftof2(_lof(regB),_hif(regB));\r
+ ptrC = (__float2_t *) c;\r
+\r
+ // update and save c[0,0]\r
+ sum00a = _daddsp(sum00a, sum00b);\r
+ reg128 = _cmpysp(regA, sum00a);\r
+ sum00a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum00a),_hif(sum00a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c;\r
+\r
+ // update and save c[1,0]\r
+ sum10a = _daddsp(sum10a, sum10b);\r
+ reg128 = _cmpysp(regA, sum10a);\r
+ sum10a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum10a),_hif(sum10a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+\r
+ ptrC = (__float2_t *) c + cs_c;\r
+\r
+ // update and save c[0,1]\r
+ sum01a = _daddsp(sum01a, sum01b);\r
+ reg128 = _cmpysp(regA, sum01a);\r
+ sum01a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum01a),_hif(sum01a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c + cs_c;\r
+\r
+ // update and save c[1,1]\r
+ sum11a = _daddsp(sum11a, sum11b);\r
+ reg128 = _cmpysp(regA, sum11a);\r
+ sum11a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum11a),_hif(sum11a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + 2 * cs_c;\r
+\r
+ // update and save c[0,2]\r
+ sum02a = _daddsp(sum02a, sum02b);\r
+ reg128 = _cmpysp(regA, sum02a);\r
+ sum02a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum02a),_hif(sum02a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c + 2* cs_c;\r
+\r
+ // update and save c[1,2]\r
+ sum12a = _daddsp(sum12a, sum12b);\r
+ reg128 = _cmpysp(regA, sum12a);\r
+ sum12a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum12a),_hif(sum12a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + 3 * cs_c;\r
+\r
+ // update and save c[0,3]\r
+ sum03a = _daddsp(sum03a, sum03b);\r
+ reg128 = _cmpysp(regA, sum03a);\r
+ sum03a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum03a),_hif(sum03a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c + 3 * cs_c;\r
+\r
+ // update and save c[1,3]\r
+ sum13a = _daddsp(sum13a, sum13b);\r
+ reg128 = _cmpysp(regA, sum13a);\r
+ sum13a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum13a),_hif(sum13a)),_ftof2(_lof(regC),-_hif(regC)));\r
+ }\r
+ return;\r
+}\r
+\r
+void bli_zgemm_ukernel_2x2(\r
+ dim_t k,\r
+ dcomplex* restrict alpha,\r
+ dcomplex* restrict a,\r
+ dcomplex* restrict b,\r
+ dcomplex* restrict beta,\r
+ dcomplex* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ double * restrict ptrA = (double *) a;\r
+ double * restrict ptrB = (double *) b;\r
+ //double * restrict ptrC = (double *) c;\r
+ double sum00r, sum00i;\r
+ int index;\r
+ int kEven = k&0xFFFE;\r
+\r
+ sum00r = 0.0;\r
+ sum00i = 0.0;\r
+\r
+ if(k>4) // The loop is safe for k > 4\r
+ {\r
+#pragma UNROLL(2)\r
+ for(index = 0; index<kEven; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 1x1 by 1x1\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 1x1 matrix\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+\r
+ }\r
+ if(k&1) // odd k; one left to do\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ if(k>0)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ if(k>1)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ if(k>2)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ if(k>3)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+\r
+ }\r
+\r
+ { // final saving\r
+ double alphar, alphai, betar, betai, cr, ci;\r
+ alphar = alpha->real;\r
+ alphai = alpha->imag;\r
+ betar = beta->real;\r
+ betai = beta->imag;\r
+\r
+ cr = c->real;\r
+ ci = c->imag;\r
+\r
+ c->imag = (betar * ci + betai * cr);\r
+ c->real = (betar * cr - betai * ci);\r
+ c->real += (alphar * sum00r - alphai * sum00i);\r
+ c->imag += (alphar * sum00i + alphai * sum00r);\r
+ }\r
+\r
+\r
+ return;\r
+}\r
+\r
+\r
diff --git a/blis/config/am57x/kernels/3/bli_gemmtrsm_l_ukernels.c b/blis/config/am57x/kernels/3/bli_gemmtrsm_l_ukernels.c
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas at Austin
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas at Austin nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define bli_spacknr_trsm 4
+
+void bli_sgemmtrsm_l_ukernel_4x4(
+ dim_t k,
+ float* restrict alpha,
+ float* restrict a10,
+ float* restrict a11,
+ float* restrict b01,
+ float* restrict b11,
+ float* restrict c11, inc_t rs_c, inc_t cs_c,
+ auxinfo_t* data
+ )
+{
+ const inc_t rs_b = PASTEMAC(s,packnr_trsm);
+ const inc_t cs_b = 1;
+
+ float* restrict minus_one = PASTEMAC(s,m1);
+
+ /* b11 = alpha * b11 - a10 * b01; */
+ bli_sgemm_ukernel_4x4( k,
+ minus_one,
+ a10,
+ b01,
+ alpha,
+ b11, rs_b, cs_b,
+ data );
+
+ /* b11 = inv(a11) * b11;
+ c11 = b11; */
+ BLIS_STRSM_L_UKERNEL( a11,
+ b11,
+ c11, rs_c, cs_c,
+ data );
+}
diff --git a/blis/config/am57x/kernels/3/bli_gemmtrsm_u_ukernels.c b/blis/config/am57x/kernels/3/bli_gemmtrsm_u_ukernels.c
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas at Austin
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas at Austin nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define bli_spacknr_trsm 4
+
+
+void bli_sgemmtrsm_u_ukernel_4x4(
+ dim_t k,
+ float* restrict alpha,
+ float* restrict a12,
+ float* restrict a11,
+ float* restrict b21,
+ float* restrict b11,
+ float* restrict c11, inc_t rs_c, inc_t cs_c,
+ auxinfo_t* data
+ )
+{
+ const inc_t rs_b = PASTEMAC(s,packnr_trsm);
+ const inc_t cs_b = 1;
+
+ float* restrict minus_one = PASTEMAC(s,m1);
+
+ /* b11 = alpha * b11 - a12 * b21; */
+ bli_sgemm_ukernel_4x4( k,
+ minus_one,
+ a12,
+ b21,
+ alpha,
+ b11, rs_b, cs_b,
+ data );
+\
+ /* b11 = inv(a11) * b11;
+ c11 = b11; */ \
+
+ BLIS_STRSM_U_UKERNEL( a11, \
+ b11, \
+ c11, rs_c, cs_c, \
+ data ); \
+
+}
+
diff --git a/blis/config/am57x/kernels/3/bli_trsm_l_ukernels.c b/blis/config/am57x/kernels/3/bli_trsm_l_ukernels.c
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas at Austin
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas at Austin nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERsANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUs DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define bli_smr_trsm 4
+#define bli_snr_trsm 4
+
+#define bli_spacknr_trsm 4
+#define bli_spackmr_trsm 4
+
+void bli_strsm_l_ukernel_4x4( float* restrict a,
+ float* restrict b,
+ float* restrict c, inc_t rs_c, inc_t cs_c,
+ auxinfo_t* data
+ )
+{
+ const dim_t m = PASTEMAC(s,mr_trsm);
+ const dim_t n = PASTEMAC(s,nr_trsm);
+
+ const inc_t rs_a = 1;
+ const inc_t cs_a = PASTEMAC(s,packmr_trsm);
+
+ const inc_t rs_b = PASTEMAC(s,packnr_trsm);
+ const inc_t cs_b = 1;
+
+ dim_t iter, i, j, l;
+ dim_t n_behind;
+
+ for ( iter = 0; iter < m; ++iter )
+ {
+ i = iter;
+ n_behind = i;
+
+ float* restrict alpha11 = a + (i )*rs_a + (i )*cs_a;
+ float* restrict a10t = a + (i )*rs_a + (0 )*cs_a;
+ float* restrict B0 = b + (0 )*rs_b + (0 )*cs_b;
+ float* restrict b1 = b + (i )*rs_b + (0 )*cs_b;
+
+ /* b1 = b1 - a10t * B0; */
+ /* b1 = b1 / alpha11; */
+ for ( j = 0; j < n; ++j )
+ {
+ float* restrict b01 = B0 + (0 )*rs_b + (j )*cs_b;
+ float* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b;
+ float* restrict gamma11 = c + (i )*rs_c + (j )*cs_c;
+ float beta11c = *beta11;
+ float rho11;
+
+ /* beta11 = beta11 - a10t * b01; */
+ PASTEMAC(s,set0s)( rho11 );
+ for ( l = 0; l < n_behind; ++l )
+ {
+ float* restrict alpha10 = a10t + (l )*cs_a;
+ float* restrict beta01 = b01 + (l )*rs_b;
+
+ PASTEMAC(s,axpys)( *alpha10, *beta01, rho11 );
+ }
+ PASTEMAC(s,subs)( rho11, beta11c );
+
+ /* beta11 = beta11 / alpha11; */
+ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
+ of alpha11, so we can multiply rather than divide. We store
+ the inverse of alpha11 intentionally to avoid expensive
+ division instructions within the micro-kernel. */
+ PASTEMAC(s,scals)( *alpha11, beta11c );
+
+ /* Output final result to matrix c. */
+ PASTEMAC(s,copys)( beta11c, *gamma11 );
+
+ /* Store the local value back to b11. */
+ PASTEMAC(s,copys)( beta11c, *beta11 );
+ }
+ }
+}
+
diff --git a/blis/config/am57x/kernels/3/bli_trsm_u_ukernels.c b/blis/config/am57x/kernels/3/bli_trsm_u_ukernels.c
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas at Austin
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas at Austin nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERsANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUs DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define bli_smr_trsm 4
+#define bli_snr_trsm 4
+
+#define bli_spacknr_trsm 4
+#define bli_spackmr_trsm 4
+
+void bli_strsm_u_ukernel_4x4( float* restrict a,
+ float* restrict b,
+ float* restrict c, inc_t rs_c, inc_t cs_c,
+ auxinfo_t* data
+ )
+{
+ const dim_t m = PASTEMAC(s,mr_trsm);
+ const dim_t n = PASTEMAC(s,nr_trsm);
+
+ const inc_t rs_a = 1;
+ const inc_t cs_a = PASTEMAC(s,packmr_trsm);
+
+ const inc_t rs_b = PASTEMAC(s,packnr_trsm);
+ const inc_t cs_b = 1;
+
+ dim_t iter, i, j, l;
+ dim_t n_behind;
+
+ for ( iter = 0; iter < m; ++iter )
+ {
+ i = m - iter - 1;
+ n_behind = iter;
+
+ float* restrict alpha11 = a + (i )*rs_a + (i )*cs_a;
+ float* restrict a12t = a + (i )*rs_a + (i+1)*cs_a;
+ float* restrict b1 = b + (i )*rs_b + (0 )*cs_b;
+ float* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b;
+
+ /* b1 = b1 - a12t * B2; */
+ /* b1 = b1 / alpha11; */
+ for ( j = 0; j < n; ++j )
+ {
+ float* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b;
+ float* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b;
+ float* restrict gamma11 = c + (i )*rs_c + (j )*cs_c;
+ float beta11c = *beta11;
+ float rho11;
+
+ /* beta11 = beta11 - a12t * b21; */
+ PASTEMAC(s,set0s)( rho11 );
+ for ( l = 0; l < n_behind; ++l )
+ {
+ float* restrict alpha12 = a12t + (l )*cs_a;
+ float* restrict beta21 = b21 + (l )*rs_b;
+
+ PASTEMAC(s,axpys)( *alpha12, *beta21, rho11 );
+ }
+ PASTEMAC(s,subs)( rho11, beta11c );
+
+ /* beta11 = beta11 / alpha11; */
+ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
+ of alpha11, so we can multiply rather than divide. We store
+ the inverse of alpha11 intentionally to avoid expensive
+ division instructions within the micro-kernel. */
+ PASTEMAC(s,scals)( *alpha11, beta11c );
+
+ /* Output final result to matrix c. */
+ PASTEMAC(s,copys)( beta11c, *gamma11 );
+
+ /* Store the local value back to b11. */
+ PASTEMAC(s,copys)( beta11c, *beta11 );
+ }
+ }
+}
diff --git a/blis/config/am57x/make_defs.mk b/blis/config/am57x/make_defs.mk
--- /dev/null
@@ -0,0 +1,155 @@
+#!/bin/bash
+#
+# BLIS
+# An object-based framework for developing high-performance BLAS-like
+# libraries.
+#
+# Copyright (C) 2014, The University of Texas
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# - Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# - Neither the name of The University of Texas nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# Only include this block of code once.
+ifndef MAKE_DEFS_MK_INCLUDED
+MAKE_DEFS_MK_INCLUDED := yes
+
+
+TI_INSTALL_DIR?=/usr/src/dsp
+
+PATH:=$(TI_OCL_CGT_INSTALL)/bin:$(PATH)
+
+define FIND_DSP_PKG
+ export $(1)?=$$(patsubst %/$(3),%,$$(lastword $$(sort $$(wildcard $$(TI_INSTALL_DIR)/$(2)/$(3)))))
+ ifeq ($$($(1)),)
+ $$(error ERROR - $(1) is not defined and could not be found in $(TI_INSTALL_DIR)/ )
+ else
+ ifeq ($$(wildcard $$($(1))/$(3)),)
+ $$(error ERROR - "$(1) = $$($(1))" Is not valid!)
+ endif
+ endif
+ $$(info Using $(1) = $$($(1)))
+endef
+
+UNAME_M :=$(shell uname -m)
+
+ifneq (,$(findstring 86, $(UNAME_M)))
+$(eval $(call FIND_DSP_PKG,C6636_PDK_DIR,pdk_keystone2*,packages))
+endif
+
+$(eval $(call FIND_DSP_PKG,FC_DIR,framework_components*,packages))
+$(eval $(call FIND_DSP_PKG,OMP_DIR,openmp_dsp*,packages))
+$(eval $(call FIND_DSP_PKG,LIBARCH_DIR,libarch*,packages))
+$(eval $(call FIND_DSP_PKG,XDAIS_DIR,xdais*,packages))
+$(eval $(call FIND_DSP_PKG,XDC_DIR,xdc*,packages))
+
+#
+# --- Build definitions --------------------------------------------------------
+#
+
+# Variables corresponding to other configure-time options.
+BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
+BLIS_ENABLE_STATIC_BUILD := yes
+BLIS_ENABLE_DYNAMIC_BUILD := no
+
+
+
+#
+# --- Utility program definitions ----------------------------------------------
+#
+
+SH := /bin/sh
+MV := mv
+MKDIR := mkdir -p
+RM_F := rm -f
+RM_RF := rm -rf
+SYMLINK := ln -sf
+FIND := find
+GREP := grep
+XARGS := xargs
+RANLIB := ranlib
+INSTALL := install -c
+
+# Used to refresh CHANGELOG.
+GIT := git
+GIT_LOG := $(GIT) log --decorate
+
+
+
+#
+# --- Development tools definitions --------------------------------------------
+#
+
+# --- Determine the C compiler and related flags ---
+CC := cl6x
+# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
+# NOTE: This is needed to enable posix_memalign().
+CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
+CMISCFLAGS := --c99
+#CMISCFLAGS += -I$(TI_OCL_CGT_INSTALL)/include
+CMISCFLAGS += -I$(OMP_DIR)/packages/ti/runtime/openmp
+CMISCFLAGS += -I$(FC_DIR)/packages
+CMISCFLAGS += -I$(XDC_DIR)/packages
+CMISCFLAGS += -I$(XDAIS_DIR)/packages
+CMISCFLAGS += -I$(LIBARCH_DIR)/packages
+CMISCFLAGS += -I$(LINUX_DEVKIT_ROOT)/usr/share/ti/cgt-c6x/include
+CMISCFLAGS += -I$(TARGET_ROOTDIR)/usr/share/ti/opencl
+
+ifneq (,$(findstring 86, $(UNAME_M)))
+CMISCFLAGS += -I$(C6636_PDK_DIR)/packages
+$(info Using $(UNAME_M))
+else
+CMISCFLAGS += -I$(LINUX_DEVKIT_ROOT)/usr/include
+$(info Using $(UNAME_M))
+endif
+#CMISCFLAGS += -mv6600 --use_g2 --omp #-std=c99 # -fopenmp -pg
+CMISCFLAGS += -mv6600 --use_g2 --omp -DDEVICE_K2H -DLIB_OPENCL #-std=c99 # -fopenmp -pg
+
+
+CDBGFLAGS := -s -k -mw
+CWARNFLAGS :=
+COPTFLAGS := -O2
+CKOPTFLAGS := $(COPTFLAGS)
+CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
+
+COMPILER_OUTPUT_FLAG := -fs=$(BASE_OBJ_PATH) -fe
+
+# Aggregate all of the flags into multiple groups: one for standard
+# compilation, and one for each of the supported "special" compilation
+# modes.
+CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+
+# --- Determine the archiver and related flags ---
+AR := ar6x
+ARFLAGS := -ur
+
+# --- Determine the linker and related flags ---
+LINKER := $(CC)
+LDFLAGS := #-lm
+
+# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
+endif
diff --git a/blis/config/am57x/touch.h b/blis/config/am57x/touch.h
--- /dev/null
@@ -0,0 +1,60 @@
+#ifndef _TOUCH_H_\r
+#define _TOUCH_H_\r
+\r
+/* Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/\r
+*\r
+* Redistribution and use in source and binary forms, with or without\r
+* modification, are permitted provided that the following conditions\r
+* are met:\r
+*\r
+* Redistributions of source code must retain the above copyright\r
+* notice, this list of conditions and the following disclaimer.\r
+*\r
+* Redistributions in binary form must reproduce the above copyright\r
+* notice, this list of conditions and the following disclaimer in the\r
+* documentation and/or other materials provided with the\r
+* distribution.\r
+*\r
+* Neither the name of Texas Instruments Incorporated nor the names of\r
+* its contributors may be used to endorse or promote products derived\r
+* from this software without specific prior written permission.\r
+*\r
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*\r
+*/\r
+\r
+/**\r
+ * @file touch.h\r
+ * @brief Contains interface to cache optimization utilities\r
+ *\r
+ */\r
+\r
+/** @defgroup util util */\r
+\r
+/** @ingroup util */\r
+/* @{ */\r
+\r
+/** \r
+ * @brief touches an array to bring it into cache\r
+ * \r
+ * @param[in] array Pointer to array to touch\r
+ * @param[in] length Length array in bytes\r
+ *\r
+ */\r
+void touch (const void *array, int length); \r
+\r
+#endif\r
+\r
+/* @} */ /* ingroup */\r
+\r
+/* Nothing past this point */\r
index 6cd8a3e48450e3b710f6ab7cb1e0ff65e86f0898..872c4bc3c345ceb9904d9d2b0fb92fd3fdc50294 100755 (executable)
#define BLIS_CONFIG_H
#define BLIS_ENABLE_C66X_BUILD
+
+#define BLIS_ENABLE_C66X_K2H
+
#define BLIS_ENABLE_C66X_MEM_POOLS
#define BLIS_ENABLE_C66X_OPENCL
+#ifdef BLIS_ENABLE_C66X_OPENCL
+// clocl creates a cio section in L2 when fprintf is used. Redefining fprintf to map to printf.
+#define fprintf ti_printf
+#endif
-// -- OPERATING SYSTEM ---------------------------------------------------------
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
// -- INTEGER PROPERTIES -------------------------------------------------------
// The bit size of the integer type used to track values such as dimensions,
// -- c66x headers -------------------------------------------------------------
#include "c6x.h"
+
#include <ti/csl/device/k2h/src/cslr_device.h>
#include <ti/libarch/libarch.h>
+//#include <ti/csl/csl_chipAux.h> // CSL_chipReadDNUM -> to read coreID
+//#include <ti/csl/csl_cacheAux.h> // CACHE_invL1d
+
+// for __clock64()
+#include <dsp_c.h>
-#include <ti/csl/csl_chipAux.h> // lib_get_coreID -> to read coreID
-#include <ti/csl/csl_cacheAux.h> // CACHE_invL1d
// -- EDMA ---------------------------------------------------------------------
#define BLIS_ENABLE_C66X_EDMA
#ifdef BLIS_ENABLE_C66X_EDMA
-//#include "edmamgr.h"
-//#include <ti/sdo/fc/edmamgr/edmamgr.h>
+
+#define BLIS_GEMM_DMAA_CNTL gemm_dmaa_cntl
+#define BLIS_GEMM_DMAB_CNTL gemm_dmab_cntl
+
+/*
+#if USING_FC_EDMAMGR
+#include <xdc/std.h>
+
+#define ECPY_INLINE_ALL 1
+#define EDMAMGR_INLINE_ALL 1
+#include <ti/sdo/fc/edmamgr/edmamgr.h>
+#else
+#include "edmamgr.h"
+#endif
+*/
#define BLIS_C66X_MAXDMASTRIDE 0x7FFF
#include "idma.h"
#endif
-
+// -- PROFILE -----------------------------------------------------------------
+//uncomment to Profile performance
+//#define BLIS_ENABLE_PROFILE
// -- MULTITHREADING -----------------------------------------------------------
#define BLIS_ENABLE_OPENMP
#define BLIS_MAX_NUM_THREADS 8
+#define BLIS_C66X_IC_NT BLIS_MAX_NUM_THREADS
+#define BLIS_C66X_JC_NT 1
+#define BLIS_C66X_JR_NT 1
+#define BLIS_C66X_IR_NT 1
+
+
+
// -- MEMORY ALLOCATION --------------------------------------------------------
index f2eb785811367a4d093676f8bec0d99980b17812..264e1df2b3d692d7cdda691b5f8402447735c787 100755 (executable)
//#define BLIS_DEFAULT_NC_Z 584
//Values for 2 buffers of KN in L3
+
+#if 1
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 428
#define BLIS_DEFAULT_NC_S 944
+#else
+#define BLIS_DEFAULT_MC_S 216
+#define BLIS_DEFAULT_KC_S 428
+#define BLIS_DEFAULT_NC_S 1376
+#endif
// MR = 4, NR = 4
//#define BLIS_DEFAULT_MC_S 40
index 3c3c2a727338fb4163d911cb93f78880f47e95a1..0fb4f41e669b47779d9adef9a334691d590b7cc4 100755 (executable)
$(eval $(call FIND_DSP_PKG,XDAIS_DIR,xdais*,packages))
$(eval $(call FIND_DSP_PKG,XDC_DIR,xdc*,packages))
-
#
# --- Build definitions --------------------------------------------------------
#
#CMISCFLAGS += -I$(TI_OCL_CGT_INSTALL)/include
CMISCFLAGS += -I$(OMP_DIR)/packages/ti/runtime/openmp
CMISCFLAGS += -I$(FC_DIR)/packages
-CMISCFLAGS += -I$(C6636_PDK_DIR)/packages
CMISCFLAGS += -I$(XDC_DIR)/packages
CMISCFLAGS += -I$(XDAIS_DIR)/packages
CMISCFLAGS += -I$(LIBARCH_DIR)/packages
-#CMISCFLAGS += -I$(C6636_PDK_DIR)/packages
CMISCFLAGS += -I$(LINUX_DEVKIT_ROOT)/usr/share/ti/cgt-c6x/include
+CMISCFLAGS += -I$(TARGET_ROOTDIR)/usr/share/ti/opencl
+
ifneq (,$(findstring 86, $(UNAME_M)))
+CMISCFLAGS += -I$(C6636_PDK_DIR)/packages
$(info Using $(UNAME_M))
else
CMISCFLAGS += -I$(LINUX_DEVKIT_ROOT)/usr/include
CMISCFLAGS += -mv6600 --use_g2 --omp -DDEVICE_K2H -DLIB_OPENCL #-std=c99 # -fopenmp -pg
-CDBGFLAGS := -s
+CDBGFLAGS := -s -k -mw
CWARNFLAGS :=
COPTFLAGS := -O2
CKOPTFLAGS := $(COPTFLAGS)
index 9771f1d617655459bfb446d0ffafb36ef5649313..19f88259463519e2d74ff7211a4daf0e709510b7 100644 (file)
--- a/blis/config/c66x/touch.h
+++ b/blis/config/c66x/touch.h
-#ifndef _TOUCH_H_
-#define _TOUCH_H_
-
-/* Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/
-*
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions
-* are met:
-*
-* Redistributions of source code must retain the above copyright
-* notice, this list of conditions and the following disclaimer.
-*
-* Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in the
-* documentation and/or other materials provided with the
-* distribution.
-*
-* Neither the name of Texas Instruments Incorporated nor the names of
-* its contributors may be used to endorse or promote products derived
-* from this software without specific prior written permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-*/
-
-/**
- * @file touch.h
- * @brief Contains interface to cache optimization utilities
- *
- */
-
-/** @defgroup util util */
-
-/** @ingroup util */
-/* @{ */
-
-/**
- * @brief touches an array to bring it into cache
- *
- * @param[in] array Pointer to array to touch
- * @param[in] length Length array in bytes
- *
- */
-void touch (const void *array, int length);
-
-#endif
-
-/* @} */ /* ingroup */
-
-/* Nothing past this point */
+#ifndef _TOUCH_H_\r
+#define _TOUCH_H_\r
+\r
+/* Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/\r
+*\r
+* Redistribution and use in source and binary forms, with or without\r
+* modification, are permitted provided that the following conditions\r
+* are met:\r
+*\r
+* Redistributions of source code must retain the above copyright\r
+* notice, this list of conditions and the following disclaimer.\r
+*\r
+* Redistributions in binary form must reproduce the above copyright\r
+* notice, this list of conditions and the following disclaimer in the\r
+* documentation and/or other materials provided with the\r
+* distribution.\r
+*\r
+* Neither the name of Texas Instruments Incorporated nor the names of\r
+* its contributors may be used to endorse or promote products derived\r
+* from this software without specific prior written permission.\r
+*\r
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*\r
+*/\r
+\r
+/**\r
+ * @file touch.h\r
+ * @brief Contains interface to cache optimization utilities\r
+ *\r
+ */\r
+\r
+/** @defgroup util util */\r
+\r
+/** @ingroup util */\r
+/* @{ */\r
+\r
+/** \r
+ * @brief touches an array to bring it into cache\r
+ * \r
+ * @param[in] array Pointer to array to touch\r
+ * @param[in] length Length array in bytes\r
+ *\r
+ */\r
+void touch (const void *array, int length); \r
+\r
+#endif\r
+\r
+/* @} */ /* ingroup */\r
+\r
+/* Nothing past this point */\r
index dbf9bb439a76d2925cb54b83847c602c18cdc2b7..76d521f40c3c955d8b3ebb7e1d829b79af5ddc81 100644 (file)
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
-//#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4
-//#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4
+#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4
+#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4
// -- trsm-related --
diff --git a/blis/config/shannon/bli_config.h b/blis/config/shannon/bli_config.h
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_CONFIG_H
+#define BLIS_CONFIG_H
+
+#define BLIS_ENABLE_C66X_BUILD
+
+#define BLIS_ENABLE_C66X_K2H
+
+#define BLIS_ENABLE_C66X_MEM_POOLS
+
+#define BLIS_ENABLE_C66X_OPENCL
+
+#ifdef BLIS_ENABLE_C66X_OPENCL
+// clocl creates a cio section in L2 when fprintf is used. Redefining fprintf to map to printf.
+#define fprintf ti_printf
+#endif
+
+
+
+
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
+// -- INTEGER PROPERTIES -------------------------------------------------------
+
+// The bit size of the integer type used to track values such as dimensions,
+// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
+// integers while 64 results in 64-bit integers. Any other value results in use
+// of the C99 type "long int". Note that this ONLY affects integers used
+// internally within BLIS as well as those exposed in the native BLAS-like BLIS
+// interface.
+#define BLIS_INT_TYPE_SIZE 32
+
+
+
+// -- FLOATING-POINT PROPERTIES ------------------------------------------------
+
+// Define the number of floating-point types supported, and the size of the
+// largest type.
+#define BLIS_NUM_FP_TYPES 4
+#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
+
+// Enable use of built-in C99 "float complex" and "double complex" types and
+// associated overloaded operations and functions? Disabling results in
+// scomplex and dcomplex being defined in terms of simple structs.
+//#define BLIS_ENABLE_C99_COMPLEX
+
+// -- c66x headers -------------------------------------------------------------
+#include "c6x.h"
+
+#include <ti/csl/device/k2h/src/cslr_device.h>
+
+#include <ti/libarch/libarch.h>
+//#include <ti/csl/csl_chipAux.h> // CSL_chipReadDNUM -> to read coreID
+//#include <ti/csl/csl_cacheAux.h> // CACHE_invL1d
+
+// for __clock64()
+#include <dsp_c.h>
+
+
+// -- EDMA ---------------------------------------------------------------------
+#define BLIS_ENABLE_C66X_EDMA
+
+#ifdef BLIS_ENABLE_C66X_EDMA
+
+#define BLIS_GEMM_DMAA_CNTL gemm_dmaa_cntl
+#define BLIS_GEMM_DMAB_CNTL gemm_dmab_cntl
+
+/*
+#if USING_FC_EDMAMGR
+#include <xdc/std.h>
+
+#define ECPY_INLINE_ALL 1
+#define EDMAMGR_INLINE_ALL 1
+#include <ti/sdo/fc/edmamgr/edmamgr.h>
+#else
+#include "edmamgr.h"
+#endif
+*/
+
+#define BLIS_C66X_MAXDMASTRIDE 0x7FFF
+
+#define BLIS_C66X_EDMA_MAX_NUM_CHANNELS 6
+#endif
+
+// -- PREFETCH -----------------------------------------------------------------
+//#define BLIS_ENABLE_C66X_PREFETCH
+
+#ifdef BLIS_ENABLE_C66X_PREFETCH
+#include "touch.h"
+#endif
+
+// -- IDMA -----------------------------------------------------------------
+#define BLIS_ENABLE_C66X_IDMA
+
+#ifdef BLIS_ENABLE_C66X_IDMA
+#include "idma.h"
+#endif
+
+// -- PROFILE -----------------------------------------------------------------
+//uncomment to Profile performance
+//#define BLIS_ENABLE_PROFILE
+
+// -- MULTITHREADING -----------------------------------------------------------
+
+// The maximum number of BLIS threads that will run concurrently.
+#define BLIS_ENABLE_MULTITHREADING
+#define BLIS_ENABLE_OPENMP
+#define BLIS_MAX_NUM_THREADS 8
+
+#define BLIS_C66X_IC_NT BLIS_MAX_NUM_THREADS
+#define BLIS_C66X_JC_NT 1
+#define BLIS_C66X_JR_NT 1
+#define BLIS_C66X_IR_NT 1
+
+
+
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+// -- Contiguous (static) memory allocator --
+
+// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
+// contiguous memory pools.
+
+#define BLIS_NUM_MC_X_KC_BLOCKS_L3 0
+#define BLIS_NUM_MC_X_KC_BLOCKS_L2 2 //Each L2 ram is local to the DSP Just need one buffer per thread that is packed
+#define BLIS_NUM_MC_X_KC_BLOCKS_L1 0
+#define BLIS_NUM_MR_X_KC_BLOCKS_L1 2 // To transfer A to L1 in a ping-poing manner
+#define BLIS_NUM_MC_X_KC_BLOCKS 2*BLIS_MAX_NUM_THREADS + 1 //To test w/o DMA and L2, L3 memory, all memory must be in DDR3 now
+
+#define BLIS_NUM_KC_X_NC_BLOCKS_L3 2 // Each thread shares a B block, so do not need 8 buffers *BLIS_MAX_NUM_THREADS // One for the partitioned B1, and one for the packed B1
+#define BLIS_NUM_KC_X_NC_BLOCKS_L2 0
+#define BLIS_NUM_KC_X_NC_BLOCKS_L1 0
+#define BLIS_NUM_KC_X_NR_BLOCKS_L1 1
+#define BLIS_NUM_KC_X_NC_BLOCKS 2*BLIS_MAX_NUM_THREADS //To test w/o DMA and L2, L3 memory, all memory must be in DDR3 now
+
+#define BLIS_NUM_MC_X_NC_BLOCKS_L3 0
+#define BLIS_NUM_MC_X_NC_BLOCKS_L2 0
+#define BLIS_NUM_MC_X_NR_BLOCKS_L2 3 //Bringing C into the L2 memory. We need 3 buffers, one to read, one to compute and one to write.
+#define BLIS_NUM_MC_X_NC_BLOCKS_L1 0
+#define BLIS_NUM_MR_X_NR_BLOCKS_L1 0
+#define BLIS_NUM_MC_X_NC_BLOCKS 0
+
+
+// The maximum preload byte offset is used to pad the end of the contiguous
+// memory pools so that the micro-kernel, when computing with the end of the
+// last block, can exceed the bounds of the usable portion of the memory
+// region without causing a segmentation fault.
+#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
+
+// -- Memory alignment --
+
+// It is sometimes useful to define the various memory alignments in terms
+// of some other characteristics of the system, such as the cache line size
+// and the page size.
+#define BLIS_CACHE_LINE_SIZE 64
+#define BLIS_PAGE_SIZE 4096
+
+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE 16
+
+// Alignment size used to align local stack buffers within macro-kernel
+// functions.
+#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when allocating memory dynamically from the operating
+// system (eg: posix_memalign()). To disable heap alignment and just use
+// malloc() instead, set this to 1.
+#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when sizing leading dimensions of dynamically
+// allocated memory.
+#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
+
+// Alignment size used when allocating entire blocks of contiguous memory
+// from the contiguous memory allocator.
+#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_CACHE_LINE_SIZE//BLIS_PAGE_SIZE
+
+// Extra buffer space in each block in L1 to account for bank conflict
+/* There are 2 buffers of size MRK and 1 buffer of size KNR in L1. The
+ * extra buffer space in each block in L1 is computed based on the
+ * remaining space available in L1. L1DSRAM is configured to size 28K.
+ * The total size of the blocks in L1 = 2*MR*KC*size of datatype + KC*NR* size of datatype.
+ * The remaining available space in L1 is divided such that
+ * 2*BLIS_MRK_BLOCK_BUFFER_L1+1*BLIS_KNR_PANEL_BUFFER_L1+0BLIS_MRNR_BLOCK_BUFFER_L1_S = remaining available space.
+ */
+#define BLIS_MRK_BLOCK_BUFFER_L1_S 128 //
+#define BLIS_MRK_BLOCK_BUFFER_L1_D 64 //
+#define BLIS_MRK_BLOCK_BUFFER_L1_C 64 //
+#define BLIS_MRK_BLOCK_BUFFER_L1_Z 32 //
+#define BLIS_KNR_PANEL_BUFFER_L1_S 256 //
+#define BLIS_KNR_PANEL_BUFFER_L1_D 128
+#define BLIS_KNR_PANEL_BUFFER_L1_C 128
+#define BLIS_KNR_PANEL_BUFFER_L1_Z 64
+#define BLIS_MRNR_BLOCK_BUFFER_L1_S 128 //
+#define BLIS_MRNR_BLOCK_BUFFER_L1_D 64 //
+#define BLIS_MRNR_BLOCK_BUFFER_L1_C 64 //
+#define BLIS_MRNR_BLOCK_BUFFER_L1_Z 32 //
+
+//
+
+#define bli_sbank 8
+#define bli_dbank 16
+#define bli_cbank 16
+#define bli_zbank 24
+
+
+
+// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
+
+// Basic (homogeneous) datatype support always enabled.
+
+// Enable mixed domain operations?
+//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+
+// Enable extra mixed precision operations?
+//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+
+
+
+// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
+
+// Stay initialized after auto-initialization, unless and until the user
+// explicitly calls bli_finalize().
+#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
+
+
+
+// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
+
+// Enable the BLAS compatibility layer?
+#define BLIS_ENABLE_BLAS2BLIS
+
+// The bit size of the integer type used to track values such as dimensions and
+// leading dimensions (ie: column strides) within the BLAS compatibility layer.
+// A value of 32 results in the compatibility layer using 32-bit signed integers
+// while 64 results in 64-bit integers. Any other value results in use of the
+// C99 type "long int". Note that this ONLY affects integers used within the
+// BLAS compatibility layer.
+#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
+
+// Fortran-77 name-mangling macros.
+#define PASTEF770(name) name ## _
+#define PASTEF77(ch1,name) ch1 ## name ## _
+#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
+
+
+
+
+#endif
+
diff --git a/blis/config/shannon/bli_kernel.h b/blis/config/shannon/bli_kernel.h
--- /dev/null
@@ -0,0 +1,247 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_KERNEL_H
+#define BLIS_KERNEL_H
+
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
+
+// -- Cache blocksizes --
+
+//
+// Constraints:
+//
+// (1) MC must be a multiple of:
+// (a) MR (for zero-padding purposes)
+// (b) NR (for zero-padding purposes when MR and NR are "swapped")
+// (2) NC must be a multiple of
+// (a) NR (for zero-padding purposes)
+// (b) MR (for zero-padding purposes when MR and NR are "swapped")
+// (3) KC must be a multiple of
+// (a) MR and
+// (b) NR (for triangular operations such as trmm and trsm).
+//
+
+#define BLIS_DEFAULT_MC_S 128
+#define BLIS_DEFAULT_KC_S 240
+#define BLIS_DEFAULT_NC_S 1288
+
+#define BLIS_DEFAULT_MC_D 68
+#define BLIS_DEFAULT_KC_D 240
+#define BLIS_DEFAULT_NC_D 844
+
+#define BLIS_DEFAULT_MC_C 68
+#define BLIS_DEFAULT_KC_C 240
+#define BLIS_DEFAULT_NC_C 844
+
+#define BLIS_DEFAULT_MC_Z 60
+#define BLIS_DEFAULT_KC_Z 136
+#define BLIS_DEFAULT_NC_Z 631
+
+#define BLIS_DEFAULT_4M_MC_C 68
+#define BLIS_DEFAULT_4M_KC_C 240
+#define BLIS_DEFAULT_4M_NC_C 844
+
+#define BLIS_DEFAULT_4M_MC_Z 60
+#define BLIS_DEFAULT_4M_KC_Z 136
+#define BLIS_DEFAULT_4M_NC_Z 628
+
+#define BLIS_DEFAULT_3M_MC_C 68
+#define BLIS_DEFAULT_3M_KC_C 160
+#define BLIS_DEFAULT_3M_NC_C 720
+
+#define BLIS_DEFAULT_3M_MC_Z 52
+#define BLIS_DEFAULT_3M_KC_Z 100
+#define BLIS_DEFAULT_3M_NC_Z 524
+
+// -- Register blocksizes --
+
+#define BLIS_DEFAULT_MR_S 4
+#define BLIS_DEFAULT_NR_S 8 //4 //
+
+#define BLIS_DEFAULT_MR_D 4
+#define BLIS_DEFAULT_NR_D 4
+
+#define BLIS_DEFAULT_MR_C 2
+#define BLIS_DEFAULT_NR_C 4
+
+#define BLIS_DEFAULT_MR_Z 1
+#define BLIS_DEFAULT_NR_Z 1
+
+
+// NOTE: If the micro-kernel, which is typically unrolled to a factor
+// of f, handles leftover edge cases (ie: when k % f > 0) then these
+// register blocksizes in the k dimension can be defined to 1.
+
+//#define BLIS_DEFAULT_KR_S 1
+//#define BLIS_DEFAULT_KR_D 1
+//#define BLIS_DEFAULT_KR_C 1
+//#define BLIS_DEFAULT_KR_Z 1
+
+// -- Cache blocksize extensions (for optimizing edge cases) --
+
+// NOTE: These cache blocksize "extensions" have the same constraints as
+// the corresponding default blocksizes above. When these values are
+// non-zero, blocksizes used at edge cases are extended (enlarged) if
+// such an extension would encompass the remaining portion of the
+// matrix dimension.
+
+//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
+//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
+//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
+
+//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
+//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
+//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
+
+//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
+//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
+//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
+
+//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
+//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
+//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
+
+// -- Register blocksize extensions (for packed micro-panels) --
+
+// NOTE: These register blocksize "extensions" determine whether the
+// leading dimensions used within the packed micro-panels are equal to
+// or greater than their corresponding register blocksizes above.
+
+//#define BLIS_EXTEND_MR_S 0
+//#define BLIS_EXTEND_NR_S 0
+
+//#define BLIS_EXTEND_MR_D 0
+//#define BLIS_EXTEND_NR_D 0
+
+//#define BLIS_EXTEND_MR_C 0
+//#define BLIS_EXTEND_NR_C 0
+
+//#define BLIS_EXTEND_MR_Z 0
+//#define BLIS_EXTEND_NR_Z 0
+
+
+
+// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
+
+
+
+
+// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
+
+
+
+
+// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
+
+// -- gemm --
+#define BLIS_SGEMM_UKERNEL bli_sgemm_ukernel_4x8
+#define BLIS_DGEMM_UKERNEL bli_dgemm_ukernel_4x4
+#define BLIS_CGEMM_UKERNEL bli_cgemm_ukernel_2x4
+#define BLIS_ZGEMM_UKERNEL bli_zgemm_ukernel_2x2
+
+// -- trsm-related --
+#define BLIS_SGEMMTRSM_U_UKERNEL bli_sgemmtrsm_u_ukernel_4x4
+#define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_ukernel_4x4
+
+#define BLIS_STRSM_U_UKERNEL bli_strsm_u_ukernel_4x4
+#define BLIS_STRSM_L_UKERNEL bli_strsm_l_ukernel_4x4
+
+
+// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
+
+// -- packm --
+#define BLIS_SPACKM_4XK_KERNEL bli_spackm_4xk_ukernel
+#define BLIS_SPACKM_8XK_KERNEL bli_spackm_8xk_ukernel
+
+// -- unpackm --
+
+
+
+
+// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
+
+// -- axpy2v --
+
+// -- dotaxpyv --
+
+// -- axpyf --
+
+// -- dotxf --
+
+// -- dotxaxpyf --
+
+
+
+
+// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
+
+// -- addv --
+
+// -- axpyv --
+
+// -- copyv --
+
+// -- dotv --
+
+// -- dotxv --
+
+// -- invertv --
+
+// -- scal2v --
+
+// -- scalv --
+
+// -- setv --
+
+// -- subv --
+
+// -- swapv --
+
+// adding packm micro kernel prototypes
+#include "bli_packm_cxk_ukernels.h"
+
+// Declaration for bli_sgemm_ukernel_4x4 which is used by gemmtrsm ukernel
+void bli_sgemm_ukernel_4x4(
+ dim_t k,
+ float* restrict alpha,
+ float* restrict a,
+ float* restrict b,
+ float* restrict beta,
+ float* restrict c, inc_t rs_c, inc_t cs_c,
+ auxinfo_t* data
+ );
+
+#endif
+
diff --git a/blis/config/shannon/corepack_regs.h b/blis/config/shannon/corepack_regs.h
--- /dev/null
@@ -0,0 +1,82 @@
+/**
+ Structures for CorePack registers.
+*/
+
+#ifndef RT_COREPACK_REGS_H
+#define RT_COREPACK_REGS_H
+
+#include <stdint.h>
+
+/**
+ IDMA registers placed at 0x0182 0000.
+*/
+struct corepack_idma_regs {
+ uint32_t idma0_stat; /* 0000 */
+ uint32_t idma0_mask; /* 0004 */
+ uint32_t idma0_source; /* 0008 */
+ uint32_t idma0_dest; /* 000C */
+ uint32_t idma0_count; /* 0010 */
+ uint32_t res1[59]; /* 0014 - 00ff */
+ uint32_t idma1_stat; /* 0100 */
+ uint32_t res2; /* 0104 */
+ uint32_t idma1_source; /* 0108 */
+ uint32_t idma1_dest; /* 010c */
+ uint32_t idma1_count; /* 0110 */
+ uint32_t res3[16315]; /* 0114 - ffff */
+};
+
+/**
+ Cache registers placed at 0x0184 0000.
+*/
+struct corepack_cache_regs {
+ uint32_t l2cfg; /* 0000 */
+ uint32_t res1[7]; /* 0004 - 001f */
+ uint32_t l1pcfg; /* 0020 */
+ uint32_t l1pcc; /* 0024 */
+ uint32_t res2[6]; /* 0028 - 003f */
+ uint32_t l1dcfg; /* 0040 */
+ uint32_t l1dcc; /* 0044 */
+ uint32_t res3[4078]; /* 0048 - 3fff */
+ uint32_t l2wbar; /* 4000 */
+ uint32_t l2wwc; /* 4004 */
+ uint32_t res4[2]; /* 4008 - 400f */
+ uint32_t l2wibar; /* 4010 */
+ uint32_t l2wiwc; /* 4014 */
+ uint32_t l2ibar; /* 4018 */
+ uint32_t l2iwc; /* 401c */
+ uint32_t l1pibar; /* 4020 */
+ uint32_t l1piwc; /* 4024 */
+ uint32_t res5[2]; /* 4028 - 402f */
+ uint32_t l1dwibar; /* 4030 */
+ uint32_t l1dwiwc; /* 4034 */
+ uint32_t res6[2]; /* 4038 - 403f */
+ uint32_t l1dwbar; /* 4040 */
+ uint32_t l1dwwc; /* 4044 */
+ uint32_t l1dibar; /* 4048 */
+ uint32_t l1diwc; /* 404c */
+ uint32_t res7[1004]; /* 4050 - 4fff */
+ uint32_t l2wb; /* 5000 */
+ uint32_t l2wbinv; /* 5004 */
+ uint32_t l2inv; /* 5008 */
+ uint32_t res8[7]; /* 500c - 5027 */
+ uint32_t l1pinv; /* 5028 */
+ uint32_t res9[5]; /* 502c - 503f */
+ uint32_t l1dwb; /* 5040 */
+ uint32_t l1dwbinv; /* 5044 */
+ uint32_t l1dinv; /* 5048 */
+ uint32_t res10[3053]; /* 504c - 7fff */
+ uint32_t mar[256]; /* 8000 - 83ff */
+ uint32_t res11[7936]; /* 8400 - ffff */
+};
+
+struct corepack_regs {
+ uint32_t res1[32768]; /* 0180 0000 - 0181 ffff */
+ struct corepack_idma_regs idma_regs; /* 0182 0000 - 0182 ffff */
+ uint32_t res2[16384]; /* 0183 0000 - 0183 ffff */
+ struct corepack_cache_regs cache_regs; /* 0184 0000 - 0184 ffff */
+};
+
+static volatile struct corepack_regs * const corepack_regs =
+ (struct corepack_regs *)(0x01800000ul);
+
+#endif
diff --git a/blis/config/shannon/edmamgr.h b/blis/config/shannon/edmamgr.h
--- /dev/null
@@ -0,0 +1,104 @@
+#ifndef _EdmaMgr_h
+#define _EdmaMgr_h
+#include <stdint.h>
+
+typedef void *EdmaMgr_Handle;
+
+int32_t EdmaMgr_init (int32_t proc_id, void* edma3_config);
+
+EdmaMgr_Handle EdmaMgr_alloc (int32_t max_linked_transfers);
+
+int32_t EdmaMgr_free (EdmaMgr_Handle h);
+
+void EdmaMgr_wait (EdmaMgr_Handle h);
+
+int32_t EdmaMgr_copy1D1D (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes);
+
+int32_t EdmaMgr_copy1D2D (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes,
+ int32_t num_lines,
+ int32_t pitch);
+
+int32_t EdmaMgr_copy2D1D (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes,
+ int32_t num_lines,
+ int32_t pitch);
+
+int32_t EdmaMgr_copy2D2D (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes,
+ int32_t num_lines,
+ int32_t pitch);
+
+int32_t EdmaMgr_copy2D2DSep (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst,
+ int32_t num_bytes,
+ int32_t num_lines,
+ int32_t src_pitch,
+ int32_t dst_pitch);
+
+int32_t EdmaMgr_copy1D1DLinked (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copy1D2DLinked (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_lines[],
+ int32_t pitch[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copy2D1DLinked (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_lines[],
+ int32_t pitch[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copy2D2DLinked (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_lines[],
+ int32_t pitch[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copy2D2DSepLinked(EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_bytes[],
+ int32_t num_lines[],
+ int32_t src_pitch[],
+ int32_t dst_pitch[],
+ int32_t num_transfers);
+
+int32_t EdmaMgr_copyFast (EdmaMgr_Handle h,
+ void *restrict src,
+ void *restrict dst);
+
+int32_t EdmaMgr_copyLinkedFast (EdmaMgr_Handle h,
+ void *restrict src[],
+ void *restrict dst[],
+ int32_t num_transfers);
+
+#define EdmaMgr_SUCCESS 0
+#define EdmaMgr_ERROR_INVARG -1
+#define EdmaMgr_ERROR_INVCFG -2
+#define EdmaMgr_ERROR_RMANINIT -3
+#define EdmaMgr_ERROR_INVHANDLE -4
+#define EdmaMgr_ERROR_FREE -5
+
+#endif
diff --git a/blis/config/shannon/idma.h b/blis/config/shannon/idma.h
--- /dev/null
@@ -0,0 +1,70 @@
+/**
+ Simple IDMA helper functions.
+*/
+
+/*
+
+Copyright (c) 2012 Kungliga Tekniska Högskolan
+(Royal Institute of Technology, Stockholm, Sweden).
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+*/
+
+#ifndef RT_IDMA_H
+#define RT_IDMA_H
+
+#include "corepack_regs.h"
+
+static inline void
+idma1_setup(void *dest,
+ const void *source,
+ unsigned size,
+ unsigned fill,
+ unsigned inter,
+ unsigned priority)
+{
+ uint32_t count;
+
+ count = size | (fill << 16) | (inter << 28) | (priority << 29);
+
+ corepack_regs->idma_regs.idma1_source = (uint32_t)source;
+ corepack_regs->idma_regs.idma1_dest = (uint32_t)dest;
+ corepack_regs->idma_regs.idma1_count = count;
+}
+
+static inline uint32_t
+idma1_status()
+{
+ return corepack_regs->idma_regs.idma1_stat;
+}
+
+static inline int
+idma1_done()
+{
+ return idma1_status() == 0;
+}
+
+#endif
diff --git a/blis/config/shannon/kernels b/blis/config/shannon/kernels
--- /dev/null
@@ -0,0 +1 @@
+../../kernels/c66x
\ No newline at end of file
diff --git a/blis/config/shannon/make_defs.mk b/blis/config/shannon/make_defs.mk
--- /dev/null
@@ -0,0 +1,156 @@
+#!/bin/bash
+#
+# BLIS
+# An object-based framework for developing high-performance BLAS-like
+# libraries.
+#
+# Copyright (C) 2014, The University of Texas
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# - Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# - Neither the name of The University of Texas nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# Only include this block of code once.
+ifndef MAKE_DEFS_MK_INCLUDED
+MAKE_DEFS_MK_INCLUDED := yes
+
+
+TI_INSTALL_DIR?=/usr/src/dsp
+
+PATH:=$(TI_OCL_CGT_INSTALL)/bin:$(PATH)
+
+define FIND_DSP_PKG
+ export $(1)?=$$(patsubst %/$(3),%,$$(lastword $$(sort $$(wildcard $$(TI_INSTALL_DIR)/$(2)/$(3)))))
+ ifeq ($$($(1)),)
+ $$(error ERROR - $(1) is not defined and could not be found in $(TI_INSTALL_DIR)/ )
+ else
+ ifeq ($$(wildcard $$($(1))/$(3)),)
+ $$(error ERROR - "$(1) = $$($(1))" Is not valid!)
+ endif
+ endif
+ $$(info Using $(1) = $$($(1)))
+endef
+
+UNAME_M :=$(shell uname -m)
+
+ifneq (,$(findstring 86, $(UNAME_M)))
+$(eval $(call FIND_DSP_PKG,C6636_PDK_DIR,pdk_keystone2*,packages))
+endif
+
+$(eval $(call FIND_DSP_PKG,FC_DIR,framework_components*,packages))
+$(eval $(call FIND_DSP_PKG,OMP_DIR,openmp_dsp*,packages))
+$(eval $(call FIND_DSP_PKG,LIBARCH_DIR,libarch*,packages))
+$(eval $(call FIND_DSP_PKG,XDAIS_DIR,xdais*,packages))
+$(eval $(call FIND_DSP_PKG,XDC_DIR,xdc*,packages))
+
+
+
+#
+# --- Build definitions --------------------------------------------------------
+#
+
+# Variables corresponding to other configure-time options.
+BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
+BLIS_ENABLE_STATIC_BUILD := yes
+BLIS_ENABLE_DYNAMIC_BUILD := no
+
+
+
+#
+# --- Utility program definitions ----------------------------------------------
+#
+
+SH := /bin/sh
+MV := mv
+MKDIR := mkdir -p
+RM_F := rm -f
+RM_RF := rm -rf
+SYMLINK := ln -sf
+FIND := find
+GREP := grep
+XARGS := xargs
+RANLIB := ranlib
+INSTALL := install -c
+
+# Used to refresh CHANGELOG.
+GIT := git
+GIT_LOG := $(GIT) log --decorate
+
+
+
+#
+# --- Development tools definitions --------------------------------------------
+#
+
+# --- Determine the C compiler and related flags ---
+CC := cl6x
+# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
+# NOTE: This is needed to enable posix_memalign().
+CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
+CMISCFLAGS := --c99
+#CMISCFLAGS += -I$(TI_OCL_CGT_INSTALL)/include
+CMISCFLAGS += -I$(OMP_DIR)/packages/ti/runtime/openmp
+CMISCFLAGS += -I$(FC_DIR)/packages
+CMISCFLAGS += -I$(XDC_DIR)/packages
+CMISCFLAGS += -I$(XDAIS_DIR)/packages
+CMISCFLAGS += -I$(LIBARCH_DIR)/packages
+CMISCFLAGS += -I$(LINUX_DEVKIT_ROOT)/usr/share/ti/cgt-c6x/include
+CMISCFLAGS += -I$(TARGET_ROOTDIR)/usr/share/ti/opencl
+
+ifneq (,$(findstring 86, $(UNAME_M)))
+CMISCFLAGS += -I$(C6636_PDK_DIR)/packages
+$(info Using $(UNAME_M))
+else
+CMISCFLAGS += -I$(LINUX_DEVKIT_ROOT)/usr/include
+$(info Using $(UNAME_M))
+endif
+CMISCFLAGS += -mv6600 --use_g2 --omp -DDEVICE_K2H -DLIB_OPENCL #-std=c99 # -fopenmp -pg
+
+
+CDBGFLAGS := -s -k -mw
+CWARNFLAGS :=
+COPTFLAGS := -O2
+CKOPTFLAGS := $(COPTFLAGS)
+CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
+
+COMPILER_OUTPUT_FLAG := -fs=$(BASE_OBJ_PATH) -fe
+
+# Aggregate all of the flags into multiple groups: one for standard
+# compilation, and one for each of the supported "special" compilation
+# modes.
+CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+
+# --- Determine the archiver and related flags ---
+AR := ar6x
+ARFLAGS := -ur
+
+# --- Determine the linker and related flags ---
+LINKER := $(CC)
+LDFLAGS := #-lm
+
+# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
+endif
index da5ce77d8a6de486a1d28c026743b9003dd0b9cb..f739d56fcdff38e6e247dd089f30539c4ef03c76 100644 (file)
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *a );
dim_t start, end;
- bli_get_range( thread, 0, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- &start, &end );
+// bli_get_range( thread, 0, m_trans,
+// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+// &start, &end );
+
+ bli_get_range_t2b( thread, 0, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ &start, &end );
// Partition along the m dimension.
#ifdef BLIS_ENABLE_C66X_EDMA
index 1738b125707fd70850bdb5cf89516206885988bc..8da0f33c5f39ea547c61216f673e20127564270f 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- bli_get_range( thread, 0, n_trans,
- bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
- &start, &end );
+// bli_get_range( thread, 0, n_trans,
+// bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+// &start, &end );
+
+ bli_get_range_l2r( thread, 0, n_trans,
+ bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+ &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
index 05fb033532e2da57908e93401b147cf601681ee5..acd632ee57254c7a87ce9bae3c94d5892a9afe25 100644 (file)
BLIS_UPANEL_B_ALIGN_SIZE_Z, 0 );
- // Attach the register blksz_t objects as sub-blocksizes to the cache
+ // Attach the register blksz_t objects as blocksize multiples to the cache
// blksz_t objects.
bli_blksz_obj_attach_to( gemm_mr, gemm_mc );
bli_blksz_obj_attach_to( gemm_nr, gemm_nc );
bli_blksz_obj_attach_to( gemm_kr, gemm_kc );
+ //bli_blksz_obj_attach_mult_to( gemm_mr, gemm_mc );
+ //bli_blksz_obj_attach_mult_to( gemm_nr, gemm_nc );
+ //bli_blksz_obj_attach_mult_to( gemm_kr, gemm_kc );
+
+
+ // Attach the mr and nr blksz_t objects to each cache blksz_t object.
+ // The primary example of why this is needed relates to nudging kc.
+ // In hemm, symm, trmm, or trmm3, we need to know both mr and nr,
+ // since the multiple we target in nudging depends on whether the
+ // structured matrix is on the left or the right.
+ //bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_mc );
+ //bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_nc );
+ //bli_blksz_obj_attach_mr_nr_to( gemm_mr, gemm_nr, gemm_kc );
// Create function pointer object for each datatype-specific gemm
// micro-kernel.
gemm_packa_cntl,
NULL, //gemm_packb_cntl, //
NULL,
- gemm_dmaa_cntl, //NULL, //
+ BLIS_GEMM_DMAA_CNTL, //gemm_dmaa_cntl, //
NULL, //gemm_dmab_cntl, //
NULL,
gemm_cntl_bp_ke,
gemm_packb_cntl, //NULL, //
NULL,
NULL,
- gemm_dmab_cntl, //NULL, //
+ BLIS_GEMM_DMAB_CNTL, // gemm_dmab_cntl, //NULL, //
NULL,
gemm_cntl_op_bp,
NULL );
index 060c5bddf76d209c11c59c3b6b4543de15be2f0c..712dec8a9d91eee028767bb56379d8e7223702f3 100644 (file)
#include "blis.h"
+#define CLOCK 1.2 // In GHz
+#define OPS_PER_CYCLE_S 16
+#define OPS_PER_CYCLE_D 4
+#define NUM_THREADS 8
+
+#ifdef BLIS_ENABLE_PROFILE
+ profile_data_t *bli_gemm_profile_data;
+#endif
void bli_gemm_front( obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, n_profile, k_profile;
+#endif
+
+
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c );
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ n_profile = bli_obj_width( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+ bli_gemm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS); //One struct for each thread for each variant
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_gemm_int,
bli_gemm_thrinfo_free_paths( infos, n_threads );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_gemm_profile_data,
+ m_profile, n_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 2, n_threads);
+
+ bli_profile_data_free(bli_gemm_profile_data);
+
+#endif
+
}
index c46b814b312d290ef7dde5603d72508c5983d484..b09d5288fec222022bbefa67bbb84895154a3f73 100644 (file)
#include "blis.h"
-#ifdef BLIS_ENABLE_C66X_BUILD
-//#define BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
-#endif
-
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)( obj_t* a,
varnum_t n;
impl_t i;
FUNCPTR_T f;
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- volatile int counter_start;
- volatile int counter_end;
+#if defined(BLIS_ENABLE_PROFILE)
+ volatile uint64_t counter_start;
+ volatile uint64_t counter_end;
+ extern profile_data_t *bli_gemm_profile_data;
+ dim_t m_var, k_var, n_var;
+ dim_t index;
#endif
// Extract the variant number and implementation type.
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
- //printf("gemm_int %d %d\n", n, i);
// Invoke the variant.
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- TSCL = 0;
- counter_start = TSCL;
+#if defined(BLIS_ENABLE_PROFILE)
+ m_var = bli_obj_length( c_local );
+ k_var = bli_obj_width_after_trans( a_local );
+ n_var = bli_obj_width( c_local );
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ TSCL = 0;
+ counter_start = __clock64();
+#else
+ counter_start = (uint64_t) (bli_clock()*1.2e9);
+#endif
#endif
f( &a_local,
&b_local,
&c_local,
cntl,
- thread );
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- counter_end = TSCL;
- if(lib_get_coreID()==0) printf("xxxxx bli_gemm_int %d %d %d\n", n, i, counter_end-counter_start);
+ thread );
+
+#if defined(BLIS_ENABLE_PROFILE)
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ counter_end = __clock64();
+#else
+ counter_end = (uint64_t) (bli_clock()*1.2e9);
+#endif
+ bli_profile_get_index(n, i, index);
+ bli_profile_data_update(bli_gemm_profile_data[index], (counter_end-counter_start), m_var*k_var*n_var);
#endif
}
index b76494cb44e60ebd1fadf008df5f968211934ffc..1ffd0a8eee489bb843ebe49d6ea7de8b57765945 100644 (file)
#define FUNCPTR_T gemm_fp
-/* move this memory allocation to memory pool on L2 */
-/* buffer size needed is max(NR*sizeof(ctype)*MC*3) */
-/* c data movement currently works for rs_c=1 only; */
-/* need to carry this for cs_c=1 as well ensuring it works */
-/* for non unity rs_c and cs_c */
-/*#define CNEWBUFSIZE (104*8*4)
-
-#pragma DATA_SECTION(cNew,".mem_l2")
-#pragma DATA_ALIGN(cNew,8)
-char cNew[3*CNEWBUFSIZE];
-*/
+#ifdef BLIS_ENABLE_C66X_MEM_POOLS
+#define BLIS_ENABLE_C66X_C_L2
+#endif
+
typedef void (*FUNCPTR_T)(
pack_t schema_a,
pack_t schema_b,
}
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-
-#if defined (BLIS_ENABLE_C66X_EDMA) && defined (BLIS_ENABLE_C66X_IDMA)
+#ifdef BLIS_ENABLE_C66X_C_L2
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrtype ) \
lib_emt_Handle emt_handle_c1 = NULL; \
\
/*For DSP timing*/ \
- /*volatile int counter_start; \
- volatile int counter_end;*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_gemm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
\
- /*Acquiring buffers for C (MC_x_NR) in L2 */\
+ /*Acquiring buffers for C (MC_x_NR) in L2 */\
bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
cNew0 = bli_mem_buffer( &c0_L2_mem ); \
\
} \
\
/*For DSP Timing*/ \
- /*TSCL=0; \
- counter_start = TSCL;*/ \
/* initiate first c transfer */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
n_cur = ( bli_is_not_edge_f( jr_thread_id, n_iter, n_left ) ? NR : n_left ); \
if(cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
- lib_emt_copy2D2D(emt_handle_c0, c_cast+jr_thread_id*cstep_c, \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c_cast+jr_thread_id*cstep_c, \
cNew1, m*sizeof(ctype), /*(m_iter-ir_thread_id)*sizeof(ctype)*MR,*/ \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
- else \
+ } \
+ else \
{ \
- dim_t ii; \
- ctype *ptr_source; \
- ctype *ptr_dest; \
- ptr_source = c_cast+jr_thread_id*cstep_c; \
- ptr_dest = cNew1; \
- for(ii = 0; ii < n_cur; ii++) \
- { \
- memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
- ptr_source += cs_c; \
- ptr_dest += cs_c11; \
- } \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c_cast+jr_thread_id*cstep_c; \
+ ptr_dest = cNew1; \
+ for(ii = 0; ii < n_cur; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c; \
+ ptr_dest += cs_c11; \
+ } \
} \
+ lib_emt_wait(emt_handle_c0); \
\
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
- n_next = ( bli_is_not_edge_f( j+jr_num_threads, n_iter, n_left ) ? NR : n_left ); \
\
if (j == jr_thread_id) \
{ \
lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype));\
} \
- /* wait for previous c transfer to complete and initiate next transfer */ \
+\
+ n_next = ( bli_is_not_edge_f( j+jr_num_threads, n_iter, n_left ) ? NR : n_left ); \
+ /* wait for previous c transfer to complete and initiate next transfer */ \
lib_emt_wait(emt_handle_c0); \
+\
if(j < (n_iter-jr_num_threads)) /* no transfer for last iteration */ \
- {\
- if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ {\
lib_emt_copy2D2D(emt_handle_c0, c1+jr_num_threads*cstep_c, \
- cNew0, m*sizeof(ctype), /*(m_iter-ir_thread_id)*sizeof(ctype)*MR,*/ \
- n_next, cs_c*sizeof(ctype), \
- cs_c11*sizeof(ctype)); \
- else \
- { \
- dim_t ii; \
- ctype *ptr_source; \
- ctype *ptr_dest; \
- ptr_source = c1+jr_num_threads*cstep_c; \
- ptr_dest = cNew0; \
- for(ii = 0; ii < n_next; ii++) \
- { \
- memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
- ptr_source += cs_c; \
- ptr_dest += cs_c11; \
- } \
- } \
- }\
+ cNew0, m*sizeof(ctype), /*(m_iter-ir_thread_id)*sizeof(ctype)*MR,*/ \
+ n_next, cs_c*sizeof(ctype), \
+ cs_c11*sizeof(ctype)); \
+ /*Testing if performance improves with fast API*/ \
+ /*lib_emt_copyFast(emt_handle_c0, c1+jr_num_threads*cstep_c, \
+ cNew0);*/ \
+ }\
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1+jr_num_threads*cstep_c; \
+ ptr_dest = cNew0; \
+ for(ii = 0; ii < n_next; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c; \
+ ptr_dest += cs_c11; \
+ } \
+ } \
+ } \
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
- /*c11 = c1 + i * rstep_c;*/ \
- c11 = cNew1 + i * rstep_c11; \
+ c11 = cNew1 + i * rstep_c11; \
+ /*c11 = c1 + i * rstep_c; */\
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
lib_emt_wait(emt_handle_b); \
} \
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
- alpha_cast, \
- a1_L1, /*a1_L1,*/ \
- b1_L1, /*b1_L1,*/ \
- beta_cast, \
- c11, rs_c11, cs_c11, /*rs_c, cs_c,*/ \
- &aux ); \
+ alpha_cast, \
+ a1_L1, /*a1_L1,*/ \
+ b1_L1, /*b1_L1,*/ \
+ beta_cast, \
+ c11, rs_c11, cs_c11, /*rs_c, cs_c,*/ \
+ &aux ); \
} \
else \
{ \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c11, cs_c11 /*rs_c, cs_c */); \
- } \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*m_cur*k*n_cur); \
+ } \
if(!bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) && bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads )) \
{ \
/* Start the EDMA of the next panel (K*NR) of B*/ \
lib_emt_copy1D1D(emt_handle_b, b2, b1_L1, k*NR*sizeof(ctype)); \
+ /*Testing if performance improves with fast API*/ \
+ /*lib_emt_copyFast(emt_handle_b, b2, b1_L1);*/ \
} \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND],\
+ (counter_end_mr-counter_start_mr), (uint64_t) 2*m*k*n_cur); \
+ } \
/* circularly shift buffers */ \
cNewTemp = cNew0; \
cNew0 = cNew2; \
} \
\
} \
- /*For DSP timing*/ \
- /*counter_end=TSCL; \
- if (lib_get_coreID () == 0) */\
- /*printf("%d %d %d\t%d\n",n_iter, m_iter, k, counter_end-counter_start); */\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*m*k*n); \
+ } \
/* Loop over the n dimension (NR columns at a time). */ \
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
INSERT_GENTFUNC_BASIC( gemm_ker_var2, gemm_ukr_t )
-#else //If EDMA IDMA is not enabled use memcpy
+#else //No Data movement for C
+
+
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrtype ) \
\
void PASTEMAC(ch,varname)( \
- pack_t schema_a, \
+ pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
- PASTECH(ch,ukrtype) gemm_ukr_cast = gemm_ukr; \
+ PASTECH(ch,ukrtype) gemm_ukr_cast = (PASTECH(ch,ukrtype) ) gemm_ukr; \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,maxmr) * \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
+ dim_t n_next; \
inc_t rstep_a; \
inc_t cstep_b; \
- inc_t rstep_c, cstep_c; \
+ inc_t rstep_c; \
+ inc_t cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
auxinfo_t aux; \
+\
+ inc_t rstep_c11, rs_c11, cs_c11; \
\
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
\
mem_t a1_L1_mem, a2_L1_mem; \
ctype *a1_L1, *a2_L1, *temp; \
+\
+ mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
+ ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
+ /*EDMA Declarations */ \
+\
+ lib_emt_Handle emt_handle_b = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_gemm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
-\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
+\
+ rstep_c11 = MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
+ rs_c11 = 1; \
+ cs_c11 = (m%2 == 0) ? m : m+1 ; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k; \
+\
+ /*printf("n %d m %d n_left %d m_left = %d rstep_c = %d, cstep_c = %d rs_c = %d, cs_c = %d, cs_c11 = %d, NR = %d\n", n, m, n_left, m_left, rstep_c, cstep_c, rs_c, cs_c, (m%2 == 0) ? m : m+1, NR ); */\
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, aux ); \
/*Acquiring a buffer for B in L1*/ \
bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
- b1_L1 = (ctype *) (b1_L1_mem.buf + PASTEMAC(ch,bank)); \
+ /* Type casting pointer to char to avoid warning "arithmetic on pointer to void or function type"*/ \
+ b1_L1 = (ctype *) ((char *) b1_L1_mem.buf + PASTEMAC(ch,bank)); \
\
/*Acquiring a buffer for A in L1*/ \
- /*printf("Acquire A k %d, MR %d, size of ctype %d, A size requested %d\n", k, MR, sizeof(ctype), k*MR*sizeof(ctype));*/ \
bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
- a1_L1 = a1_L1; \
\
bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
\
- /* Loop over the n dimension (NR columns at a time). */ \
+ /*Acquiring buffers for C (MC_x_NR) in L2 */\
+ bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
+ cNew0 = bli_mem_buffer( &c0_L2_mem ); \
+\
+ bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
+ cNew1 = bli_mem_buffer( &c1_L2_mem ); \
+\
+ bli_mem_acquire_m( cs_c11*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
+ cNew2 = bli_mem_buffer( &c2_L2_mem ); \
+\
+ /*Acquiring an EDMA handle from the pool*/ \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
+ { \
+ printf("ker_var2 Failed to alloc edma handle B CoreID %d \n", lib_get_coreID()); \
+ } \
+\
+ /*For DSP Timing*/ \
+ /* initiate first c transfer */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
+\
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
- \
+\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
- n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
- /* Initialize our next panel of B to be the current panel of B. */ \
- b2 = b1; \
- memcpy(b1_L1, b1, k*NR*sizeof(ctype)); \
+ if (j == jr_thread_id) \
+ { \
+ /*Always use MR and NR while transfering a packed panel*/ \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
+ lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype), 0, 0, 7);\
+ } \
\
- /* Loop over the m dimension (MR rows at a time). */ \
- for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
+ /* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
+ for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
- c11 = c1 + i * rstep_c; \
+ a1 = a_cast + i * rstep_a; \
+ c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
- if(i == ir_thread_id) \
- { \
- a1 = a_cast + i * rstep_a; \
- memcpy(a2_L1, a1, k*MR*sizeof(ctype)); \
- } \
\
/* Compute the addresses of the next panels of A and B. */ \
- a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \
+ a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
- a1 = a2; /*Make the next panel the current panel for the next iteration*/ \
- /*Start next panel*/ \
- memcpy(a2_L1, a2, k*MR*sizeof(ctype)); \
\
- /* Save addresses of next panels of A and B to the auxinfo_t
- object. */ \
- /*if ( bli_is_last_iter( i, m_iter ) ) \
+ /*Wait for the panel to finish transferring*/ \
+ lib_imt_wait(); \
+ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
+ lib_imt_copy(a1 = a_cast + ir_thread_id * rstep_a, a2_L1, k*MR*sizeof(ctype), 0, 0, 7); \
a2 = a_cast; \
b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \
- if ( bli_is_last_iter( j, n_iter ) ) \
- b2 = b_cast; \
+ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
+ b2 = b_cast; \
+ } \
+ else \
+ { \
+ lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype), 0, 0, 7); \
} \
- bli_auxinfo_set_next_a( a2, aux ); \
- bli_auxinfo_set_next_b( b2, aux );*/ \
\
- /* Handle interior and edge cases separately. */ \
- if ( m_cur == MR && n_cur == NR ) \
+ if(i == ir_thread_id) \
{ \
- /* Invoke the gemm micro-kernel. */ \
- gemm_ukr_cast( k, \
- alpha_cast, \
- a1_L1, \
- b1_L1, \
- beta_cast, \
- c11, rs_c, cs_c, \
- &aux ); \
+ lib_emt_wait(emt_handle_b); \
+ } \
+ /* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
+ if ( m_cur == MR && n_cur == NR ) \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ alpha_cast, \
+ a1_L1, /*a1_L1,*/ \
+ b1_L1, /*b1_L1,*/ \
+ beta_cast, \
+ c11, rs_c, cs_c, \
+ &aux ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
- a1_L1, \
- b1_L1, \
+ a1_L1, /*a1_L1,*/ \
+ b1_L1, /*b1_L1,*/ \
zero, \
- ct, rs_ct, cs_ct, \
+ ct, rs_ct, cs_ct, \
&aux ); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
- ct, rs_ct, cs_ct, \
- beta_cast, \
- c11, rs_c, cs_c ); \
+ ct, rs_ct, cs_ct, \
+ beta_cast, \
+ c11, rs_c, cs_c); \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*m_cur*k*n_cur); \
+ } \
+ if(!bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) && bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads )) \
+ { \
+ /* Start the EDMA of the next panel (K*NR) of B*/ \
+ lib_emt_copy1D1D(emt_handle_b, b2, b1_L1, k*NR*sizeof(ctype)); \
} \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*m*k*n_cur); \
+ } \
} \
- \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_gemm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*m*k*n); \
+ } \
+ /* Loop over the n dimension (NR columns at a time). */ \
+ bli_mem_release( &c2_L2_mem ); \
+ bli_mem_release( &c1_L2_mem ); \
+ bli_mem_release( &c0_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
-/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
+ if ( emt_handle_b != NULL ) \
+ { \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
+ } \
+ /*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( gemm_ker_var2, gemm_ukr_t )
+
+
#endif
#else
index 91ae16d4743e955300f3329bf95af794ecb00be9..ff8a557c508172496fa04aec1e17f396c063d303 100644 (file)
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ extern profile_data_t *bli_gemm_profile_data;
+ dim_t m_profile, n_profile, k_profile;
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_hemm_check( side, alpha, a, b, beta, c );
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ n_profile = bli_obj_width( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+ bli_gemm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS); //One struct for each thread for each variant
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_gemm_int,
(void*) cntl,
(void**) infos );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_gemm_profile_data,
+ m_profile, n_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 2, n_threads);
+
+ bli_profile_data_free(bli_gemm_profile_data);
+
+#endif
+
bli_gemm_thrinfo_free_paths( infos, n_threads );
}
index 74e1613df1345260a72d61f4ba718164b5544a0e..5d78c83a13d161a57c794e97e9a8e355fefaa5ef 100644 (file)
else if ( bli_3m_is_enabled_dt( dt ) ) bli_her2k3m_entry( alpha, a, b, beta, c );
else if ( bli_4mh_is_enabled_dt( dt ) ) bli_her2k4mh_entry( alpha, a, b, beta, c );
else if ( bli_4m_is_enabled_dt( dt ) ) bli_her2k4m_entry( alpha, a, b, beta, c );
- else bli_her2k_entry( alpha, a, b, beta, c );
+ else
+ bli_her2k_entry( alpha, a, b, beta, c );
}
index 7753e6b1e5c45dcb659485fba03b30cc25353430..863cfbe5ab1fba6d24b3a065dcb6b1171bdc43e0 100644 (file)
obj_t b_local;
obj_t ah_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, k_profile;
+ extern profile_data_t *bli_herk_profile_data;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_her2k_check( alpha, a, b, beta, c );
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_herk_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
(void*) cntl,
(void**) infos );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_herk_profile_data,
+ m_profile, m_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 1.0, n_threads);
+
+ bli_profile_data_free(bli_herk_profile_data);
+#endif
+
bli_herk_thrinfo_free_paths( infos, n_threads );
#endif
index 3a34f9de36351991d307ddec054c5664226eff20..7817434a7ac1b1d9e480a526f2e9240247cb2799 100644 (file)
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *c );
dim_t start, end;
- bli_get_range_weighted( thread, 0, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_is_upper( *c ), &start, &end );
+
+// bli_get_range_weighted( thread, 0, m_trans,
+// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+// bli_obj_is_upper( *c ), &start, &end );
+
+#if 0
+ if(bli_is_lower( bli_obj_root_uplo( *c ) ))
+ {
+ dim_t n_trans;
+ n_trans = bli_obj_width_after_trans(*c);
+
+ {
+ dim_t At, Ar, X;
+ dim_t num_threads_At, num_threads_Ar;
+ At = ceil(n_trans*n_trans/2);
+ Ar = bli_max(0, m_trans - n_trans)*n_trans;
+ X = ceil(Ar/At);
+
+ if (X > 0)
+ {
+ num_threads_At = thread->n_way / (1 + X);
+ num_threads_Ar = thread->n_way - num_threads_At;
+
+ printf("n_trans = %d At = %d, Ar = %d, X = %d, num_threads_At = %d, num_threads_Ar = %d\n", n_trans*n_trans, At, Ar, X, num_threads_At, num_threads_Ar);
+
+ if(thread->work_id < num_threads_At)
+ {
+ dim_t all_start = 0;
+ dim_t all_end = n_trans;
+ dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
+ uplo_t uplo = bli_obj_root_uplo( *c );
+ bool_t handle_edge_low = FALSE;
+ dim_t n_way = num_threads_At;
+ dim_t work_id = thread->work_id;
+ dim_t size = all_end - all_start; // partioning only the triangular part
+ dim_t width;
+ dim_t block_fac_leftover = size % block_factor;
+ dim_t i;
+ double num;
+
+ bli_toggle_uplo(uplo);
+
+ //printf("Triangle: work_id = %d \n", thread->work_id);
+
+ start = 0;
+ end = all_end - all_start;
+ num = size * size / ( double )n_way;
+
+ for ( i = 0; TRUE; ++i )
+ {
+ width = ceil( sqrt( start * start + num ) ) - start;
+
+ if ( i == 0 && handle_edge_low )
+ {
+ if ( width % block_factor != block_fac_leftover )
+ width += block_fac_leftover - ( width % block_factor );
+ }
+ else
+ {
+ if ( width % block_factor != 0 )
+ width += block_factor - ( width % block_factor );
+ }
+
+ if ( work_id == 0 )
+ {
+ start = start + all_start;
+ end = bli_min( start + width, all_end );
+ break;
+ }
+ else
+ {
+ start = start + width;
+ work_id--;
+ }
+ }
+
+ }
+ else
+ {
+ dim_t all_start = n_trans;
+ dim_t all_end = m_trans;
+ dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
+ bool_t handle_edge_low = FALSE;
+
+ dim_t n_way = num_threads_Ar;
+ dim_t work_id = thread->work_id - num_threads_At;
+
+ dim_t size = all_end - all_start;
+
+ dim_t n_bf_whole = size / block_factor;
+ dim_t n_bf_left = size % block_factor;
+
+ dim_t n_bf_lo = n_bf_whole / n_way;
+ dim_t n_bf_hi = n_bf_whole / n_way;
+
+ //printf("Rectangle: work_id = %d \n", thread->work_id);
+
+
+ if ( handle_edge_low == FALSE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "high" and
+ // the "low" thread group is empty.
+ dim_t n_th_lo = n_bf_whole % n_way;
+ //dim_t n_th_hi = n_way - n_th_lo;
+
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to lower index threads.
+ if ( n_th_lo != 0 ) n_bf_lo += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ start = lo_start + (work_id ) * size_lo;
+ end = lo_start + (work_id+1) * size_lo;
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ start = hi_start + (work_id-n_th_lo ) * size_hi;
+ end = hi_start + (work_id-n_th_lo+1) * size_hi;
+
+ // Since the edge case is being allocated to the high
+ // end of the index range, we have to advance the last
+ // thread's end.
+ if ( work_id == n_way - 1 ) end += n_bf_left;
+ }
+ }
+ else // if ( handle_edge_low == TRUE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "low" and
+ // the "high" thread group is empty.
+ dim_t n_th_hi = n_bf_whole % n_way;
+ dim_t n_th_lo = n_way - n_th_hi;
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to higher index threads.
+ if ( n_th_hi != 0 ) n_bf_hi += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo
+ + n_bf_left;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ start = lo_start + (work_id ) * size_lo;
+ end = lo_start + (work_id+1) * size_lo;
+
+ // Since the edge case is being allocated to the low
+ // end of the index range, we have to advance the
+ // starts/ends accordingly.
+ if ( work_id == 0 ) end += n_bf_left;
+ else { start += n_bf_left;
+ end += n_bf_left; }
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ start = hi_start + (work_id-n_th_lo ) * size_hi;
+ end = hi_start + (work_id-n_th_lo+1) * size_hi;
+ }
+ }
+ }
+ }
+ else
+ {
+ bli_get_range_weighted_t2b( thread, 0, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *c ), &start, &end );
+ }
+ }
+ }
+ else
+ bli_get_range_weighted_t2b( thread, 0, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *c ), &start, &end );
+#else
+ bli_get_range_weighted_t2b( thread, 0, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *c ), &start, &end );
+#endif
#ifdef BLIS_ENABLE_C66X_EDMA
if(start < end)
bli_dmam_init( &a1, a1_dma, cntl_sub_dmam_a( cntl ) );
bli_dmam_init( &c1, c1_dma, cntl_sub_dmam_c( cntl ) );
}
- //bli_obj_print("subpart", &a1);
- //bli_obj_print("subpart", a1_dma);
bli_dmam_int( &a1, a1_dma, cntl_sub_dmam_a( cntl ), (dmam_thrinfo_t *) gemm_thread_sub_ipackm( thread ) );
bli_dmam_int( &c1, c1_dma, cntl_sub_dmam_c( cntl ), (dmam_thrinfo_t *) gemm_thread_sub_ipackm( thread ) );
index caf2d2f7d278da5c61ea50bb95a1ad50e6fb7a3e..35737049cda564aed09ffbc59e2e95e122d95cc5 100644 (file)
dim_t start, end;
// Needs to be replaced with a weighted range because triangle
- bli_get_range_weighted( thread, 0, n_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_is_lower( *c ), &start, &end );
+// bli_get_range_weighted( thread, 0, n_trans,
+// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+// bli_obj_is_lower( *c ), &start, &end );
+
+ bli_get_range_weighted_l2r( thread, 0, n_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *c ), &start, &end );
+
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
bli_acquire_mpart_t2b( stored_part,
i, b_alg, a_pack, &aS_pack );
+ //printf("c1S: blk_var2: m = %d, n = %d, diagoff = %d \nc1: blk_var2: m = %d, n = %d, diagoff = %d\n", bli_obj_length( c1S ), bli_obj_width( c1S ), bli_obj_diag_offset(c1S), bli_obj_length( c1 ), bli_obj_width( c1 ), bli_obj_diag_offset(c1));
+
// Initialize objects for packing A1' and C1.
if( thread_am_ichief( thread ) ) {
bli_packm_init( &ah1, ah1_pack,
index c8ce5ed0414942aae1d9e66e2ba99f003586cafd..6cb44ce7247abcc167d677e7b716fd1c75b64c19 100644 (file)
#include "blis.h"
+#ifdef BLIS_ENABLE_PROFILE
+ profile_data_t *bli_herk_profile_data;
+#endif
+
void bli_herk_front( obj_t* alpha,
obj_t* a,
obj_t* beta,
obj_t a_local;
obj_t ah_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, k_profile;
+
+ m_profile = bli_obj_length( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+#endif
// Check parameters.
if ( bli_error_checking_is_enabled() )
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_herk_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
bli_herk_thrinfo_free_paths( infos, n_threads );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_herk_profile_data,
+ m_profile, m_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 1.0, n_threads);
+
+ bli_profile_data_free(bli_herk_profile_data);
+#endif
+
// The Hermitian rank-k product was computed as A*A', even for the
// diagonal elements. Mathematically, the imaginary components of
// diagonal elements of a Hermitian rank-k product should always be
index ac0744ff062c009fbd77dca695876a50960bccf1..f8fdce5e3bfbb90d9a3fe1deb0acbb3af53bd0ec 100644 (file)
#include "blis.h"
-#ifdef BLIS_ENABLE_C66X_BUILD
-//#define BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
-#endif
-
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)( obj_t* a,
impl_t i;
bool_t uplo;
FUNCPTR_T f;
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- volatile int counter_start;
- volatile int counter_end;
+#if defined(BLIS_ENABLE_PROFILE)
+ volatile uint64_t counter_start;
+ volatile uint64_t counter_end;
+ extern profile_data_t *bli_herk_profile_data;
+ dim_t m_var, k_var, n_var;
+ dim_t index;
#endif
// Check parameters.
// Index into the variant array to extract the correct function pointer.
f = vars[uplo][n][i];
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- TSCL = 0;
- counter_start = TSCL;
+#if defined(BLIS_ENABLE_PROFILE)
+ m_var = bli_obj_length( c_local );
+ k_var = bli_obj_width_after_trans( a_local );
+ n_var = bli_obj_width_after_trans( ah_local );
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ TSCL = 0;
+ counter_start = __clock64();
+#else
+ counter_start = (uint64_t) (bli_clock()*1.2e9);
+#endif
#endif
// Invoke the variant.
f( &a_local,
&c_local,
cntl,
thread );
-#ifdef BLIS_ENABLE_CYCLE_COUNT_BLIS_GEMM_INT
- counter_end = TSCL;
- if(lib_get_coreID ()==0) printf("xxxxx bli_gemm_int \t %d %d %d \t %d\n", uplo, n, i, counter_end-counter_start);
+#if defined(BLIS_ENABLE_PROFILE)
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ counter_end = __clock64();
+#else // if not DSP
+ counter_end = (uint64_t) (bli_clock()*1.2e9);
#endif
+ bli_profile_get_index(n, i, index);
+ bli_profile_data_update(bli_herk_profile_data[index], (counter_end-counter_start), m_var*k_var*n_var);
+
+#endif // if defined profile
}
index 9e2815b10d1f8b531874b827ee0f8ad232b10a6a..2ced5caee22dd2fa4ff385d4729ceb42d7cef540 100644 (file)
#define FUNCPTR_T herk_fp
+#ifdef BLIS_ENABLE_PROFILE
+#define BLIS_ENABLE_PROFILE_KERVAR2 1
+#else
+#define BLIS_ENABLE_PROFILE_KERVAR2 0
+#endif
+
typedef void (*FUNCPTR_T)(
doff_t diagoffc,
pack_t schema_a,
lib_emt_Handle emt_handle_c0 = NULL; \
lib_emt_Handle emt_handle_c1 = NULL; \
\
- /*volatile int counter_start; \
- volatile int counter_end;*/ \
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_herk_profile_data; \
+\
/*
Assumptions/assertions:
rs_a == 1
* If it does, update the new m and move the pointers of C and A accordingly
* Logic is not working as of now.*/\
diagoffc_j = diagoffc - (doff_t) jr_thread_id * NR; \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
/*if ( diagoffc_j < 0 ) \
{ \
dim_t ii; \
n_next, cs_c*sizeof(ctype), \
cs_c11*sizeof(ctype));*/ \
}\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter_new; i += ir_num_threads ) \
{ \
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c11, cs_c11 /* rs_c, cs_c */); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
beta_cast, \
c11, rs_c11, cs_c11 /* rs_c, cs_c */); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} \
- /*counter_end=TSCL; \
- if (lib_get_coreID () == 0) \
- printf("%d %d %d\t%d\n",n_cur, m_cur, k, counter_end-counter_start); */\
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*mc_new*k*n_cur); \
} \
/* circularly shift buffers */ \
cNewTemp = cNew0; \
diagoffc_j = diagoffc_j_next; \
cs_c11 = cs_c11_next; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*mc_new*k*n); \
+ } \
}\
\
bli_mem_release( &c2_L2_mem ); \
}
INSERT_GENTFUNC_BASIC( herk_l_ker_var2, gemm_ukr_t )
-
-#else
-
#endif
#else
inc_t istep_a; \
inc_t istep_b; \
auxinfo_t aux; \
+\
+/*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_herk_profile_data; \
\
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = (uint64_t) (bli_clock()*1.2e9); \
+ } \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = (uint64_t) (bli_clock()*1.2e9); \
+ } \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = (uint64_t) (bli_clock()*1.2e9); \
+ } \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = (uint64_t) (bli_clock()*1.2e9); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ ( (long int) (counter_end_ker-counter_start_ker)), 2*k*m_cur*n_cur); \
+ } \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = (uint64_t) (bli_clock()*1.2e9); \
+ } \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
beta_cast, \
c11, rs_c, cs_c ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = (uint64_t) (bli_clock()*1.2e9); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ ( (long int) (counter_end_ker-counter_start_ker)), 2*k*m_cur*n_cur); \
+ } \
} \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = (uint64_t) (bli_clock()*1.2e9); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND],\
+ ( (long int) (counter_end_mr-counter_start_mr)), 2*m*k*n_cur); \
+ } \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = (uint64_t) (bli_clock()*1.2e9); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ ( (long int)(counter_end_nr-counter_start_nr)), 2*m*k*n); \
} \
}
index 04e1902eb56a005b61576b23e75ae637a2699824..d72fd63caf1ae4ad39ab09a7f844639f8d494d4d 100644 (file)
lib_emt_Handle emt_handle_b = NULL; \
lib_emt_Handle emt_handle_c0 = NULL; \
lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_herk_profile_data; \
\
/*
Assumptions/assertions:
* (m_iter-ir_thread_id)*MR is not equal to m which would lead to incorrect
* values of C written back.*/ \
n_cur = ( bli_is_not_edge_f( jr_thread_id, n_iter, n_left ) ? NR : n_left ); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
+\
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
lib_emt_copy2D2D(emt_handle_c0, c_cast+jr_thread_id*cstep_c, \
} \
}\
/* Interior loop over the m dimension (MR rows at a time). */ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
+\
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
+\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c11, cs_c11 /*rs_c, cs_c */); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
+\
} \
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
+\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
beta_cast, \
c11, rs_c11, cs_c11 /*rs_c, cs_c */); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*m*k*n_cur); \
+ } \
/* circularly shift buffers */ \
cNewTemp = cNew0; \
cNew0 = cNew2; \
} \
} \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_herk_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND],\
+ (counter_end_nr-counter_start_nr), 2*m*k*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
index 9d2aeaed2c80a7ef4e507139aba6ffd2eb386af7..3bf8f99f34a03892ed8fb9e66b87576d2b4ec0ca 100644 (file)
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ extern profile_data_t *bli_gemm_profile_data;
+ dim_t m_profile, n_profile, k_profile;
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_symm_check( side, alpha, a, b, beta, c );
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ n_profile = bli_obj_width( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+ bli_gemm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS); //One struct for each thread for each variant
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_gemm_int,
(void*) cntl,
(void**) infos );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_gemm_profile_data,
+ m_profile, n_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 2, n_threads);
+
+ bli_profile_data_free(bli_gemm_profile_data);
+
+#endif
+
bli_gemm_thrinfo_free_paths( infos, n_threads );
}
index fdb131c9c56dd808df1544e35cc6fac31719502d..f42e1bb9c3966333c932c49c8b91cdbb5b6ca779 100644 (file)
obj_t b_local;
obj_t at_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, k_profile;
+ extern profile_data_t *bli_herk_profile_data;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syr2k_check( alpha, a, b, beta, c );
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_herk_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
(void*) cntl,
(void**) infos );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_herk_profile_data,
+ m_profile, m_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 1.0, n_threads);
+
+ bli_profile_data_free(bli_herk_profile_data);
+#endif
+
bli_herk_thrinfo_free_paths( infos, n_threads );
#endif
}
index ab50f9b2dd3eb3acf7ffb8b715b9cdf60e50b78f..047cf2dbcdd432f0287666a27918a8ac0820fba3 100644 (file)
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ */
#include "blis.h"
+
+#define CLOCK 1.2 // In GHz
+#define OPS_PER_CYCLE_S 16
+#define OPS_PER_CYCLE_D 4
+#define NUM_THREADS 8
+
void bli_syrk_front( obj_t* alpha,
- obj_t* a,
- obj_t* beta,
- obj_t* c,
- gemm_t* cntl )
+ obj_t* a,
+ obj_t* beta,
+ obj_t* c,
+ gemm_t* cntl )
{
obj_t a_local;
obj_t at_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t m_profile, k_profile;
+ extern profile_data_t *bli_herk_profile_data;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *c );
+ k_profile = bli_obj_width_after_trans( *a );
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syrk_check( alpha, a, beta, c );
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if (
- ( bli_obj_is_row_stored( c_local ) &&
- bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
- cntl_gemm_ukrs( cntl ) ) ) ||
- ( bli_obj_is_col_stored( c_local ) &&
- bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
- cntl_gemm_ukrs( cntl ) ) )
- )
+ ( bli_obj_is_row_stored( c_local ) &&
+ bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
+ cntl_gemm_ukrs( cntl ) ) ) ||
+ ( bli_obj_is_col_stored( c_local ) &&
+ bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
+ cntl_gemm_ukrs( cntl ) ) )
+ )
{
bli_obj_induce_trans( c_local );
}
-
- herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
- dim_t n_threads = thread_num_threads( infos[0] );
-
- // Invoke the internal back-end.
- bli_level3_thread_decorator( n_threads,
- (level3_int_t) bli_herk_int,
- alpha,
- &a_local,
- &at_local,
- beta,
- &c_local,
- (void*) cntl,
- (void**) infos );
-
- bli_herk_thrinfo_free_paths( infos, n_threads );
+
+ herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
+ dim_t n_threads = thread_num_threads( infos[0] );
+
+#ifdef BLIS_ENABLE_PROFILE
+ bli_herk_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
+ // Invoke the internal back-end.
+ bli_level3_thread_decorator( n_threads,
+ (level3_int_t) bli_herk_int,
+ alpha,
+ &a_local,
+ &at_local,
+ beta,
+ &c_local,
+ (void*) cntl,
+ (void**) infos );
+
+#ifdef BLIS_ENABLE_PROFILE
+ bli_profile_data_print (bli_herk_profile_data,
+ m_profile, m_profile, k_profile,
+ bli_obj_execution_datatype( *c ), 1.0, n_threads);
+
+ bli_profile_data_free(bli_herk_profile_data);
+#endif
+
+ bli_herk_thrinfo_free_paths( infos, n_threads );
}
index 899dd768ff96de8dbb49e2e2259afa6a39dd6706..32ae4af9930bb8d4e0a8215f9644467f5a9b7429 100644 (file)
bli_obj_width_after_trans( *a );
dim_t start, end;
- bli_get_range_weighted( thread, offA, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_is_upper( *c ), &start, &end );
+// bli_get_range_weighted( thread, offA, m_trans,
+// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+// bli_obj_is_upper( *c ), &start, &end );
+
+#if 0
+ dim_t n_trans;
+ n_trans = bli_obj_width_after_trans(*a);
+
+ {
+ dim_t At, Ar, X;
+ dim_t num_threads_At, num_threads_Ar;
+ At = ceil(n_trans*n_trans/2);
+ Ar = bli_max(0, m_trans - n_trans)*n_trans;
+ X = ceil(Ar/At);
+
+ printf("offA = %d, m_trans = %d, n_trans = %d At = %d, Ar = %d, X = %d\n", offA, m_trans, n_trans, At, Ar, X);
+
+ if (X > 0)
+ {
+ num_threads_At = thread->n_way / (1 + X);
+ num_threads_Ar = thread->n_way - num_threads_At;
+
+ printf("offA = %d, m_trans = %d, n_trans = %d At = %d, Ar = %d, X = %d, num_threads_At = %d, num_threads_Ar = %d\n", offA, m_trans, n_trans, At, Ar, X, num_threads_At, num_threads_Ar);
+
+ if(thread->work_id < num_threads_At)
+ {
+ dim_t all_start = offA;
+ dim_t all_end = offA + n_trans;
+ dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
+ uplo_t uplo = bli_obj_root_uplo( *a );
+ bool_t handle_edge_low = FALSE;
+ dim_t n_way = num_threads_At;
+ dim_t work_id = thread->work_id;
+ dim_t size = all_end - all_start; // partioning only the triangular part
+ dim_t width;
+ dim_t block_fac_leftover = size % block_factor;
+ dim_t i;
+ double num;
+
+ bli_toggle_uplo(uplo);
+
+ //printf("Triangle: work_id = %d \n", thread->work_id);
+
+ start = 0;
+ end = all_end - all_start;
+ num = size * size / ( double )n_way;
+
+ for ( i = 0; TRUE; ++i )
+ {
+ width = ceil( sqrt( start * start + num ) ) - start;
+
+ if ( i == 0 && handle_edge_low )
+ {
+ if ( width % block_factor != block_fac_leftover )
+ width += block_fac_leftover - ( width % block_factor );
+ }
+ else
+ {
+ if ( width % block_factor != 0 )
+ width += block_factor - ( width % block_factor );
+ }
+
+ if ( work_id == 0 )
+ {
+ start = start + all_start;
+ end = bli_min( start + width, all_end );
+ break;
+ }
+ else
+ {
+ start = start + width;
+ work_id--;
+ }
+ }
+
+ }
+ else
+ {
+ dim_t all_start = offA + n_trans;
+ dim_t all_end = m_trans;
+ dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
+ bool_t handle_edge_low = FALSE;
+
+ dim_t n_way = num_threads_Ar;
+ dim_t work_id = thread->work_id - num_threads_At;
+
+ dim_t size = all_end - all_start;
+
+ dim_t n_bf_whole = size / block_factor;
+ dim_t n_bf_left = size % block_factor;
+
+ dim_t n_bf_lo = n_bf_whole / n_way;
+ dim_t n_bf_hi = n_bf_whole / n_way;
+
+ //printf("Rectangle: work_id = %d \n", thread->work_id);
+
+
+ if ( handle_edge_low == FALSE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "high" and
+ // the "low" thread group is empty.
+ dim_t n_th_lo = n_bf_whole % n_way;
+ //dim_t n_th_hi = n_way - n_th_lo;
+
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to lower index threads.
+ if ( n_th_lo != 0 ) n_bf_lo += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ start = lo_start + (work_id ) * size_lo;
+ end = lo_start + (work_id+1) * size_lo;
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ start = hi_start + (work_id-n_th_lo ) * size_hi;
+ end = hi_start + (work_id-n_th_lo+1) * size_hi;
+
+ // Since the edge case is being allocated to the high
+ // end of the index range, we have to advance the last
+ // thread's end.
+ if ( work_id == n_way - 1 ) end += n_bf_left;
+ }
+ }
+ else // if ( handle_edge_low == TRUE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "low" and
+ // the "high" thread group is empty.
+ dim_t n_th_hi = n_bf_whole % n_way;
+ dim_t n_th_lo = n_way - n_th_hi;
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to higher index threads.
+ if ( n_th_hi != 0 ) n_bf_hi += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo
+ + n_bf_left;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ start = lo_start + (work_id ) * size_lo;
+ end = lo_start + (work_id+1) * size_lo;
+
+ // Since the edge case is being allocated to the low
+ // end of the index range, we have to advance the
+ // starts/ends accordingly.
+ if ( work_id == 0 ) end += n_bf_left;
+ else { start += n_bf_left;
+ end += n_bf_left; }
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ start = hi_start + (work_id-n_th_lo ) * size_hi;
+ end = hi_start + (work_id-n_th_lo+1) * size_hi;
+ }
+ }
+ }
+ }
+ else
+ {
+ bli_get_range_weighted_t2b( thread, offA, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *a ), &start, &end );
+ }
+ }
+#else
+ bli_get_range_weighted_t2b( thread, offA, m_trans,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *a ), &start, &end );
+#endif
+
#ifdef BLIS_ENABLE_C66X_EDMA
if(start<end)
index 3a4716b036d82d0a713b1a76ad86e825f9e24c81..e4e696b93d6ad1edc6751df54e91bad34478bc18 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- bli_get_range_weighted( thread, 0, n_trans,
- bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
- bli_obj_is_upper( *c ), &start, &end );
+// bli_get_range_weighted( thread, 0, n_trans,
+// bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+// bli_obj_is_upper( *c ), &start, &end );
+
+ bli_get_range_weighted_r2l( thread, 0, n_trans,
+ bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *b ), &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
index c341cb449da9c9228f9c77577faf2a42c32c6b2d..6ddd1dc543a181ff5f927bbc688e83e385b9d192 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- bli_get_range_weighted( thread, 0, n_trans,
- bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
- bli_obj_is_lower( *c ), &start, &end );
+// bli_get_range_weighted( thread, 0, n_trans,
+// bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+// bli_obj_is_lower( *c ), &start, &end );
+
+ bli_get_range_weighted_l2r( thread, 0, n_trans,
+ bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+ bli_obj_root_uplo( *b ), &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
index 61bbfa4612dbcdd8e0328ab6d32c1f0be2fe90af..39a9815954edab4255f95464dc76ced443aa55cc 100644 (file)
#include "blis.h"
+
+#ifdef BLIS_ENABLE_PROFILE
+ profile_data_t *bli_trmm_profile_data;
+#endif
+
void bli_trmm_front( side_t side,
obj_t* alpha,
obj_t* a,
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t mn_side, m_profile, n_profile;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *b );
+ n_profile = bli_obj_width( *b );
+ bli_set_dim_with_side( side, m_profile, n_profile, mn_side );
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trmm_check( side, alpha, a, b );
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( bli_is_right( side ) );
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_trmm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*7);
+#endif
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_trmm_int,
(void**) infos );
bli_trmm_thrinfo_free_paths( infos, n_threads );
+
+#ifdef BLIS_ENABLE_PROFILE
+ {
+ bli_profile_data_print (bli_trmm_profile_data,
+ m_profile, n_profile, mn_side,
+ bli_obj_execution_datatype( *b ), 1, n_threads);
+
+
+ bli_profile_data_free(bli_trmm_profile_data);
+ }
+
+#endif
}
index 4878aefea5661b0ddec9377b697b71622d182bef..d3a3e3648bff5ebe846c105b66d159ab5ec3437e 100644 (file)
impl_t i;
FUNCPTR_T f;
+#if defined(BLIS_ENABLE_PROFILE)
+ volatile uint64_t counter_start;
+ volatile uint64_t counter_end;
+ extern profile_data_t *bli_trmm_profile_data;
+ dim_t m_var, k_var, n_var;
+ dim_t index;
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trmm_int_check( alpha, a, b, beta, c, cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[side][uplo][n][i];
+#if defined(BLIS_ENABLE_PROFILE)
+ m_var = bli_obj_length( *b );
+ n_var = bli_obj_width( *b );
+ bli_set_dim_with_side( side, m_var, n_var, k_var );
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ TSCL = 0;
+ counter_start = __clock64();
+#else
+ counter_start = (uint64_t) (bli_clock()*1.2e9);
+#endif
+#endif
+
// Invoke the variant.
f( &a_local,
&b_local,
&c_local,
cntl,
thread );
+
+#if defined(BLIS_ENABLE_PROFILE)
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ counter_end = __clock64();
+#else
+ counter_end = (uint64_t) (bli_clock()*1.2e9);
+#endif
+ bli_profile_get_index(n, i, index);
+
+ bli_profile_data_update(bli_trmm_profile_data[index], (counter_end-counter_start), m_var*k_var*n_var);
+#endif
}
index b48d0f9f4277a76ffa59e845d87533c3affb3184..1d67404ecaedff6ef3ef2ab5aab6282b59d432c6 100644 (file)
lib_emt_Handle emt_handle_c0 = NULL; \
lib_emt_Handle emt_handle_c1 = NULL; \
\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trmm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
{ \
printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
/* Transfering MC(=m)xNR*/ \
} \
} \
c11 = cNew1;/*c11 = c1;*/ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
+\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
bli_auxinfo_set_is_a( PACKMR * k_a1011, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11/* rs_c, cs_c */); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k_a1011*m_cur*n_cur); \
+ } \
} /*if ( trmm_l_ir_my_iter( i, ir_thread ) ) */ \
\
a1 += ps_a_cur; \
bli_auxinfo_set_is_a( istep_a, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 /*rs_c, cs_c */); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} /*if ( trmm_l_ir_my_iter( i, ir_thread ) )*/ \
\
a1 += rstep_a; \
/*c11 += rstep_c;*/ \
c11 += rstep_c11; \
} /*for ( i = 0; i < m_iter; ++i )*/\
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND],\
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
+ } \
+\
/* circularly shift buffers */ \
cNewTemp = cNew0; \
cNew0 = cNew2; \
} \
\
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND],\
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
index 3360f36d6494740cbce5ea501edd5cc4bde350d3..2182d55765bf7433219b6371bf6ada7434ffdf72 100644 (file)
lib_emt_Handle emt_handle_b = NULL; \
lib_emt_Handle emt_handle_c0 = NULL; \
lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trmm_profile_data; \
\
/*
Assumptions/assertions:
{ \
printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
/* Loop over the n dimension (NR columns at a time). */ \
} \
} \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
+\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/* Save the 4m/3m imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( PACKMR * k_a1112, aux ); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k_a1112*m_cur*n_cur); \
+ } \
} /*if ( trmm_l_ir_my_iter( i, ir_thread ) )*/ \
\
a1 += ps_a_cur; \
/* Save the 4m/3m imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, aux ); \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
+\
} /*if ( trmm_l_ir_my_iter( i, ir_thread ) )*/\
\
a1 += rstep_a; \
c11 += rstep_c11; \
/*c11 += rstep_c;*/ \
} /*for ( i = 0; i < m_iter; ++i )*/\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
+ } \
/* circularly shift buffers */ \
cNewTemp = cNew0; \
cNew0 = cNew2; \
b1 += cstep_b; \
c1 += cstep_c; \
} \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
index 0486cc2974d9fd46ea44a0a13d6311989eb6d726..c23083569b744ce94c14e96e13a8f7e0bc823fcc 100644 (file)
#define FUNCPTR_T gemm_fp
+#ifdef BLIS_ENABLE_PROFILE
+#define BLIS_ENABLE_PROFILE_KERVAR2 1
+#else
+#define BLIS_ENABLE_PROFILE_KERVAR2 0
+#endif
+
typedef void (*FUNCPTR_T)(
doff_t diagoffb,
pack_t schema_a,
func_t* gemm_ukrs;
void* gemm_ukr;
-
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
lib_emt_Handle emt_handle_b = NULL; \
lib_emt_Handle emt_handle_c0 = NULL; \
lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trmm_profile_data; \
\
/*
Assumptions/assertions:
\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
/* Loop over the n dimension (NR columns at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
/* Transfering MC(=m)xNR*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
bli_auxinfo_set_is_b( PACKNR * k_b1121, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ /*TSCL=0;*/ \
+ counter_start_ker = TSCL; \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker),2*k_b1121*m_cur*n_cur); \
+ } \
} \
\
a1 += rstep_a; \
c11 += rstep_c11; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = TSCL; \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k_b1121*m*n_cur); \
+ } \
} \
\
b1 += ps_b_cur; \
bli_auxinfo_set_is_b( istep_b, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ /*printf("gemm %d %d %d %ld\n", MR, NR, k, (counter_end_ker-counter_start_ker));*/ \
+ } \
} /*if ( trmm_r_ir_my_iter( i, ir_thread ) )*/\
\
a1 += rstep_a; \
c11 += rstep_c11; \
} /*for i*/\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
+ } \
} /*j thread*/\
\
b1 += cstep_b; \
} \
c1 += cstep_c; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
index 3731fecc21daf9ccecbe60c3de43ab5a7ec6844f..6707817ce3c7c9effbe31111ce73439cec7c114d 100644 (file)
lib_emt_Handle emt_handle_b = NULL; \
lib_emt_Handle emt_handle_c0 = NULL; \
lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trmm_profile_data; \
\
/*
Assumptions/assertions:
\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
/* Loop over the n dimension (NR columns at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
/* Transfering MC(=m)xNR*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
bli_auxinfo_set_is_b( PACKNR * k_b0111, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k_b0111*m_cur*n_cur); \
+ } \
} \
\
a1 += rstep_a; \
c11 += rstep_c11; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k_b0111*m*n_cur); \
+ } \
} \
\
b1 += ps_b_cur; \
bli_auxinfo_set_is_b( istep_b, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} \
\
a1 += rstep_a; \
c11 += rstep_c11; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
+ } \
} \
\
b1 += cstep_b; \
\
c1 += cstep_c; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
index c81b63c308040958c1b692b24d44d9e37da40193..3319b7700070617068ab622b9467690c88a549f7 100644 (file)
dim_t ic_nt = jr_way * ir_way;
dim_t jr_nt = ir_way;
dim_t ir_nt = 1;
-
trmm_thrinfo_t** paths = (trmm_thrinfo_t**) bli_malloc( global_num_threads * sizeof( trmm_thrinfo_t* ) );
index f6145a2310a525540a773e4787e8b87eef238ba1..9dc5bead6acd9ebaa9a5f334fd8c1d229e3e2352 100644 (file)
#ifdef BLIS_ENABLE_C66X_EDMA
dim_t b_alg_next;
#endif
+ volatile int counter_start;
+ volatile int counter_end;
// printf("blk_var1b\n");
// Initialize object for packing B.
bli_obj_width_after_trans( *a );
dim_t start, end;
- num_t datatype = bli_obj_execution_datatype( *a );
- bli_get_range( thread, offA, m_trans,
- //bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
- bli_info_get_default_mc( datatype ),
- &start, &end );
+ num_t dt = bli_obj_execution_datatype( *a );
+// bli_get_range( thread, offA, m_trans,
+// //bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
+// bli_info_get_default_mc( datatype ),
+// &start, &end );
+
+ bli_get_range_b2t( thread, offA, m_trans,
+ //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+ bli_info_get_default_mc( dt ),
+ &start, &end );
// Partition along the remaining portion of the m dimension.
for ( i = start; i < end; i += b_alg )
if( thread_am_ichief( thread ) )
{
// DMA control leaf unrolling for A
- //printf("Init DMA A2 %x ", bli_mem_buffer(&(a1_dma->dma_mem)));
bli_dmam_init( &a1, a1_dma, cntl_sub_dmam_a( cntl ) );
- //printf("after DMA init %x %x %x\n", bli_mem_buffer(&(a1_dma->dma_mem)), bli_mem_buffer(&(a1_dma->pack_mem)), a1_dma->emt_handle);
}
bli_dmam_int( &a1, a1_dma, cntl_sub_dmam_a( cntl ), (dmam_thrinfo_t *) trsm_thread_sub_ipackm( thread ) );
}
#endif
// Perform trsm subproblem.
+ //TSCL = 0;
+ //counter_start = TSCL;
bli_trsm_int( &BLIS_ONE,
a1_pack,
b_pack,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
+ //counter_end = TSCL;
+ //if(lib_get_coreID()==0)
+ {
+ //printf("%d\n", (counter_end-counter_start));
+ }
+
#ifdef BLIS_ENABLE_C66X_EDMA
bli_obj_alias_to(c2, c1);
#endif
index 35b8a8fba019972af08d07f03c41a745052184a8..8296455ea8ee682cd642f279391b4a59f586edbf 100644 (file)
offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
dim_t start, end;
- num_t datatype = bli_obj_execution_datatype( *a );
- bli_get_range( thread, offA, m_trans,
- //bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
- bli_info_get_default_mc( datatype ),
- &start, &end );
+ num_t dt = bli_obj_execution_datatype( *a );
+ bli_get_range_t2b( thread, offA, m_trans,
+ //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+ bli_info_get_default_mc( dt ),
+ &start, &end );
// Partition along the remaining portion of the m dimension.
for ( i = start; i < end; i += b_alg )
index 201cf792218e3eab7803dc48a90d1a2f28dc14eb..69550811b9689f2aaf3176f04fb18651b4f9db21 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- num_t datatype = bli_obj_execution_datatype( *a );
- bli_get_range( thread, 0, n_trans,
- bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
+ num_t dt = bli_obj_execution_datatype( *a );
+
+ bli_get_range_r2l( thread, 0, n_trans,
+ bli_lcm( bli_info_get_default_nr( dt ), bli_info_get_default_mr( dt ) ),
&start, &end );
// Partition along the n dimension.
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, end, b,
cntl_blocksize( cntl ) );
- //printf("blk_var2 b_alg %d\n", b_alg);
+
+
// Acquire partitions for B1 and C1.
bli_acquire_mpart_r2l( BLIS_SUBPART1,
i, b_alg, b, &b1 );
index 9e3f82733f63a42c80c1981546666f169159bfc7..78ed4f6f80034f40fa5344e41e04b290f987105f 100644 (file)
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
- num_t datatype = bli_obj_execution_datatype( *a );
- bli_get_range( thread, 0, n_trans,
- bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
+ num_t dt = bli_obj_execution_datatype( *a );
+// bli_get_range_l2r( thread, 0, n_trans,
+// //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
+// // bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+// bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
+// bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
+// &start, &end );
+
+ bli_get_range_l2r( thread, 0, n_trans,
+ bli_lcm( bli_info_get_default_nr( dt ), bli_info_get_default_mr( dt ) ),
&start, &end );
+ //printf("blk_var2f n = %d end = %d\n", n_trans, end);
+
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
{
b_alg = bli_determine_blocksize_f( i, end, b,
cntl_blocksize( cntl ) );
+ //printf("start %d b_alg %d\n", i, b_alg);
+
// Acquire partitions for B1 and C1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
i, b_alg, b, &b1 );
index 811ddeb5260cf532db503718dee6b116f2e83526..f09b16965da240ba937d0a42d1002c6d41f1fcd4 100644 (file)
trsm_r_packa_cntl,
NULL, //trsm_r_packb_cntl, //
NULL,
- gemm_dmaa_cntl, //NULL, //
+ BLIS_GEMM_DMAA_CNTL, //NULL, //
NULL,
NULL,
trsm_cntl_bp_ke,
trsm_r_packb_cntl, //NULL, //
NULL,
NULL,
- gemm_dmab_cntl, //NULL, //
+ BLIS_GEMM_DMAB_CNTL, //NULL, //
NULL,
trsm_r_cntl_op_bp,
NULL,
index c6269218569ab51358bc178782a0cfc481ea6724..d971528a63d18385b2a4d62f66649f9a8df7163f 100644 (file)
#include "blis.h"
+
+
+#ifdef BLIS_ENABLE_PROFILE
+ profile_data_t *bli_trsm_profile_data;
+ profile_data_t *bli_trsm_kervar2_profile_data;
+#endif
+
void bli_trsm_front( side_t side,
obj_t* alpha,
obj_t* a,
obj_t b_local;
obj_t c_local;
+#ifdef BLIS_ENABLE_PROFILE
+ dim_t mn_side, m_profile, n_profile;
+#endif
+
+#ifdef BLIS_ENABLE_PROFILE
+ m_profile = bli_obj_length( *b );
+ n_profile = bli_obj_width( *b );
+ bli_set_dim_with_side( side, m_profile, n_profile, mn_side );
+
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trsm_check( side, alpha, a, b );
trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths( bli_is_right( side ) );
dim_t n_threads = thread_num_threads( infos[0] );
+#ifdef BLIS_ENABLE_PROFILE
+ bli_trsm_profile_data = bli_profile_data_init(BLIS_MAX_NUM_THREADS*BLIS_PROFILE_NUM_REPORTS);
+#endif
+
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_trsm_int,
(void**) infos );
bli_trsm_thrinfo_free_paths( infos, n_threads );
+
+#ifdef BLIS_ENABLE_PROFILE
+ {
+
+ bli_profile_data_print (bli_trsm_profile_data,
+ m_profile, n_profile, mn_side,
+ bli_obj_execution_datatype( *b ), 1, n_threads);
+
+
+ bli_profile_data_free(bli_trsm_profile_data);
+ }
+
+#endif
}
index 99efa46c27bc9b77bd49dd270be04d76bbde238d..06cb179a7cd30020f754a980fa513fbc6c8274b0 100644 (file)
impl_t i;
FUNCPTR_T f;
+#if defined(BLIS_ENABLE_PROFILE)
+ volatile uint64_t counter_start;
+ volatile uint64_t counter_end;
+ extern profile_data_t *bli_trsm_profile_data;
+ dim_t m_var, k_var, n_var;
+ dim_t index;
+#endif
+
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trsm_int_check( alpha, a, b, beta, c, cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[side][uplo][n][i];
+#if defined(BLIS_ENABLE_PROFILE)
+ m_var = bli_obj_length( *b );
+ n_var = bli_obj_width( *b );
+ bli_set_dim_with_side( side, m_var, n_var, k_var );
+
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ TSCL = 0;
+ counter_start = __clock64();
+#else
+ counter_start = (uint64_t) (bli_clock()*1.2e9);
+#endif
+#endif
+
// Invoke the variant.
f( &a_local,
&b_local,
&c_local,
cntl,
thread );
+
+#if defined(BLIS_ENABLE_PROFILE)
+#if defined(BLIS_ENABLE_C66X_BUILD)
+ counter_end = __clock64();
+#else
+ counter_end = (uint64_t) (bli_clock()*1.2e9);
+#endif
+ bli_profile_get_index(n, i, index);
+
+ bli_profile_data_update(bli_trsm_profile_data[index], (counter_end-counter_start), m_var*k_var*n_var);
+#endif
}
index bea7aee266f8dd157bbafe013706594be19c3136..d6352535a093307d0851d96d0ef654d04bf8fdf8 100644 (file)
#define FUNCPTR_T gemm_fp
+#ifdef BLIS_ENABLE_C66X_IDMA
+#define BLIS_ENABLE_C66X_IDMA_KERVAR2 1
+#endif
+
+
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
extern char *pool_mk_mem_L1;
dim_t k_b11; \
dim_t k_b21; \
dim_t off_b11; \
- /*dim_t off_b21;*/ \
+ dim_t off_b21; \
dim_t i, j, jb; \
inc_t rstep_a; \
inc_t cstep_b; \
- inc_t rstep_c, cstep_c; \
+ /*inc_t rstep_c;*/ \
+ inc_t cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
lib_emt_Handle emt_handle_c0 = NULL; \
lib_emt_Handle emt_handle_c1 = NULL; \
\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trsm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
\
cstep_b = ps_b; \
\
- rstep_c = rs_c * MR; \
+ /*rstep_c = rs_c * MR;*/ \
cstep_c = cs_c * NR; \
\
/* When C (MC*NR) is moved to L2 the stride to get to the next panel of MRxNR*/ \
rs_c11 = NR; /* stride to get to next row in MRxNR panel*/\
cs_c11 = 1; /*stride to get to next column in a panel of MRxNR*/\
\
- rstep_c11 = rstep_c; \
+ /*rstep_c11 = rstep_c; \
rs_c11 = rs_c; \
- cs_c11 = cs_c; \
+ cs_c11 = cs_c;*/ \
} \
\
istep_a = PACKMR * k_full; \
printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
\
- /*for(jb = 0; jb < 16; jb ++) \
- printf("%f \t %f\n", a_cast[jb], b_cast[jb]);*/ \
- /* initiate first c transfer */ \
- /*printf("cstep_c %d rstep_c %d rs_c %d cs_c %d rstep_c11 %d rs_c11 %d cs_c11 %d\n", cstep_c, rstep_c, rs_c, cs_c, rstep_c11, rs_c11, cs_c11);*/\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
+ \
if(rs_c == 1) \
{\
c1 = c1 + (n_iter-1)*cstep_c; \
+ /*printf("rs_c = %d, cs_c = %d, rs_c11 = %d, cs_c11 = %d, cstep_c = %d, rstep_c11 = %d, n_cur = %d, m = %d\n", rs_c, cs_c, rs_c11, cs_c11, cstep_c,rstep_c11, n_cur, m);*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
lib_emt_copy2D2D(emt_handle_c0, c1, \
} \
}\
}\
- /*else \
- lib_emt_copy2D2D(emt_handle_c0, c1 = c1 + (n_iter-1)*cstep_c, \
- cNew1, n_cur*sizeof(ctype), \
- m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype));*/ \
+ else \
+ { \
+ /*printf("rs_c = %d, cs_c = %d, rs_c11 = %d, cs_c11 = %d, cstep_c = %d, rstep_c11 = %d, n_cur = %d, m = %d\n", rs_c, cs_c, rs_c11, cs_c11, cstep_c,rstep_c11, n_cur, m);*/ \
+ c1 = c1 + (n_iter-1)*cstep_c; \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
+ cNew1, n_cur*sizeof(ctype), \
+ m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1; \
+ ptr_dest = cNew1; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_cur*sizeof(ctype)); \
+ ptr_source += rs_c; \
+ ptr_dest += rs_c11; \
+ } \
+ }\
+ }\
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( jb = 0; jb < n_iter; ++jb ) \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict b2; \
-\
+ \
j = n_iter - 1 - jb; \
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
- if(rs_c == 1) \
- c11 = cNew1; \
- else \
- c11 = c1 + (n_iter-1)*cstep_c; \
-\
+ /*if(rs_c == 1)*/ \
+ c11 = cNew1; \
+ /*else \
+ c11 = c1 + (n_iter-1)*cstep_c; */\
+ \
n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
n_next = ( bli_is_not_edge_b( jb+1, n_iter, n_left ) ? NR : n_left ); \
-\
+ \
lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
- if(rs_c == 1) \
- lib_emt_wait(emt_handle_c0); \
+ /*if(rs_c == 1)*/ \
+ lib_emt_wait(emt_handle_c0); \
if(jb < n_iter-1) /* no transfer for last iteration */ \
{ \
if (rs_c == 1) \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
lib_emt_copy2D2D(emt_handle_c0, c1-cstep_c, \
- cNew0, m*sizeof(ctype), \
- n_next, cs_c*sizeof(ctype), \
- cs_c11*sizeof(ctype)); \
+ cNew0, m*sizeof(ctype), \
+ n_next, cs_c*sizeof(ctype), \
+ cs_c11*sizeof(ctype)); \
} \
else \
{ \
} \
}\
}\
- /*else \
- { \
- lib_emt_copy2D2D(emt_handle_c0, c1 - cstep_c, \
- cNew0, n_next*sizeof(ctype), \
- m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
- }*/ \
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1 - cstep_c, \
+ cNew0, n_next*sizeof(ctype), \
+ m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1 - cstep_c; \
+ ptr_dest = cNew0; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_next*sizeof(ctype)); \
+ ptr_source += rs_c; \
+ ptr_dest += rs_c11; \
+ } \
+ }\
+ } \
} \
\
/* If the current panel of B intersects the diagonal, use a
k_b1121 = k - off_b11; \
k_b11 = NR; \
k_b21 = k_b1121 - NR; \
- /*off_b21 = off_b11 + k_b11;*/ \
+ off_b21 = off_b11 + k_b11; \
\
/* Compute the addresses of the triangular block B11 and the
panel B21. */ \
bli_auxinfo_set_is_a( PACKNR * k_b1121, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
- if( trsm_my_iter( i, thread ) ){ \
+ if( trsm_my_iter( i, thread ) ) \
+ { \
\
ctype* restrict a11; \
ctype* restrict a12; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
- /*printf("%d %d %d %d \n", k, MR, off_b11, PACKMR);*/ \
- if (i == 0) \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
{ \
+ if (i == 0) \
+ { \
lib_imt_copy(a1 + ( off_b11 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
+ } \
} \
\
- /*ORIG TRSM*/ \
/* Compute the addresses of the next panels of A and B. */ \
- /*a2 = a1;*/ \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ a2 = a1 + rstep_a; \
+ lib_imt_wait(); \
+ temp = a1_L1; \
+ a1_L1 = a2_L1; \
+ a2_L1 = temp; \
+ } \
+ else \
+ { \
+ a2 = a1; \
+ } \
\
/* Compute the addresses of the next panels of A and B. */ \
- a2 = a1 + rstep_a; \
- lib_imt_wait(); \
- temp = a1_L1; \
- a1_L1 = a2_L1; \
- a2_L1 = temp; \
if(i == 0) \
{ \
lib_emt_wait(emt_handle_b);\
} \
else \
{ \
- /*Start next panel*/ \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ /*Start next panel*/ \
lib_imt_copy(a2 + ( off_b11 * PACKMR ) / off_scl, a2_L1, k_b1121*MR*sizeof(ctype)); \
+ } \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( a2, aux ); \
\
/* Compute the addresses of the A11 block and A12 panel. */ \
- /*a11 = a1 + ( off_b11 * PACKMR ) / off_scl; */\
- /*a12 = a1 + ( off_b21 * PACKMR ) / off_scl;*/ \
- a11 = a1_L1;\
- a12 = a1_L1 + ( k_b11 * PACKMR ) / off_scl; \
+ /* If IDMA enabled*/ \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ a11 = a1_L1;\
+ a12 = a1_L1 + ( k_b11 * PACKMR ) / off_scl; \
+ } \
+ else \
+ { \
+ a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
+ a12 = a1 + ( off_b21 * PACKMR ) / off_scl; \
+ } \
/* Handle interior and edge cases separately. */ \
- /*printf("Calling GEMMTRSM ukernel\n");*/\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 /* rs_c, cs_c **** 1, cstep_c11*/); \
} \
- lib_imt_wait(); \
- /*printf("%d %d \n", k_b11, PACKMR);*/ \
- lib_imt_copy(a11, a1 + ( off_b11 * PACKMR ) / off_scl, k_b11*PACKMR*sizeof(ctype)); \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k_b21*m_cur*n_cur); \
} \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ /*Storing the value back*/\
+ /*lib_imt_wait(); \
+ lib_imt_copy(a11, a1 + ( off_b11 * PACKMR ) / off_scl, k_b11*PACKMR*sizeof(ctype), 0,0,7);*/ \
+ { \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = a11; \
+ ptr_dest = a1 + ( off_b11 * PACKMR ) / off_scl; \
+ memcpy(ptr_dest, ptr_source, k_b11*PACKMR*sizeof(ctype)); \
+ } \
+ } \
+ } /*trsm_my_iter( i, thread ) */\
\
a1 += rstep_a; \
/*c11 += rstep_c;*/ \
c11 += rstep_c11; \
+ } /*MR loop*/\
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
} \
\
b1 += ps_b_cur; \
bli_auxinfo_set_is_a( istep_b, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
+\
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trsm_my_iter( i, thread ) ){ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
- if(i == 0) \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
{ \
+ if(i == 0) \
+ { \
lib_imt_copy(a1, a2_L1, k*MR*sizeof(ctype)); \
+ } \
+ /* Compute the addresses of the next panels of A and B. */ \
+ a2 = a1 + rstep_a; \
+ lib_imt_wait(); \
+ temp = a1_L1; \
+ a1_L1 = a2_L1; \
+ a2_L1 = temp; \
+ } \
+ else \
+ { \
+ /* Compute the addresses of the next panels of A and B. */ \
+ a2 = a1;\
} \
\
- /*ORIG TRSM*/ \
- /* Compute the addresses of the next panels of A and B. */ \
- /*a2 = a1;*/\
- /* Compute the addresses of the next panels of A and B. */ \
- a2 = a1 + rstep_a; \
- lib_imt_wait(); \
- temp = a1_L1; \
- a1_L1 = a2_L1; \
- a2_L1 = temp; \
if(i == 0) \
{ \
lib_emt_wait(emt_handle_b);\
} \
else \
{ \
- /*Start next panel*/ \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ /*Start next panel*/ \
lib_imt_copy(a2, a2_L1, k*MR*sizeof(ctype)); \
+ } \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( a2, aux ); \
\
/* Handle interior and edge cases separately. */ \
- /*printf("Calling GEMM ukernel\n");*/\
- if ( m_cur == MR && n_cur == NR ) \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
{ \
- /* Invoke the gemm micro-kernel. */ \
- gemm_ukr_cast( k, \
- minus_one, \
- b1_L1, \
- a1_L1, \
- alpha2_cast, \
- c11, cs_c11, rs_c11, /*cs_c, rs_c, * cstep_c11, 1,*/ \
- &aux ); \
+ counter_start_ker = __clock64(); \
+ } \
+ if (BLIS_ENABLE_C66X_IDMA_KERVAR2 == 1) \
+ { \
+ if ( m_cur == MR && n_cur == NR ) \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ minus_one, \
+ b1_L1, \
+ a1_L1, \
+ alpha2_cast, \
+ c11, cs_c11, rs_c11, /*cs_c, rs_c, * cstep_c11, 1,*/ \
+ &aux ); \
+ } \
+ else \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ minus_one, \
+ b1_L1, \
+ a1_L1, \
+ zero, \
+ ct, cs_ct, rs_ct, \
+ &aux ); \
+\
+ /* Add the result to the edge of C. */ \
+ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+ ct, rs_ct, cs_ct, \
+ alpha2_cast, \
+ c11, rs_c11, cs_c11 /*rs_c, cs_c *1, cstep_c11 */); \
+ } \
} \
else \
{ \
- /* Invoke the gemm micro-kernel. */ \
- gemm_ukr_cast( k, \
- minus_one, \
- b1_L1, \
- a1_L1, \
- zero, \
- ct, cs_ct, rs_ct, \
- &aux ); \
-\
- /* Add the result to the edge of C. */ \
- PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
- ct, rs_ct, cs_ct, \
- alpha2_cast, \
- c11, rs_c11, cs_c11 /*rs_c, cs_c *1, cstep_c11 */); \
+ if ( m_cur == MR && n_cur == NR ) \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ minus_one, \
+ b1_L1, \
+ a1, \
+ alpha2_cast, \
+ c11, cs_c11, rs_c11, /*cs_c, rs_c, * cstep_c11, 1,*/ \
+ &aux ); \
+ } \
+ else \
+ { \
+ /* Invoke the gemm micro-kernel. */ \
+ gemm_ukr_cast( k, \
+ minus_one, \
+ b1_L1, \
+ a1, \
+ zero, \
+ ct, cs_ct, rs_ct, \
+ &aux ); \
+\
+ /* Add the result to the edge of C. */ \
+ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+ ct, rs_ct, cs_ct, \
+ alpha2_cast, \
+ c11, rs_c11, cs_c11 /*rs_c, cs_c *1, cstep_c11 */); \
+ } \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
} \
+ } /*myiter*/\
\
a1 += rstep_a; \
/*c11 += rstep_c;*/ \
c11 += rstep_c11; \
+ } /*MR loop if b does not intersect diagonal*/\
+ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
} \
\
b1 += cstep_b; \
} \
\
/* circularly shift buffers */ \
- if(rs_c==1) \
- { \
cNewTemp = cNew0; \
cNew0 = cNew2; \
cNew2 = cNew1; \
{ \
lib_emt_wait(emt_handle_c1); \
} \
- } \
/* save updated c*/ \
if(rs_c==1) \
{ \
} \
} \
} \
- /*else \
- lib_emt_copy2D2D(emt_handle_c1, cNew1, c1, n_cur*sizeof(ctype), \
- m, rs_c11*sizeof(ctype), rs_c*sizeof(ctype)); */\
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, n_cur*sizeof(ctype), \
+ m, rs_c11*sizeof(ctype), rs_c*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = cNew2; \
+ ptr_dest = c1; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_cur*sizeof(ctype)); \
+ ptr_source += rs_c11; \
+ ptr_dest += rs_c; \
+ } \
+ }\
+ }\
\
c1 -= cstep_c; \
+ } /*NR loop*/\
+ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
} \
\
bli_mem_release( &c2_L2_mem ); \
index f126f6ab63f434c6d7164ef29f79e06ed9f15347..c2b44f8801711969911b70c942a8f2bd69c654a7 100644 (file)
#define FUNCPTR_T gemm_fp
-
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
extern char *pool_mk_mem_L1;
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
- inc_t rstep_c, cstep_c; \
+ /*inc_t rstep_c; */\
+ inc_t cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ps_b_cur; \
auxinfo_t aux; \
\
- /*inc_t rstep_c11, rs_c11, cs_c11;*/ \
+ dim_t n_next; \
+ inc_t rstep_c11, rs_c11, cs_c11; \
\
mem_t b1_L1_mem; \
/*memcpy does not like b1_L1 if it is restrict. The resid of gemm is non zero if this is changed to ctype* restrict*/ \
mem_t a1_L1_mem, a2_L1_mem; \
ctype *a1_L1, *a2_L1, *temp; \
\
-/* mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
- ctype *cNew0, *cNew1, *cNew2, *cNewTemp;*/ \
+ mem_t c0_L2_mem, c1_L2_mem, c2_L2_mem; \
+ ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
/*EDMA Declarations */ \
\
lib_emt_Handle emt_handle_b = NULL; \
-/* lib_emt_Handle emt_handle_c0 = NULL; \
- lib_emt_Handle emt_handle_c1 = NULL;*/ \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trsm_profile_data; \
/*
Assumptions/assertions:
rs_a == 1
last column and the next multiple of NR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
- PASTEMAC(ch,set0s_mxn)( MR, NR, \
- ct, rs_ct, cs_ct ); \
+ /*PASTEMAC(ch,set0s_mxn)( MR, NR, \
+ ct, rs_ct, cs_ct );*/ \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
\
cstep_b = ps_b; \
\
- rstep_c = rs_c * MR; \
+ /*rstep_c = rs_c * MR;*/ \
cstep_c = cs_c * NR; \
\
/* When C (MC*NR) is moved to L2 the stride to get to the next panel of MRxNR*/ \
-/* if(rs_c == 1) \
+ if(rs_c == 1) \
{ \
- rstep_c11 = MR; / *stride to get to next panel of MRxNR in a panel of MCxNR* /\
+ rstep_c11 = MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/ \
rs_c11 = 1; \
- cs_c11 = (m%2 == 0) ? m : m+1; / *stride to get to next column in a panel of MRxNR* /\
+ cs_c11 = (m%2 == 0) ? m : m+1; /*stride to get to next column in a panel of MRxNR*/ \
} \
else\
{ \
- rstep_c11 = NR*MR; / *stride to get to next panel of MRxNR in a panel of MCxNR* /\
- rs_c11 = NR; / * stride to get to next row in MRxNR panel* /\
- cs_c11 = 1; / *stride to get to next column in a panel of MRxNR* /\
- } */\
+ rstep_c11 = NR*MR; /*stride to get to next panel of MRxNR in a panel of MCxNR*/\
+ rs_c11 = NR; /* stride to get to next row in MRxNR panel*/ \
+ cs_c11 = 1; /*stride to get to next column in a panel of MRxNR*/\
+ } \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
a1_L1 = (ctype*) (pool_mk_mem_L1 ); \
a2_L1 = (ctype*) (pool_mk_mem_L1 + k*MR*sizeof(ctype) ) ;\
b1_L1 = (ctype*) (pool_mk_mem_L1 + PASTEMAC(ch,bank) + 2 * k*MR*sizeof(ctype)) ;\
- /*printf("%x %x %x \n", a1_L1, a2_L1, b1_L1);*/\
}\
else { \
- /*Acquiring a buffer for B in L1*/ \
- bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
- b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
- b1_L1 = (ctype *) ((char *) b1_L1_mem.buf + PASTEMAC(ch,bank)); \
+ /*Acquiring a buffer for B in L1*/ \
+ bli_mem_acquire_m( k*NR*sizeof(ctype), BLIS_BUFFER_FOR_B_PANEL_L1, &b1_L1_mem); \
+ b1_L1 = bli_mem_buffer( &b1_L1_mem ); \
+ b1_L1 = (ctype *) ((char *) b1_L1_mem.buf + PASTEMAC(ch,bank)); \
\
- /*Acquiring a buffer for A in L1*/ \
- bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
- a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
+ /*Acquiring a buffer for A in L1*/ \
+ bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a1_L1_mem); \
+ a1_L1 = bli_mem_buffer( &a1_L1_mem ); \
\
- bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
- a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
+ bli_mem_acquire_m( k*MR*sizeof(ctype), BLIS_BUFFER_FOR_A_BLOCK_L1, &a2_L1_mem); \
+ a2_L1 = bli_mem_buffer( &a2_L1_mem ); \
}\
\
/*Acquiring buffers for C (MC_x_NR) in L2 */\
-/* bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BITVAL_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
+ bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c0_L2_mem); \
cNew0 = bli_mem_buffer( &c0_L2_mem ); \
\
- bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BITVAL_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
+ bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c1_L2_mem); \
cNew1 = bli_mem_buffer( &c1_L2_mem ); \
\
- bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BITVAL_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
+ bli_mem_acquire_m( m*NR*sizeof(ctype), BLIS_BUFFER_FOR_C_PANEL_L2, &c2_L2_mem); \
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
-*/ \
\
/*Acquiring an EDMA handle from the pool*/ \
bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
{ \
printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
} \
-/* bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
if(emt_handle_c0 == NULL) \
{ \
printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
{ \
printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
-*/ \
+ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = __clock64(); \
+ } \
+ n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
+ if(rs_c == 1) \
+ {\
+ if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
+ cNew1, m*sizeof(ctype), \
+ n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1; \
+ ptr_dest = cNew1; \
+ for(ii = 0; ii < n_cur; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c; \
+ ptr_dest += cs_c11; \
+ } \
+ }\
+ }\
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
+ cNew1, n_cur*sizeof(ctype), \
+ m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1; \
+ ptr_dest = cNew1; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_cur*sizeof(ctype)); \
+ ptr_source += rs_c; \
+ ptr_dest += rs_c11; \
+ } \
+ }\
+ }\
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
- c11 = c1; \
+ /*c11 = c1; */\
+ c11 = cNew1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+ n_next = ( bli_is_not_edge_f( j+1, n_iter, n_left ) ? NR : n_left ); \
\
lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k*NR*sizeof(ctype)); \
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
+\
+ lib_emt_wait(emt_handle_c0); \
+ if(j < n_iter-1) /* no transfer for last iteration */ \
+ { \
+ if (rs_c == 1) \
+ { \
+ if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
+ cNew0, m*sizeof(ctype), \
+ n_next, cs_c*sizeof(ctype), \
+ cs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1+cstep_c; \
+ ptr_dest = cNew0; \
+ for(ii = 0; ii < n_next; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c; \
+ ptr_dest += cs_c11; \
+ } \
+ }\
+ }\
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c0, c1 + cstep_c, \
+ cNew0, n_next*sizeof(ctype), \
+ m, rs_c*sizeof(ctype), rs_c11*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = c1 + cstep_c; \
+ ptr_dest = cNew0; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_next*sizeof(ctype)); \
+ ptr_source += rs_c; \
+ ptr_dest += rs_c11; \
+ } \
+ }\
+ } \
+ } \
\
/* If the current panel of B intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
bli_auxinfo_set_is_a( PACKNR * k_b0111, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trsm_my_iter( i, thread ) ){ \
bli_auxinfo_set_next_b( a2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
b11, \
a10, \
a11, \
- c11, cs_c, rs_c, \
+ c11, cs_c11, rs_c11, /*cs_c, rs_c,*/ \
&aux ); \
} \
else \
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
- c11, rs_c, cs_c ); \
+ c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k_b01*m_cur*n_cur); \
} \
lib_imt_wait(); \
lib_imt_copy(a11, a1 + ( off_b11 * PACKMR ) / off_scl, NR*PACKMR*sizeof(ctype)); \
} \
\
a1 += rstep_a; \
- c11 += rstep_c; \
+ /*c11 += rstep_c;*/ \
+ c11 += rstep_c11; \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
} \
\
b1 += ps_b_cur; \
bli_auxinfo_set_is_a( istep_b, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = __clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trsm_my_iter( i, thread ) ){ \
bli_auxinfo_set_next_b( a2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = __clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
b1_L1, \
a1_L1, \
alpha2_cast, \
- c11, cs_c, rs_c, \
+ c11, cs_c11, rs_c11, /*cs_c, rs_c,*/ \
&aux ); \
} \
else \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
- c11, rs_c, cs_c ); \
+ c11, rs_c11, cs_c11 /*rs_c, cs_c*/ ); \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
} \
} \
\
a1 += rstep_a; \
- c11 += rstep_c; \
+ /*c11 += rstep_c;*/ \
+ c11 += rstep_c11; \
+ } \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
} \
\
b1 += cstep_b; \
} \
\
+ /* circularly shift buffers */ \
+ cNewTemp = cNew0; \
+ cNew0 = cNew2; \
+ cNew2 = cNew1; \
+ cNew1 = cNewTemp; \
+ if(j != 0) /* wait for save c to complete; skip first iteration */ \
+ { \
+ lib_emt_wait(emt_handle_c1); \
+ } \
+ /* save updated c*/ \
+ if(rs_c==1) \
+ { \
+ if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = cNew2; \
+ ptr_dest = c1; \
+ for(ii = 0; ii < n_cur; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, m*sizeof(ctype)); \
+ ptr_source += cs_c11; \
+ ptr_dest += cs_c; \
+ } \
+ } \
+ } \
+ else \
+ { \
+ if (rs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
+ { \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, n_cur*sizeof(ctype), \
+ m, rs_c11*sizeof(ctype), rs_c*sizeof(ctype)); \
+ } \
+ else \
+ { \
+ dim_t ii; \
+ ctype *ptr_source; \
+ ctype *ptr_dest; \
+ ptr_source = cNew2; \
+ ptr_dest = c1; \
+ for(ii = 0; ii < m; ii++) \
+ { \
+ memcpy(ptr_dest, ptr_source, n_cur*sizeof(ctype)); \
+ ptr_source += rs_c11; \
+ ptr_dest += rs_c; \
+ } \
+ }\
+ }\
c1 += cstep_c; \
} \
-/* bli_mem_release( &c2_L2_mem ); \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = __clock64(); \
+ bli_profile_data_update(bli_trsm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
+ bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
bli_mem_release( &c0_L2_mem ); \
- */ \
+\
if((MKSTR(ch)=="c")==0) \
{\
- bli_mem_release( &a2_L1_mem ); \
- bli_mem_release( &a1_L1_mem ); \
- bli_mem_release( &b1_L1_mem ); \
+ bli_mem_release( &a2_L1_mem ); \
+ bli_mem_release( &a1_L1_mem ); \
+ bli_mem_release( &b1_L1_mem ); \
}\
if ( emt_handle_b != NULL ) \
{ \
bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
emt_handle_b = NULL; \
} \
-/* if ( emt_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
emt_handle_c0 = NULL; \
lib_emt_wait(emt_handle_c1); \
bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
emt_handle_c1 = NULL; \
- } \
-*/ \
+ }\
+ \
}
INSERT_GENTFUNC_BASIC2( trsm_ru_ker_var2, gemmtrsm_ukr_t, gemm_ukr_t )
index faf1e3b3d2ecd87fb69d17dfb0808a759e37e858..c83b6d063659eb0c28dd1fd5b77a918d43c35281 100755 (executable)
*/
#include "blis.h"
+
+#if defined(BLIS_ENABLE_C66X_OPENCL)
+int ti_printf(FILE* _fp, const char *_format, ...)
+{
+ va_list argptr;
+ va_start(argptr,_format);
+ printf(_format, argptr);
+ va_end(argptr);
+ return 0;
+}
+#endif
+
#if defined(BLIS_ENABLE_C66X_BUILD) && defined(BLIS_ENABLE_C66X_MEM_POOLS) && defined(BLIS_ENABLE_C66X_EDMA)
//#define BLIS_EDMA_DEBUG
//#define BLIS_ENABLE_CYCLE_COUNT
if ( bli_obj_is_zeros( *a ) )
{
- //printf("zeros\n");
- //bli_obj_release_dma( p, cntl );
bli_obj_alias_for_dma( *a, *p );
return;
}
index d4b26f7e47b0640d9db39dca89008d58625468b4..0e4928ce5a44fd5e93df6f69c32dfe12e0f4d8dd 100644 (file)
#ifndef BLIS_DMA_H
#define BLIS_DMA_H
+
+int ti_printf(FILE *_fp, const char *_format, ...);
+int ti_sprintf(char *str, const char *_format, ...);
+
+
/*
* EDMA Pool
*/
index 8638a23eae933ea7296e199634157c8ec2c072ae..d61f8e8d1e487a358b8629a5c5889a6d6f3da1a7 100644 (file)
{
fprintf( stderr, "libblis: Aborting.\n" );
//raise( SIGABRT );
-#ifdef BLIS_ENABLE_C66X_OPENCL
- exit(1);
-#else
abort();
-#endif
}
void bli_print_msg( char* str, char* file, guint_t line )
{
-#ifdef BLIS_ENABLE_C66X_OPENCL
+ //fprintf( stderr, "\n" );
+ //fprintf( stderr, "libblis: %s (line %lu):\n", file, ( long unsigned int )line );
+ //fprintf( stderr, "libblis: %s\n", str );
printf( "\n" );
printf( "libblis: %s (line %lu):\n", file, ( long unsigned int )line );
printf( "libblis: %s\n", str );
-
-#else
- fprintf( stderr, "\n" );
- fprintf( stderr, "libblis: %s (line %lu):\n", file, ( long unsigned int )line );
- fprintf( stderr, "libblis: %s\n", str );
+#ifndef BLIS_ENABLE_C66X_OPENCL
fflush( stderr );
#endif
}
index fc8bfe27ba88d41ee37d54b877b1047f1dddcdac..40e50571ce39143cb1c8aeedb3c997a7feccad5a 100644 (file)
#pragma DATA_ALIGN(pool_kn_mem_L2, BLIS_CACHE_LINE_SIZE);
static char pool_kn_mem_L2[ BLIS_KN_POOL_SIZE_L2 ];
-static void* pool_mn_blk_ptrs_L2[ BLIS_NUM_MC_X_NC_BLOCKS_L2 ];
+static void* pool_mn_blk_ptrs_L2[ BLIS_NUM_MC_X_NR_BLOCKS_L2 ];
#pragma DATA_SECTION( pool_mn_mem_L2, ".myL2");
#pragma DATA_ALIGN(pool_mn_mem_L2, BLIS_CACHE_LINE_SIZE);
-static char pool_mn_mem_L2[ BLIS_MN_POOL_SIZE_L2 ];
+static char pool_mn_mem_L2[ BLIS_MNR_POOL_SIZE_L2 ];
//
//L3 Pools
if (bli_buf_type_is_shared(buf_type))
core_id = 0;
else
- core_id = lib_get_coreID ();
+ core_id = omp_get_thread_num ();
#endif
bli_mem_set_buffer( block, mem );
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_size( req_size, mem );
+
+ //printf("Acquire: block %x \n", block);
}
else
{
pool = &pools[ pool_index ];
#ifdef BLIS_ENABLE_C66X_MEM_DEBUG
- printf("Acquire: buf_type %x pool size %d req size %d ", buf_type, bli_pool_block_size( pool ), req_size);
+ printf("Acquire: core_id %d buf_type %x block size size %d req size %d ", core_id, buf_type, bli_pool_block_size( pool ), req_size);
#endif
// Unconditionally perform error checking on the memory pool.
if (bli_buf_type_is_shared(buf_type))
core_id = 0;
else
- core_id = lib_get_coreID ();
+ core_id = omp_get_thread_num ();
#endif
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
#ifdef BLIS_ENABLE_C66X_MEM_DEBUG
printf("L2 Cache\n");
- printf("MK: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MK_POOL_SIZE_L2, BLIS_NUM_MC_X_KC_BLOCKS_L2, BLIS_MK_BLOCK_SIZE);
- printf("KN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_KN_POOL_SIZE_L2, BLIS_NUM_KC_X_NC_BLOCKS_L2, BLIS_KN_BLOCK_SIZE);
- printf("MN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MN_POOL_SIZE_L2, BLIS_NUM_MC_X_NC_BLOCKS_L2, BLIS_MN_BLOCK_SIZE);
-
- printf("BLIS_POOL_MC_S: %d BLIS_POOL_KC_S: %d\n", BLIS_POOL_MC_S, BLIS_POOL_KC_S);
-
- printf("max :%d\n", ( (BLIS_POOL_MC_S + BLIS_POOL_KC_S)^2));
-
- printf("MK: Block Size %d, %d, %d, %d, %d, %d, %d, %d\n",BLIS_MK_BLOCK_SIZE_S ,BLIS_MK_BLOCK_SIZE_D ,BLIS_MK_BLOCK_SIZE_C,
+ printf("MK: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MK_POOL_SIZE_L2, BLIS_NUM_MC_X_KC_BLOCKS_L2, BLIS_MK_BLOCK_SIZE);
+ printf("KN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_KN_POOL_SIZE_L2, BLIS_NUM_KC_X_NC_BLOCKS_L2, BLIS_KN_BLOCK_SIZE);
+ printf("MN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MN_POOL_SIZE_L2, BLIS_NUM_MC_X_NC_BLOCKS_L2, BLIS_MN_BLOCK_SIZE);
+ printf("MK: Block Size %d, %d, %d, %d, %d, %d, %d, %d\n",BLIS_MK_BLOCK_SIZE_S ,BLIS_MK_BLOCK_SIZE_D ,BLIS_MK_BLOCK_SIZE_C,
BLIS_MK_BLOCK_SIZE_Z,BLIS_MK_BLOCK_SIZE_4M_C,BLIS_MK_BLOCK_SIZE_4M_Z,BLIS_MK_BLOCK_SIZE_3M_C,BLIS_MK_BLOCK_SIZE_3M_Z );
#endif
//L2 Cache
printf("MK: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MK_POOL_SIZE_L3, BLIS_NUM_MC_X_KC_BLOCKS_L3, BLIS_MK_BLOCK_SIZE);
printf("KN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_KN_POOL_SIZE_L3, BLIS_NUM_KC_X_NC_BLOCKS_L3, BLIS_KN_BLOCK_SIZE);
printf("MN: Pool Size %d, Num Blocks %d, Block size %d\n",BLIS_MN_POOL_SIZE_L3, BLIS_NUM_MC_X_NC_BLOCKS_L3, BLIS_MN_BLOCK_SIZE);
+ printf("BLIS_POOL_KC_S: %d BLIS_POOL_NC_S: %d\n", BLIS_POOL_KC_S, BLIS_POOL_NC_S);
printf("KN: Block Size %d, %d, %d, %d, %d, %d, %d, %d\n",BLIS_KN_BLOCK_SIZE_S ,BLIS_KN_BLOCK_SIZE_D ,BLIS_KN_BLOCK_SIZE_C,
BLIS_KN_BLOCK_SIZE_Z,BLIS_KN_BLOCK_SIZE_4M_C,BLIS_KN_BLOCK_SIZE_4M_Z,BLIS_KN_BLOCK_SIZE_3M_C,BLIS_KN_BLOCK_SIZE_3M_Z );
index a3359de7d905cb9f6b49ba7aaaf579e8392713ce..13922c8297083c0a70b7abf4033124457ff8535c 100644 (file)
void bli_obj_print( char* label, obj_t* obj )
{
-#ifndef BLIS_ENABLE_C66X_OPENCL
FILE* file = stdout;
-#endif
mem_t* pack_mem = bli_obj_pack_mem( *obj );
//mem_t* cast_mem = bli_obj_cast_mem( *obj );
if ( bli_error_checking_is_enabled() )
bli_obj_print_check( label, obj );
-#ifdef BLIS_ENABLE_C66X_OPENCL
- printf( "\n" );
- printf( "%s\n", label );
- printf( "\n" );
-
- printf( " m x n %lu x %lu\n", ( unsigned long int )bli_obj_length( *obj ),
- ( unsigned long int )bli_obj_width( *obj ) );
- printf( "\n" );
-
- printf( " offm, offn %lu, %lu\n", ( unsigned long int )bli_obj_row_offset( *obj ),
- ( unsigned long int )bli_obj_col_offset( *obj ) );
- printf( " diagoff %ld\n", ( signed long int )bli_obj_diag_offset( *obj ) );
- printf( "\n" );
- printf( " buf %p\n", ( void* )bli_obj_buffer( *obj ) );
- printf( " elem size %lu\n", ( unsigned long int )bli_obj_elem_size( *obj ) );
- printf( " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( *obj ),
- ( signed long int )bli_obj_col_stride( *obj ) );
- printf( " pack_mem \n" );
- printf( " - buf %p\n", ( void* )bli_mem_buffer( pack_mem ) );
- printf( " - buf_type %lu\n", ( unsigned long int )bli_mem_buf_type( pack_mem ) );
- printf( " - size %lu\n", ( unsigned long int )bli_mem_size( pack_mem ) );
- printf( " m_padded %lu\n", ( unsigned long int )bli_obj_padded_length( *obj ) );
- printf( " n_padded %lu\n", ( unsigned long int )bli_obj_padded_width( *obj ) );
- printf( " ps %lu\n", ( unsigned long int )bli_obj_panel_stride( *obj ) );
- printf( " pd %lu\n", ( unsigned long int )bli_obj_panel_dim( *obj ) );
- printf( " m_panel %lu\n", ( unsigned long int )bli_obj_panel_length( *obj ) );
- printf( " n_panel %lu\n", ( unsigned long int )bli_obj_panel_width( *obj ) );
- printf( "\n" );
-
- printf( " info %lX\n", ( unsigned long int )(*obj).info );
- printf( " - is complex %lu\n", ( unsigned long int )bli_obj_is_complex( *obj ) );
- printf( " - is d. prec %lu\n", ( unsigned long int )bli_obj_is_double_precision( *obj ) );
- printf( " - datatype %lu\n", ( unsigned long int )bli_obj_datatype( *obj ) );
- printf( " - target dt %lu\n", ( unsigned long int )bli_obj_target_datatype( *obj ) );
- printf( " - exec dt %lu\n", ( unsigned long int )bli_obj_execution_datatype( *obj ) );
- printf( " - has trans %lu\n", ( unsigned long int )bli_obj_has_trans( *obj ) );
- printf( " - has conj %lu\n", ( unsigned long int )bli_obj_has_conj( *obj ) );
- printf( " - unit diag? %lu\n", ( unsigned long int )bli_obj_has_unit_diag( *obj ) );
- printf( " - struc type %lu\n", ( unsigned long int )bli_obj_struc( *obj ) >> BLIS_STRUC_SHIFT );
- printf( " - uplo type %lu\n", ( unsigned long int )bli_obj_uplo( *obj ) >> BLIS_UPLO_SHIFT );
- printf( " - is upper %lu\n", ( unsigned long int )bli_obj_is_upper( *obj ) );
- printf( " - is lower %lu\n", ( unsigned long int )bli_obj_is_lower( *obj ) );
- printf( " - is dense %lu\n", ( unsigned long int )bli_obj_is_dense( *obj ) );
- printf( " - pack schema %lu\n", ( unsigned long int )bli_obj_pack_schema( *obj ) >> BLIS_PACK_SCHEMA_SHIFT );
- printf( " - packinv diag? %lu\n", ( unsigned long int )bli_obj_has_inverted_diag( *obj ) );
- printf( " - pack ordifup %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_upper( *obj ) );
- printf( " - pack ordiflo %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_lower( *obj ) );
- printf( " - packbuf type %lu\n", ( unsigned long int )bli_obj_pack_buffer_type( *obj ) >> BLIS_PACK_BUFFER_SHIFT );
- printf( "\n" );
-
-#else
fprintf( file, "\n" );
fprintf( file, "%s\n", label );
fprintf( file, "\n" );
fprintf( file, " - pack ordiflo %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_lower( *obj ) );
fprintf( file, " - packbuf type %lu\n", ( unsigned long int )bli_obj_pack_buffer_type( *obj ) >> BLIS_PACK_BUFFER_SHIFT );
fprintf( file, "\n" );
-#endif
+
}
diff --git a/blis/frame/base/bli_profile.c b/blis/frame/base/bli_profile.c
--- /dev/null
@@ -0,0 +1,507 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas at Austin
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas at Austin nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "blis.h"
+#ifdef BLIS_ENABLE_PROFILE
+
+void bli_profile_data_free(profile_data_t *bli_profile_data)
+{
+ if(bli_profile_data == NULL)
+ printf("not allocated\n");
+
+ bli_free(bli_profile_data);
+}
+
+
+profile_data_t * bli_profile_data_init(dim_t num_objects)
+{
+ dim_t i;
+ profile_data_t *bli_profile_data;
+ bli_profile_data = (profile_data_t *)bli_malloc(num_objects*sizeof(profile_data_t));
+ if(bli_profile_data == NULL)
+ printf("not allocated\n");
+ for (i = 0; i < num_objects; i++)
+ {
+ (bli_profile_data[i]).total_cycles = 0;
+ (bli_profile_data[i]).num_iter = 0;
+ (bli_profile_data[i]).num_comp = 0;
+
+ }
+
+ return bli_profile_data;
+}
+
+
+void bli_profile_data_print (profile_data_t *bli_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype, dim_t comp_scale, dim_t num_threads)
+{
+ dim_t n_ind, j;
+
+ dim_t index;
+
+ uint64_t num_iter;
+ //long int total_cycles_kervar2[3];
+
+ double ideal_gflops, gflops, total_cycles, num_ops, time_taken, num_ops_core;
+
+ if(datatype == 0 || datatype == 2)
+ {
+ num_ops = ( (double) comp_scale * m * n * k );
+ }
+ else
+ {
+ // for complex numbers
+ num_ops = ( (double) 4.0*comp_scale * m * n * k );
+ }
+
+ if(datatype == 0 || datatype == 1)
+ {
+ ideal_gflops = CLOCK*OPS_PER_CYCLE_S*num_threads;
+ }
+ else
+ {
+ ideal_gflops = CLOCK*OPS_PER_CYCLE_D*num_threads;
+ }
+ // print intro
+ printf("\n");
+ printf("Clock Frequency %4.1f GHz\n", CLOCK);
+ printf("Number of Threads %d\n", num_threads);
+ printf("Datatype %d Number of Operations: %f\n", datatype, num_ops);
+ printf("Operations per cycle %d\n", (datatype == 0 || datatype == 1) ? OPS_PER_CYCLE_S : OPS_PER_CYCLE_D);
+ printf("Peak GFLOPS %5.1f GFLOPS\n", ideal_gflops);
+ printf("\n");
+
+ // print table header
+ printf("%-10s", "Variant");
+ printf("%2s", "|");
+ printf("%5s", "Cores");
+ printf("%10s", "Num Iter");
+ printf("%15s", "Total Cycles");
+ printf("%10s", "GFLOPS");
+ printf("%12s", "Efficiency");
+
+ // print separator (scales with dimensions of sweep)
+ printf("\n");
+ printf("%-10s","----------");
+ printf("%2s", "--");
+ printf("%10s", "----------");
+ printf("%15s", "---------------");
+ printf("%10s", "----------");
+ printf("%12s", "------------");
+ printf("\n");
+
+
+ for (n_ind = 0; n_ind < BLIS_PROFILE_NUM_REPORTS; n_ind++)
+ {
+ if(n_ind < BLIS_PROFILE_KER_VAR2_IND)
+ {
+ index = 0 + BLIS_MAX_NUM_THREADS*n_ind;
+ if((bli_profile_data[index]).num_iter != 0)
+ {
+ total_cycles = ((double)(bli_profile_data[index]).total_cycles);
+ time_taken = total_cycles / CLOCK ;
+ gflops = num_ops / time_taken; // in 10^9
+ printf("%-10d",n_ind);
+ printf("%2s", "|");
+ printf("%5s", "1");
+ printf("%10d", (bli_profile_data[index]).num_iter);
+ printf("%15llu", (bli_profile_data[index]).total_cycles);
+ printf("%10.4f", gflops );
+ printf("%11.4f%%", gflops/ideal_gflops*100);
+ printf("\n");
+ }
+ }
+ else if(n_ind == BLIS_PROFILE_KER_VAR2_IND)
+ {
+ /*The total performance of the operation depends on the slowest thread in kervar2.
+ * Hence, reporting the max cycles of the thread for ker_var2*/
+
+ num_iter = 0;
+ total_cycles = 0;
+
+ for (j = 0; j<num_threads; j++)
+ {
+ num_iter = bli_max(num_iter, (uint64_t) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_iter) ;
+ total_cycles = bli_max(total_cycles, (double) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].total_cycles);
+ }
+ if(num_iter != 0)
+ {
+ time_taken = total_cycles / CLOCK ;
+ gflops = num_ops / time_taken; // in 10^9
+ printf("%-10d",n_ind);
+ printf("%2s", "|");
+ printf("%5d", 1);
+ printf("%10llu", num_iter);
+ printf("%15llu", (uint64_t) total_cycles);
+ printf("%10.4f", gflops );
+ printf("%11.4f%%", gflops/ideal_gflops*100);
+ printf("\n");
+ }
+ }
+ else
+ {
+ num_iter = 0;
+ total_cycles = 0;
+
+ for (j = 0; j<num_threads; j++)
+ {
+ num_iter += (uint64_t) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_iter ;
+ total_cycles += (double) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].total_cycles;
+ }
+ if(num_iter != 0)
+ {
+ time_taken = total_cycles / CLOCK / num_threads ; // total cycles here are for all 8 cores, that is why we have to divide by num_threads to get the time taken for one core
+ gflops = num_ops / time_taken; // in 10^9
+ printf("%-10d",n_ind);
+ printf("%2s", "|");
+ printf("%5d", num_threads);
+ printf("%10llu", num_iter);
+ printf("%15llu", (uint64_t) total_cycles);
+ printf("%10.4f", gflops );
+ printf("%11.4f%%", gflops/ideal_gflops*100);
+ printf("\n");
+ }
+ }
+ }
+
+ printf("%-10s", "----------");
+ printf("%2s", "--");
+ printf("%10s", "----------");
+ printf("%15s", "---------------");
+ printf("%10s", "----------");
+ printf("%12s", "------------");
+ printf("\n");
+
+ printf("uKernel details for each core\n");
+
+ printf("%-10s", "Core #");
+ printf("%2s", "|");
+ printf("%15s", "Num. Computes");
+ printf("%15s", "Total Cycles");
+ printf("%10s", "GFLOPS");
+ printf("%12s", "Efficiency");
+ printf("\n");
+
+ n_ind = 6;
+ for (j = 0; j<num_threads; j++)
+ {
+ num_iter = (uint64_t) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_iter ;
+ total_cycles = (double) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].total_cycles;
+ if(datatype == 0 || datatype == 2)
+ num_ops_core = (double) bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_comp;
+ else
+ num_ops_core = (double) 4.0*bli_profile_data[j+BLIS_MAX_NUM_THREADS*n_ind].num_comp;
+
+ if(num_iter != 0)
+ {
+ time_taken = total_cycles / CLOCK ;
+ gflops = num_ops_core / time_taken; // in 10^9
+ if(datatype == 0 || datatype == 1)
+ {
+ ideal_gflops = CLOCK*OPS_PER_CYCLE_S;
+ }
+ else
+ {
+ ideal_gflops = CLOCK*OPS_PER_CYCLE_D;
+ }
+ printf("%-10d",j);
+ printf("%2s", "|");
+ printf("%15llu", (uint64_t) num_ops_core);
+ printf("%15llu", (uint64_t) total_cycles);
+ printf("%10.4f", gflops );
+ printf("%11.4f%%", gflops/ideal_gflops*100);
+ printf("\n");
+ }
+ }
+ printf("%-10s", "----------");
+ printf("%2s", "--");
+ printf("%10s", "----------");
+ printf("%15s", "---------------");
+ printf("%10s", "----------");
+ printf("%12s", "------------");
+ printf("\n");
+}
+
+
+void bli_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype, dim_t comp_scale)
+{
+// dim_t i,j;
+//
+// gint_t num_iter[3];
+// long int total_cycles[3];
+// gint_t min_cycles[3];
+// gint_t max_cycles[3];
+//
+//
+// double ideal_gflops, gflops, total_cycles_j, num_ops, time_taken;
+//
+// num_ops = ( (float) comp_scale * (float) m * (float) n * (float) k );
+//
+// if(datatype == 0 || datatype == 1)
+// {
+// ideal_gflops = CLOCK*OPS_PER_CYCLE_S*NUM_THREADS;
+// }
+// else
+// {
+// ideal_gflops = CLOCK*OPS_PER_CYCLE_D*NUM_THREADS;
+// }
+//
+// for(i = 0; i<3; i++)
+// {
+// num_iter[i] = 0;
+// total_cycles[i] = 0;
+// min_cycles[i] = 0;
+// max_cycles[i] = 0;
+// for (j = 0; j<NUM_THREADS; j++)
+// {
+// num_iter[i] += bli_kervar2_profile_data[j*3+i].num_iter ;
+//
+// total_cycles[i] += bli_kervar2_profile_data[j*3+i].total_cycles;
+// if(i == 0 )
+// {
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// }
+// else
+// {
+// if(bli_kervar2_profile_data[j*3+i].max_cycles > max_cycles[i])
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// if(bli_kervar2_profile_data[j*3+i].min_cycles < min_cycles[i])
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// }
+//
+// }
+// }
+//
+// for(i = 0; i<3; i++)
+// {
+// num_iter[i] = 0;
+// total_cycles[i] = 0;
+// min_cycles[i] = 0;
+// max_cycles[i] = 0;
+// for (j = 0; j<8; j++)
+// {
+// num_iter[i] += bli_kervar2_profile_data[j*3+i].num_iter ;
+//
+// total_cycles[i] += bli_kervar2_profile_data[j*3+i].total_cycles;
+// if(i == 0 )
+// {
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// }
+// else
+// {
+// if(bli_kervar2_profile_data[j*3+i].max_cycles > max_cycles[i])
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// if(bli_kervar2_profile_data[j*3+i].min_cycles < min_cycles[i])
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// }
+//
+// }
+// }
+//
+// for(j = 0; j < 3; j++)
+// {
+// if(num_iter[j] != 0)
+// {
+//
+// total_cycles_j = (double) total_cycles[j];
+// time_taken = total_cycles_j / CLOCK / NUM_THREADS;
+// gflops = num_ops / time_taken; // in 10^9
+// if(j == 0)
+// printf("%-10s","kernel");
+// else if(j ==1)
+// printf("%-10s","mr");
+// else
+// printf("%-10s","nr");
+//
+// printf("%2s", "|");
+// printf("%10d", num_iter[j]);
+// printf("%15ld", total_cycles[j]);
+// printf("%10.4f", gflops);
+// printf("%11.4f%%", gflops/ideal_gflops*100);
+// printf("\n");
+// }
+// }
+//#if 1
+// for(j = 0; j < 3; j++)
+// {
+// if(j == 0)
+// printf("%-10s\n","kernel");
+// else if(j ==1)
+// printf("%-10s\n","mr");
+// else
+// printf("%-10s\n","nr");
+//
+// for(i = 0; i < 8; i++)
+// {
+// printf("core %d\t",i);
+// printf("%15ld\t", bli_kervar2_profile_data[i*3+j].total_cycles);
+// printf("%15ld\t", bli_kervar2_profile_data[i*3+j].num_comp);
+// printf("\n");
+// }
+// }
+//#endif
+}
+
+void bli_trsm_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype)
+{
+// dim_t i,j;
+//
+// long int num_iter[3];
+// long int total_cycles[3];
+// gint_t min_cycles[3];
+// gint_t max_cycles[3];
+//
+//
+// double ideal_gflops, gflops, total_cycles_j, num_ops, time_taken;
+//
+// num_ops = ( 1.0 * m * n * k );
+//
+// if(datatype == 0 || datatype == 1)
+// {
+// ideal_gflops = CLOCK*OPS_PER_CYCLE_S*NUM_THREADS;
+// }
+// else
+// {
+// ideal_gflops = CLOCK*OPS_PER_CYCLE_D*NUM_THREADS;
+// }
+//
+// for(i = 0; i<3; i++)
+// {
+// num_iter[i] = 0;
+// total_cycles[i] = 0;
+// min_cycles[i] = 0;
+// max_cycles[i] = 0;
+// for (j = 0; j<8; j++)
+// {
+// num_iter[i] += bli_kervar2_profile_data[j*3+i].num_iter ;
+//
+// total_cycles[i] += bli_kervar2_profile_data[j*3+i].total_cycles;
+// if(i == 0 )
+// {
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// }
+// else
+// {
+// if(bli_kervar2_profile_data[j*3+i].max_cycles > max_cycles[i])
+// max_cycles[i] = bli_kervar2_profile_data[j*3+i].max_cycles;
+// if(bli_kervar2_profile_data[j*3+i].min_cycles < min_cycles[i])
+// min_cycles[i] = bli_kervar2_profile_data[j*3+i].min_cycles;
+// }
+//
+// }
+// }
+//
+// for(j = 0; j < 3; j++)
+// {
+// if(num_iter[j] != 0)
+// {
+// total_cycles_j = (double) total_cycles[j];
+// time_taken = total_cycles_j / CLOCK / NUM_THREADS;
+// gflops = num_ops / time_taken; // in 10^9
+// if(j == 0)
+// printf("%-10s","gemmtrsm");
+// else if(j ==1)
+// printf("%-10s","MR loop");
+// else
+// printf("%-10s","nr");
+//
+// printf("%2s", "|");
+// printf("%10ld", num_iter[j]);
+// printf("%15ld", total_cycles[j]);
+// printf("%10.4f", gflops);
+// printf("%11.4f%%", gflops/ideal_gflops*100);
+// printf("\n");
+// }
+// }
+// printf("%-10s", "----------");
+// printf("%2s", "--");
+// printf("%10s", "----------");
+// printf("%15s", "---------------");
+// printf("%10s", "----------");
+// printf("%12s", "------------");
+//
+// printf("\nNumber of loop iterations across cores %d\n", NUM_THREADS);
+//
+// // print table header
+// printf("%-10s", " ");
+// printf("%2s", "|");
+// printf("%12s", "Core 0");
+// printf("%12s", "Core 1");
+// printf("%12s", "Core 2");
+// printf("%12s", "Core 3");
+// printf("%12s", "Core 4");
+// printf("%12s", "Core 5");
+// printf("%12s", "Core 6");
+// printf("%12s\n", "Core 7");
+//
+// for(j = 0; j < 3; j++)
+// {
+// if(j == 0)
+// printf("%-10s","kernel");
+// else if(j ==1)
+// printf("%-10s","MR loop");
+// else
+// printf("%-10s","NR loop");
+//
+// printf("%2s", "|");
+//
+// for(i = 0; i < 8; i++)
+// printf("%12ld", bli_kervar2_profile_data[i*3+j].num_comp);
+//
+// printf("\n");
+// }
+//
+// for(j = 0; j < 3; j++)
+// {
+// if(j == 0)
+// printf("%-10s","kernel");
+// else if(j ==1)
+// printf("%-10s","MR loop");
+// else
+// printf("%-10s","NR loop");
+//
+// printf("%2s", "|");
+//
+// for(i = 0; i < 8; i++)
+// printf("%15ld", bli_kervar2_profile_data[i*3+j].total_cycles);
+//
+// printf("\n");
+// }
+}
+
+
+
+
+#endif
diff --git a/blis/frame/base/bli_profile.h b/blis/frame/base/bli_profile.h
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2014, The University of Texas at Austin
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas at Austin nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_PROFILE_H
+#define BLIS_PROFILE_H
+
+//#ifdef BLIS_ENABLE_C66X_BUILD
+#define BLIS_PROFILE_BLK_VAR2_IND 0
+#define BLIS_PROFILE_BLK_VAR3_IND 1
+#define BLIS_PROFILE_BLK_VAR1_IND 2
+#define BLIS_PROFILE_KER_VAR2_IND 3
+#define BLIS_PROFILE_JR_LOOP_IND 4
+#define BLIS_PROFILE_IR_LOOP_IND 5
+#define BLIS_PROFILE_KER_LOOP_IND 6
+#define BLIS_PROFILE_NUM_REPORTS 7
+
+#define MAX_THREADS 8
+
+#ifdef BLIS_ENABLE_PROFILE
+
+#define CLOCK 1.2 // In GHz
+
+#ifdef BLIS_ENABLE_C66X_BUILD
+#define OPS_PER_CYCLE_S 16
+#define OPS_PER_CYCLE_D 4
+#else
+#define OPS_PER_CYCLE_S 8
+#define OPS_PER_CYCLE_D 2
+#endif
+
+
+#define BLIS_ENABLE_PROFILE_KERVAR2 1
+#else
+#define BLIS_ENABLE_PROFILE_KERVAR2 0
+#endif
+
+
+struct profile_data_s
+{
+ uint64_t total_cycles;
+ gint_t num_iter;
+ uint64_t num_comp;
+};
+typedef struct profile_data_s profile_data_t;
+
+struct profile_details_s
+{
+ dim_t m;
+ dim_t n;
+ dim_t k;
+ long int cycles;
+};
+typedef struct profile_details_s profile_details_t;
+
+
+#define bli_profile_data_update( bli_profile_data, cycles, comps) \
+{ \
+ bli_profile_data.total_cycles += cycles; \
+ bli_profile_data.num_comp += comps; \
+ bli_profile_data.num_iter++; \
+}
+
+#define bli_profile_get_index(n, i, index) \
+{ \
+ if(n == 1 && i == 2) \
+ index = bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_BLK_VAR2_IND; \
+ else if(n == 2 && i == 2) \
+ index = bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_BLK_VAR3_IND; \
+ else if(n == 0 && i == 2) \
+ index = bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_BLK_VAR1_IND; \
+ else if(n == 1 && i == 1) \
+ index = bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_VAR2_IND; \
+}
+
+profile_data_t* bli_profile_data_init (dim_t num_objects);
+
+void bli_profile_data_free (profile_data_t *bli_profile_data);
+
+//void bli_profile_data_update (profile_data_t *bli_profile_data, long int cycles, long int num_comp);
+
+void bli_gemm_profile_data_print (profile_data_t *bli_gemm_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype);
+
+void bli_trsm_profile_data_print (profile_data_t *bli_trsm_profile_data, dim_t m, dim_t n, dim_t mn_side, num_t datatype);
+
+void bli_gemm_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype);
+void bli_trsm_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype);
+
+void bli_profile_data_print (profile_data_t *bli_gemm_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype, dim_t comp_scale, dim_t num_threads);
+
+void bli_kervar2_profile_data_print (profile_data_t *bli_kervar2_profile_data, dim_t m, dim_t n, dim_t k, num_t datatype, dim_t comp_scale);
+
+
+
+profile_details_t* bli_profile_details_init (long int num_objects);
+
+void bli_profile_details_free (profile_details_t *bli_profile_details);
+
+void bli_profile_details_update (profile_details_t *bli_profile_details, dim_t m, dim_t n, dim_t k, long int cycles);
+
+
+
+
+#endif
index 48e8974601b5dc4d50aa21b4c98735cb5f0f7487..e3c4cb77fbae2d6cc57ffe507fa96d0a613a6328 100644 (file)
@@ -275,68 +275,386 @@ void bli_setup_thread_info( thrinfo_t* thr, thread_comm_t* ocomm, dim_t ocomm_id
thr->work_id = work_id;
}
-void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+//void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+//{
+// thrinfo_t* thread = (thrinfo_t*) thr;
+// dim_t n_way = thread->n_way;
+// dim_t work_id = thread->work_id;
+//
+// dim_t size = all_end - all_start;
+// dim_t n_pt = size / n_way;
+// n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
+// n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor);
+// *start = work_id * n_pt + all_start;
+// *end = bli_min( *start + n_pt, size + all_start );
+//}
+//
+//void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
+//{
+// thrinfo_t* thread = (thrinfo_t*) thr;
+// dim_t n_way = thread->n_way;
+// dim_t work_id = thread->work_id;
+// dim_t size = all_end - all_start;
+// double num;
+//
+// *start = 0;
+// *end = all_end - all_start;
+// num = size*size / (double) n_way; // 2xArea per thread?
+//
+// //printf("bli_threading %d %d %f %d\n", *start, *end, num, work_id);
+//
+// if( forward ) {
+// dim_t curr_caucus = n_way - 1;
+// dim_t len = 0;
+// while(1){
+// dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus
+// width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
+// if( curr_caucus == work_id ) {
+// *start = bli_max( 0 , *end - width ) + all_start;
+// *end = *end + all_start;
+// return;
+// }
+// else{
+// *end -= width;
+// len += width;
+// curr_caucus--;
+// }
+// }
+// }
+// else{
+// while(1){
+// dim_t width = ceil(sqrt(*start * *start + num)) - *start;
+// width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
+// printf("bli_threading %d %d %d\n", *start, width, work_id);
+//
+// if( work_id == 0 ) {
+// *start = *start + all_start;
+// *end = bli_min( *start + width, all_end );
+// return;
+// }
+// else{
+// *start = *start + width;
+// }
+// work_id--;
+// }
+// }
+//}
+
+
+void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t handle_edge_low, dim_t* start, dim_t* end )
{
- thrinfo_t* thread = (thrinfo_t*) thr;
- dim_t n_way = thread->n_way;
- dim_t work_id = thread->work_id;
-
- dim_t size = all_end - all_start;
- dim_t n_pt = size / n_way;
- n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
- n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor);
- *start = work_id * n_pt + all_start;
- *end = bli_min( *start + n_pt, size + all_start );
+ thrinfo_t* thread = ( thrinfo_t* )thr;
+ dim_t n_way = thread->n_way;
+ dim_t work_id = thread->work_id;
+
+ dim_t size = all_end - all_start;
+
+ dim_t n_bf_whole = size / block_factor;
+ dim_t n_bf_left = size % block_factor;
+
+ dim_t n_bf_lo = n_bf_whole / n_way;
+ dim_t n_bf_hi = n_bf_whole / n_way;
+
+ // In this function, we partition the space between all_start and
+ // all_end into n_way partitions, each a multiple of block_factor
+ // with the exception of the one partition that recieves the
+ // "edge" case (if applicable).
+ //
+ // Here are examples of various thread partitionings, in units of
+ // the block_factor, when n_way = 4. (A '+' indicates the thread
+ // that receives the leftover edge case (ie: n_bf_left extra
+ // rows/columns in its sub-range).
+ // (all_start ... all_end)
+ // n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3
+ // 12 =0 f 0 4 3 3 3 3
+ // 12 >0 f 0 4 3 3 3 3+
+ // 13 >0 f 1 3 4 3 3 3+
+ // 14 >0 f 2 2 4 4 3 3+
+ // 15 >0 f 3 1 4 4 4 3+
+ // 15 =0 f 3 1 4 4 4 3
+ //
+ // 12 =0 t 4 0 3 3 3 3
+ // 12 >0 t 4 0 3+ 3 3 3
+ // 13 >0 t 3 1 3+ 3 3 4
+ // 14 >0 t 2 2 3+ 3 4 4
+ // 15 >0 t 1 3 3+ 4 4 4
+ // 15 =0 t 1 3 3 4 4 4
+
+ // As indicated by the table above, load is balanced as equally
+ // as possible, even in the presence of an edge case.
+
+ // First, we must differentiate between cases where the leftover
+ // "edge" case (n_bf_left) should be allocated to a thread partition
+ // at the low end of the index range or the high end.
+
+ if ( handle_edge_low == FALSE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "high" and
+ // the "low" thread group is empty.
+ dim_t n_th_lo = n_bf_whole % n_way;
+ //dim_t n_th_hi = n_way - n_th_lo;
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to lower index threads.
+ if ( n_th_lo != 0 ) n_bf_lo += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ *start = lo_start + (work_id ) * size_lo;
+ *end = lo_start + (work_id+1) * size_lo;
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ *start = hi_start + (work_id-n_th_lo ) * size_hi;
+ *end = hi_start + (work_id-n_th_lo+1) * size_hi;
+
+ // Since the edge case is being allocated to the high
+ // end of the index range, we have to advance the last
+ // thread's end.
+ if ( work_id == n_way - 1 ) *end += n_bf_left;
+ }
+ }
+ else // if ( handle_edge_low == TRUE )
+ {
+ // Notice that if all threads receive the same number of
+ // block_factors, those threads are considered "low" and
+ // the "high" thread group is empty.
+ dim_t n_th_hi = n_bf_whole % n_way;
+ dim_t n_th_lo = n_way - n_th_hi;
+
+ // If some partitions must have more block_factors than others
+ // assign the slightly larger partitions to higher index threads.
+ if ( n_th_hi != 0 ) n_bf_hi += 1;
+
+ // Compute the actual widths (in units of rows/columns) of
+ // individual threads in the low and high groups.
+ dim_t size_lo = n_bf_lo * block_factor;
+ dim_t size_hi = n_bf_hi * block_factor;
+
+ // Precompute the starting indices of the low and high groups.
+ dim_t lo_start = all_start;
+ dim_t hi_start = all_start + n_th_lo * size_lo
+ + n_bf_left;
+
+ // Compute the start and end of individual threads' ranges
+ // as a function of their work_ids and also the group to which
+ // they belong (low or high).
+ if ( work_id < n_th_lo )
+ {
+ *start = lo_start + (work_id ) * size_lo;
+ *end = lo_start + (work_id+1) * size_lo;
+
+ // Since the edge case is being allocated to the low
+ // end of the index range, we have to advance the
+ // starts/ends accordingly.
+ if ( work_id == 0 ) *end += n_bf_left;
+ else { *start += n_bf_left;
+ *end += n_bf_left; }
+ }
+ else // if ( n_th_lo <= work_id )
+ {
+ *start = hi_start + (work_id-n_th_lo ) * size_hi;
+ *end = hi_start + (work_id-n_th_lo+1) * size_hi;
+ }
+ }
}
-void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
+void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
{
- thrinfo_t* thread = (thrinfo_t*) thr;
- dim_t n_way = thread->n_way;
- dim_t work_id = thread->work_id;
- dim_t size = all_end - all_start;
- double num;
-
- *start = 0;
- *end = all_end - all_start;
- num = size*size / (double) n_way; // 2xArea per thread?
-
- if( forward ) {
- dim_t curr_caucus = n_way - 1;
- dim_t len = 0;
- while(1){
- dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus
- width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
- if( curr_caucus == work_id ) {
- *start = bli_max( 0 , *end - width ) + all_start;
- *end = *end + all_start;
- return;
- }
- else{
- *end -= width;
- len += width;
- curr_caucus--;
- }
- }
- }
- else{
- while(1){
- dim_t width = ceil(sqrt(*start * *start + num)) - *start;
- width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
-
- if( work_id == 0 ) {
- *start = *start + all_start;
- *end = bli_min( *start + width, all_end );
- return;
- }
- else{
- *start = *start + width;
- }
- work_id--;
- }
- }
+ bli_get_range( thr, all_start, all_end, block_factor,
+ FALSE, start, end );
+}
+
+void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+{
+ bli_get_range( thr, all_start, all_end, block_factor,
+ TRUE, start, end );
+}
+
+void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+{
+ bli_get_range( thr, all_start, all_end, block_factor,
+ FALSE, start, end );
+}
+
+void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+{
+ bli_get_range( thr, all_start, all_end, block_factor,
+ TRUE, start, end );
}
+void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, bool_t handle_edge_low, dim_t* start, dim_t* end )
+{
+ thrinfo_t* thread = ( thrinfo_t* )thr;
+ dim_t n_way = thread->n_way;
+ dim_t work_id = thread->work_id;
+ dim_t size = all_end - all_start;
+ dim_t width;
+ dim_t block_fac_leftover = size % block_factor;
+ dim_t i;
+ double num;
+
+ *start = 0;
+ *end = all_end - all_start;
+ num = size * size / ( double )n_way;
+
+ if ( bli_is_lower( uplo ) )
+ {
+ dim_t cur_caucus = n_way - 1;
+ dim_t len = 0;
+
+ // This loop computes subpartitions backwards, from the high end
+ // of the index range to the low end. If the low end is assumed
+ // to be on the left and the high end the right, this assignment
+ // of widths is appropriate for n dimension partitioning of a
+ // lower triangular matrix.
+ for ( i = 0; TRUE; ++i )
+ {
+ width = ceil( sqrt( len*len + num ) ) - len;
+
+ // If we need to allocate the edge case (assuming it exists)
+ // to the high thread subpartition, adjust width so that it
+ // contains the exact amount of leftover edge dimension so that
+ // all remaining subpartitions can be multiples of block_factor.
+ // If the edge case is to be allocated to the low subpartition,
+ // or if there is no edge case, it is implicitly allocated to
+ // the low subpartition by virtue of the fact that all other
+ // subpartitions already assigned will be multiples of
+ // block_factor.
+ if ( i == 0 && !handle_edge_low )
+ {
+ if ( width % block_factor != block_fac_leftover )
+ width += block_fac_leftover - ( width % block_factor );
+ }
+ else
+ {
+ if ( width % block_factor != 0 )
+ width += block_factor - ( width % block_factor );
+ }
+
+ if ( cur_caucus == work_id )
+ {
+ *start = bli_max( 0, *end - width ) + all_start;
+ *end = *end + all_start;
+ return;
+ }
+ else
+ {
+ *end -= width;
+ len += width;
+ cur_caucus--;
+ }
+ }
+ }
+ else // if ( bli_is_upper( uplo ) )
+ {
+ // This loop computes subpartitions forwards, from the low end
+ // of the index range to the high end. If the low end is assumed
+ // to be on the left and the high end the right, this assignment
+ // of widths is appropriate for n dimension partitioning of an
+ // upper triangular matrix.
+ for ( i = 0; TRUE; ++i )
+ {
+ width = ceil( sqrt( *start * *start + num ) ) - *start;
+
+ if ( i == 0 && handle_edge_low )
+ {
+ if ( width % block_factor != block_fac_leftover )
+ width += block_fac_leftover - ( width % block_factor );
+ }
+ else
+ {
+ if ( width % block_factor != 0 )
+ width += block_factor - ( width % block_factor );
+ }
+
+ if ( work_id == 0 )
+ {
+ *start = *start + all_start;
+ *end = bli_min( *start + width, all_end );
+ return;
+ }
+ else
+ {
+ *start = *start + width;
+ work_id--;
+ }
+ }
+ }
+}
+
+void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+ if ( bli_is_upper_or_lower( uplo ) )
+ {
+ bli_get_range_weighted( thr, all_start, all_end, block_factor,
+ uplo, FALSE, start, end );
+ }
+ else // if dense or zeros
+ {
+ bli_get_range_l2r( thr, all_start, all_end, block_factor,
+ start, end );
+ }
+}
+
+void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+ if ( bli_is_upper_or_lower( uplo ) )
+ {
+ //printf( "bli_get_range_weighted_r2l: is upper or lower\n" );
+ bli_toggle_uplo( uplo );
+ bli_get_range_weighted( thr, all_start, all_end, block_factor,
+ uplo, TRUE, start, end );
+ }
+ else // if dense or zeros
+ {
+ //printf( "bli_get_range_weighted_r2l: is dense or zeros\n" );
+ bli_get_range_r2l( thr, all_start, all_end, block_factor,
+ start, end );
+ }
+}
+
+void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+ if ( bli_is_upper_or_lower( uplo ) )
+ {
+ bli_toggle_uplo( uplo );
+ bli_get_range_weighted( thr, all_start, all_end, block_factor,
+ uplo, FALSE, start, end );
+ }
+ else // if dense or zeros
+ {
+ bli_get_range_t2b( thr, all_start, all_end, block_factor,
+ start, end );
+ }
+}
+
+void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+ if ( bli_is_upper_or_lower( uplo ) )
+ {
+ bli_get_range_weighted( thr, all_start, all_end, block_factor,
+ uplo, TRUE, start, end );
+ }
+ else // if dense or zeros
+ {
+ bli_get_range_b2t( thr, all_start, all_end, block_factor,
+ start, end );
+ }
+ }
void bli_level3_thread_decorator( dim_t n_threads,
level3_int_t func,
obj_t* alpha,
dim_t number = 1;
#ifdef BLIS_ENABLE_C66X_BUILD
if(strcmp(env,"BLIS_JC_NT")==0)
- number = 1;
+ number = BLIS_C66X_JC_NT;
if(strcmp(env,"BLIS_IC_NT")==0)
- number = 8;
+ number = BLIS_C66X_IC_NT;
if(strcmp(env,"BLIS_JR_NT")==0)
- number = 1;
+ number = BLIS_C66X_JR_NT;
if(strcmp(env,"BLIS_IR_NT")==0)
- number = 1;
+ number = BLIS_C66X_IR_NT;
return number;
#else
char* str = getenv( env );
index 7ca163d8108b3cbe71c0acc705e27c48d250cda2..19c8118fca5bc24fbdb85528caf4962735c9c44f 100644 (file)
};
typedef struct thrinfo_s thrinfo_t;
+#ifdef BLIS_ENABLE_C66X_BUILD
+#define bli_get_thread_num lib_get_coreID
+#else
+#define bli_get_thread_num omp_get_thread_num
+#endif
+
+
// Thread Info Interface Definitions
#define thread_ocomm( thread ) (thread->ocomm)
#define thread_icomm( thread ) (thread->icomm)
#define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id )
#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id )
-void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
-void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end);
+//void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
+//void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end);
+
+void bli_get_range( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ bool_t handle_edge_low,
+ dim_t* start, dim_t* end );
+void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ dim_t* start, dim_t* end );
+void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ dim_t* start, dim_t* end );
+void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ dim_t* start, dim_t* end );
+void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor,
+ dim_t* start, dim_t* end );
+
+void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ bool_t handle_edge_low,
+ dim_t* start, dim_t* end );
+void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ dim_t* start, dim_t* end );
+void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ dim_t* start, dim_t* end );
+void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ dim_t* start, dim_t* end );
+void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end,
+ dim_t block_factor, uplo_t uplo,
+ dim_t* start, dim_t* end );
+
thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
index 24ebbf187fb6ce1a5fd2935fd90a10baf70375b2..da2dc4726582d1553e30b1f81865402d1e32e89e 100644 (file)
+#include "bli_config.h"
+#include "bli_system.h"
+#include "bli_type_defs.h"
+#include "bli_cblas.h"
#ifdef BLIS_ENABLE_CBLAS
/*
* cblas_caxpy.c
index 8afee1552d2e07d7cb71a6aa91c63741535ae7d1..5a9a3dd1d20ae50aed6b073604323378eeb962c9 100644 (file)
+#include "bli_config.h"
+#include "bli_system.h"
+#include "bli_type_defs.h"
+#include "bli_cblas.h"
#ifdef BLIS_ENABLE_CBLAS
/*
* cblas_ccopy.c
diff --git a/blis/frame/include/bli_mem_pool_macro_defs.h b/blis/frame/include/bli_mem_pool_macro_defs.h
index d371d63bda455caee126eebe768e31a419ec5cf8..c79f438ce87a2b266657a4047c7b2a9391435603 100644 (file)
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*#define BLIS_MK_BLOCK_SIZE_S ( BLIS_POOL_MC_L2_S * \
- ( BLIS_POOL_KC_L2_S + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_S / \
- BLIS_SIZEOF_S ) \
- ) * \
- BLIS_SIZEOF_S \
- )
-#define BLIS_KN_BLOCK_SIZE_S ( \
- ( BLIS_POOL_KC_L3_S + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_S / \
- BLIS_SIZEOF_S ) \
- ) * \
- BLIS_POOL_NC_L3_S * \
- BLIS_SIZEOF_S \
- )
-*/
+
+#if defined (BLIS_ENABLE_C66X_K2H)
#define BLIS_MK_BLOCK_SIZE_S ( bli_max( BLIS_POOL_MC_S*(BLIS_POOL_MC_S + BLIS_POOL_KC_S), \
(BLIS_POOL_MC_S + BLIS_POOL_KC_S)*(BLIS_POOL_MC_S + BLIS_POOL_KC_S)/4 \
BLIS_SIZEOF_S \
)
+#elif defined (BLIS_ENABLE_C66X_AM57X)
+
+//DMA is not used, and so we do not need to calculate the extra memory that needs to DMA'ed to rebuild symmetric matrices
+
+#define BLIS_MK_BLOCK_SIZE_S ( BLIS_POOL_MC_S * \
+ ( BLIS_POOL_KC_S \
+ ) * \
+ BLIS_SIZEOF_S \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_S ( \
+ ( BLIS_POOL_KC_S \
+ ) * \
+ BLIS_POOL_NC_S * \
+ BLIS_SIZEOF_S \
+ )
+
+#endif
+
#define BLIS_MN_BLOCK_SIZE_S ( BLIS_POOL_MC_S * \
BLIS_POOL_NC_S * \
BLIS_SIZEOF_S \
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_D ( BLIS_POOL_MC_L2_D * \
- ( BLIS_POOL_KC_L2_D + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_D / \
- BLIS_SIZEOF_D ) \
- ) * \
- BLIS_SIZEOF_D \
- )
-#define BLIS_KN_BLOCK_SIZE_D ( \
- ( BLIS_POOL_KC_L3_D + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_D / \
- BLIS_SIZEOF_D ) \
- ) * \
- BLIS_POOL_NC_L3_D * \
- BLIS_SIZEOF_D \
- )
-*/
+#if defined (BLIS_ENABLE_C66X_K2H)
+
+
#define BLIS_MK_BLOCK_SIZE_D ( bli_max( BLIS_POOL_MC_D*(BLIS_POOL_MC_D + BLIS_POOL_KC_D), \
(BLIS_POOL_MC_D + BLIS_POOL_KC_D)*(BLIS_POOL_MC_D + BLIS_POOL_KC_D)/4 \
) * \
BLIS_SIZEOF_D \
)
+#elif defined (BLIS_ENABLE_C66X_AM57X)
+#define BLIS_MK_BLOCK_SIZE_D ( BLIS_POOL_MC_D * \
+ ( BLIS_POOL_KC_D \
+ ) * \
+ BLIS_SIZEOF_D \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_D ( \
+ ( BLIS_POOL_KC_D \
+ ) * \
+ BLIS_POOL_NC_D * \
+ BLIS_SIZEOF_D \
+ )
+
+#endif
#define BLIS_MN_BLOCK_SIZE_D ( BLIS_POOL_MC_D * \
BLIS_POOL_NC_D * \
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_C ( BLIS_POOL_MC_L2_C * \
- ( BLIS_POOL_KC_L2_C + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_SIZEOF_C \
- )
-#define BLIS_KN_BLOCK_SIZE_C ( \
- ( BLIS_POOL_KC_L3_C + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_POOL_NC_L3_C * \
- BLIS_SIZEOF_C \
- )
-*/
+
+#if defined (BLIS_ENABLE_C66X_K2H)
+
+
#define BLIS_MK_BLOCK_SIZE_C ( bli_max( BLIS_POOL_MC_C*(BLIS_POOL_MC_C + BLIS_POOL_KC_C), \
(BLIS_POOL_MC_C + BLIS_POOL_KC_C)*(BLIS_POOL_MC_C + BLIS_POOL_KC_C)/4 \
) * \
BLIS_SIZEOF_D \
)
+#elif defined (BLIS_ENABLE_C66X_AM57X)
+
+#define BLIS_MK_BLOCK_SIZE_C ( BLIS_POOL_MC_C * \
+ ( BLIS_POOL_KC_C \
+ ) * \
+ BLIS_SIZEOF_C \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_C ( \
+ ( BLIS_POOL_KC_C \
+ ) * \
+ BLIS_POOL_NC_C * \
+ BLIS_SIZEOF_C \
+ )
+#endif
+
#define BLIS_MN_BLOCK_SIZE_C ( BLIS_POOL_MC_C * \
BLIS_POOL_NC_C * \
BLIS_SIZEOF_C \
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_Z ( BLIS_POOL_MC_L2_Z * \
- ( BLIS_POOL_KC_L2_Z + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_SIZEOF_Z \
- )
-#define BLIS_KN_BLOCK_SIZE_Z ( \
- ( BLIS_POOL_KC_L3_Z + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_POOL_NC_L3_Z * \
- BLIS_SIZEOF_Z \
- )
-*/
+#if defined (BLIS_ENABLE_C66X_K2H)
#define BLIS_MK_BLOCK_SIZE_Z ( bli_max( BLIS_POOL_MC_Z * (BLIS_POOL_MC_Z + BLIS_POOL_KC_Z), \
(BLIS_POOL_MC_Z + BLIS_POOL_KC_Z)*(BLIS_POOL_MC_Z + BLIS_POOL_KC_Z)/4 \
BLIS_SIZEOF_Z \
)
+#elif defined (BLIS_ENABLE_C66X_AM57X)
+
+#define BLIS_MK_BLOCK_SIZE_Z ( BLIS_POOL_MC_Z * \
+ ( BLIS_POOL_KC_Z \
+ ) * \
+ BLIS_SIZEOF_Z \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_Z ( \
+ ( BLIS_POOL_KC_Z \
+ ) * \
+ BLIS_POOL_NC_Z * \
+ BLIS_SIZEOF_Z \
+ )
+
+#endif
+
#define BLIS_MN_BLOCK_SIZE_Z ( BLIS_POOL_MC_Z * \
BLIS_POOL_NC_Z * \
BLIS_SIZEOF_Z \
// Compute memory pool block sizes for single complex (4m).
//
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_MC_L2_C * \
- ( BLIS_POOL_4M_KC_L2_C + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_SIZEOF_C \
- )
-#define BLIS_KN_BLOCK_SIZE_4M_C ( \
- ( BLIS_POOL_4M_KC_L3_C + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_POOL_4M_NC_L3_C * \
- BLIS_SIZEOF_C \
- )
-*/
+#if defined (BLIS_ENABLE_C66X_K2H)
#define BLIS_MK_BLOCK_SIZE_4M_C ( bli_max( BLIS_POOL_4M_MC_C*(BLIS_POOL_4M_MC_C + BLIS_POOL_4M_KC_C), \
(BLIS_POOL_4M_MC_C + BLIS_POOL_4M_KC_C)*(BLIS_POOL_4M_MC_C + BLIS_POOL_4M_KC_C)/4 \
) * \
) * \
BLIS_SIZEOF_C \
)
+#elif defined (BLIS_ENABLE_C66X_AM57X)
+#define BLIS_MK_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_MC_C * \
+ ( BLIS_POOL_4M_KC_C \
+ ) * \
+ BLIS_SIZEOF_C \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_4M_C ( \
+ ( BLIS_POOL_4M_KC_C \
+ ) * \
+ BLIS_POOL_4M_NC_C * \
+ BLIS_SIZEOF_C \
+ )
+#endif
+
#define BLIS_MN_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_MC_C * \
// Compute memory pool block sizes for double complex (4m).
//
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_MC_L2_Z * \
- ( BLIS_POOL_4M_KC_L2_Z + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_SIZEOF_Z \
- )
-#define BLIS_KN_BLOCK_SIZE_4M_Z ( \
- ( BLIS_POOL_4M_KC_L3_Z + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_POOL_4M_NC_L3_Z * \
- BLIS_SIZEOF_Z \
- )
-*/
+
+#if defined (BLIS_ENABLE_C66X_K2H)
#define BLIS_MK_BLOCK_SIZE_4M_Z ( bli_max( BLIS_POOL_4M_MC_Z*(BLIS_POOL_4M_MC_Z + BLIS_POOL_4M_KC_Z), \
(BLIS_POOL_4M_MC_Z + BLIS_POOL_4M_KC_Z)*(BLIS_POOL_4M_MC_Z + BLIS_POOL_4M_KC_Z)/4 \
) * \
BLIS_SIZEOF_Z \
)
+#elif defined (BLIS_ENABLE_C66X_AM57X)
+
+#define BLIS_MK_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_MC_Z * \
+ ( BLIS_POOL_4M_KC_Z \
+ ) * \
+ BLIS_SIZEOF_Z \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_4M_Z ( \
+ ( BLIS_POOL_4M_KC_Z \
+ ) * \
+ BLIS_POOL_4M_NC_Z * \
+ BLIS_SIZEOF_Z \
+ )
+#endif
+
#define BLIS_MN_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_MC_Z * \
BLIS_POOL_4M_NC_Z * \
// NOTE: We scale by 3/2 because 3m requires 50% more space than 4m.
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_MC_L2_C * \
- ( BLIS_POOL_3M_KC_L2_C + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- ( BLIS_SIZEOF_C * \
- 3 \
- ) / 2 \
- )
-#define BLIS_KN_BLOCK_SIZE_3M_C ( \
- ( BLIS_POOL_3M_KC_L3_C + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_C / \
- BLIS_SIZEOF_C ) \
- ) * \
- BLIS_POOL_3M_NC_L3_C * \
- ( BLIS_SIZEOF_C * \
- 3 \
- ) / 2 \
- )
-*/
+
+#if defined(BLIS_ENABLE_C66X_K2H)
+
#define BLIS_MK_BLOCK_SIZE_3M_C ( bli_max( BLIS_POOL_3M_MC_C*(BLIS_POOL_3M_MC_C + BLIS_POOL_3M_KC_C), \
(BLIS_POOL_3M_MC_C + BLIS_POOL_3M_KC_C)*(BLIS_POOL_3M_MC_C + BLIS_POOL_3M_KC_C)/4 \
) * \
3 / 2 \
)
+#elif defined (BLIS_ENABLE_C66X_AM57X)
+
+#define BLIS_MK_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_MC_C * \
+ ( BLIS_POOL_3M_KC_C \
+ ) * \
+ ( BLIS_SIZEOF_C * \
+ 3 \
+ ) / 2 \
+ )
+
+#define BLIS_KN_BLOCK_SIZE_3M_C ( \
+ ( BLIS_POOL_3M_KC_C \
+ ) * \
+ BLIS_POOL_3M_NC_C * \
+ ( BLIS_SIZEOF_C * \
+ 3 \
+ ) / 2 \
+ )
+#endif
#define BLIS_MN_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_MC_C * \
BLIS_POOL_3M_NC_C * \
// NOTE: We scale by 3/2 because 3m requires 50% more space than 4m.
#ifdef BLIS_ENABLE_C66X_MEM_POOLS
-/*
-#define BLIS_MK_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_MC_L2_Z * \
- ( BLIS_POOL_3M_KC_L2_Z + \
- ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- ( BLIS_SIZEOF_Z * \
- 3 \
- ) / 2 \
- )
-#define BLIS_KN_BLOCK_SIZE_3M_Z ( \
- ( BLIS_POOL_3M_KC_L3_Z + \
- ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \
- BLIS_SIZEOF_Z ) \
- ) * \
- BLIS_POOL_3M_NC_L3_Z * \
- ( BLIS_SIZEOF_Z * \
- 3 \
- ) / 2 \
- )
-*/
+#if defined(BLIS_ENABLE_C66X_K2H)
+
#define BLIS_MK_BLOCK_SIZE_3M_Z ( bli_max( BLIS_POOL_3M_MC_Z*(BLIS_POOL_3M_MC_Z + BLIS_POOL_3M_KC_Z), \
(BLIS_POOL_3M_MC_Z + BLIS_POOL_3M_KC_Z)*(BLIS_POOL_3M_MC_Z + BLIS_POOL_3M_KC_Z) / 4 \
) * \
3 / 2 \
)
+#elif defined (BLIS_ENABLE_C66X_AM57X)
+#define BLIS_MK_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_MC_Z * \
+ ( BLIS_POOL_3M_KC_Z \
+ ) * \
+ ( BLIS_SIZEOF_Z * \
+ 3 \
+ ) / 2 \
+ )
+
+
+#define BLIS_KN_BLOCK_SIZE_3M_Z ( \
+ ( BLIS_POOL_3M_KC_Z \
+ ) * \
+ BLIS_POOL_3M_NC_Z * \
+ ( BLIS_SIZEOF_Z * \
+ 3 \
+ ) / 2 \
+ )
+#endif
+
#define BLIS_MN_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_MC_Z * \
BLIS_POOL_3M_NC_Z * \
)
-/*#define BLIS_MN_POOL_SIZE_L1 ( \
- BLIS_NUM_MC_X_NC_BLOCKS_L1 * \
- ( BLIS_MN_BLOCK_SIZE + \
- BLIS_CONTIG_ADDR_ALIGN_SIZE \
- ) + \
- BLIS_MAX_PRELOAD_BYTE_OFFSET \
+#define BLIS_MNR_POOL_SIZE_L2 ( \
+ BLIS_NUM_MC_X_NR_BLOCKS_L1 * \
+ ( BLIS_MNR_BLOCK_SIZE ) \
)
-*/
+
#define BLIS_MN_POOL_SIZE_L1 ( \
BLIS_NUM_MC_X_NC_BLOCKS_L1 * \
( BLIS_MN_BLOCK_SIZE + \
index 2b01a4d62f286c2f6a817514a1f9f42f4fbb9ddd..7f2ec1885c6150af596a033bf2c8be0139aec5e2 100644 (file)
\
bli_obj_is_lower( *bli_obj_root( obj ) ) \
+#define bli_obj_root_uplo( obj ) \
+\
+ bli_obj_uplo( *bli_obj_root( obj ) )
// Root matrix modification
index c597d3f2386bab722a1c27988870376e5ce81545..363e59ec53ed28cd14c5908600892c8ca6ada819 100644 (file)
#ifndef BLIS_TYPE_DEFS_H
#define BLIS_TYPE_DEFS_H
-//#include <ti/libarch/libarch.h>
//
// -- BLIS basic types ---------------------------------------------------------
index 7c94e2b2223a2fae3503cb6b6bad8be46376fe42..d0926536fa8df0b040b5495a27f7b08ee3a627a9 100644 (file)
//DMA include
#include "bli_dma.h" //Has to be after bli_cntl, because bli_dma.h uses typedefs from bli_cntl.h
#endif
+#include "bli_profile.h"
+
// -- Level-0 operations --
index a4c2779bd8d9e4822991c30451b8451930962f60..dc810df29ff33c2aa62d1d4f75718a3d18aaef4d 100644 (file)
#define BLIS_FPRINTS_H
// prints
-#ifdef BLIS_ENABLE_C66X_OPENCL
-#define bli_sfprints( file, spec, x ) \
-{ \
- printf( spec, (x) ); \
-}
-#define bli_dfprints( file, spec, x ) \
-{ \
- printf(spec, (x) ); \
-}
-#define bli_cfprints( file, spec, x ) \
-{ \
- printf( spec, bli_creal(x) ); \
- printf( " + " ); \
- printf( spec, bli_cimag(x) ); \
- printf( " " ); \
-}
-#define bli_zfprints( file, spec, x ) \
-{ \
- printf( spec, bli_zreal(x) ); \
- printf( " + " ); \
- printf( spec, bli_zimag(x) ); \
- printf( " " ); \
-}
-#define bli_ifprints( file, spec, x ) \
-{ \
- printf( spec, (x) ); \
-}
-#else
+
#define bli_sfprints( file, spec, x ) \
{ \
fprintf( file, spec, (x) ); \
{ \
fprintf( file, spec, (x) ); \
}
-#endif
#endif
index 5097ab0dcfe6418ccfe8edc172b5a6ed84ba0528..d6afa34b575b02725c3a2da4d4b6876364c21f97 100644 (file)
dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, *x );
gint_t* ip = bli_obj_buffer_for_const( BLIS_INT, *x );
-#ifdef BLIS_ENABLE_C66X_OPENCL
- printf( "%s\n", s1 );
- printf( " float: %9.2e\n", bli_sreal( *sp ) );
- printf( " double: %9.2e\n", bli_dreal( *dp ) );
- printf( " scomplex: %9.2e + %9.2e\n", bli_creal( *cp ), bli_cimag( *cp ) );
- printf( " dcomplex: %9.2e + %9.2e\n", bli_zreal( *zp ), bli_zimag( *zp ) );
- printf( " int: %ld\n", (long int)*ip );
- printf( "\n" );
- return;
-#else
fprintf( file, "%s\n", s1 );
fprintf( file, " float: %9.2e\n", bli_sreal( *sp ) );
fprintf( file, " double: %9.2e\n", bli_dreal( *dp ) );
fprintf( file, " int: %ld\n", (long int)*ip );
fprintf( file, "\n" );
return;
-#endif
}
// Index into the type combination array to extract the correct
s2 );
}
-#ifdef BLIS_ENABLE_C66X_OPENCL
-#undef GENTFUNC
-#define GENTFUNC( ctype, ch, opname, varname ) \
-\
-void PASTEMAC(ch,opname)( \
- FILE* file, \
- char* s1, \
- dim_t m, \
- dim_t n, \
- void* x, inc_t rs_x, inc_t cs_x, \
- char* format, \
- char* s2 \
- ) \
-{ \
- dim_t i, j; \
- ctype* chi1; \
- char default_spec[32] = PASTEMAC(ch,formatspec)(); \
-\
- if ( format == NULL ) format = default_spec; \
-\
- printf( "%s\n", s1 ); \
-\
- for ( i = 0; i < m; ++i ) \
- { \
- for ( j = 0; j < n; ++j ) \
- { \
- chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \
-\
- PASTEMAC(ch,fprints)( file, format, *chi1 ); \
- printf( " " ); \
- } \
-\
- printf( ";\n" ); \
- } \
-\
- printf( "%s\n", s2 ); \
-}
-
-INSERT_GENTFUNC_BASIC_I( fprintm, fprintm )
-#else
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
}
INSERT_GENTFUNC_BASIC_I( fprintm, fprintm )
-#endif
+
index e3629615c07d5ea6232ec07e81488a793722de4b..c4387366b7ef4d9bb1231c7dc2730c4785e3c06d 100644 (file)
format,
s2 );
}
-#ifdef BLIS_ENABLE_C66X_OPENCL
-
-#undef GENTFUNC
-#define GENTFUNC( ctype, ch, opname, varname ) \
-\
-void PASTEMAC(ch,opname)( \
- FILE* file, \
- char* s1, \
- dim_t n, \
- void* x, inc_t incx, \
- char* format, \
- char* s2 \
- ) \
-{ \
- dim_t i; \
- ctype* chi1; \
- char default_spec[32] = PASTEMAC(ch,formatspec)(); \
-\
- if ( format == NULL ) format = default_spec; \
-\
- chi1 = x; \
-\
- printf( "%s\n", s1 ); \
-\
- for ( i = 0; i < n; ++i ) \
- { \
- PASTEMAC(ch,fprints)( file, format, *chi1 ); \
- printf( "\n" ); \
-\
- chi1 += incx; \
- } \
-\
- printf( "\n" ); \
- printf( "%s\n", s2 ); \
-}
-
-INSERT_GENTFUNC_BASIC_I( fprintv, fprintv )
-
-#else
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
}
INSERT_GENTFUNC_BASIC_I( fprintv, fprintv )
-#endif
diff --git a/blis/kernels/armv7a/3/bli_cgemm_kernel_2x2.S b/blis/kernels/armv7a/3/bli_cgemm_kernel_2x2.S
--- /dev/null
@@ -0,0 +1,502 @@
+
+#define REALNAME bli_cgemm_kernel_2x2
+
+#define STACKSIZE 256
+
+#define K r0
+#define PTR_ALPHA r1
+#define OLD_A r2
+#define OLD_B r3
+#define PTR_BETA [fp, #0 ]
+#define OLD_C [fp, #4 ]
+#define OLD_RSC [fp, #8 ]
+#define OLD_CSC [fp, #12 ]
+#define AUX [fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L r2
+
+#define AO r5
+#define BO r6
+
+#define CO1 r7
+#define CO2 r8
+
+
+#define A_PRE 96
+#define B_PRE 96
+#define C_PRE 0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#define FMAC_BR fnmacs
+#define FMAC_BI fmacs
+
+#define NN 1
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+ #define FADD_R fsubs
+ #define FADD_I fadds
+
+ #define FMAC_R1 fnmacs
+ #define FMAC_R2 fnmacs
+ #define FMAC_I1 fmacs
+ #define FMAC_I2 fnmacs
+
+#elif defined(CN) || defined(CT)
+
+ #define FADD_R fadds
+ #define FADD_I fsubs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fmacs
+ #define FMAC_I1 fnmacs
+ #define FMAC_I2 fmacs
+
+#elif defined(NC) || defined(TC)
+
+ #define FADD_R fadds
+ #define FADD_I fsubs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fnmacs
+ #define FMAC_I1 fmacs
+ #define FMAC_I2 fmacs
+
+#else
+
+ #define FADD_R fsubs
+ #define FADD_I fadds
+
+ #define FMAC_R1 fnmacs
+ #define FMAC_R2 fmacs
+ #define FMAC_I1 fnmacs
+ #define FMAC_I2 fnmacs
+
+#endif
+
+
+
+.macro INIT2x2
+
+ vsub.f32 s16 , s16 , s16
+ vmov.f32 s17, s16
+ vmov.f32 s18, s16
+ vmov.f32 s19, s16
+ vmov.f32 s20, s16
+ vmov.f32 s21, s16
+ vmov.f32 s22, s16
+ vmov.f32 s23, s16
+ vmov.f32 s24, s16
+ vmov.f32 s25, s16
+ vmov.f32 s26, s16
+ vmov.f32 s27, s16
+ vmov.f32 s28, s16
+ vmov.f32 s29, s16
+ vmov.f32 s30, s16
+ vmov.f32 s31, s16
+
+.endm
+
+.macro KERNEL2x2_I
+ pld [ AO , #A_PRE ]
+ pld [ BO , #B_PRE ]
+ flds s0 , [ AO ]
+ flds s1 , [ AO, #4 ]
+ flds s8 , [ BO ]
+ flds s9 , [ BO, #4 ]
+
+ fmuls s16 , s0, s8
+ flds s2 , [ AO, #8 ]
+ fmuls s24 , s1, s9
+ flds s3 , [ AO, #12 ]
+ fmuls s17 , s0, s9
+ flds s10, [ BO, #8 ]
+ fmuls s25 , s1, s8
+
+ flds s11, [ BO, #12 ]
+ fmuls s18 , s2, s8
+ add BO , BO, #16
+ fmuls s26 , s3, s9
+ add AO , AO, #16
+ fmuls s19 , s2, s9
+ pld [ BO , #B_PRE ]
+ fmuls s27 , s3, s8
+
+ pld [ AO , #A_PRE ]
+ fmuls s20 , s0, s10
+ flds s4 , [ AO, #0 ]
+ fmuls s28 , s1, s11
+ flds s5 , [ AO, #4 ]
+ fmuls s21 , s0, s11
+ flds s12, [ BO ]
+ fmuls s29 , s1, s10
+
+ flds s13, [ BO, #4 ]
+ fmuls s22 , s2, s10
+ flds s6 , [ AO, #8 ]
+ fmuls s30 , s3, s11
+ flds s7 , [ AO, #12 ]
+ fmuls s23 , s2, s11
+ flds s14, [ BO, #8 ]
+ fmuls s31 , s3, s10
+ flds s15, [ BO, #12 ]
+
+ add BO , BO, #16
+ add AO , AO, #16
+.endm
+
+
+
+.macro KERNEL2x2_M1
+ pld [ AO , #A_PRE ]
+
+ fmacs s16 , s0, s8
+ pld [ BO , #B_PRE ]
+ fmacs s24 , s1, s9
+ flds s4 , [ AO, #0 ]
+ fmacs s17 , s0, s9
+ flds s5 , [ AO, #4 ]
+ fmacs s25 , s1, s8
+
+ flds s12, [ BO ]
+ fmacs s18 , s2, s8
+ flds s13, [ BO, #4 ]
+ fmacs s26 , s3, s9
+ flds s6 , [ AO, #8 ]
+ fmacs s19 , s2, s9
+ flds s7 , [ AO, #12 ]
+ fmacs s27 , s3, s8
+
+ fmacs s20 , s0, s10
+ flds s14, [ BO, #8 ]
+ fmacs s28 , s1, s11
+ fmacs s21 , s0, s11
+ flds s15, [ BO, #12 ]
+ fmacs s29 , s1, s10
+
+ fmacs s22 , s2, s10
+ add BO , BO, #16
+ fmacs s30 , s3, s11
+ fmacs s23 , s2, s11
+ add AO , AO, #16
+ fmacs s31 , s3, s10
+
+.endm
+
+.macro KERNEL2x2_M2
+
+ fmacs s16 , s4, s12
+ fmacs s24 , s5, s13
+ flds s0 , [ AO, #0 ]
+ fmacs s17 , s4, s13
+ flds s1 , [ AO, #4 ]
+ fmacs s25 , s5, s12
+
+ fmacs s18 , s6, s12
+ flds s8 , [ BO ]
+ fmacs s26 , s7, s13
+ flds s9 , [ BO, #4 ]
+ fmacs s19 , s6, s13
+ fmacs s27 , s7, s12
+
+ flds s2 , [ AO, #8 ]
+ fmacs s20 , s4, s14
+ flds s3 , [ AO, #12 ]
+ fmacs s28 , s5, s15
+ fmacs s21 , s4, s15
+ flds s10, [ BO, #8 ]
+ fmacs s29 , s5, s14
+
+ flds s11, [ BO, #12 ]
+ fmacs s22 , s6, s14
+ fmacs s30 , s7, s15
+ add BO , BO, #16
+ fmacs s23 , s6, s15
+ add AO , AO, #16
+ fmacs s31 , s7, s14
+
+.endm
+
+
+.macro KERNEL2x2_E
+
+ fmacs s16 , s4, s12
+ fmacs s24 , s5, s13
+ fmacs s17 , s4, s13
+ fmacs s25 , s5, s12
+
+ fmacs s18 , s6, s12
+ fmacs s26 , s7, s13
+ fmacs s19 , s6, s13
+ fmacs s27 , s7, s12
+
+ fmacs s20 , s4, s14
+ fmacs s28 , s5, s15
+ fmacs s21 , s4, s15
+ fmacs s29 , s5, s14
+
+ fmacs s22 , s6, s14
+ fmacs s30 , s7, s15
+ fmacs s23 , s6, s15
+ fmacs s31 , s7, s14
+
+.endm
+
+.macro KERNEL2x2_SUB
+
+ flds s0 , [ AO ]
+ flds s1 , [ AO, #4 ]
+ flds s8 , [ BO ]
+ flds s9 , [ BO, #4 ]
+
+ fmacs s16 , s0, s8
+ flds s2 , [ AO, #8 ]
+ fmacs s24 , s1, s9
+ flds s3 , [ AO, #12 ]
+ fmacs s17 , s0, s9
+ flds s10, [ BO, #8 ]
+ fmacs s25 , s1, s8
+
+ flds s11, [ BO, #12 ]
+ fmacs s18 , s2, s8
+ fmacs s26 , s3, s9
+ fmacs s19 , s2, s9
+ fmacs s27 , s3, s8
+
+ fmacs s20 , s0, s10
+ fmacs s28 , s1, s11
+ fmacs s21 , s0, s11
+ fmacs s29 , s1, s10
+
+ fmacs s22 , s2, s10
+ add BO , BO, #16
+ fmacs s30 , s3, s11
+ fmacs s23 , s2, s11
+ add AO , AO, #16
+ fmacs s31 , s3, s10
+
+.endm
+
+
+
+
+.macro SAVE2x2
+
+ ldr r3, OLD_RSC // Row stride size
+ lsl r3, r3, #3 // multiply with size of complex float
+
+ flds s0, [ PTR_ALPHA ] // load real part of alpha
+ flds s1, [ PTR_ALPHA, #4 ] // load imag part of alpha
+ ldr r4, PTR_BETA
+ flds s2, [ r4 ] // load real part of beta
+ flds s3, [ r4, #4 ] // load imag part of beta
+
+ // Add/Sub the real and the imag parts
+ FADD_R s16, s24 , s16
+ FADD_I s17, s25 , s17
+ FADD_R s18, s26 , s18
+ FADD_I s19, s27 , s19
+ FADD_R s20, s28 , s20
+ FADD_I s21, s29 , s21
+ FADD_R s22, s30 , s22
+ FADD_I s23, s31 , s23
+
+ mov r4, CO1 // save pointer
+ fldmias CO1, { s4 - s5 } // read real and imag part from C
+ add CO1, CO1, r3
+
+ mov r2, CO2 // save pointer
+ fldmias CO2, { s8 - s9 } // read real and imag part from C
+ add CO2, CO2, r3
+
+ fmuls s24, s4, s2 // multiply Beta-real with C-real
+ fmuls s25, s5, s2 // multiply Beta-real with C-imag
+ fmuls s28, s8, s2 // multiply Beta-real with C-real
+ fmuls s29, s9, s2 // multiply Beta-real with C-imag
+
+ FMAC_BR s24, s3, s5 // multiply beta-imag with C-imag and add
+ FMAC_BI s25, s3, s4 // multiply beta-imag with C-real and add
+ FMAC_BR s28, s3, s9 // multiply beta-imag with C-imag and add
+ FMAC_BI s29, s3, s8 // multiply beta-imag with C-real and add
+
+ FMAC_R1 s24 , s0 , s16
+ FMAC_I1 s25 , s0 , s17
+ FMAC_R2 s24 , s1 , s17
+ FMAC_I2 s25 , s1 , s16
+
+ FMAC_R1 s28 , s0 , s20
+ FMAC_I1 s29 , s0 , s21
+ FMAC_R2 s28 , s1 , s21
+ FMAC_I2 s29 , s1 , s20
+
+ fldmias CO1, { s4 - s5 } // read real and imag part from C
+ fldmias CO2, { s8 - s9 } // read real and imag part from C
+
+ fmuls s26, s4, s2 // multiply Beta-real with C-real
+ fmuls s27, s5, s2 // multiply Beta-real with C-imag
+ fmuls s30, s8, s2 // multiply Beta-real with C-real
+ fmuls s31, s9, s2 // multiply Beta-real with C-imag
+
+ FMAC_BR s26, s3, s5 // multiply beta-imag with C-imag and add
+ FMAC_BI s27, s3, s4 // multiply beta-imag with C-real and add
+ FMAC_BR s30, s3, s9 // multiply beta-imag with C-imag and add
+ FMAC_BI s31, s3, s8 // multiply beta-imag with C-real and add
+
+ FMAC_R1 s26 , s0 , s18
+ FMAC_I1 s27 , s0 , s19
+ FMAC_R2 s26 , s1 , s19
+ FMAC_I2 s27 , s1 , s18
+
+ FMAC_R1 s30, s0 , s22
+ FMAC_I1 s31, s0 , s23
+ FMAC_R2 s30, s1 , s23
+ FMAC_I2 s31, s1 , s22
+
+ mov CO1, r4 // restore pointer
+ mov CO2, r2 // restore pointer
+ fstmias CO1, { s24 - s25 }
+ fstmias CO2, { s28 - s29 }
+ add CO1, CO1, r3
+ add CO2, CO2, r3
+ fstmias CO1, { s26 - s27 }
+ fstmias CO2, { s30 - s31 }
+
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ .arm
+ .global REALNAME
+ .func REALNAME
+
+REALNAME:
+
+ push {r4 - r9, fp} // save register
+ add fp, sp, #28 // add number of saved register multiplied by size of int
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ mov AO, OLD_A // pointer matrix A
+ mov BO, OLD_B // pointer matrix B
+
+ sub r3, fp, #128
+ vstm r3, { s8 - s31} // store floating point registers
+
+ ldr r2, OLD_C // pointer matrix C
+ ldr r3, OLD_CSC // Col stride size of C
+ lsl r3, r3, #3 // multiply with size of complex float
+
+ mov CO1, r2 // first line of C
+ add CO2, CO1, r3 // second line of C
+
+ pld [ CO1, #C_PRE ] // prefetch the lines of C
+ pld [ CO2, #C_PRE ] // prefetch the lines of C
+
+cgemm_kernel_L2_M2_20:
+
+ asrs L , K, #3 // L = K / 8
+ cmp L , #2
+ blt cgemm_kernel_L2_M2_32
+
+ KERNEL2x2_I
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ subs L, L, #2
+ ble cgemm_kernel_L2_M2_22a
+ .align 5
+
+cgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ subs L, L, #1
+ bgt cgemm_kernel_L2_M2_22
+
+cgemm_kernel_L2_M2_22a:
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_E
+
+ b cgemm_kernel_L2_M2_44
+
+cgemm_kernel_L2_M2_32:
+
+ tst L, #1
+ ble cgemm_kernel_L2_M2_40
+
+ KERNEL2x2_I
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_E
+
+ b cgemm_kernel_L2_M2_44
+
+cgemm_kernel_L2_M2_40:
+
+ INIT2x2
+
+cgemm_kernel_L2_M2_44:
+
+ ands L , K, #7 // L = K % 8
+ ble cgemm_kernel_L2_M2_100
+
+cgemm_kernel_L2_M2_46:
+
+ KERNEL2x2_SUB
+
+ subs L, L, #1
+ bne cgemm_kernel_L2_M2_46
+
+cgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+cgemm_kernel_L999:
+
+ sub r3, fp, #128
+ vldm r3, { s8 - s31} // restore floating point registers
+
+ sub sp, fp, #28
+ pop {r4 - r9, fp}
+ bx lr
+
diff --git a/blis/kernels/armv7a/3/bli_dgemm_kernel_4x4.S b/blis/kernels/armv7a/3/bli_dgemm_kernel_4x4.S
--- /dev/null
@@ -0,0 +1,503 @@
+
+#define REALNAME bli_dgemm_kernel_4x4
+
+#define STACKSIZE 256
+
+#define K r0
+#define PTR_ALPHA r1
+#define OLD_A r2
+#define OLD_B r3
+#define PTR_BETA [fp, #0 ]
+#define OLD_C [fp, #4 ]
+#define OLD_RSC [fp, #8 ]
+#define OLD_CSC [fp, #12 ]
+#define AUX [fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L r2
+
+#define AO r5
+#define BO r6
+
+#define CO1 r7
+#define CO2 r8
+#define CO3 r9
+#define CO4 r12
+
+
+#define A_PRE 96
+#define B_PRE 96
+#define C_PRE 0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+ vsub.f64 d16 , d16 , d16
+ vmov.f64 d17, d16
+ vmov.f64 d18, d16
+ vmov.f64 d19, d16
+ vmov.f64 d20, d16
+ vmov.f64 d21, d16
+ vmov.f64 d22, d16
+ vmov.f64 d23, d16
+ vmov.f64 d24, d16
+ vmov.f64 d25, d16
+ vmov.f64 d26, d16
+ vmov.f64 d27, d16
+ vmov.f64 d28, d16
+ vmov.f64 d29, d16
+ vmov.f64 d30, d16
+ vmov.f64 d31, d16
+
+.endm
+
+.macro KERNEL4x4_I
+ pld [ BO , #B_PRE ]
+ fldd d8 , [ BO ]
+ fldd d0 , [ AO ]
+ pld [ AO , #A_PRE ]
+
+ fldd d1 , [ AO, #8 ]
+ fmuld d16 , d0, d8
+ fldd d2 , [ AO, #16 ]
+ fmuld d17 , d1, d8
+ fldd d3 , [ AO, #24 ]
+ fmuld d18 , d2, d8
+ fldd d9 , [ BO, #8 ]
+ fmuld d19 , d3, d8
+
+ fldd d10, [ BO, #16 ]
+ fmuld d20 , d0, d9
+ fldd d11, [ BO, #24 ]
+ fmuld d21 , d1, d9
+ add BO , BO, #32
+ add AO , AO, #32
+ fmuld d22 , d2, d9
+
+ pld [ BO , #B_PRE ]
+ fldd d12, [ BO ]
+ fmuld d23 , d3, d9
+
+ pld [ AO , #A_PRE ]
+ fldd d4 , [ AO, #0 ]
+ fmuld d24 , d0, d10
+ fldd d5 , [ AO, #8 ]
+ fmuld d25 , d1, d10
+ fldd d6 , [ AO, #16 ]
+ fmuld d26 , d2, d10
+ fldd d7 , [ AO, #24 ]
+ fmuld d27 , d3, d10
+
+ fldd d13, [ BO, #8 ]
+ fmuld d28 , d0, d11
+ fldd d14, [ BO, #16 ]
+ fmuld d29 , d1, d11
+ fldd d15, [ BO, #24 ]
+ fmuld d30 , d2, d11
+ fmuld d31 , d3, d11
+
+.endm
+
+.macro KERNEL4x4_M2
+
+ fmacd d16 , d4, d12
+ pld [ AO , #A_PRE+32 ]
+ fmacd d17 , d5, d12
+ fldd d0 , [ AO , #32 ]
+ fmacd d18 , d6, d12
+ pld [ BO , #B_PRE+32 ]
+ fmacd d19 , d7, d12
+
+ fldd d8 , [ BO , #32 ]
+ fmacd d20 , d4, d13
+ fldd d1 , [ AO, #40 ]
+ fmacd d21 , d5, d13
+ fldd d2 , [ AO, #48 ]
+ fmacd d22 , d6, d13
+ fldd d3 , [ AO, #56 ]
+ fmacd d23 , d7, d13
+
+ fmacd d24 , d4, d14
+ fmacd d25 , d5, d14
+ fldd d9 , [ BO, #40 ]
+ fmacd d26 , d6, d14
+ fldd d10, [ BO, #48 ]
+ fmacd d27 , d7, d14
+
+ fldd d11, [ BO, #56 ]
+ fmacd d28 , d4, d15
+ fmacd d29 , d5, d15
+ add AO , AO, #64
+ fmacd d30 , d6, d15
+ add BO , BO, #64
+ fmacd d31 , d7, d15
+
+.endm
+
+.macro KERNEL4x4_M1
+
+ fmacd d16 , d0, d8
+ pld [ AO , #A_PRE ]
+ fmacd d17 , d1, d8
+ fldd d4 , [ AO ]
+ fmacd d18 , d2, d8
+ pld [ BO , #B_PRE ]
+ fmacd d19 , d3, d8
+
+ fldd d12, [ BO ]
+ fmacd d20 , d0, d9
+ fldd d5 , [ AO, #8 ]
+ fmacd d21 , d1, d9
+ fldd d6 , [ AO, #16 ]
+ fmacd d22 , d2, d9
+ fldd d7 , [ AO, #24 ]
+ fmacd d23 , d3, d9
+
+ fmacd d24 , d0, d10
+ fmacd d25 , d1, d10
+ fldd d13, [ BO, #8 ]
+ fmacd d26 , d2, d10
+ fldd d14, [ BO, #16 ]
+ fmacd d27 , d3, d10
+
+ fldd d15, [ BO, #24 ]
+ fmacd d28 , d0, d11
+ fmacd d29 , d1, d11
+ fmacd d30 , d2, d11
+ fmacd d31 , d3, d11
+
+.endm
+
+.macro KERNEL4x4_E
+
+ fmacd d16 , d4, d12
+ fmacd d17 , d5, d12
+ add BO , BO, #32
+ fmacd d18 , d6, d12
+ add AO , AO, #32
+ fmacd d19 , d7, d12
+
+ fmacd d20 , d4, d13
+ fmacd d21 , d5, d13
+ fmacd d22 , d6, d13
+ fmacd d23 , d7, d13
+
+ fmacd d24 , d4, d14
+ fmacd d25 , d5, d14
+ fmacd d26 , d6, d14
+ fmacd d27 , d7, d14
+
+ fmacd d28 , d4, d15
+ fmacd d29 , d5, d15
+ fmacd d30 , d6, d15
+ fmacd d31 , d7, d15
+
+.endm
+
+.macro KERNEL4x4_SUB
+
+ fldd d8 , [ BO ]
+ pld [ BO , #B_PRE ]
+
+ fldd d0 , [ AO ]
+ pld [ AO , #A_PRE ]
+ fldd d1 , [ AO, #8 ]
+
+ fmacd d16 , d0, d8
+ fldd d2 , [ AO, #16 ]
+ fmacd d17 , d1, d8
+ fldd d3 , [ AO, #24 ]
+ fmacd d18 , d2, d8
+ fldd d9 , [ BO, #8 ]
+ fmacd d19 , d3, d8
+
+ fldd d10, [ BO, #16 ]
+ fmacd d20 , d0, d9
+ fldd d11, [ BO, #24 ]
+ fmacd d21 , d1, d9
+ fmacd d22 , d2, d9
+ fmacd d23 , d3, d9
+
+ fmacd d24 , d0, d10
+ fmacd d25 , d1, d10
+ fmacd d26 , d2, d10
+ fmacd d27 , d3, d10
+
+ fmacd d28 , d0, d11
+ fmacd d29 , d1, d11
+ add AO , AO, #32
+ fmacd d30 , d2, d11
+ add BO , BO, #32
+ fmacd d31 , d3, d11
+
+.endm
+
+.macro SAVE4x4
+
+ ldr r3, OLD_RSC // Row stride size
+ lsl r3, r3, #3 // multiply with size of double
+
+ fldd d0, [ PTR_ALPHA ] // load alpha
+ ldr r4, PTR_BETA
+ fldd d1, [ r4 ] // load beta
+
+//-----------------------------------------------------------
+ mov r2, CO1 // save pointer
+ mov r4, CO2 // save pointer
+ fldd d8, [ CO1 ] // load value from C
+ fldd d12, [ CO2 ] // load value from C
+ fmuld d8, d8, d1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacd d8, d0, d16 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ fldd d9, [ CO1 ] // load value from C
+ fldd d13, [ CO2 ] // load value from C
+ fmuld d9, d9, d1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacd d9, d0, d17 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ fldd d10, [ CO1 ] // load value from C
+ fldd d14, [ CO2 ] // load value from C
+ fmuld d10, d10, d1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacd d10, d0, d18 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ fldd d11, [ CO1 ] // load value from C
+ fldd d15, [ CO2 ] // load value from C
+ fmuld d11, d11, d1 // multiply with beta
+ mov CO1, r2 // restore pointer
+ fmacd d11, d0, d19 // multiply sum with alpha and add to value of C
+ mov CO2, r4 // restore pointer
+
+ fstd d8, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fstd d9, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fstd d10, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fstd d11, [ CO1 ] // store value in C
+
+//-----------------------------------------------------------
+ mov r2, CO3 // save pointer
+ fldd d8, [ CO3 ] // load value from C
+ fmuld d12, d12, d1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacd d12, d0, d20 // multiply sum with alpha and add to value of C
+
+ fldd d9, [ CO3 ] // load value from C
+ fmuld d13, d13, d1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacd d13, d0, d21 // multiply sum with alpha and add to value of C
+
+ fldd d10, [ CO3 ] // load value from C
+ fmuld d14, d14, d1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacd d14, d0, d22 // multiply sum with alpha and add to value of C
+
+ fldd d11, [ CO3 ] // load value from C
+ fmuld d15, d15, d1 // multiply with beta
+ mov CO3, r2 // restore pointer
+ fmacd d15, d0, d23 // multiply sum with alpha and add to value of C
+
+ fstd d12, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fstd d13, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fstd d14, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fstd d15, [ CO2 ] // store value in C
+
+//-----------------------------------------------------------
+ mov r4, CO4 // save pointer
+ fldd d12, [ CO4 ] // load value from C
+ fmuld d8, d8, d1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacd d8, d0, d24 // multiply sum with alpha and add to value of C
+
+ fldd d13, [ CO4 ] // load value from C
+ fmuld d9, d9, d1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacd d9, d0, d25 // multiply sum with alpha and add to value of C
+
+ fldd d14, [ CO4 ] // load value from C
+ fmuld d10, d10, d1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacd d10, d0, d26 // multiply sum with alpha and add to value of C
+
+ fldd d15, [ CO4 ] // load value from C
+ fmuld d11, d11, d1 // multiply with beta
+ mov CO4, r4 // restore pointer
+ fmacd d11, d0, d27 // multiply sum with alpha and add to value of C
+
+
+//-----------------------------------------------------------
+ fstd d8, [ CO3 ] // store value in C
+ fmuld d12, d12, d1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacd d12, d0, d28 // multiply sum with alpha and add to value of C
+
+ fstd d9, [ CO3 ] // store value in C
+ fmuld d13, d13, d1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacd d13, d0, d29 // multiply sum with alpha and add to value of C
+
+ fstd d10, [ CO3 ] // store value in C
+ fmuld d14, d14, d1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacd d14, d0, d30 // multiply sum with alpha and add to value of C
+
+ fstd d11, [ CO3 ] // store value in C
+ fmuld d15, d15, d1 // multiply with beta
+ fstd d12, [ CO4 ] // store value in C
+ fmacd d15, d0, d31 // multiply sum with alpha and add to value of C
+
+ add CO4 , CO4, r3 // compute next pointer
+ fstd d13, [ CO4 ] // store value in C
+ add CO4 , CO4, r3 // compute next pointer
+ fstd d14, [ CO4 ] // store value in C
+ add CO4 , CO4, r3 // compute next pointer
+ fstd d15, [ CO4 ] // store value in C
+
+.endm
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ .arm
+ .global REALNAME
+ .func REALNAME
+
+REALNAME:
+
+ push {r4 - r9, fp} // save register
+ add fp, sp, #28 // add number of saved register multiplied by size of int
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ mov AO, OLD_A // pointer matrix A
+ mov BO, OLD_B // pointer matrix B
+
+ sub r3, fp, #128
+ vstm r3, { d8 - d15} // store floating point registers
+
+ ldr r2, OLD_C // pointer matrix C
+ ldr r3, OLD_CSC // Col stride size of C
+ lsl r3, r3, #3 // multiply with size of double
+
+ mov CO1, r2 // first line of C
+ add CO2, CO1, r3 // second line of C
+ add CO3, CO2, r3 // third line of C
+ add CO4, CO3, r3 // fourth line of C
+
+ pld [ CO1, #C_PRE ] // prefetch the lines of C
+ pld [ CO2, #C_PRE ] // prefetch the lines of C
+ pld [ CO3, #C_PRE ] // prefetch the lines of C
+ pld [ CO3, #C_PRE ] // prefetch the lines of C
+
+dgemm_kernel_L4_M4_20:
+
+ asrs L , K, #3 // L = K / 8
+ cmp L , #2
+ blt dgemm_kernel_L4_M4_32
+
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs L, L, #2
+ ble dgemm_kernel_L4_M4_22a
+ .align 5
+
+dgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs L, L, #1
+ bgt dgemm_kernel_L4_M4_22
+
+dgemm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b dgemm_kernel_L4_M4_44
+
+dgemm_kernel_L4_M4_32:
+
+ tst L, #1
+ ble dgemm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b dgemm_kernel_L4_M4_44
+
+dgemm_kernel_L4_M4_40:
+
+ INIT4x4
+
+dgemm_kernel_L4_M4_44:
+
+ ands L , K, #7 // L = K % 8
+ ble dgemm_kernel_L4_M4_100
+
+dgemm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+ subs L, L, #1
+ bne dgemm_kernel_L4_M4_46
+
+dgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+dgemm_kernel_L999:
+
+ sub r3, fp, #128
+ vldm r3, { d8 - d15} // restore floating point registers
+
+ sub sp, fp, #28
+ pop {r4 - r9, fp}
+ bx lr
+
diff --git a/blis/kernels/armv7a/3/bli_sgemm_kernel_4x4.S b/blis/kernels/armv7a/3/bli_sgemm_kernel_4x4.S
--- /dev/null
@@ -0,0 +1,483 @@
+
+#define REALNAME bli_sgemm_kernel_4x4
+
+#define STACKSIZE 256
+
+#define K r0
+#define PTR_ALPHA r1
+#define OLD_A r2
+#define OLD_B r3
+#define PTR_BETA [fp, #0 ]
+#define OLD_C [fp, #4 ]
+#define OLD_RSC [fp, #8 ]
+#define OLD_CSC [fp, #12 ]
+#define AUX [fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L r2
+
+#define AO r5
+#define BO r6
+
+#define CO1 r7
+#define CO2 r8
+#define CO3 r9
+#define CO4 r12
+
+
+#define A_PRE 96
+#define B_PRE 96
+#define C_PRE 0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+ vsub.f32 s16 , s16 , s16
+ vmov.f32 s17, s16
+ vmov.f32 s18, s16
+ vmov.f32 s19, s16
+ vmov.f32 s20, s16
+ vmov.f32 s21, s16
+ vmov.f32 s22, s16
+ vmov.f32 s23, s16
+ vmov.f32 s24, s16
+ vmov.f32 s25, s16
+ vmov.f32 s26, s16
+ vmov.f32 s27, s16
+ vmov.f32 s28, s16
+ vmov.f32 s29, s16
+ vmov.f32 s30, s16
+ vmov.f32 s31, s16
+
+.endm
+
+.macro KERNEL4x4_I
+
+ pld [ AO , #A_PRE ]
+ fldmias AO!, { s0 - s1 }
+ pld [ BO , #B_PRE ]
+ fldmias BO!, { s8 - s9 }
+
+ fmuls s16 , s0, s8
+ fldmias AO!, { s2 - s3 }
+ fmuls s17 , s1, s8
+ fmuls s18 , s2, s8
+ fldmias BO!, { s10 - s11 }
+ fmuls s19 , s3, s8
+
+ fmuls s20 , s0, s9
+ fldmias AO!, { s4 - s5 }
+ fmuls s21 , s1, s9
+ fmuls s22 , s2, s9
+ fldmias AO!, { s6 - s7 }
+ fmuls s23 , s3, s9
+
+ fmuls s24 , s0, s10
+ fldmias BO!, { s12 - s13 }
+ fmuls s25 , s1, s10
+ fmuls s26 , s2, s10
+ fldmias BO!, { s14 - s15 }
+ fmuls s27 , s3, s10
+
+ fmuls s28 , s0, s11
+ fmuls s29 , s1, s11
+ fmuls s30 , s2, s11
+ fmuls s31 , s3, s11
+
+.endm
+
+
+.macro KERNEL4x4_M2
+
+ pld [ AO , #A_PRE ]
+ fmacs s16 , s4, s12
+ fmacs s17 , s5, s12
+ fldmias AO!, { s0 - s3 }
+ fmacs s18 , s6, s12
+ pld [ BO , #B_PRE ]
+ fmacs s19 , s7, s12
+
+ fmacs s20 , s4, s13
+ fldmias BO!, { s8 - s11 }
+ fmacs s21 , s5, s13
+ fmacs s22 , s6, s13
+ //fldmias AO!, { s2 - s3 }
+ fmacs s23 , s7, s13
+
+ fmacs s24 , s4, s14
+ //fldmias BO!, { s10 - s11 }
+ fmacs s25 , s5, s14
+ fmacs s26 , s6, s14
+ fmacs s27 , s7, s14
+
+ fmacs s28 , s4, s15
+ fmacs s29 , s5, s15
+ fmacs s30 , s6, s15
+ fmacs s31 , s7, s15
+
+.endm
+
+
+.macro KERNEL4x4_M1
+
+ fmacs s16 , s0, s8
+ fldmias AO!, { s4 - s7 }
+ fmacs s17 , s1, s8
+ fmacs s18 , s2, s8
+ fldmias BO!, { s12 - s15 }
+ //fldmias AO!, { s6 - s7 }
+ fmacs s19 , s3, s8
+
+ fmacs s20 , s0, s9
+ fmacs s21 , s1, s9
+ fmacs s22 , s2, s9
+ //fldmias BO!, { s14 - s15 }
+ fmacs s23 , s3, s9
+
+ fmacs s24 , s0, s10
+ fmacs s25 , s1, s10
+ fmacs s26 , s2, s10
+ fmacs s27 , s3, s10
+
+ fmacs s28 , s0, s11
+ fmacs s29 , s1, s11
+ fmacs s30 , s2, s11
+ fmacs s31 , s3, s11
+
+.endm
+
+
+
+.macro KERNEL4x4_E
+
+ fmacs s16 , s4, s12
+ fmacs s17 , s5, s12
+ fmacs s18 , s6, s12
+ fmacs s19 , s7, s12
+
+ fmacs s20 , s4, s13
+ fmacs s21 , s5, s13
+ fmacs s22 , s6, s13
+ fmacs s23 , s7, s13
+
+ fmacs s24 , s4, s14
+ fmacs s25 , s5, s14
+ fmacs s26 , s6, s14
+ fmacs s27 , s7, s14
+
+ fmacs s28 , s4, s15
+ fmacs s29 , s5, s15
+ fmacs s30 , s6, s15
+ fmacs s31 , s7, s15
+
+.endm
+
+
+
+
+.macro KERNEL4x4_SUB
+
+ flds s8 , [ BO ]
+
+ flds s0 , [ AO ]
+ flds s1 , [ AO, #4 ]
+
+ fmacs s16 , s0, s8
+ flds s2 , [ AO, #8 ]
+ fmacs s17 , s1, s8
+ flds s3 , [ AO, #12 ]
+ fmacs s18 , s2, s8
+ flds s9 , [ BO, #4 ]
+ fmacs s19 , s3, s8
+
+ flds s10, [ BO, #8 ]
+ fmacs s20 , s0, s9
+ flds s11, [ BO, #12 ]
+ fmacs s21 , s1, s9
+ fmacs s22 , s2, s9
+ fmacs s23 , s3, s9
+
+ fmacs s24 , s0, s10
+ fmacs s25 , s1, s10
+ fmacs s26 , s2, s10
+ fmacs s27 , s3, s10
+
+ fmacs s28 , s0, s11
+ fmacs s29 , s1, s11
+ add AO , AO, #16
+ fmacs s30 , s2, s11
+ add BO , BO, #16
+ fmacs s31 , s3, s11
+
+.endm
+
+
+.macro SAVE4x4
+
+ ldr r3, OLD_RSC // Row stride size
+ lsl r3, r3, #2 // multiply with size of float
+
+ flds s0, [ PTR_ALPHA ] // load alpha
+ ldr r4, PTR_BETA
+ flds s1, [ r4 ] // load beta
+
+//-----------------------------------------------------------
+ mov r2, CO1 // save pointer
+ mov r4, CO2 // save pointer
+ flds s8, [ CO1 ] // load value from C
+ flds s12, [ CO2 ] // load value from C
+ fmuls s8, s8, s1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacs s8, s0, s16 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ flds s9, [ CO1 ] // load value from C
+ flds s13, [ CO2 ] // load value from C
+ fmuls s9, s9, s1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacs s9, s0, s17 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ flds s10, [ CO1 ] // load value from C
+ flds s14, [ CO2 ] // load value from C
+ fmuls s10, s10, s1 // multiply with beta
+ add CO1, CO1, r3 // compute next pointer
+ fmacs s10, s0, s18 // multiply sum with alpha and add to value of C
+ add CO2, CO2, r3 // compute next pointer
+
+ flds s11, [ CO1 ] // load value from C
+ flds s15, [ CO2 ] // load value from C
+ fmuls s11, s11, s1 // multiply with beta
+ mov CO1, r2 // restore pointer
+ fmacs s11, s0, s19 // multiply sum with alpha and add to value of C
+ mov CO2, r4 // restore pointer
+
+ fsts s8, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fsts s9, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fsts s10, [ CO1 ] // store value in C
+ add CO1 , CO1, r3 // compute next pointer
+ fsts s11, [ CO1 ] // store value in C
+
+//-----------------------------------------------------------
+ mov r2, CO3 // save pointer
+ flds s8, [ CO3 ] // load value from C
+ fmuls s12, s12, s1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacs s12, s0, s20 // multiply sum with alpha and add to value of C
+
+ flds s9, [ CO3 ] // load value from C
+ fmuls s13, s13, s1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacs s13, s0, s21 // multiply sum with alpha and add to value of C
+
+ flds s10, [ CO3 ] // load value from C
+ fmuls s14, s14, s1 // multiply with beta
+ add CO3, CO3, r3 // compute next pointer
+ fmacs s14, s0, s22 // multiply sum with alpha and add to value of C
+
+ flds s11, [ CO3 ] // load value from C
+ fmuls s15, s15, s1 // multiply with beta
+ mov CO3, r2 // restore pointer
+ fmacs s15, s0, s23 // multiply sum with alpha and add to value of C
+
+ fsts s12, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fsts s13, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fsts s14, [ CO2 ] // store value in C
+ add CO2 , CO2, r3 // compute next pointer
+ fsts s15, [ CO2 ] // store value in C
+
+//-----------------------------------------------------------
+ mov r4, CO4 // save pointer
+ flds s12, [ CO4 ] // load value from C
+ fmuls s8, s8, s1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacs s8, s0, s24 // multiply sum with alpha and add to value of C
+
+ flds s13, [ CO4 ] // load value from C
+ fmuls s9, s9, s1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacs s9, s0, s25 // multiply sum with alpha and add to value of C
+
+ flds s14, [ CO4 ] // load value from C
+ fmuls s10, s10, s1 // multiply with beta
+ add CO4, CO4, r3 // compute next pointer
+ fmacs s10, s0, s26 // multiply sum with alpha and add to value of C
+
+ flds s15, [ CO4 ] // load value from C
+ fmuls s11, s11, s1 // multiply with beta
+ mov CO4, r4 // restore pointer
+ fmacs s11, s0, s27 // multiply sum with alpha and add to value of C
+
+
+//-----------------------------------------------------------
+ fsts s8, [ CO3 ] // store value in C
+ fmuls s12, s12, s1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacs s12, s0, s28 // multiply sum with alpha and add to value of C
+
+ fsts s9, [ CO3 ] // store value in C
+ fmuls s13, s13, s1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacs s13, s0, s29 // multiply sum with alpha and add to value of C
+
+ fsts s10, [ CO3 ] // store value in C
+ fmuls s14, s14, s1 // multiply with beta
+ add CO3 , CO3, r3 // compute next pointer
+ fmacs s14, s0, s30 // multiply sum with alpha and add to value of C
+
+ fsts s11, [ CO3 ] // store value in C
+ fmuls s15, s15, s1 // multiply with beta
+ fsts s12, [ CO4 ] // store value in C
+ fmacs s15, s0, s31 // multiply sum with alpha and add to value of C
+
+ add CO4 , CO4, r3 // compute next pointer
+ fsts s13, [ CO4 ] // store value in C
+ add CO4 , CO4, r3 // compute next pointer
+ fsts s14, [ CO4 ] // store value in C
+ add CO4 , CO4, r3 // compute next pointer
+ fsts s15, [ CO4 ] // store value in C
+
+.endm
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ .arm
+ .global REALNAME
+ .func REALNAME
+
+REALNAME:
+
+ push {r4 - r9, fp} // save register
+ add fp, sp, #28 // add number of saved register multiplied by size of int
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ mov AO, OLD_A // pointer matrix A
+ mov BO, OLD_B // pointer matrix B
+
+ sub r3, fp, #128
+ vstm r3, { s8 - s31 } // store floating point registers
+
+ ldr r2, OLD_C // pointer matrix C
+ ldr r3, OLD_CSC // Col stride size of C
+ lsl r3, r3, #2 // multiply with size of float
+
+ mov CO1, r2 // first line of C
+ add CO2, CO1, r3 // second line of C
+ add CO3, CO2, r3 // third line of C
+ add CO4, CO3, r3 // fourth line of C
+
+ pld [ CO1, #C_PRE ] // prefetch the lines of C
+ pld [ CO2, #C_PRE ] // prefetch the lines of C
+ pld [ CO3, #C_PRE ] // prefetch the lines of C
+ pld [ CO3, #C_PRE ] // prefetch the lines of C
+
+sgemm_kernel_L4_M4_20:
+
+ asrs L , K, #3 // L = K / 8
+ cmp L , #2
+ blt sgemm_kernel_L4_M4_32
+
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs L, L, #2
+ ble sgemm_kernel_L4_M4_22a
+ .align 5
+
+sgemm_kernel_L4_M4_22:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ subs L, L, #1
+ bgt sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:
+
+ tst L, #1
+ ble sgemm_kernel_L4_M4_40
+
+ KERNEL4x4_I
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+
+ KERNEL4x4_M1
+ KERNEL4x4_M2
+ KERNEL4x4_M1
+ KERNEL4x4_E
+
+ b sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_40:
+
+ INIT4x4
+
+sgemm_kernel_L4_M4_44:
+
+ ands L , K, #7 // L = K % 8
+ ble sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+ KERNEL4x4_SUB
+
+ subs L, L, #1
+ bne sgemm_kernel_L4_M4_46
+
+sgemm_kernel_L4_M4_100:
+
+ SAVE4x4
+
+sgemm_kernel_L999:
+
+ sub r3, fp, #128
+ vldm r3, { s8 - s31 } // restore floating point registers
+
+ sub sp, fp, #28
+ pop {r4 - r9, fp}
+ bx lr
+
diff --git a/blis/kernels/armv7a/3/bli_zgemm_kernel_2x2.S b/blis/kernels/armv7a/3/bli_zgemm_kernel_2x2.S
--- /dev/null
@@ -0,0 +1,506 @@
+
+#define REALNAME bli_zgemm_kernel_2x2
+
+#define STACKSIZE 256
+
+#define K r0
+#define PTR_ALPHA r1
+#define OLD_A r2
+#define OLD_B r3
+#define PTR_BETA [fp, #0 ]
+#define OLD_C [fp, #4 ]
+#define OLD_RSC [fp, #8 ]
+#define OLD_CSC [fp, #12 ]
+#define AUX [fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L r2
+
+#define AO r5
+#define BO r6
+
+#define CO1 r7
+#define CO2 r8
+
+
+#define A_PRE 96
+#define B_PRE 96
+#define C_PRE 0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#define FMAC_BR fnmacd
+#define FMAC_BI fmacd
+
+#define NN 1
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+ #define FADD_R fsubd
+ #define FADD_I faddd
+
+ #define FMAC_R1 fnmacd
+ #define FMAC_R2 fnmacd
+ #define FMAC_I1 fmacd
+ #define FMAC_I2 fnmacd
+
+#elif defined(CN) || defined(CT)
+
+ #define FADD_R faddd
+ #define FADD_I fsubd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fmacd
+ #define FMAC_I1 fnmacd
+ #define FMAC_I2 fmacd
+
+#elif defined(NC) || defined(TC)
+
+ #define FADD_R faddd
+ #define FADD_I fsubd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fnmacd
+ #define FMAC_I1 fmacd
+ #define FMAC_I2 fmacd
+
+#else
+
+ #define FADD_R fsubd
+ #define FADD_I faddd
+
+ #define FMAC_R1 fnmacd
+ #define FMAC_R2 fmacd
+ #define FMAC_I1 fnmacd
+ #define FMAC_I2 fnmacd
+
+#endif
+
+
+
+.macro INIT2x2
+
+ vsub.f64 d16 , d16 , d16
+ vmov.f64 d17, d16
+ vmov.f64 d18, d16
+ vmov.f64 d19, d16
+ vmov.f64 d20, d16
+ vmov.f64 d21, d16
+ vmov.f64 d22, d16
+ vmov.f64 d23, d16
+ vmov.f64 d24, d16
+ vmov.f64 d25, d16
+ vmov.f64 d26, d16
+ vmov.f64 d27, d16
+ vmov.f64 d28, d16
+ vmov.f64 d29, d16
+ vmov.f64 d30, d16
+ vmov.f64 d31, d16
+
+.endm
+
+.macro KERNEL2x2_I
+ pld [ AO , #A_PRE ]
+ pld [ BO , #B_PRE ]
+ fldd d0 , [ AO ]
+ fldd d1 , [ AO, #8 ]
+ fldd d8 , [ BO ]
+ fldd d9 , [ BO, #8 ]
+
+ fmuld d16 , d0, d8
+ fldd d2 , [ AO, #16 ]
+ fmuld d24 , d1, d9
+ fldd d3 , [ AO, #24 ]
+ fmuld d17 , d0, d9
+ fldd d10, [ BO, #16 ]
+ fmuld d25 , d1, d8
+
+ fldd d11, [ BO, #24 ]
+ fmuld d18 , d2, d8
+ add BO , BO, #32
+ fmuld d26 , d3, d9
+ add AO , AO, #32
+ fmuld d19 , d2, d9
+ pld [ BO , #B_PRE ]
+ fmuld d27 , d3, d8
+
+ pld [ AO , #A_PRE ]
+ fmuld d20 , d0, d10
+ fldd d4 , [ AO, #0 ]
+ fmuld d28 , d1, d11
+ fldd d5 , [ AO, #8 ]
+ fmuld d21 , d0, d11
+ fldd d12, [ BO ]
+ fmuld d29 , d1, d10
+
+ fldd d13, [ BO, #8 ]
+ fmuld d22 , d2, d10
+ fldd d6 , [ AO, #16 ]
+ fmuld d30 , d3, d11
+ fldd d7 , [ AO, #24 ]
+ fmuld d23 , d2, d11
+ fldd d14, [ BO, #16 ]
+ fmuld d31 , d3, d10
+ fldd d15, [ BO, #24 ]
+
+ add BO , BO, #32
+ add AO , AO, #32
+.endm
+
+
+
+.macro KERNEL2x2_M1
+ pld [ AO , #A_PRE ]
+
+ fmacd d16 , d0, d8
+ pld [ BO , #B_PRE ]
+ fmacd d24 , d1, d9
+ fldd d4 , [ AO, #0 ]
+ fmacd d17 , d0, d9
+ fldd d5 , [ AO, #8 ]
+ fmacd d25 , d1, d8
+
+ fldd d12, [ BO ]
+ fmacd d18 , d2, d8
+ fldd d13, [ BO, #8 ]
+ fmacd d26 , d3, d9
+ fldd d6 , [ AO, #16 ]
+ fmacd d19 , d2, d9
+ fldd d7 , [ AO, #24 ]
+ fmacd d27 , d3, d8
+
+ fmacd d20 , d0, d10
+ fldd d14, [ BO, #16 ]
+ fmacd d28 , d1, d11
+ fmacd d21 , d0, d11
+ fldd d15, [ BO, #24 ]
+ fmacd d29 , d1, d10
+
+ fmacd d22 , d2, d10
+ add BO , BO, #32
+ fmacd d30 , d3, d11
+ fmacd d23 , d2, d11
+ add AO , AO, #32
+ fmacd d31 , d3, d10
+
+.endm
+
+.macro KERNEL2x2_M2
+ pld [ AO , #A_PRE ]
+
+ fmacd d16 , d4, d12
+ pld [ BO , #B_PRE ]
+ fmacd d24 , d5, d13
+ fldd d0 , [ AO, #0 ]
+ fmacd d17 , d4, d13
+ fldd d1 , [ AO, #8 ]
+ fmacd d25 , d5, d12
+
+ fmacd d18 , d6, d12
+ fldd d8 , [ BO ]
+ fmacd d26 , d7, d13
+ fldd d9 , [ BO, #8 ]
+ fmacd d19 , d6, d13
+ fmacd d27 , d7, d12
+
+ fldd d2 , [ AO, #16 ]
+ fmacd d20 , d4, d14
+ fldd d3 , [ AO, #24 ]
+ fmacd d28 , d5, d15
+ fmacd d21 , d4, d15
+ fldd d10, [ BO, #16 ]
+ fmacd d29 , d5, d14
+
+ fldd d11, [ BO, #24 ]
+ fmacd d22 , d6, d14
+ fmacd d30 , d7, d15
+ add BO , BO, #32
+ fmacd d23 , d6, d15
+ add AO , AO, #32
+ fmacd d31 , d7, d14
+
+.endm
+
+
+.macro KERNEL2x2_E
+
+ fmacd d16 , d4, d12
+ fmacd d24 , d5, d13
+ fmacd d17 , d4, d13
+ fmacd d25 , d5, d12
+
+ fmacd d18 , d6, d12
+ fmacd d26 , d7, d13
+ fmacd d19 , d6, d13
+ fmacd d27 , d7, d12
+
+ fmacd d20 , d4, d14
+ fmacd d28 , d5, d15
+ fmacd d21 , d4, d15
+ fmacd d29 , d5, d14
+
+ fmacd d22 , d6, d14
+ fmacd d30 , d7, d15
+ fmacd d23 , d6, d15
+ fmacd d31 , d7, d14
+
+.endm
+
+.macro KERNEL2x2_SUB
+
+ pld [ AO , #A_PRE ]
+ pld [ BO , #B_PRE ]
+ fldd d0 , [ AO ]
+ fldd d1 , [ AO, #8 ]
+ fldd d8 , [ BO ]
+ fldd d9 , [ BO, #8 ]
+
+ fmacd d16 , d0, d8
+ fldd d2 , [ AO, #16 ]
+ fmacd d24 , d1, d9
+ fldd d3 , [ AO, #24 ]
+ fmacd d17 , d0, d9
+ fldd d10, [ BO, #16 ]
+ fmacd d25 , d1, d8
+
+ fldd d11, [ BO, #24 ]
+ fmacd d18 , d2, d8
+ fmacd d26 , d3, d9
+ fmacd d19 , d2, d9
+ fmacd d27 , d3, d8
+
+ fmacd d20 , d0, d10
+ fmacd d28 , d1, d11
+ fmacd d21 , d0, d11
+ fmacd d29 , d1, d10
+
+ fmacd d22 , d2, d10
+ add BO , BO, #32
+ fmacd d30 , d3, d11
+ fmacd d23 , d2, d11
+ add AO , AO, #32
+ fmacd d31 , d3, d10
+
+.endm
+
+
+
+
+.macro SAVE2x2
+
+ ldr r3, OLD_RSC // Row stride size
+ lsl r3, r3, #4 // multiply with size of complex double
+
+ fldd d0, [ PTR_ALPHA ] // load real part of alpha
+ fldd d1, [ PTR_ALPHA, #8 ] // load imag part of alpha
+ ldr r4, PTR_BETA
+ fldd d2, [ r4 ] // load real part of beta
+ fldd d3, [ r4, #8 ] // load imag part of beta
+
+ // Add/Sub the real and the imag parts
+ FADD_R d16, d24 , d16
+ FADD_I d17, d25 , d17
+ FADD_R d18, d26 , d18
+ FADD_I d19, d27 , d19
+ FADD_R d20, d28 , d20
+ FADD_I d21, d29 , d21
+ FADD_R d22, d30 , d22
+ FADD_I d23, d31 , d23
+
+ mov r4, CO1 // save pointer
+ fldmiad CO1, { d4 - d5 } // read real and imag part from C
+ add CO1, CO1, r3
+
+ mov r2, CO2 // save pointer
+ fldmiad CO2, { d8 - d9 } // read real and imag part from C
+ add CO2, CO2, r3
+
+ fmuld d24, d4, d2 // multiply Beta-real with C-real
+ fmuld d25, d5, d2 // multiply Beta-real with C-imag
+ fmuld d28, d8, d2 // multiply Beta-real with C-real
+ fmuld d29, d9, d2 // multiply Beta-real with C-imag
+
+ FMAC_BR d24, d3, d5 // multiply beta-imag with C-imag and add
+ FMAC_BI d25, d3, d4 // multiply beta-imag with C-real and add
+ FMAC_BR d28, d3, d9 // multiply beta-imag with C-imag and add
+ FMAC_BI d29, d3, d8 // multiply beta-imag with C-real and add
+
+ FMAC_R1 d24 , d0 , d16
+ FMAC_I1 d25 , d0 , d17
+ FMAC_R2 d24 , d1 , d17
+ FMAC_I2 d25 , d1 , d16
+
+ FMAC_R1 d28 , d0 , d20
+ FMAC_I1 d29 , d0 , d21
+ FMAC_R2 d28 , d1 , d21
+ FMAC_I2 d29 , d1 , d20
+
+ fldmiad CO1, { d4 - d5 } // read real and imag part from C
+ fldmiad CO2, { d8 - d9 } // read real and imag part from C
+
+ fmuld d26, d4, d2 // multiply Beta-real with C-real
+ fmuld d27, d5, d2 // multiply Beta-real with C-imag
+ fmuld d30, d8, d2 // multiply Beta-real with C-real
+ fmuld d31, d9, d2 // multiply Beta-real with C-imag
+
+ FMAC_BR d26, d3, d5 // multiply beta-imag with C-imag and add
+ FMAC_BI d27, d3, d4 // multiply beta-imag with C-real and add
+ FMAC_BR d30, d3, d9 // multiply beta-imag with C-imag and add
+ FMAC_BI d31, d3, d8 // multiply beta-imag with C-real and add
+
+ FMAC_R1 d26 , d0 , d18
+ FMAC_I1 d27 , d0 , d19
+ FMAC_R2 d26 , d1 , d19
+ FMAC_I2 d27 , d1 , d18
+
+ FMAC_R1 d30, d0 , d22
+ FMAC_I1 d31, d0 , d23
+ FMAC_R2 d30, d1 , d23
+ FMAC_I2 d31, d1 , d22
+
+ mov CO1, r4 // restore pointer
+ mov CO2, r2 // restore pointer
+ fstmiad CO1, { d24 - d25 }
+ fstmiad CO2, { d28 - d29 }
+ add CO1, CO1, r3
+ add CO2, CO2, r3
+ fstmiad CO1, { d26 - d27 }
+ fstmiad CO2, { d30 - d31 }
+
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ .arm
+ .global REALNAME
+ .func REALNAME
+
+REALNAME:
+
+ push {r4 - r9, fp} // save register
+ add fp, sp, #28 // add number of saved register multiplied by size of int
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ mov AO, OLD_A // pointer matrix A
+ mov BO, OLD_B // pointer matrix B
+
+ sub r3, fp, #128
+ vstm r3, { d8 - d15} // store floating point registers
+
+ ldr r2, OLD_C // pointer matrix C
+ ldr r3, OLD_CSC // Col stride size of C
+ lsl r3, r3, #4 // multiply with size of complex double
+
+ mov CO1, r2 // first line of C
+ add CO2, CO1, r3 // second line of C
+
+ pld [ CO1, #C_PRE ] // prefetch the lines of C
+ pld [ CO2, #C_PRE ] // prefetch the lines of C
+
+zgemm_kernel_L2_M2_20:
+
+ asrs L , K, #3 // L = K / 8
+ cmp L , #2
+ blt zgemm_kernel_L2_M2_32
+
+ KERNEL2x2_I
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ subs L, L, #2
+ ble zgemm_kernel_L2_M2_22a
+ .align 5
+
+zgemm_kernel_L2_M2_22:
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ subs L, L, #1
+ bgt zgemm_kernel_L2_M2_22
+
+zgemm_kernel_L2_M2_22a:
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_E
+
+ b zgemm_kernel_L2_M2_44
+
+zgemm_kernel_L2_M2_32:
+
+ tst L, #1
+ ble zgemm_kernel_L2_M2_40
+
+ KERNEL2x2_I
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+
+ KERNEL2x2_M1
+ KERNEL2x2_M2
+ KERNEL2x2_M1
+ KERNEL2x2_E
+
+ b zgemm_kernel_L2_M2_44
+
+zgemm_kernel_L2_M2_40:
+
+ INIT2x2
+
+zgemm_kernel_L2_M2_44:
+
+ ands L , K, #7 // L = K % 8
+ ble zgemm_kernel_L2_M2_100
+
+zgemm_kernel_L2_M2_46:
+
+ KERNEL2x2_SUB
+
+ subs L, L, #1
+ bne zgemm_kernel_L2_M2_46
+
+zgemm_kernel_L2_M2_100:
+
+ SAVE2x2
+
+zgemm_kernel_L999:
+
+ sub r3, fp, #128
+ vldm r3, { d8 - d15} // restore floating point registers
+
+ sub sp, fp, #28
+ pop {r4 - r9, fp}
+ bx lr
+
diff --git a/blis/kernels/c66x/1m/bli_packm_cxk_ukernels.c b/blis/kernels/c66x/1m/bli_packm_cxk_ukernels.c
index 3ab63325d4151d1cf95d191d2ae669e01aad283a..b91349092fa2fbdda4616932eba59367b42fc9f0 100755 (executable)
-/*
-
- BLIS
- An object-based framework for developing high-performance BLAS-like
- libraries.
-
- Copyright (C) 2014, The University of Texas
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- - Neither the name of The University of Texas nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-/* Need to implement optimization for various cases */
-
-void bli_spackm_4xk_ukernel(
- conj_t conja,
- dim_t n,
- void* kappa,
- void* a, inc_t inca, inc_t lda,
- void* p, inc_t ldp
- )
-{
-
- float* restrict kappa_cast = kappa;
- dim_t index;
-
- if(*kappa_cast == 1.0f)
- {
- if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))
- {
- __float2_t *restrict ptrA0 = (__float2_t *) a;
- __float2_t *restrict ptrP0 = (__float2_t *) p;
-
- for(index=0;index<n;index++)
- {
- *ptrP0++ = *ptrA0++;
- *ptrP0++ = *ptrA0++;
- ptrA0 += ((lda>>1)-2);
- ptrP0 += ((ldp>>1)-2);
- }
- return;
- }
- else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))
- {
- __float2_t *restrict ptrA0 = (__float2_t *) a;
- __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);
- __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);
- __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);
- __float2_t *restrict ptrP0 = (__float2_t *) p;
- __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);
- __float2_t val0, val1;
- dim_t n_iter = n >> 1;
- dim_t n_left = n & 1;
-
- for(index=0;index<n_iter;index++)
- {
- val0 = *ptrA0++;
- val1 = *ptrA1++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- val0 = *ptrA2++;
- val1 = *ptrA3++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- ptrP0 += ((ldp)-2);
- ptrP1 += ((ldp)-2);
- }
- if(n_left)
- {
- float *restrict ptrA = ((float *) a+2*n_iter);
- float *restrict ptrP = ((float *) p+2*n_iter*ldp);
- ptrP[0] = ptrA[0];
- ptrP[1] = ptrA[inca];
- ptrP[2] = ptrA[2*inca];
- ptrP[3] = ptrA[3*inca];
- }
- return;
- }
- }
-
- /* handle unoptimized case using default packing routine */
- bli_spackm_ref_4xk(conja, n, kappa, a, inca, lda, p, ldp);
-}
-
-void bli_spackm_8xk_ukernel(
- conj_t conja,
- dim_t n,
- void* kappa,
- void* a, inc_t inca, inc_t lda,
- void* p, inc_t ldp
- )
-{
-
- float* restrict kappa_cast = kappa;
- dim_t index;
-
- if(*kappa_cast == 1.0f)
- {
- if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))
- {
- __float2_t *restrict ptrA0 = (__float2_t *) a;
- __float2_t *restrict ptrP0 = (__float2_t *) p;
- for(index=0;index<n;index++)
- {
- *ptrP0++ = *ptrA0++;
- *ptrP0++ = *ptrA0++;
- *ptrP0++ = *ptrA0++;
- *ptrP0++ = *ptrA0++;
- ptrA0 += ((lda>>1)-4);
- ptrP0 += ((ldp>>1)-4);
- }
- return;
- }
- else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))
- {
- __float2_t *restrict ptrA0 = (__float2_t *) a;
- __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);
- __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);
- __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);
- __float2_t *restrict ptrA4 = (__float2_t *) (((float *) a)+4*inca);
- __float2_t *restrict ptrA5 = (__float2_t *) (((float *) a)+5*inca);
- __float2_t *restrict ptrA6 = (__float2_t *) (((float *) a)+6*inca);
- __float2_t *restrict ptrA7 = (__float2_t *) (((float *) a)+7*inca);
- __float2_t *restrict ptrP0 = (__float2_t *) p;
- __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);
- __float2_t val0, val1;
- dim_t n_iter = n >> 1;
- dim_t n_left = n & 1;
- for(index=0;index<n_iter;index++)
- {
- val0 = *ptrA0++;
- val1 = *ptrA1++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- val0 = *ptrA2++;
- val1 = *ptrA3++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- val0 = *ptrA4++;
- val1 = *ptrA5++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- val0 = *ptrA6++;
- val1 = *ptrA7++;
- *ptrP0++ = _ftof2(_lof(val1),_lof(val0));
- *ptrP1++ = _ftof2(_hif(val1),_hif(val0));
- ptrP0 += ((ldp)-4);
- ptrP1 += ((ldp)-4);
- }
- if(n_left)
- {
- float *restrict ptrA = ((float *) a+2*n_iter);
- float *restrict ptrP = ((float *) p+2*n_iter*ldp);
- ptrP[0] = ptrA[0];
- ptrP[1] = ptrA[inca];
- ptrP[2] = ptrA[2*inca];
- ptrP[3] = ptrA[3*inca];
- ptrP[4] = ptrA[4*inca];
- ptrP[5] = ptrA[5*inca];
- ptrP[6] = ptrA[6*inca];
- ptrP[7] = ptrA[7*inca];
- }
- return;
- }
- }
- /* handle unoptimized case using default packing routine */
- bli_spackm_ref_8xk(conja, n, kappa, a, inca, lda, p, ldp);
-}
-
+/*\r
+\r
+ BLIS\r
+ An object-based framework for developing high-performance BLAS-like\r
+ libraries.\r
+\r
+ Copyright (C) 2014, The University of Texas\r
+\r
+ Redistribution and use in source and binary forms, with or without\r
+ modification, are permitted provided that the following conditions are\r
+ met:\r
+ - Redistributions of source code must retain the above copyright\r
+ notice, this list of conditions and the following disclaimer.\r
+ - Redistributions in binary form must reproduce the above copyright\r
+ notice, this list of conditions and the following disclaimer in the\r
+ documentation and/or other materials provided with the distribution.\r
+ - Neither the name of The University of Texas nor the names of its\r
+ contributors may be used to endorse or promote products derived\r
+ from this software without specific prior written permission.\r
+\r
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+\r
+*/\r
+\r
+#include "blis.h"\r
+\r
+/* Need to implement optimization for various cases */\r
+\r
+void bli_spackm_4xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ )\r
+{\r
+\r
+ float* restrict kappa_cast = kappa;\r
+ dim_t index; \r
+\r
+ if(*kappa_cast == 1.0f)\r
+ {\r
+ if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+\r
+ for(index=0;index<n;index++)\r
+ {\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ ptrA0 += ((lda>>1)-2);\r
+ ptrP0 += ((ldp>>1)-2);\r
+ }\r
+ return;\r
+ }\r
+ else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);\r
+ __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);\r
+ __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+ __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);\r
+ __float2_t val0, val1;\r
+ dim_t n_iter = n >> 1;\r
+ dim_t n_left = n & 1;\r
+\r
+ for(index=0;index<n_iter;index++)\r
+ {\r
+ val0 = *ptrA0++;\r
+ val1 = *ptrA1++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA2++;\r
+ val1 = *ptrA3++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ ptrP0 += ((ldp)-2);\r
+ ptrP1 += ((ldp)-2);\r
+ }\r
+ if(n_left)\r
+ {\r
+ float *restrict ptrA = ((float *) a+2*n_iter);\r
+ float *restrict ptrP = ((float *) p+2*n_iter*ldp);\r
+ ptrP[0] = ptrA[0];\r
+ ptrP[1] = ptrA[inca];\r
+ ptrP[2] = ptrA[2*inca];\r
+ ptrP[3] = ptrA[3*inca];\r
+ } \r
+ return; \r
+ }\r
+ }\r
+\r
+ /* handle unoptimized case using default packing routine */\r
+ bli_spackm_ref_4xk(conja, n, kappa, a, inca, lda, p, ldp);\r
+}\r
+\r
+void bli_spackm_8xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ )\r
+{\r
+\r
+ float* restrict kappa_cast = kappa;\r
+ dim_t index;\r
+\r
+ if(*kappa_cast == 1.0f)\r
+ {\r
+ if((inca==1) && ((lda&1)==0) && ((ldp&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+ for(index=0;index<n;index++)\r
+ {\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ *ptrP0++ = *ptrA0++;\r
+ ptrA0 += ((lda>>1)-4);\r
+ ptrP0 += ((ldp>>1)-4);\r
+ }\r
+ return;\r
+ }\r
+ else if((lda==1) && ((ldp&1) == 0) && ((inca&1)==0))\r
+ {\r
+ __float2_t *restrict ptrA0 = (__float2_t *) a;\r
+ __float2_t *restrict ptrA1 = (__float2_t *) (((float *) a)+inca);\r
+ __float2_t *restrict ptrA2 = (__float2_t *) (((float *) a)+2*inca);\r
+ __float2_t *restrict ptrA3 = (__float2_t *) (((float *) a)+3*inca);\r
+ __float2_t *restrict ptrA4 = (__float2_t *) (((float *) a)+4*inca);\r
+ __float2_t *restrict ptrA5 = (__float2_t *) (((float *) a)+5*inca);\r
+ __float2_t *restrict ptrA6 = (__float2_t *) (((float *) a)+6*inca);\r
+ __float2_t *restrict ptrA7 = (__float2_t *) (((float *) a)+7*inca);\r
+ __float2_t *restrict ptrP0 = (__float2_t *) p;\r
+ __float2_t *restrict ptrP1 = (__float2_t *) (((float *) p)+ldp);\r
+ __float2_t val0, val1;\r
+ dim_t n_iter = n >> 1;\r
+ dim_t n_left = n & 1;\r
+ for(index=0;index<n_iter;index++)\r
+ {\r
+ val0 = *ptrA0++;\r
+ val1 = *ptrA1++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA2++;\r
+ val1 = *ptrA3++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA4++;\r
+ val1 = *ptrA5++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ val0 = *ptrA6++;\r
+ val1 = *ptrA7++;\r
+ *ptrP0++ = _ftof2(_lof(val1),_lof(val0));\r
+ *ptrP1++ = _ftof2(_hif(val1),_hif(val0));\r
+ ptrP0 += ((ldp)-4);\r
+ ptrP1 += ((ldp)-4);\r
+ }\r
+ if(n_left)\r
+ {\r
+ float *restrict ptrA = ((float *) a+2*n_iter);\r
+ float *restrict ptrP = ((float *) p+2*n_iter*ldp);\r
+ ptrP[0] = ptrA[0];\r
+ ptrP[1] = ptrA[inca];\r
+ ptrP[2] = ptrA[2*inca];\r
+ ptrP[3] = ptrA[3*inca];\r
+ ptrP[4] = ptrA[4*inca];\r
+ ptrP[5] = ptrA[5*inca];\r
+ ptrP[6] = ptrA[6*inca];\r
+ ptrP[7] = ptrA[7*inca];\r
+ }\r
+ return; \r
+ }\r
+ }\r
+ /* handle unoptimized case using default packing routine */\r
+ bli_spackm_ref_8xk(conja, n, kappa, a, inca, lda, p, ldp);\r
+}\r
+\r
diff --git a/blis/kernels/c66x/1m/bli_packm_cxk_ukernels.h b/blis/kernels/c66x/1m/bli_packm_cxk_ukernels.h
index 6849f5d7214e1f943222f462d49bd49d7b089625..7a01798df897f22df8dadf7a838e6b43b68315bf 100755 (executable)
-/*
-
- BLIS
- An object-based framework for developing high-performance BLAS-like
- libraries.
-
- Copyright (C) 2014, The University of Texas
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- - Neither the name of The University of Texas nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-void bli_spackm_4xk_ukernel(
- conj_t conja,
- dim_t n,
- void* kappa,
- void* a, inc_t inca, inc_t lda,
- void* p, inc_t ldp
- );
-
-void bli_spackm_8xk_ukernel(
- conj_t conja,
- dim_t n,
- void* kappa,
- void* a, inc_t inca, inc_t lda,
- void* p, inc_t ldp
- );
-
-
+/*\r
+\r
+ BLIS\r
+ An object-based framework for developing high-performance BLAS-like\r
+ libraries.\r
+\r
+ Copyright (C) 2014, The University of Texas\r
+\r
+ Redistribution and use in source and binary forms, with or without\r
+ modification, are permitted provided that the following conditions are\r
+ met:\r
+ - Redistributions of source code must retain the above copyright\r
+ notice, this list of conditions and the following disclaimer.\r
+ - Redistributions in binary form must reproduce the above copyright\r
+ notice, this list of conditions and the following disclaimer in the\r
+ documentation and/or other materials provided with the distribution.\r
+ - Neither the name of The University of Texas nor the names of its\r
+ contributors may be used to endorse or promote products derived\r
+ from this software without specific prior written permission.\r
+\r
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+\r
+*/\r
+\r
+void bli_spackm_4xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ );\r
+\r
+void bli_spackm_8xk_ukernel(\r
+ conj_t conja,\r
+ dim_t n,\r
+ void* kappa,\r
+ void* a, inc_t inca, inc_t lda, \r
+ void* p, inc_t ldp\r
+ );\r
+\r
+\r
index c1e82594cab1f30502a95633bed5b21fb046f6f9..7361cef7a4245c388bbdf1e1779b211a41e48a5f 100755 (executable)
-/*
-
- BLIS
- An object-based framework for developing high-performance BLAS-like
- libraries.
-
- Copyright (C) 2014, The University of Texas
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- - Neither the name of The University of Texas nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-#include "blis.h"
-
-//#define BLIS_ENABLE_CYCLE_COUNT
-
-void bli_sgemm_ukernel_4x8(
- dim_t k,
- float* restrict alpha,
- float* restrict a,
- float* restrict b,
- float* restrict beta,
- float* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
- __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
- __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;
- __float2_t * restrict ptrB = (__float2_t *) b;
- __float2_t * restrict ptrA = (__float2_t *) a;
- __float2_t * restrict ptrC;
- __float2_t regA2, regC, regS, regR;
- int_least16_t index;
- float* restrict c0, * restrict c1;
- __float2_t regB2;
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
- //touch routine: both a & b
- //Length of b = NR*K*size of float;
-#ifdef BLIS_ENABLE_PREFETCH
- //touch(a, k*BLIS_DEFAULT_MR_S*4);
-#endif
-
- // zero out accumulators
- sum0 = 0.0;
- sum1 = 0.0;
- sum2 = 0.0;
- sum3 = 0.0;
- sum4 = 0.0;
- sum5 = 0.0;
- sum6 = 0.0;
- sum7 = 0.0;
- sum8 = 0.0;
- sum9 = 0.0;
- suma = 0.0;
- sumb = 0.0;
- sumc = 0.0;
- sumd = 0.0;
- sume = 0.0;
- sumf = 0.0;
-
-
- for (index = 0; index < k; index++)
- { // loop over k;
- // each iteration performs rank one update of 4x1 by 1x8
- // matrices of A and B respectively; result is
- // accumulated over 4x8 matrix
- __float2_t b01, b23, b45, b67, a01, a23;
- __x128_t reg128;
-
- a01 = *ptrA++;
- a23 = *ptrA++;
-
- b01 = *ptrB++;
- b23 = *ptrB++;
- b45 = *ptrB++;
- b67 = *ptrB++;
-
- reg128 = _cmpysp(a01, b01);
- // accumulate a[0]*b[1] and -a[0]*b[0]
- sum0 = _daddsp(sum0, _lof2_128(reg128));
- // accumulate a[1]*b[0] and a[1]*b[1]
- sum1 = _daddsp(sum1, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b23);
- // accumulate a[0]*b[3] and -a[0]*b[2]
- sum2 = _daddsp(sum2, _lof2_128(reg128));
- // accumulate a[1]*b[2] and a[1]*b[3]
- sum3 = _daddsp(sum3, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b45);
- // accumulate a[0]*b[5] and -a[0]*b[4]
- sum4 = _daddsp(sum4, _lof2_128(reg128));
- // accumulate a[1]*b[4] and a[1]*b[5]
- sum5 = _daddsp(sum5, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b67);
- // accumulate a[0]*b[7] and -a[0]*b[6]
- sum6 = _daddsp(sum6, _lof2_128(reg128));
- // accumulate a[1]*b[6] and a[1]*b[7]
- sum7 = _daddsp(sum7, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b01);
- // accumulate a[2]*b[1] and -a[2]*b[0]
- sum8 = _daddsp(sum8, _lof2_128(reg128));
- // accumulate a[3]*b[0] and a[3]*b[1]
- sum9 = _daddsp(sum9, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b23);
- // accumulate a[2]*b[3] and -a[2]*b[2]
- suma = _daddsp(suma, _lof2_128(reg128));
- // accumulate a[3]*b[2] and a[3]*b[3]
- sumb = _daddsp(sumb, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b45);
- // accumulate a[2]*b[5] and -a[2]*b[4]
- sumc = _daddsp(sumc, _lof2_128(reg128));
- // accumulate a[3]*b[4] and a[3]*b[5]
- sumd = _daddsp(sumd, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b67);
- // accumulate a[2]*b[7] and -a[2]*b[6]
- sume = _daddsp(sume, _lof2_128(reg128));
- // accumulate a[3]*b[6] and a[3]*b[7]
- sumf = _daddsp(sumf, _hif2_128(reg128));
-
- }
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
- regA2 = _ftof2(*alpha, *alpha);
- regB2 = _ftof2(*beta, *beta);
- if (rs_c != 1)
- {
- // update c[0,0] and c[1,0]
- c0 = (c + 0*rs_c + 0*cs_c);
- c1 = (c + 1*rs_c + 0*cs_c);
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum1),-_hif2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,1] and c[1,1]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum1),_lof2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,2] and c[1,2]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum3),-_hif2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,3] and c[1,3]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum3),_lof2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,4] and c[1,4]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum5),-_hif2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,5] and c[1,5]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum5),_lof2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,6] and c[1,6]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum7),-_hif2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,7] and c[1,7]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum7),_lof2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- // update c[2,0] and c[3,0]
- c0 = (c + 2*rs_c + 0*cs_c);
- c1 = (c + 3*rs_c + 0*cs_c);
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum9),-_hif2(sum8));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- // update c[2,1] and c[3,1]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum9),_lof2(sum8));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,2] and c[3,2]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumb),-_hif2(suma));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,3] and c[3,3]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumb),_lof2(suma));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,4] and c[3,4]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumd),-_hif2(sumc));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,5] and c[3,5]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumd),_lof2(sumc));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,6] and c[2,6]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumf),-_hif2(sume));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,7] and c[2,7]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumf),_lof2(sume));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- }
- else
- {
-#if 0
- // update c[0,0] and c[1,0]
- ptrC = (__float2_t *) c;
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum1),-_hif2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,1] and c[1,1]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum1),_lof2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,2] and c[1,2]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum3),-_hif2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,3] and c[1,3]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum3),_lof2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,4] and c[1,4]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum5),-_hif2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,5] and c[1,5]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum5),_lof2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,6] and c[1,6]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum7),-_hif2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[0,7] and c[1,7]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum7),_lof2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- // update c[2,0] and c[3,0]
- ptrC = (__float2_t *) (c+2);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum9),-_hif2(sum8));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- // update c[2,1] and c[3,1]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum9),_lof2(sum8));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,2] and c[3,2]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumb),-_hif2(suma));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,3] and c[3,3]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumb),_lof2(suma));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,4] and c[3,4]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumd),-_hif2(sumc));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,5] and c[3,5]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumd),_lof2(sumc));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,6] and c[2,6]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sumf),-_hif2(sume));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-
- //update c[2,7] and c[2,7]
- ptrC += (cs_c>>1);
- regC = *ptrC;
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sumf),_lof2(sume));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *ptrC = regR;
-#else
-/* __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
- __float2_t c8, c9, ca, cb, cc, cd, ce, cf;
-
- ptrC = (__float2_t *) c;
- c0 = *ptrC++;
- sum0 = _dmpysp(regA2, sum0);
- c1 = *ptrC--;
- sum1 = _dmpysp(regA2, sum1);
- ptrC += (cs_c>>1);
- c2 = *ptrC++;
- sum2 = _dmpysp(regA2, sum2);
- c3 = *ptrC--;
- sum3 = _dmpysp(regA2, sum3);
- ptrC += (cs_c>>1);
- c4 = *ptrC++;
- sum8 = _dmpysp(regA2, sum8);
- c5 = *ptrC--;
- sum9 = _dmpysp(regA2, sum9);
- ptrC += (cs_c>>1);
- c6 = *ptrC++;
- suma = _dmpysp(regA2, suma);
- c7 = *ptrC--;
- sumb = _dmpysp(regA2, sumb);
-
- c0 = _dmpysp(c0, regB2);
- c1 = _dmpysp(c1, regB2);
- c2 = _dmpysp(c2, regB2);
- c3 = _dmpysp(c3, regB2);
- c4 = _dmpysp(c4, regB2);
- c5 = _dmpysp(c5, regB2);
- c6 = _dmpysp(c6, regB2);
- c7 = _dmpysp(c7, regB2);
-
- // update c[0,0] and c[1,0]
- c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
- // update c[2,0] and c[3,0]
- c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);
- //update c[0,1] and c[1,1]
- c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
- // update c[2,1] and c[3,1]
- c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);
- //update c[0,2] and c[1,2]
- c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
- //update c[2,2] and c[3,2]
- c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);
- //update c[0,3] and c[1,3]
- c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
- //update c[2,3] and c[3,3]
- c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);
- //update c[0,4] and c[1,4]
-
- ptrC = (__float2_t *) c;
- *ptrC++ = c0;
- *ptrC-- = c1;
- ptrC += (cs_c>>1);
- *ptrC++ = c2;
- *ptrC-- = c3;
- ptrC += (cs_c>>1);
- *ptrC++ = c4;
- *ptrC-- = c5;
- ptrC += (cs_c>>1);
- *ptrC++ = c6;
- *ptrC-- = c7;
-
- ptrC = (__float2_t *) (c+(cs_c<<2));
- c8 = *ptrC++;
- c9 = *ptrC--;
- ptrC += (cs_c>>1);
- ca = *ptrC++;
- cb = *ptrC--;
- ptrC += (cs_c>>1);
- cc = *ptrC++;
- cd = *ptrC--;
- ptrC += (cs_c>>1);
- ce = *ptrC++;
- cf = *ptrC;
-
- sum4 = _dmpysp(regA2, sum4);
- sum5 = _dmpysp(regA2, sum5);
- sum6 = _dmpysp(regA2, sum6);
- sum7 = _dmpysp(regA2, sum7);
- sumc = _dmpysp(regA2, sumc);
- sumd = _dmpysp(regA2, sumd);
- sume = _dmpysp(regA2, sume);
- sumf = _dmpysp(regA2, sumf);
-
- c8 = _dmpysp(c8, regB2);
- c9 = _dmpysp(c9, regB2);
- ca = _dmpysp(ca, regB2);
- cb = _dmpysp(cb, regB2);
- cc = _dmpysp(cc, regB2);
- cd = _dmpysp(cd, regB2);
- ce = _dmpysp(ce, regB2);
- cf = _dmpysp(cf, regB2);
-
- c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);
- //update c[2,4] and c[3,4]
- c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);
- //update c[0,5] and c[1,5]
- ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);
- //update c[2,5] and c[3,5]
- cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);
- //update c[0,6] and c[1,6]
- cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);
- //update c[2,6] and c[3,6]
- cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);
- //update c[0,7] and c[1,7]
- ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);
- //update c[2,7] and c[3,7]
- cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);
-
- ptrC = (__float2_t *) (c+(cs_c<<2));
- *ptrC++ = c8;
- *ptrC-- = c9;
- ptrC += (cs_c>>1);
- *ptrC++ = ca;
- *ptrC-- = cb;
- ptrC += (cs_c>>1);
- *ptrC++ = cc;
- *ptrC-- = cd;
- ptrC += (cs_c>>1);
- *ptrC++ = ce;
- *ptrC = cf;*/
-
- __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
- __float2_t c8, c9, ca, cb, cc, cd, ce, cf;
-
- ptrC = (__float2_t *) c;
- c0 = *ptrC++;
- c1 = *ptrC--;
- ptrC += (cs_c>>1);
- c2 = *ptrC++;
- c3 = *ptrC--;
- ptrC += (cs_c>>1);
- c4 = *ptrC++;
- c5 = *ptrC--;
- ptrC += (cs_c>>1);
- c6 = *ptrC++;
- c7 = *ptrC--;
- ptrC += (cs_c>>1);
- c8 = *ptrC++;
- c9 = *ptrC--;
- ptrC += (cs_c>>1);
- ca = *ptrC++;
- cb = *ptrC--;
- ptrC += (cs_c>>1);
- cc = *ptrC++;
- cd = *ptrC--;
- ptrC += (cs_c>>1);
- ce = *ptrC++;
- cf = *ptrC;
-
- sum0 = _dmpysp(regA2, sum0);
- sum1 = _dmpysp(regA2, sum1);
- sum2 = _dmpysp(regA2, sum2);
- sum3 = _dmpysp(regA2, sum3);
- sum4 = _dmpysp(regA2, sum4);
- sum5 = _dmpysp(regA2, sum5);
- sum6 = _dmpysp(regA2, sum6);
- sum7 = _dmpysp(regA2, sum7);
- sum8 = _dmpysp(regA2, sum8);
- sum9 = _dmpysp(regA2, sum9);
- suma = _dmpysp(regA2, suma);
- sumb = _dmpysp(regA2, sumb);
- sumc = _dmpysp(regA2, sumc);
- sumd = _dmpysp(regA2, sumd);
- sume = _dmpysp(regA2, sume);
- sumf = _dmpysp(regA2, sumf);
-
- c0 = _dmpysp(c0, regB2);
- c1 = _dmpysp(c1, regB2);
- c2 = _dmpysp(c2, regB2);
- c3 = _dmpysp(c3, regB2);
- c4 = _dmpysp(c4, regB2);
- c5 = _dmpysp(c5, regB2);
- c6 = _dmpysp(c6, regB2);
- c7 = _dmpysp(c7, regB2);
- c8 = _dmpysp(c8, regB2);
- c9 = _dmpysp(c9, regB2);
- ca = _dmpysp(ca, regB2);
- cb = _dmpysp(cb, regB2);
- cc = _dmpysp(cc, regB2);
- cd = _dmpysp(cd, regB2);
- ce = _dmpysp(ce, regB2);
- cf = _dmpysp(cf, regB2);
-
- // update c[0,0] and c[1,0]
- c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
- // update c[2,0] and c[3,0]
- c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);
- //update c[0,1] and c[1,1]
- c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
- // update c[2,1] and c[3,1]
- c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);
- //update c[0,2] and c[1,2]
- c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
- //update c[2,2] and c[3,2]
- c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);
- //update c[0,3] and c[1,3]
- c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
- //update c[2,3] and c[3,3]
- c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);
- //update c[0,4] and c[1,4]
- c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);
- //update c[2,4] and c[3,4]
- c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);
- //update c[0,5] and c[1,5]
- ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);
- //update c[2,5] and c[3,5]
- cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);
- //update c[0,6] and c[1,6]
- cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);
- //update c[2,6] and c[3,6]
- cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);
- //update c[0,7] and c[1,7]
- ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);
- //update c[2,7] and c[3,7]
- cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);
-
- ptrC = (__float2_t *) c;
- *ptrC++ = c0;
- *ptrC-- = c1;
- ptrC += (cs_c>>1);
- *ptrC++ = c2;
- *ptrC-- = c3;
- ptrC += (cs_c>>1);
- *ptrC++ = c4;
- *ptrC-- = c5;
- ptrC += (cs_c>>1);
- *ptrC++ = c6;
- *ptrC-- = c7;
- ptrC += (cs_c>>1);
- *ptrC++ = c8;
- *ptrC-- = c9;
- ptrC += (cs_c>>1);
- *ptrC++ = ca;
- *ptrC-- = cb;
- ptrC += (cs_c>>1);
- *ptrC++ = cc;
- *ptrC-- = cd;
- ptrC += (cs_c>>1);
- *ptrC++ = ce;
- *ptrC = cf;
-
-
-#endif
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
-
-}
-
-void bli_sgemm_ukernel_4x4(
- dim_t k,
- float* restrict alpha,
- float* restrict a,
- float* restrict b,
- float* restrict beta,
- float* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
- __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
- __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;
- __float2_t * restrict ptrB = (__float2_t *) b;
- __float2_t * restrict ptrA = (__float2_t *) a;
- __float2_t * restrict ptrC;
- __float2_t regA2, regB2, regC, regS, regR;
- float* restrict c0, * restrict c1;
- int_least16_t index;
- int kEven, kLeft;
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
-
- // zero out accumulators
- sum0 = 0.0;
- sum1 = 0.0;
- sum2 = 0.0;
- sum3 = 0.0;
- sum4 = 0.0;
- sum5 = 0.0;
- sum6 = 0.0;
- sum7 = 0.0;
- sum8 = 0.0;
- sum9 = 0.0;
- suma = 0.0;
- sumb = 0.0;
- sumc = 0.0;
- sumd = 0.0;
- sume = 0.0;
- sumf = 0.0;
-
- kEven=k>>1;
- kLeft=k&1;
- //TSCL = 0;
- //cycles = TSCL;
-
-
- for (index = 0; index < kEven; index++)
- { // loop over k;
- // each iteration performs rank one update of 4x1 by 1x4
- // matrices of A and B respectively; result is
- // accumulated over 4x4 matrix
- __float2_t b01, b23, a01, a23;
- __x128_t reg128;
-
- // for even k
- a01 = *ptrA++;
- a23 = *ptrA++;
-
- b01 = *ptrB++;
- b23 = *ptrB++;
-
- reg128 = _cmpysp(a01, b01);
- // accumulate a[0]*b[1] and -a[0]*b[0]
- sum0 = _daddsp(sum0, _lof2_128(reg128));
- // accumulate a[1]*b[0] and a[1]*b[1]
- sum1 = _daddsp(sum1, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b23);
- // accumulate a[0]*b[3] and -a[0]*b[2]
- sum2 = _daddsp(sum2, _lof2_128(reg128));
- // accumulate a[1]*b[2] and a[1]*b[3]
- sum3 = _daddsp(sum3, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b01);
- // accumulate a[2]*b[1] and -a[2]*b[0]
- sum4 = _daddsp(sum4, _lof2_128(reg128));
- // accumulate a[3]*b[0] and a[3]*b[1]
- sum5 = _daddsp(sum5, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b23);
- // accumulate a[2]*b[3] and -a[2]*b[2]
- sum6 = _daddsp(sum6, _lof2_128(reg128));
- // accumulate a[3]*b[2] and a[3]*b[3]
- sum7 = _daddsp(sum7, _hif2_128(reg128));
-
-
-
- // for odd k
- a01 = *ptrA++;
- a23 = *ptrA++;
-
- b01 = *ptrB++;
- b23 = *ptrB++;
-
- reg128 = _cmpysp(a01, b01);
- // accumulate a[0]*b[1] and -a[0]*b[0]
- sum8 = _daddsp(sum8, _lof2_128(reg128));
- // accumulate a[1]*b[0] and a[1]*b[1]
- sum9 = _daddsp(sum9, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b23);
- // accumulate a[0]*b[3] and -a[0]*b[2]
- suma = _daddsp(suma, _lof2_128(reg128));
- // accumulate a[1]*b[2] and a[1]*b[3]
- sumb = _daddsp(sumb, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b01);
- // accumulate a[2]*b[1] and -a[2]*b[0]
- sumc = _daddsp(sumc, _lof2_128(reg128));
- // accumulate a[3]*b[0] and a[3]*b[1]
- sumd = _daddsp(sumd, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b23);
- // accumulate a[2]*b[3] and -a[2]*b[2]
- sume = _daddsp(sume, _lof2_128(reg128));
- // accumulate a[3]*b[2] and a[3]*b[3]
- sumf = _daddsp(sumf, _hif2_128(reg128));
-
- }
- if(kLeft)
- { // last k if left;
- __float2_t b01, b23, a01, a23;
- __x128_t reg128;
-
- a01 = *ptrA++;
- a23 = *ptrA++;
-
- b01 = *ptrB++;
- b23 = *ptrB++;
-
- reg128 = _cmpysp(a01, b01);
- // accumulate a[0]*b[1] and -a[0]*b[0]
- sum0 = _daddsp(sum0, _lof2_128(reg128));
- // accumulate a[1]*b[0] and a[1]*b[1]
- sum1 = _daddsp(sum1, _hif2_128(reg128));
-
- reg128 = _cmpysp(a01, b23);
- // accumulate a[0]*b[3] and -a[0]*b[2]
- sum2 = _daddsp(sum2, _lof2_128(reg128));
- // accumulate a[1]*b[2] and a[1]*b[3]
- sum3 = _daddsp(sum3, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b01);
- // accumulate a[2]*b[1] and -a[2]*b[0]
- sum4 = _daddsp(sum4, _lof2_128(reg128));
- // accumulate a[3]*b[0] and a[3]*b[1]
- sum5 = _daddsp(sum5, _hif2_128(reg128));
-
- reg128 = _cmpysp(a23, b23);
- // accumulate a[2]*b[3] and -a[2]*b[2]
- sum6 = _daddsp(sum6, _lof2_128(reg128));
- // accumulate a[3]*b[2] and a[3]*b[3]
- sum7 = _daddsp(sum7, _hif2_128(reg128));
-
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
- sum0 = _daddsp(sum0, sum8);
- sum1 = _daddsp(sum1, sum9);
- sum2 = _daddsp(sum2, suma);
- sum3 = _daddsp(sum3, sumb);
- sum4 = _daddsp(sum4, sumc);
- sum5 = _daddsp(sum5, sumd);
- sum6 = _daddsp(sum6, sume);
- sum7 = _daddsp(sum7, sumf);
-
-
- regA2 = _ftof2(*alpha, *alpha);
- regB2 = _ftof2(*beta, *beta);
- if (rs_c != 1)
- {
- // update c[0,0] and c[1,0]
- c0 = (c + 0*rs_c + 0*cs_c);
- c1 = (c + 1*rs_c + 0*cs_c);
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum1),-_hif2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,1] and c[1,1]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum1),_lof2(sum0));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,2] and c[1,2]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum3),-_hif2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[0,3] and c[1,3]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum3),_lof2(sum2));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- // update c[2,0] and c[3,0]
- c0 = (c + 2*rs_c + 0*cs_c);
- c1 = (c + 3*rs_c + 0*cs_c);
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum5),-_hif2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- // update c[2,1] and c[3,1]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum5),_lof2(sum4));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,2] and c[3,2]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_lof2(sum7),-_hif2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- //update c[2,3] and c[3,3]
- c0 = c0 + 1*cs_c;
- c1 = c1 + 1*cs_c;
- regC = _ftof2(*c1,*c0);
- regC = _dmpysp(regC, regB2);
- regS = _ftof2(_hif2(sum7),_lof2(sum6));
- regR = _daddsp(_dmpysp(regA2, regS), regC);
- *c0 = _lof2(regR);
- *c1 = _hif2(regR);
-
- }
- else
- {
- __float2_t c0, c1, c2, c3, c4, c5, c6, c7;
-
- ptrC = (__float2_t *) c;
- c0 = *ptrC++;
- c1 = *ptrC--;
- ptrC += (cs_c>>1);
- c2 = *ptrC++;
- c3 = *ptrC--;
- ptrC += (cs_c>>1);
- c4 = *ptrC++;
- c5 = *ptrC--;
- ptrC += (cs_c>>1);
- c6 = *ptrC++;
- c7 = *ptrC--;
-
- sum0 = _dmpysp(regA2, sum0);
- sum1 = _dmpysp(regA2, sum1);
- sum2 = _dmpysp(regA2, sum2);
- sum3 = _dmpysp(regA2, sum3);
- sum4 = _dmpysp(regA2, sum4);
- sum5 = _dmpysp(regA2, sum5);
- sum6 = _dmpysp(regA2, sum6);
- sum7 = _dmpysp(regA2, sum7);
-
- c0 = _dmpysp(c0, regB2);
- c1 = _dmpysp(c1, regB2);
- c2 = _dmpysp(c2, regB2);
- c3 = _dmpysp(c3, regB2);
- c4 = _dmpysp(c4, regB2);
- c5 = _dmpysp(c5, regB2);
- c6 = _dmpysp(c6, regB2);
- c7 = _dmpysp(c7, regB2);
-
- // update c[0,0] and c[1,0]
- c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);
- // update c[2,0] and c[3,0]
- c1 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c1);
- //update c[0,1] and c[1,1]
- c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);
- // update c[2,1] and c[3,1]
- c3 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),c3);
- //update c[0,2] and c[1,2]
- c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);
- //update c[2,2] and c[3,2]
- c5 = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),c5);
- //update c[0,3] and c[1,3]
- c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);
- //update c[2,3] and c[3,3]
- c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),c7);
-
- ptrC = (__float2_t *) c;
- *ptrC++ = c0;
- *ptrC-- = c1;
- ptrC += (cs_c>>1);
- *ptrC++ = c2;
- *ptrC-- = c3;
- ptrC += (cs_c>>1);
- *ptrC++ = c4;
- *ptrC-- = c5;
- ptrC += (cs_c>>1);
- *ptrC++ = c6;
- *ptrC-- = c7;
-
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
-}
-
-
-//void dgemmKernel(const double *pA, const double *pB, double *pC, const double a, const int k, const int stepC)
-void bli_dgemm_ukernel_4x4(
- dim_t k,
- double* restrict alpha,
- double* restrict a,
- double* restrict b,
- double* restrict beta,
- double* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
- double sum00, sum01, sum02, sum03;
- double sum10, sum11, sum12, sum13;
- double sum20, sum21, sum22, sum23;
- double sum30, sum31, sum32, sum33;
- int index;
- double al = *alpha;
- double be = *beta;
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL = 0;
- counter_start = TSCL;
-#endif
- //touch routine: both a & b
- //Length of b = NR*K*size of double;
- //Length of a = MR*K*size of double;
-#ifdef BLIS_ENABLE_PREFETCH
- //touch(b, k*BLIS_DEFAULT_NR_D*8);
- //touch(a, k*BLIS_DEFAULT_MR_D*8);
-#endif
-
-
-
-
- sum00 = 0.0;
- sum01 = 0.0;
- sum02 = 0.0;
- sum03 = 0.0;
- sum10 = 0.0;
- sum11 = 0.0;
- sum12 = 0.0;
- sum13 = 0.0;
- sum20 = 0.0;
- sum21 = 0.0;
- sum22 = 0.0;
- sum23 = 0.0;
- sum30 = 0.0;
- sum31 = 0.0;
- sum32 = 0.0;
- sum33 = 0.0;
-
-
- for(index = 0; index < k; index++)
- { // loop over k;
- // each iteration performs rank one update of 4x1 by 1x4
- // matrices of A and B respectively; result is
- // accumulated over 4x4 matrix
- register double a0, a1, a2, a3;
- register double b0, b1, b2, b3;
-
- a0 = *a++;
- a1 = *a++;
- a2 = *a++;
- a3 = *a++;
- b0 = *b++;
- b1 = *b++;
- b2 = *b++;
- b3 = *b++;
-
- // a[0]*b[0]
- sum00 += a0*b0;
- // a[0]*b[1]
- sum01 += a0*b1;
- // a[0]*b[2]
- sum02 += a0*b2;
- // a[0]*b[3]
- sum03 += a0*b3;
- // a[1]*b[0]
- sum10 += a1*b0;
- // a[1]*b[1]
- sum11 += a1*b1;
- // a[1]*b[2]
- sum12 += a1*b2;
- // a[1]*b[3]
- sum13 += a1*b3;
- // a[2]*b[0]
- sum20 += a2*b0;
- // a[2]*b[1]
- sum21 += a2*b1;
- // a[2]*b[2]
- sum22 += a2*b2;
- // a[2]*b[3]
- sum23 += a2*b3;
- // a[3]*b[0]
- sum30 += a3*b0;
- // a[3]*b[1]
- sum31 += a3*b1;
- // a[3]*b[2]
- sum32 += a3*b2;
- // a[3]*b[3]
- sum33 += a3*b3;
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL = 0;
- counter_start = TSCL;
-#endif
-
- double* restrict cptr;
- // 0th Column
- // updating C[00]
- cptr = c;
- *cptr = *cptr * be;
- *cptr += sum00 * al;
-
- // updating C[10]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum10 * al;
-
- // updating C[20]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum20 * al;
-
- // updating C[30]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum30 * al;
-
- // 1st column
- // updating C[01]
- cptr = c + cs_c;
- *cptr = *cptr*be;
- *cptr += sum01 * al;
-
- // updating C[11]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum11 * al;
-
- // updating C[21]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum21 * al;
-
- // updating C[31]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum31 * al;
-
- // 2nd Column
- // updating C[02]
- cptr = c + 2*cs_c;
- *cptr = *cptr*be;
- *cptr += sum02 * al;
-
- // updating C[12]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum12 * al;
-
- // updating C[22]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum22 * al;
-
- // updating C[32]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum32 * al;
-
- // 3rd Column
- // updating C[03]
- cptr = c + 3*cs_c;
- *cptr = *cptr*be;
- *cptr += sum03 * al;
-
- // updating C[13]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum13 * al;
-
- // updating C[23]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum23 * al;
-
- // updating C[33]
- cptr += rs_c;
- *cptr = *cptr*be;
- *cptr += sum33 * al;
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
- return;
-}
-
-void bli_cgemm_ukernel_2x4(
- dim_t k,
- scomplex* restrict alpha,
- scomplex* restrict a,
- scomplex* restrict b,
- scomplex* restrict beta,
- scomplex* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
- __float2_t sum00a, sum10a, sum00b, sum10b;
- __float2_t sum01a, sum11a, sum01b, sum11b;
- __float2_t sum02a, sum12a, sum02b, sum12b;
- __float2_t sum03a, sum13a, sum03b, sum13b;
- __float2_t * restrict ptrB = (__float2_t *) b;
- __float2_t * restrict ptrA = (__float2_t *) a;
- __float2_t * restrict ptrC;
- __float2_t regA, regB, regC;
- int_least16_t index;
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
- // zero out accumulators
- sum00a = 0.0;
- sum10a = 0.0;
- sum01a = 0.0;
- sum11a = 0.0;
- sum02a = 0.0;
- sum12a = 0.0;
- sum03a = 0.0;
- sum13a = 0.0;
- sum00b = 0.0;
- sum10b = 0.0;
- sum01b = 0.0;
- sum11b = 0.0;
- sum02b = 0.0;
- sum12b = 0.0;
- sum03b = 0.0;
- sum13b = 0.0;
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
-
- for (index = 0; index < k; index++)
- { // loop over k;
- // each iteration performs rank one update of 2x1 by 1x4
- // matrices of A and B respectively; result is
- // accumulated over 2x4 matrix
- __float2_t b0, b1, b2, b3, a0, a1;
- __x128_t reg128;
-
- a0 = *ptrA++;
- a1 = *ptrA++;
-
- b0 = *ptrB++;
- b1 = *ptrB++;
- b2 = *ptrB++;
- b3 = *ptrB++;
-
- // the four partial sums are accumulated independently
- // a[0]*b[0]
- reg128 = _cmpysp(a0, b0);
- sum00a = _daddsp(sum00a, _lof2_128(reg128));
- sum00b = _daddsp(sum00b, _hif2_128(reg128));
-
- // a[1]*b[0]
- reg128 = _cmpysp(a1, b0);
- sum10a = _daddsp(sum10a, _lof2_128(reg128));
- sum10b = _daddsp(sum10b, _hif2_128(reg128));
-
- // a[0]*b[1]
- reg128 = _cmpysp(a0, b1);
- sum01a = _daddsp(sum01a, _lof2_128(reg128));
- sum01b = _daddsp(sum01b, _hif2_128(reg128));
-
- // a[1]*b[1]
- reg128 = _cmpysp(a1, b1);
- sum11a = _daddsp(sum11a, _lof2_128(reg128));
- sum11b = _daddsp(sum11b, _hif2_128(reg128));
-
- // a[0]*b[2]
- reg128 = _cmpysp(a0, b2);
- sum02a = _daddsp(sum02a, _lof2_128(reg128));
- sum02b = _daddsp(sum02b, _hif2_128(reg128));
-
- // a[1]*b[2]
- reg128 = _cmpysp(a1, b2);
- sum12a = _daddsp(sum12a, _lof2_128(reg128));
- sum12b = _daddsp(sum12b, _hif2_128(reg128));
-
- // a[0]*b[3]
- reg128 = _cmpysp(a0, b3);
- sum03a = _daddsp(sum03a, _lof2_128(reg128));
- sum03b = _daddsp(sum03b, _hif2_128(reg128));
-
- // a[1]*b[3]
- reg128 = _cmpysp(a1, b3);
- sum13a = _daddsp(sum13a, _lof2_128(reg128));
- sum13b = _daddsp(sum13b, _hif2_128(reg128));
- }
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
- {
- __x128_t reg128;
- ptrA = (__float2_t *) alpha;
- ptrB = (__float2_t *) beta;
- regA = *ptrA;
- regB = *ptrB;
-
- // the value of a and the final values need to be
- // rearranged due to the specific way cmpysp assumes
- // data arrangement
- regA =_ftof2(-_lof(regA), _hif(regA));
- //regB = _ftof2(_lof(regB),_hif(regB));
- ptrC = (__float2_t *) c;
-
- // update and save c[0,0]
- sum00a = _daddsp(sum00a, sum00b);
- reg128 = _cmpysp(regA, sum00a);
- sum00a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum00a),_hif(sum00a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + rs_c;
-
- // update and save c[1,0]
- sum10a = _daddsp(sum10a, sum10b);
- reg128 = _cmpysp(regA, sum10a);
- sum10a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum10a),_hif(sum10a)),_ftof2(_lof(regC),-_hif(regC)));
-
-
- ptrC = (__float2_t *) c + cs_c;
-
- // update and save c[0,1]
- sum01a = _daddsp(sum01a, sum01b);
- reg128 = _cmpysp(regA, sum01a);
- sum01a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum01a),_hif(sum01a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + rs_c + cs_c;
-
- // update and save c[1,1]
- sum11a = _daddsp(sum11a, sum11b);
- reg128 = _cmpysp(regA, sum11a);
- sum11a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum11a),_hif(sum11a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + 2 * cs_c;
-
- // update and save c[0,2]
- sum02a = _daddsp(sum02a, sum02b);
- reg128 = _cmpysp(regA, sum02a);
- sum02a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum02a),_hif(sum02a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + rs_c + 2* cs_c;
-
- // update and save c[1,2]
- sum12a = _daddsp(sum12a, sum12b);
- reg128 = _cmpysp(regA, sum12a);
- sum12a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum12a),_hif(sum12a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + 3 * cs_c;
-
- // update and save c[0,3]
- sum03a = _daddsp(sum03a, sum03b);
- reg128 = _cmpysp(regA, sum03a);
- sum03a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum03a),_hif(sum03a)),_ftof2(_lof(regC),-_hif(regC)));
-
- ptrC = (__float2_t *) c + rs_c + 3 * cs_c;
-
- // update and save c[1,3]
- sum13a = _daddsp(sum13a, sum13b);
- reg128 = _cmpysp(regA, sum13a);
- sum13a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));
- regC = *ptrC;
- //regC = _ftof2(_lof(regC), _hif(regC));
- reg128 = _cmpysp(regC,regB);
- regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));
- *ptrC = _daddsp(_ftof2(-_lof(sum13a),_hif(sum13a)),_ftof2(_lof(regC),-_hif(regC)));
-
- }
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
- return;
-}
-
-void bli_zgemm_ukernel_2x2(
- dim_t k,
- dcomplex* restrict alpha,
- dcomplex* restrict a,
- dcomplex* restrict b,
- dcomplex* restrict beta,
- dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
- auxinfo_t* data
- )
-{
-
- double * restrict ptrA = (double *) a;
- double * restrict ptrB = (double *) b;
- //double * restrict ptrC = (double *) c;
- double sum00r, sum00i;
- int index;
- int kEven = k&0xFFFE;
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- volatile int counter_start;
- volatile int counter_end;
-#endif
-
- sum00r = 0.0;
- sum00i = 0.0;
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL = 0;
- counter_start = TSCL;
-#endif
-
- if(k>4) // The loop is safe for k > 4
- {
-#pragma UNROLL(2)
- for(index = 0; index<kEven; index++)
- { // loop over k;
- // each iteration performs rank one update of 1x1 by 1x1
- // matrices of A and B respectively; result is
- // accumulated over 1x1 matrix
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
-
- }
- if(k&1) // odd k; one left to do
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
- }
- else
- {
- if(k>0)
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
- if(k>1)
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
- if(k>2)
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
- if(k>3)
- {
- double a0r, a0i;
- double b0r, b0i;
-
- a0r = *ptrA++;
- a0i = *ptrA++;
-
- b0r = *ptrB++;
- b0i = *ptrB++;
-
- sum00r += a0r*b0r;
- sum00r -= a0i*b0i;
- sum00i += a0r*b0i;
- sum00i += a0i*b0r;
- }
-
- }
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\t%d\t",k, counter_end-counter_start);
-#endif
-
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- TSCL=0;
- counter_start = TSCL;
-#endif
-
- { // final saving
- double alphar, alphai, betar, betai, cr, ci;
- alphar = alpha->real;
- alphai = alpha->imag;
- betar = beta->real;
- betai = beta->imag;
-
- cr = c->real;
- ci = c->imag;
-
- c->imag = (betar * ci + betai * cr);
- c->real = (betar * cr - betai * ci);
- c->real += (alphar * sum00r - alphai * sum00i);
- c->imag += (alphar * sum00i + alphai * sum00r);
- }
-#ifdef BLIS_ENABLE_CYCLE_COUNT
- counter_end=TSCL;
- if (lib_get_coreID () == 0)
- printf("%d\n", counter_end-counter_start);
-#endif
-
-
- return;
-}
-
-
+/*\r
+\r
+ BLIS\r
+ An object-based framework for developing high-performance BLAS-like\r
+ libraries.\r
+\r
+ Copyright (C) 2014, The University of Texas\r
+\r
+ Redistribution and use in source and binary forms, with or without\r
+ modification, are permitted provided that the following conditions are\r
+ met:\r
+ - Redistributions of source code must retain the above copyright\r
+ notice, this list of conditions and the following disclaimer.\r
+ - Redistributions in binary form must reproduce the above copyright\r
+ notice, this list of conditions and the following disclaimer in the\r
+ documentation and/or other materials provided with the distribution.\r
+ - Neither the name of The University of Texas nor the names of its\r
+ contributors may be used to endorse or promote products derived\r
+ from this software without specific prior written permission.\r
+\r
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+\r
+ */\r
+#include "blis.h"\r
+\r
+void bli_sgemm_ukernel_4x8(\r
+ dim_t k,\r
+ float* restrict alpha,\r
+ float* restrict a,\r
+ float* restrict b,\r
+ float* restrict beta,\r
+ float* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;\r
+ __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;\r
+ __float2_t * restrict ptrB = (__float2_t *) b;\r
+ __float2_t * restrict ptrA = (__float2_t *) a;\r
+ __float2_t * restrict ptrC;\r
+ __float2_t regA2;\r
+ int_least16_t index;\r
+ __float2_t regB2;\r
+\r
+ //touch routine: both a & b\r
+ //Length of b = NR*K*size of float;\r
+#ifdef BLIS_ENABLE_PREFETCH\r
+ //touch(a, k*BLIS_DEFAULT_MR_S*4);\r
+#endif\r
+\r
+ // zero out accumulators\r
+ sum0 = 0.0;\r
+ sum1 = 0.0;\r
+ sum2 = 0.0;\r
+ sum3 = 0.0;\r
+ sum4 = 0.0;\r
+ sum5 = 0.0;\r
+ sum6 = 0.0;\r
+ sum7 = 0.0;\r
+ sum8 = 0.0;\r
+ sum9 = 0.0;\r
+ suma = 0.0;\r
+ sumb = 0.0;\r
+ sumc = 0.0;\r
+ sumd = 0.0;\r
+ sume = 0.0;\r
+ sumf = 0.0;\r
+\r
+\r
+ for (index = 0; index < k; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 4x1 by 1x8\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 4x8 matrix\r
+ __float2_t b01, b23, b45, b67, a01, a23;\r
+ __x128_t reg128;\r
+\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+ b45 = *ptrB++;\r
+ b67 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum0 = _daddsp(sum0, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum1 = _daddsp(sum1, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ sum2 = _daddsp(sum2, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sum3 = _daddsp(sum3, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b45);\r
+ // accumulate a[0]*b[5] and -a[0]*b[4]\r
+ sum4 = _daddsp(sum4, _lof2_128(reg128));\r
+ // accumulate a[1]*b[4] and a[1]*b[5]\r
+ sum5 = _daddsp(sum5, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b67);\r
+ // accumulate a[0]*b[7] and -a[0]*b[6]\r
+ sum6 = _daddsp(sum6, _lof2_128(reg128));\r
+ // accumulate a[1]*b[6] and a[1]*b[7]\r
+ sum7 = _daddsp(sum7, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sum8 = _daddsp(sum8, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sum9 = _daddsp(sum9, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ suma = _daddsp(suma, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sumb = _daddsp(sumb, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b45);\r
+ // accumulate a[2]*b[5] and -a[2]*b[4]\r
+ sumc = _daddsp(sumc, _lof2_128(reg128));\r
+ // accumulate a[3]*b[4] and a[3]*b[5]\r
+ sumd = _daddsp(sumd, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b67);\r
+ // accumulate a[2]*b[7] and -a[2]*b[6]\r
+ sume = _daddsp(sume, _lof2_128(reg128));\r
+ // accumulate a[3]*b[6] and a[3]*b[7]\r
+ sumf = _daddsp(sumf, _hif2_128(reg128));\r
+\r
+ }\r
+\r
+ regA2 = _ftof2(*alpha, *alpha);\r
+ regB2 = _ftof2(*beta, *beta);\r
+ if (rs_c != 1)\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+ __float2_t c8, c9, ca, cb, cc, cd, ce, cf;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++; //c[0,0] and c[0,1]\r
+ c1 = *ptrC--; //c[0,2] and c[0,3]\r
+ ptrC += (rs_c>>1);\r
+ c4 = *ptrC++; //c[1,0] and c[1,1]\r
+ c5 = *ptrC--; //c[1,2] and c[1,3]\r
+ ptrC += (rs_c>>1);\r
+ c8 = *ptrC++; //c[2,0] and c[2,1]\r
+ c9 = *ptrC--; //c[2,2] and c[2,3]\r
+ ptrC += (rs_c>>1);\r
+ cc = *ptrC++; //c[3,0] and c[3,1]\r
+ cd = *ptrC--; //c[3,2] and c[3,3]\r
+\r
+ ptrC = (__float2_t *) c + 2;\r
+ c2 = *ptrC++; //c[0,4] and c[0,5]\r
+ c3 = *ptrC--; //c[0,6] and c[0,7]\r
+ ptrC += (rs_c>>1);\r
+ c6 = *ptrC++; //c[1,4] and c[1,5]\r
+ c7 = *ptrC--; //c[1,6] and c[1,7]\r
+ ptrC += (rs_c>>1);\r
+ ca = *ptrC++; //c[2,4] and c[2,5]\r
+ cb = *ptrC--; //c[2,6] and c[2,7]\r
+ ptrC += (rs_c>>1);\r
+ ce = *ptrC++; //c[3,4] and c[3,5]\r
+ cf = *ptrC; //c[3,6] and c[3,7]\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+ sum8 = _dmpysp(regA2, sum8);\r
+ sum9 = _dmpysp(regA2, sum9);\r
+ suma = _dmpysp(regA2, suma);\r
+ sumb = _dmpysp(regA2, sumb);\r
+ sumc = _dmpysp(regA2, sumc);\r
+ sumd = _dmpysp(regA2, sumd);\r
+ sume = _dmpysp(regA2, sume);\r
+ sumf = _dmpysp(regA2, sumf);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+ c8 = _dmpysp(c8, regB2);\r
+ c9 = _dmpysp(c9, regB2);\r
+ ca = _dmpysp(ca, regB2);\r
+ cb = _dmpysp(cb, regB2);\r
+ cc = _dmpysp(cc, regB2);\r
+ cd = _dmpysp(cd, regB2);\r
+ ce = _dmpysp(ce, regB2);\r
+ cf = _dmpysp(cf, regB2);\r
+\r
+ // update c[0,0] and c[1,0]\r
+ c0 = _daddsp(_ftof2(_lof2(sum0),-_hif2(sum0)),c0);\r
+ // update c[0,2] and c[0,3]\r
+ c1 = _daddsp(_ftof2(_lof2(sum2),-_hif2(sum2)),c1);\r
+ //update c[0,4] and c[0,5]\r
+ c2 = _daddsp(_ftof2(_lof2(sum4),-_hif2(sum4)),c2);\r
+ // update c[0,6] and c[0,7]\r
+ c3 = _daddsp(_ftof2(_lof2(sum6),-_hif2(sum6)),c3);\r
+\r
+ //update c[1,0] and c[1,1]\r
+ c4 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum1)),c4);\r
+ //update c[1,2] and c[1,3]\r
+ c5 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum3)),c5);\r
+ //update c[1,4] and c[1,5]\r
+ c6 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum5)),c6);\r
+ //update c[1,6] and c[1,7]\r
+ c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum7)),c7);\r
+\r
+ // update c[2,0] and c[2,0]\r
+ c8 = _daddsp(_ftof2(_lof2(sum8),-_hif2(sum8)),c8);\r
+ // update c[2,2] and c[2,3]\r
+ c9 = _daddsp(_ftof2(_lof2(suma),-_hif2(suma)),c9);\r
+ //update c[2,4] and c[2,5]\r
+ ca = _daddsp(_ftof2(_lof2(sumc),-_hif2(sumc)),ca);\r
+ // update c[2,6] and c[2,7]\r
+ cb = _daddsp(_ftof2(_lof2(sume),-_hif2(sume)),cb);\r
+\r
+ //update c[3,0] and c[3,1]\r
+ cc = _daddsp(_ftof2(_hif2(sum9),_lof2(sum9)),cc);\r
+ //update c[3,2] and c[3,3]\r
+ cd = _daddsp(_ftof2(_hif2(sumb),_lof2(sumb)),cd);\r
+ //update c[3,4] and c[3,5]\r
+ ce = _daddsp(_ftof2(_hif2(sumd),_lof2(sumd)),ce);\r
+ //update c[3,6] and c[3,7]\r
+ cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sumf)),cf);\r
+\r
+// // update c[0,0] and c[1,0]\r
+// c0 = (c + 0*rs_c + 0*cs_c);\r
+// c1 = (c + 1*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum1),-_hif2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,1] and c[1,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum1),_lof2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,2] and c[1,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum3),-_hif2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,3] and c[1,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum3),_lof2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,4] and c[1,4]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum5),-_hif2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,5] and c[1,5]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum5),_lof2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,6] and c[1,6]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum7),-_hif2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,7] and c[1,7]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum7),_lof2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,0] and c[3,0]\r
+// c0 = (c + 2*rs_c + 0*cs_c);\r
+// c1 = (c + 3*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum9),-_hif2(sum8));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,1] and c[3,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum9),_lof2(sum8));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,2] and c[3,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sumb),-_hif2(suma));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,3] and c[3,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sumb),_lof2(suma));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,4] and c[3,4]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sumd),-_hif2(sumc));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,5] and c[3,5]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sumd),_lof2(sumc));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,6] and c[2,6]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sumf),-_hif2(sume));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,7] and c[2,7]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sumf),_lof2(sume));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+\r
+ }\r
+ else\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+ __float2_t c8, c9, ca, cb, cc, cd, ce, cf;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++; //c[0,0] and c[1,0]\r
+ c1 = *ptrC--; //c[2,0] and c[3,0]\r
+ ptrC += (cs_c>>1); // divide by 2 because ptrC is __float2_t, and cs_c is the stride for floats\r
+ c2 = *ptrC++; //c[0,1] and c[1,1]\r
+ c3 = *ptrC--; //c[2,1] and c[3,1]\r
+ ptrC += (cs_c>>1);\r
+ c4 = *ptrC++; //c[0,2] and c[1,2]\r
+ c5 = *ptrC--; //c[2,2] and c[3,2]\r
+ ptrC += (cs_c>>1);\r
+ c6 = *ptrC++; //c[0,3] and c[1,3]\r
+ c7 = *ptrC--; //c[2,3] and c[3,3]\r
+ ptrC += (cs_c>>1);\r
+ c8 = *ptrC++; //c[0,4] and c[1,0]\r
+ c9 = *ptrC--; //c[2,4] and c[3,0]\r
+ ptrC += (cs_c>>1);\r
+ ca = *ptrC++; //c[0,5] and c[1,0]\r
+ cb = *ptrC--; //c[2,5] and c[3,0]\r
+ ptrC += (cs_c>>1);\r
+ cc = *ptrC++; //c[0,6] and c[1,0]\r
+ cd = *ptrC--; //c[2,6] and c[3,0]\r
+ ptrC += (cs_c>>1);\r
+ ce = *ptrC++; //c[0,7] and c[1,0]\r
+ cf = *ptrC; //c[2,7] and c[3,0]\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+ sum8 = _dmpysp(regA2, sum8);\r
+ sum9 = _dmpysp(regA2, sum9);\r
+ suma = _dmpysp(regA2, suma);\r
+ sumb = _dmpysp(regA2, sumb);\r
+ sumc = _dmpysp(regA2, sumc);\r
+ sumd = _dmpysp(regA2, sumd);\r
+ sume = _dmpysp(regA2, sume);\r
+ sumf = _dmpysp(regA2, sumf);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+ c8 = _dmpysp(c8, regB2);\r
+ c9 = _dmpysp(c9, regB2);\r
+ ca = _dmpysp(ca, regB2);\r
+ cb = _dmpysp(cb, regB2);\r
+ cc = _dmpysp(cc, regB2);\r
+ cd = _dmpysp(cd, regB2);\r
+ ce = _dmpysp(ce, regB2);\r
+ cf = _dmpysp(cf, regB2);\r
+\r
+ // update c[0,0] and c[1,0]\r
+ c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);\r
+ // update c[2,0] and c[3,0]\r
+ c1 = _daddsp(_ftof2(_lof2(sum9),-_hif2(sum8)),c1);\r
+ //update c[0,1] and c[1,1]\r
+ c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);\r
+ // update c[2,1] and c[3,1]\r
+ c3 = _daddsp(_ftof2(_hif2(sum9),_lof2(sum8)),c3);\r
+ //update c[0,2] and c[1,2]\r
+ c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);\r
+ //update c[2,2] and c[3,2]\r
+ c5 = _daddsp(_ftof2(_lof2(sumb),-_hif2(suma)),c5);\r
+ //update c[0,3] and c[1,3]\r
+ c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);\r
+ //update c[2,3] and c[3,3]\r
+ c7 = _daddsp(_ftof2(_hif2(sumb),_lof2(suma)),c7);\r
+ //update c[0,4] and c[1,4]\r
+ c8 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c8);\r
+ //update c[2,4] and c[3,4]\r
+ c9 = _daddsp(_ftof2(_lof2(sumd),-_hif2(sumc)),c9);\r
+ //update c[0,5] and c[1,5]\r
+ ca = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),ca);\r
+ //update c[2,5] and c[3,5]\r
+ cb = _daddsp(_ftof2(_hif2(sumd),_lof2(sumc)),cb);\r
+ //update c[0,6] and c[1,6]\r
+ cc = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),cc);\r
+ //update c[2,6] and c[3,6]\r
+ cd = _daddsp(_ftof2(_lof2(sumf),-_hif2(sume)),cd);\r
+ //update c[0,7] and c[1,7]\r
+ ce = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),ce);\r
+ //update c[2,7] and c[3,7]\r
+ cf = _daddsp(_ftof2(_hif2(sumf),_lof2(sume)),cf);\r
+\r
+ ptrC = (__float2_t *) c;\r
+ *ptrC++ = c0;\r
+ *ptrC-- = c1;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c2;\r
+ *ptrC-- = c3;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c4;\r
+ *ptrC-- = c5;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c6;\r
+ *ptrC-- = c7;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c8;\r
+ *ptrC-- = c9;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = ca;\r
+ *ptrC-- = cb;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = cc;\r
+ *ptrC-- = cd;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = ce;\r
+ *ptrC = cf;\r
+ }\r
+}\r
+\r
+void bli_sgemm_ukernel_4x4(\r
+ dim_t k,\r
+ float* restrict alpha,\r
+ float* restrict a,\r
+ float* restrict b,\r
+ float* restrict beta,\r
+ float* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ __float2_t sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;\r
+ __float2_t sum8, sum9, suma, sumb, sumc, sumd, sume, sumf;\r
+ __float2_t * restrict ptrB = (__float2_t *) b;\r
+ __float2_t * restrict ptrA = (__float2_t *) a;\r
+ __float2_t * restrict ptrC;\r
+ __float2_t regA2, regB2;\r
+ int_least16_t index;\r
+ int kEven, kLeft;\r
+\r
+ // zero out accumulators\r
+ sum0 = 0.0;\r
+ sum1 = 0.0;\r
+ sum2 = 0.0;\r
+ sum3 = 0.0;\r
+ sum4 = 0.0;\r
+ sum5 = 0.0;\r
+ sum6 = 0.0;\r
+ sum7 = 0.0;\r
+ sum8 = 0.0;\r
+ sum9 = 0.0;\r
+ suma = 0.0;\r
+ sumb = 0.0;\r
+ sumc = 0.0;\r
+ sumd = 0.0;\r
+ sume = 0.0;\r
+ sumf = 0.0;\r
+\r
+ kEven=k>>1;\r
+ kLeft=k&1;\r
+\r
+ for (index = 0; index < kEven; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 4x1 by 1x4\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 4x4 matrix\r
+ __float2_t b01, b23, a01, a23;\r
+ __x128_t reg128;\r
+\r
+ // for even k\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum0 = _daddsp(sum0, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum1 = _daddsp(sum1, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ sum2 = _daddsp(sum2, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sum3 = _daddsp(sum3, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sum4 = _daddsp(sum4, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sum5 = _daddsp(sum5, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ sum6 = _daddsp(sum6, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sum7 = _daddsp(sum7, _hif2_128(reg128));\r
+\r
+\r
+\r
+ // for odd k\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum8 = _daddsp(sum8, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum9 = _daddsp(sum9, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ suma = _daddsp(suma, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sumb = _daddsp(sumb, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sumc = _daddsp(sumc, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sumd = _daddsp(sumd, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ sume = _daddsp(sume, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sumf = _daddsp(sumf, _hif2_128(reg128));\r
+\r
+ }\r
+ if(kLeft)\r
+ { // last k if left;\r
+ __float2_t b01, b23, a01, a23;\r
+ __x128_t reg128;\r
+\r
+ a01 = *ptrA++;\r
+ a23 = *ptrA++;\r
+\r
+ b01 = *ptrB++;\r
+ b23 = *ptrB++;\r
+\r
+ reg128 = _cmpysp(a01, b01);\r
+ // accumulate a[0]*b[1] and -a[0]*b[0]\r
+ sum0 = _daddsp(sum0, _lof2_128(reg128));\r
+ // accumulate a[1]*b[0] and a[1]*b[1]\r
+ sum1 = _daddsp(sum1, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a01, b23);\r
+ // accumulate a[0]*b[3] and -a[0]*b[2]\r
+ sum2 = _daddsp(sum2, _lof2_128(reg128));\r
+ // accumulate a[1]*b[2] and a[1]*b[3]\r
+ sum3 = _daddsp(sum3, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b01);\r
+ // accumulate a[2]*b[1] and -a[2]*b[0]\r
+ sum4 = _daddsp(sum4, _lof2_128(reg128));\r
+ // accumulate a[3]*b[0] and a[3]*b[1]\r
+ sum5 = _daddsp(sum5, _hif2_128(reg128));\r
+\r
+ reg128 = _cmpysp(a23, b23);\r
+ // accumulate a[2]*b[3] and -a[2]*b[2]\r
+ sum6 = _daddsp(sum6, _lof2_128(reg128));\r
+ // accumulate a[3]*b[2] and a[3]*b[3]\r
+ sum7 = _daddsp(sum7, _hif2_128(reg128));\r
+\r
+ }\r
+\r
+ sum0 = _daddsp(sum0, sum8);\r
+ sum1 = _daddsp(sum1, sum9);\r
+ sum2 = _daddsp(sum2, suma);\r
+ sum3 = _daddsp(sum3, sumb);\r
+ sum4 = _daddsp(sum4, sumc);\r
+ sum5 = _daddsp(sum5, sumd);\r
+ sum6 = _daddsp(sum6, sume);\r
+ sum7 = _daddsp(sum7, sumf);\r
+\r
+\r
+ regA2 = _ftof2(*alpha, *alpha);\r
+ regB2 = _ftof2(*beta, *beta);\r
+ if (rs_c != 1)\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++; //c[0,0] and c[0,1]\r
+ c1 = *ptrC--; //c[0,2] and c[0,3]\r
+ ptrC += (rs_c>>1);\r
+ c2 = *ptrC++; //c[1,0] and c[1,1]\r
+ c3 = *ptrC--; //c[1,2] and c[1,3]\r
+ ptrC += (rs_c>>1);\r
+ c4 = *ptrC++; //c[2,0] and c[2,1]\r
+ c5 = *ptrC--; //c[2,2] and c[2,3]\r
+ ptrC += (rs_c>>1);\r
+ c6 = *ptrC++; //c[3,0] and c[3,1]\r
+ c7 = *ptrC--; //c[3,2] and c[3,3]\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+\r
+ // update c[0,0] and c[0,1]\r
+ c0 = _daddsp(_ftof2(_lof2(sum0),-_hif2(sum0)),c0);\r
+ // update c[0,2] and c[0,3]\r
+ c1 = _daddsp(_ftof2(_lof2(sum2),-_hif2(sum2)),c1);\r
+ // update c[1,0] and c[1,1]\r
+ c2 = _daddsp(_ftof2(_hif2(sum1), _lof2(sum1)),c2);\r
+ // update c[1,2] and c[1,2]\r
+ c3 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum3)),c3);\r
+ // update c[2,0] and c[2,1]\r
+ c4 = _daddsp(_ftof2(_lof2(sum4),-_hif2(sum4)),c4);\r
+ // update c[2,2] and c[2,3]\r
+ c5 = _daddsp(_ftof2(_lof2(sum6),-_hif2(sum6)),c5);\r
+ // update c[3,0] and c[3,1]\r
+ c6 = _daddsp(_ftof2(_hif2(sum5), _lof2(sum5)),c6);\r
+ // update c[3,2] and c[3,2]\r
+ c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum7)),c7);\r
+\r
+ ptrC = (__float2_t *) c;\r
+ *ptrC++ = c0; //c[0,0] and c[0,1]\r
+ *ptrC-- = c1; //c[0,2] and c[0,3]\r
+ ptrC += (rs_c>>1);\r
+ *ptrC++ = c2; //c[1,0] and c[1,1]\r
+ *ptrC-- = c3; //c[1,2] and c[1,3]\r
+ ptrC += (rs_c>>1);\r
+ *ptrC++ = c4; //c[2,0] and c[2,1]\r
+ *ptrC-- = c5; //c[2,2] and c[2,3]\r
+ ptrC += (rs_c>>1);\r
+ *ptrC++ = c6; //c[3,0] and c[3,1]\r
+ *ptrC-- = c7; //c[3,2] and c[3,3]\r
+\r
+// // update c[0,0] and c[1,0]\r
+// c0 = (c + 0*rs_c + 0*cs_c);\r
+// c1 = (c + 1*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum1),-_hif2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,1] and c[1,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum1),_lof2(sum0));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,2] and c[1,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum3),-_hif2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[0,3] and c[1,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum3),_lof2(sum2));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,0] and c[3,0]\r
+// c0 = (c + 2*rs_c + 0*cs_c);\r
+// c1 = (c + 3*rs_c + 0*cs_c);\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum5),-_hif2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// // update c[2,1] and c[3,1]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum5),_lof2(sum4));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,2] and c[3,2]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_lof2(sum7),-_hif2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+//\r
+// //update c[2,3] and c[3,3]\r
+// c0 = c0 + 1*cs_c;\r
+// c1 = c1 + 1*cs_c;\r
+// regC = _ftof2(*c1,*c0);\r
+// regC = _dmpysp(regC, regB2);\r
+// regS = _ftof2(_hif2(sum7),_lof2(sum6));\r
+// regR = _daddsp(_dmpysp(regA2, regS), regC);\r
+// *c0 = _lof2(regR);\r
+// *c1 = _hif2(regR);\r
+\r
+ }\r
+ else\r
+ {\r
+ __float2_t c0, c1, c2, c3, c4, c5, c6, c7;\r
+\r
+ ptrC = (__float2_t *) c;\r
+ c0 = *ptrC++;\r
+ c1 = *ptrC--;\r
+ ptrC += (cs_c>>1);\r
+ c2 = *ptrC++;\r
+ c3 = *ptrC--;\r
+ ptrC += (cs_c>>1);\r
+ c4 = *ptrC++;\r
+ c5 = *ptrC--;\r
+ ptrC += (cs_c>>1);\r
+ c6 = *ptrC++;\r
+ c7 = *ptrC--;\r
+\r
+ sum0 = _dmpysp(regA2, sum0);\r
+ sum1 = _dmpysp(regA2, sum1);\r
+ sum2 = _dmpysp(regA2, sum2);\r
+ sum3 = _dmpysp(regA2, sum3);\r
+ sum4 = _dmpysp(regA2, sum4);\r
+ sum5 = _dmpysp(regA2, sum5);\r
+ sum6 = _dmpysp(regA2, sum6);\r
+ sum7 = _dmpysp(regA2, sum7);\r
+\r
+ c0 = _dmpysp(c0, regB2);\r
+ c1 = _dmpysp(c1, regB2);\r
+ c2 = _dmpysp(c2, regB2);\r
+ c3 = _dmpysp(c3, regB2);\r
+ c4 = _dmpysp(c4, regB2);\r
+ c5 = _dmpysp(c5, regB2);\r
+ c6 = _dmpysp(c6, regB2);\r
+ c7 = _dmpysp(c7, regB2);\r
+\r
+ // update c[0,0] and c[1,0]\r
+ c0 = _daddsp(_ftof2(_lof2(sum1),-_hif2(sum0)),c0);\r
+ // update c[2,0] and c[3,0]\r
+ c1 = _daddsp(_ftof2(_lof2(sum5),-_hif2(sum4)),c1);\r
+ //update c[0,1] and c[1,1]\r
+ c2 = _daddsp(_ftof2(_hif2(sum1),_lof2(sum0)),c2);\r
+ // update c[2,1] and c[3,1]\r
+ c3 = _daddsp(_ftof2(_hif2(sum5),_lof2(sum4)),c3);\r
+ //update c[0,2] and c[1,2]\r
+ c4 = _daddsp(_ftof2(_lof2(sum3),-_hif2(sum2)),c4);\r
+ //update c[2,2] and c[3,2]\r
+ c5 = _daddsp(_ftof2(_lof2(sum7),-_hif2(sum6)),c5);\r
+ //update c[0,3] and c[1,3]\r
+ c6 = _daddsp(_ftof2(_hif2(sum3),_lof2(sum2)),c6);\r
+ //update c[2,3] and c[3,3]\r
+ c7 = _daddsp(_ftof2(_hif2(sum7),_lof2(sum6)),c7);\r
+\r
+ ptrC = (__float2_t *) c;\r
+ *ptrC++ = c0;\r
+ *ptrC-- = c1;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c2;\r
+ *ptrC-- = c3;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c4;\r
+ *ptrC-- = c5;\r
+ ptrC += (cs_c>>1);\r
+ *ptrC++ = c6;\r
+ *ptrC-- = c7;\r
+\r
+ }\r
+}\r
+\r
+\r
+//void dgemmKernel(const double *pA, const double *pB, double *pC, const double a, const int k, const int stepC)\r
+void bli_dgemm_ukernel_4x4(\r
+ dim_t k,\r
+ double* restrict alpha,\r
+ double* restrict a,\r
+ double* restrict b,\r
+ double* restrict beta,\r
+ double* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ double sum00, sum01, sum02, sum03;\r
+ double sum10, sum11, sum12, sum13;\r
+ double sum20, sum21, sum22, sum23;\r
+ double sum30, sum31, sum32, sum33;\r
+ int index;\r
+ double al = *alpha;\r
+ double be = *beta;\r
+\r
+ //touch routine: both a & b\r
+ //Length of b = NR*K*size of double;\r
+ //Length of a = MR*K*size of double;\r
+#ifdef BLIS_ENABLE_PREFETCH\r
+ //touch(b, k*BLIS_DEFAULT_NR_D*8);\r
+ //touch(a, k*BLIS_DEFAULT_MR_D*8);\r
+#endif\r
+\r
+ sum00 = 0.0;\r
+ sum01 = 0.0;\r
+ sum02 = 0.0;\r
+ sum03 = 0.0;\r
+ sum10 = 0.0;\r
+ sum11 = 0.0;\r
+ sum12 = 0.0;\r
+ sum13 = 0.0;\r
+ sum20 = 0.0;\r
+ sum21 = 0.0;\r
+ sum22 = 0.0;\r
+ sum23 = 0.0;\r
+ sum30 = 0.0;\r
+ sum31 = 0.0;\r
+ sum32 = 0.0;\r
+ sum33 = 0.0;\r
+\r
+ for(index = 0; index < k; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 4x1 by 1x4\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 4x4 matrix\r
+ register double a0, a1, a2, a3;\r
+ register double b0, b1, b2, b3;\r
+\r
+ a0 = *a++;\r
+ a1 = *a++;\r
+ a2 = *a++;\r
+ a3 = *a++;\r
+ b0 = *b++;\r
+ b1 = *b++;\r
+ b2 = *b++;\r
+ b3 = *b++;\r
+\r
+ // a[0]*b[0]\r
+ sum00 += a0*b0;\r
+ // a[0]*b[1]\r
+ sum01 += a0*b1;\r
+ // a[0]*b[2]\r
+ sum02 += a0*b2;\r
+ // a[0]*b[3]\r
+ sum03 += a0*b3;\r
+ // a[1]*b[0]\r
+ sum10 += a1*b0;\r
+ // a[1]*b[1]\r
+ sum11 += a1*b1;\r
+ // a[1]*b[2]\r
+ sum12 += a1*b2;\r
+ // a[1]*b[3]\r
+ sum13 += a1*b3;\r
+ // a[2]*b[0]\r
+ sum20 += a2*b0;\r
+ // a[2]*b[1]\r
+ sum21 += a2*b1;\r
+ // a[2]*b[2]\r
+ sum22 += a2*b2;\r
+ // a[2]*b[3]\r
+ sum23 += a2*b3;\r
+ // a[3]*b[0]\r
+ sum30 += a3*b0;\r
+ // a[3]*b[1]\r
+ sum31 += a3*b1;\r
+ // a[3]*b[2]\r
+ sum32 += a3*b2;\r
+ // a[3]*b[3]\r
+ sum33 += a3*b3;\r
+ }\r
+\r
+ double* restrict cptr;\r
+ // 0th Column\r
+ // updating C[00]\r
+ cptr = c;\r
+ *cptr = *cptr * be;\r
+ *cptr += sum00 * al;\r
+\r
+ // updating C[10]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum10 * al;\r
+\r
+ // updating C[20]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum20 * al;\r
+\r
+ // updating C[30]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum30 * al;\r
+\r
+ // 1st column\r
+ // updating C[01]\r
+ cptr = c + cs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum01 * al;\r
+\r
+ // updating C[11]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum11 * al;\r
+\r
+ // updating C[21]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum21 * al;\r
+\r
+ // updating C[31]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum31 * al;\r
+\r
+ // 2nd Column\r
+ // updating C[02]\r
+ cptr = c + 2*cs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum02 * al;\r
+\r
+ // updating C[12]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum12 * al;\r
+\r
+ // updating C[22]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum22 * al;\r
+\r
+ // updating C[32]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum32 * al;\r
+\r
+ // 3rd Column\r
+ // updating C[03]\r
+ cptr = c + 3*cs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum03 * al;\r
+\r
+ // updating C[13]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum13 * al;\r
+\r
+ // updating C[23]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum23 * al;\r
+\r
+ // updating C[33]\r
+ cptr += rs_c;\r
+ *cptr = *cptr*be;\r
+ *cptr += sum33 * al;\r
+\r
+ return;\r
+}\r
+\r
+void bli_cgemm_ukernel_2x4(\r
+ dim_t k,\r
+ scomplex* restrict alpha,\r
+ scomplex* restrict a,\r
+ scomplex* restrict b,\r
+ scomplex* restrict beta,\r
+ scomplex* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ __float2_t sum00a, sum10a, sum00b, sum10b;\r
+ __float2_t sum01a, sum11a, sum01b, sum11b;\r
+ __float2_t sum02a, sum12a, sum02b, sum12b;\r
+ __float2_t sum03a, sum13a, sum03b, sum13b;\r
+ __float2_t * restrict ptrB = (__float2_t *) b;\r
+ __float2_t * restrict ptrA = (__float2_t *) a;\r
+ __float2_t * restrict ptrC;\r
+ __float2_t regA, regB, regC;\r
+ int_least16_t index;\r
+\r
+ // zero out accumulators\r
+ sum00a = 0.0;\r
+ sum10a = 0.0;\r
+ sum01a = 0.0;\r
+ sum11a = 0.0;\r
+ sum02a = 0.0;\r
+ sum12a = 0.0;\r
+ sum03a = 0.0;\r
+ sum13a = 0.0;\r
+ sum00b = 0.0;\r
+ sum10b = 0.0;\r
+ sum01b = 0.0;\r
+ sum11b = 0.0;\r
+ sum02b = 0.0;\r
+ sum12b = 0.0;\r
+ sum03b = 0.0;\r
+ sum13b = 0.0;\r
+\r
+ for (index = 0; index < k; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 2x1 by 1x4\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 2x4 matrix\r
+ __float2_t b0, b1, b2, b3, a0, a1;\r
+ __x128_t reg128;\r
+\r
+ a0 = *ptrA++;\r
+ a1 = *ptrA++;\r
+\r
+ b0 = *ptrB++;\r
+ b1 = *ptrB++;\r
+ b2 = *ptrB++;\r
+ b3 = *ptrB++;\r
+\r
+ // the four partial sums are accumulated independently\r
+ // a[0]*b[0]\r
+ reg128 = _cmpysp(a0, b0);\r
+ sum00a = _daddsp(sum00a, _lof2_128(reg128));\r
+ sum00b = _daddsp(sum00b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[0]\r
+ reg128 = _cmpysp(a1, b0);\r
+ sum10a = _daddsp(sum10a, _lof2_128(reg128));\r
+ sum10b = _daddsp(sum10b, _hif2_128(reg128));\r
+\r
+ // a[0]*b[1]\r
+ reg128 = _cmpysp(a0, b1);\r
+ sum01a = _daddsp(sum01a, _lof2_128(reg128));\r
+ sum01b = _daddsp(sum01b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[1]\r
+ reg128 = _cmpysp(a1, b1);\r
+ sum11a = _daddsp(sum11a, _lof2_128(reg128));\r
+ sum11b = _daddsp(sum11b, _hif2_128(reg128));\r
+\r
+ // a[0]*b[2]\r
+ reg128 = _cmpysp(a0, b2);\r
+ sum02a = _daddsp(sum02a, _lof2_128(reg128));\r
+ sum02b = _daddsp(sum02b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[2]\r
+ reg128 = _cmpysp(a1, b2);\r
+ sum12a = _daddsp(sum12a, _lof2_128(reg128));\r
+ sum12b = _daddsp(sum12b, _hif2_128(reg128));\r
+\r
+ // a[0]*b[3]\r
+ reg128 = _cmpysp(a0, b3);\r
+ sum03a = _daddsp(sum03a, _lof2_128(reg128));\r
+ sum03b = _daddsp(sum03b, _hif2_128(reg128));\r
+\r
+ // a[1]*b[3]\r
+ reg128 = _cmpysp(a1, b3);\r
+ sum13a = _daddsp(sum13a, _lof2_128(reg128));\r
+ sum13b = _daddsp(sum13b, _hif2_128(reg128));\r
+ }\r
+\r
+ {\r
+ __x128_t reg128;\r
+ ptrA = (__float2_t *) alpha;\r
+ ptrB = (__float2_t *) beta;\r
+ regA = *ptrA;\r
+ regB = *ptrB;\r
+\r
+ // the value of a and the final values need to be\r
+ // rearranged due to the specific way cmpysp assumes\r
+ // data arrangement\r
+ regA =_ftof2(-_lof(regA), _hif(regA));\r
+ //regB = _ftof2(_lof(regB),_hif(regB));\r
+ ptrC = (__float2_t *) c;\r
+\r
+ // update and save c[0,0]\r
+ sum00a = _daddsp(sum00a, sum00b);\r
+ reg128 = _cmpysp(regA, sum00a);\r
+ sum00a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum00a),_hif(sum00a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c;\r
+\r
+ // update and save c[1,0]\r
+ sum10a = _daddsp(sum10a, sum10b);\r
+ reg128 = _cmpysp(regA, sum10a);\r
+ sum10a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum10a),_hif(sum10a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+\r
+ ptrC = (__float2_t *) c + cs_c;\r
+\r
+ // update and save c[0,1]\r
+ sum01a = _daddsp(sum01a, sum01b);\r
+ reg128 = _cmpysp(regA, sum01a);\r
+ sum01a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum01a),_hif(sum01a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c + cs_c;\r
+\r
+ // update and save c[1,1]\r
+ sum11a = _daddsp(sum11a, sum11b);\r
+ reg128 = _cmpysp(regA, sum11a);\r
+ sum11a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum11a),_hif(sum11a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + 2 * cs_c;\r
+\r
+ // update and save c[0,2]\r
+ sum02a = _daddsp(sum02a, sum02b);\r
+ reg128 = _cmpysp(regA, sum02a);\r
+ sum02a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum02a),_hif(sum02a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c + 2* cs_c;\r
+\r
+ // update and save c[1,2]\r
+ sum12a = _daddsp(sum12a, sum12b);\r
+ reg128 = _cmpysp(regA, sum12a);\r
+ sum12a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum12a),_hif(sum12a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + 3 * cs_c;\r
+\r
+ // update and save c[0,3]\r
+ sum03a = _daddsp(sum03a, sum03b);\r
+ reg128 = _cmpysp(regA, sum03a);\r
+ sum03a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum03a),_hif(sum03a)),_ftof2(_lof(regC),-_hif(regC)));\r
+\r
+ ptrC = (__float2_t *) c + rs_c + 3 * cs_c;\r
+\r
+ // update and save c[1,3]\r
+ sum13a = _daddsp(sum13a, sum13b);\r
+ reg128 = _cmpysp(regA, sum13a);\r
+ sum13a = _daddsp(_hif2_128(reg128),_lof2_128(reg128));\r
+ regC = *ptrC;\r
+ //regC = _ftof2(_lof(regC), _hif(regC));\r
+ reg128 = _cmpysp(regC,regB);\r
+ regC = _daddsp(_lof2_128(reg128),_hif2_128(reg128));\r
+ *ptrC = _daddsp(_ftof2(-_lof(sum13a),_hif(sum13a)),_ftof2(_lof(regC),-_hif(regC)));\r
+ }\r
+ return;\r
+}\r
+\r
+void bli_zgemm_ukernel_2x2(\r
+ dim_t k,\r
+ dcomplex* restrict alpha,\r
+ dcomplex* restrict a,\r
+ dcomplex* restrict b,\r
+ dcomplex* restrict beta,\r
+ dcomplex* restrict c, inc_t rs_c, inc_t cs_c,\r
+ auxinfo_t* data\r
+)\r
+{\r
+ double * restrict ptrA = (double *) a;\r
+ double * restrict ptrB = (double *) b;\r
+ //double * restrict ptrC = (double *) c;\r
+ double sum00r, sum00i;\r
+ int index;\r
+ int kEven = k&0xFFFE;\r
+\r
+ sum00r = 0.0;\r
+ sum00i = 0.0;\r
+\r
+ if(k>4) // The loop is safe for k > 4\r
+ {\r
+#pragma UNROLL(2)\r
+ for(index = 0; index<kEven; index++)\r
+ { // loop over k;\r
+ // each iteration performs rank one update of 1x1 by 1x1\r
+ // matrices of A and B respectively; result is\r
+ // accumulated over 1x1 matrix\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+\r
+ }\r
+ if(k&1) // odd k; one left to do\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ if(k>0)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ if(k>1)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ if(k>2)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+ if(k>3)\r
+ {\r
+ double a0r, a0i;\r
+ double b0r, b0i;\r
+\r
+ a0r = *ptrA++;\r
+ a0i = *ptrA++;\r
+\r
+ b0r = *ptrB++;\r
+ b0i = *ptrB++;\r
+\r
+ sum00r += a0r*b0r;\r
+ sum00r -= a0i*b0i;\r
+ sum00i += a0r*b0i;\r
+ sum00i += a0i*b0r;\r
+ }\r
+\r
+ }\r
+\r
+ { // final saving\r
+ double alphar, alphai, betar, betai, cr, ci;\r
+ alphar = alpha->real;\r
+ alphai = alpha->imag;\r
+ betar = beta->real;\r
+ betai = beta->imag;\r
+\r
+ cr = c->real;\r
+ ci = c->imag;\r
+\r
+ c->imag = (betar * ci + betai * cr);\r
+ c->real = (betar * cr - betai * ci);\r
+ c->real += (alphar * sum00r - alphai * sum00i);\r
+ c->imag += (alphar * sum00i + alphai * sum00r);\r
+ }\r
+\r
+\r
+ return;\r
+}\r
+\r
+\r
index bf427249bf06b30450e279d495a961978e65920e..23d6896ce0d78968dffeda64c9de7cc528980037 100644 (file)
--- a/blis/testsuite/Makefile
+++ b/blis/testsuite/Makefile
#
# BLIS library and header path. This is simply wherever it was installed.
-#BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib
-#BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis
+BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib
+BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis
+BLIS_LIB := ../$(BLIS_LIB_PATH)/libblis.a
# BLIS library.
-BLIS_LIB_PATH := $(DIST_PATH)/$(LIB_DIR)/$(CONFIG_NAME)
-BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a -lOpenCL -locl_util -lstdc++ -lrt
-LDFLAGS += -L$(TARGET_ROOTDIR)/lib -L$(TARGET_ROOTDIR)/usr/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/usr/lib
+#BLIS_LIB_PATH := $(DIST_PATH)/$(LIB_DIR)/$(CONFIG_NAME)
# BLAS library path(s). This is where the BLAS libraries reside.
ifeq ($(lib),CBLAS)
#CBLAS w/o OpenCL wrappers
+LDFLAGS += -L$(TARGET_ROOTDIR)/lib -L$(TARGET_ROOTDIR)/usr/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/lib -Wl,-rpath-link,$(TARGET_ROOTDIR)/usr/lib
+
CFLAGS += -I$(CBLAS_INC_PATH)
CFLAGS += -DCBLAS
-temp := $(CBLAS_LIB) $(BLIS_LIB)
+temp := $(CBLAS_LIB) $(BLIS_LIB) -lOpenCL -locl_util -lstdc++ -lrt
BLIS_LIB := $(temp)
else ifeq ($(lib),OpenCLCBLAS)
index fa4d6ab3de6cdbb1a392c7144a7adde8d31cce7e..6a7608261008bd718b687acf364ffa12a720109e 100644 (file)
# 'r' = rowvec / unit stride; 'i' = rowvec / non-unit stride
0 # Test all combinations of storage schemes?
32 # General stride spacing (for cases when testing general stride)
-d # Datatype(s) to test:
+sdcz # Datatype(s) to test:
# 's' = single real; 'c' = single complex;
# 'd' = double real; 'z' = double complex
-1000 # Problem size: first to test
-5000 # Problem size: maximum to test
-500 # Problem size: increment between experiments
+1000 # Problem size: first to test
+4000 # Problem size: maximum to test
+500 # Problem size: increment between experiments
# Complex level-3 implementations
0 # 3mh ('1' = enable; '0' = disable)
0 # 3m ('1' = enable; '0' = disable)
index 56399d4514b4da6be5c3f6c687a737ea22e82a23..f38449040c1162175d159e61efcfd90f00a76a9f 100644 (file)
bli_obj_set_conjtrans( transa, a );
bli_obj_set_conjtrans( transb, b );
+ //bli_printm( "c_save = [", &c_save[0], "%f", "];" );
+
// Repeat the experiment n_repeats times and record results.
for ( i = 0; i < n_repeats; ++i )
{
#ifdef BLIS_ENABLE_MULTITHREAD_TEST
// Need only one call to initialize the CBLAS OpenCL kernel
bli_copym( &c_save[0], &c[0] );
-
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c[0] );
//but need to re-initialize C for each of iteration of n_repeats
#else
bli_copym( &c_save, &c );
-
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
-
bli_copym( &c_save, &c );
#endif
-
time = bli_clock();
// bli_printm( "a = [", &a, "%f", "];" );
diff --git a/blis/version b/blis/version
index 4e632fe6172cd89b557687a532f9fde2f8241660..e62876a936f315312b542b0dff7047cc67c7e8e1 100644 (file)
--- a/blis/version
+++ b/blis/version
-DEV.LINALG.01.00.00.01
+DEV.LINALG.01.02.00.00-6
diff --git a/blis/windows/Makefile b/blis/windows/Makefile
index 6e8c1e0f0912dc5988d9952214a211ead8aafb36..b5c211c9938f166efa9df7c03bbf4334319c365c 100644 (file)
--- a/blis/windows/Makefile
+++ b/blis/windows/Makefile
-#
-#
-# BLIS
-# An object-based framework for developing high-performance BLAS-like
-# libraries.
-#
-# Copyright (C) 2014, The University of Texas at Austin
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-# - Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# - Neither the name of The University of Texas at Austin nor the names
-# of its contributors may be used to endorse or promote products
-# derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-
-
-#
-# --- Include variables determined at configure-time --------------------------
-#
-CONFIGURE_DEFS = config\config.mk
-
-!if exist ( $(CONFIGURE_DEFS) )
-!include $(CONFIGURE_DEFS)
-!else
-!error nmake: $(CONFIGURE_DEFS) does not exist! Run configure.cmd first.
-!endif
-
-
-
-#
-# --- Include environment- and build-specific definitions ----------------------
-#
-
-MAKE_DEFS = build\defs.mk
-
-# Include build definitions
-!if exist ( $(MAKE_DEFS) )
-!include $(MAKE_DEFS)
-!else
-!error nmake: $(MAKE_DEFS) does not exist! Your libblis distribution may be incomplete.
-!endif
-
-
-
-#
-# --- Variable modifications ---------------------------------------------------
-#
-
-
-
-#
-# --- High-level rules ---------------------------------------------------------
-#
-
-all: libblis
-
-libblis: libblis-lib
-
-libblis-objs: $(BLIS_OBJS)
-
-libblis-lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB)
-
-libblis-dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL)
-
-lib: libblis-lib
-
-dll: libblis-dll
-
-install: install-lib install-headers
-
-install-lib: $(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib
-
-install-dll: $(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll \
- $(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib \
- $(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp
-
-install-headers: $(INSTALL_PREFIX_INC)\$(BLIS_H)
-
-clean: clean-build clean-log
-
-distclean: clean-config clean-build clean-log
-
-
-
-#
-# --- Source code (inference) rules --------------------------------------------
-#
-
-# --- C source files in flamec directory ---
-{$(SRC_BLI_DIRPATH)}.c{$(OBJ_BLI_DIRPATH)}.obj:
-!ifdef VERBOSE
- if not exist $(OBJ_BLI_DIRPATH) \
- ( $(MKDIR) $(OBJ_BLI_DIRPATH) )
- $(CC) $(CFLAGS) /c $< /Fo$@
-!else
- @if not exist $(OBJ_BLI_DIRPATH) \
- ( ( $(ECHO) nmake: Creating $(OBJ_BLI_DIRPATH) directory ) & \
- ( $(MKDIR) $(OBJ_BLI_DIRPATH) ) )
- @$(ECHO) nmake: Compiling $<
- @$(CC) $(CFLAGS) /c $< /Fo$@ >> $(CC_LOG_FILE)
-!endif
-
-
-
-#
-# --- Library generation rules -------------------------------------------------
-#
-
-# --- Static library ---
-$(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB): libblis-objs
-!ifdef VERBOSE
- if not exist $(LIB_LIBBLIS_DIRPATH) \
- ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) )
- $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH)
- $(CD) $(LIB_LIBBLIS_DIRPATH)
- $(LIB) $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)
- $(DEL) *.obj
- $(CD) $(TOP_BUILD_DIR_ABS)
-!else
- @if not exist $(LIB_LIBBLIS_DIRPATH) \
- ( ( $(ECHO) nmake: Creating $(LIB_LIBBLIS_DIRPATH) directory ) & \
- ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) ) )
- @$(ECHO) nmake: Creating static library $@
- @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)
- @$(CD) $(LIB_LIBBLIS_DIRPATH)
- @$(LIB) /VERBOSE $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)
- @$(DEL) *.obj
- @$(CD) $(TOP_BUILD_DIR_ABS)
-!endif
-
-# --- Dynamic library (object code file, import library, and export file) ---
-$(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL): libblis-objs
-!ifdef VERBOSE
- if not exist $(DLL_LIBBLIS_DIRPATH) \
- ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) )
- $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)
- $(CD) $(DLL_LIBBLIS_DIRPATH)
- $(DIR) /B *.obj > $(OBJ_LIST_FILE)
- $(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)
- $(DEL) $(OBJ_LIST_FILE)
- $(DEL) *.obj
- $(CD) $(TOP_BUILD_DIR_ABS)
-!else
- @if not exist $(DLL_LIBBLIS_DIRPATH) \
- ( ( $(ECHO) nmake: Creating $(DLL_LIBBLIS_DIRPATH) directory ) & \
- ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) ) )
- @$(ECHO) nmake: Creating dynamic library $@
- @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)
- @$(CD) $(DLL_LIBBLIS_DIRPATH)
- @$(DIR) /B *.obj > $(OBJ_LIST_FILE)
- @$(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)
- @$(DEL) $(OBJ_LIST_FILE)
- @$(DEL) *.obj
- @$(CD) $(TOP_BUILD_DIR_ABS)
-!endif
-
-
-
-#
-# --- Install rules ------------------------------------------------------------
-#
-
-# --- Header files ---
-$(INSTALL_PREFIX_INC)\$(BLIS_H): $(INC_BLI_DIRPATH)\$(BLIS_H) \
- $(BUILD_DIRNAME)\$(BLI_CONFIG_H)
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_INC) \
- ( $(MKDIR) $(INSTALL_PREFIX_INC) )
- $(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
- $(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
-!else
- @if not exist $(INSTALL_PREFIX_INC) \
- ( $(MKDIR) $(INSTALL_PREFIX_INC) )
- @$(ECHO) nmake: Installing libblis header files to $(INSTALL_PREFIX_INC)
- @$(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
- @$(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)
-!endif
-
-# --- Static library ---
-$(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )
- if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
- ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) )
-!else
- @if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )
- @if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
- ( ( $(ECHO) nmake: Installing $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_LIB) ) & \
- ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) ) )
-!endif
-
-# --- Dynamic library (object code) ---
-$(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )
-!else
- @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \
- ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll to $(INSTALL_PREFIX_DLL) ) & \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )
-!endif
-
-# --- Dynamic library (import library) ---
-$(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )
-!else
- @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \
- ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_DLL) ) & \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )
-!endif
-
-# --- Dynamic library (export file) ---
-$(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp
-!ifdef VERBOSE
- if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )
-!else
- @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )
- @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \
- ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp to $(INSTALL_PREFIX_DLL) ) & \
- ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )
-!endif
-
-
-
-#
-# --- Clean rules --------------------------------------------------------------
-#
-
-clean-log:
-!ifdef VERBOSE
- if exist $(CC_LOG_FILE) \
- ( $(DEL) $(CC_LOG_FILE) )
- if exist $(FC_LOG_FILE) \
- ( $(DEL) $(FC_LOG_FILE) )
- if exist $(COPY_LOG_FILE) \
- ( $(DEL) $(COPY_LOG_FILE) )
-!else
- @if exist $(CC_LOG_FILE) \
- ( ( $(ECHO) nmake: Deleting $(CC_LOG_FILE) ) & \
- ( $(DEL) $(CC_LOG_FILE) ) )
- @if exist $(FC_LOG_FILE) \
- ( ( $(ECHO) nmake: Deleting $(FC_LOG_FILE) ) & \
- ( $(DEL) $(FC_LOG_FILE) ) )
- @if exist $(COPY_LOG_FILE) \
- ( ( $(ECHO) nmake: Deleting $(COPY_LOG_FILE) ) & \
- ( $(DEL) $(COPY_LOG_FILE) ) )
-!endif
-
-clean-config:
-!ifdef VERBOSE
- if exist $(CNF_DIRNAME) \
- ( $(RMDIR) $(CNF_DIRNAME) )
- if exist $(INC_DIRNAME) \
- ( $(RMDIR) $(INC_DIRNAME) )
- if exist $(SRC_DIRNAME) \
- ( $(RMDIR) $(SRC_DIRNAME) )
-!else
- @if exist $(CNF_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(CNF_DIRNAME) directory ) & \
- ( $(RMDIR) $(CNF_DIRNAME) ) )
- @if exist $(INC_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(INC_DIRNAME) directory ) & \
- ( $(RMDIR) $(INC_DIRNAME) ) )
- @if exist $(SRC_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(SRC_DIRNAME) directory ) & \
- ( $(RMDIR) $(SRC_DIRNAME) ) )
-!endif
-
-clean-build:
-!ifdef VERBOSE
- if exist $(OBJ_DIRNAME) \
- ( $(RMDIR) $(OBJ_DIRNAME) )
- if exist $(LIB_DIRNAME) \
- ( $(RMDIR) $(LIB_DIRNAME) )
- if exist $(DLL_DIRNAME) \
- ( $(RMDIR) $(DLL_DIRNAME) )
-!else
- @if exist $(OBJ_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(OBJ_DIRNAME) directory ) & \
- ( $(RMDIR) $(OBJ_DIRNAME) ) )
- @if exist $(LIB_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \
- ( $(RMDIR) $(LIB_DIRNAME) ) )
- @if exist $(DLL_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \
- ( $(RMDIR) $(DLL_DIRNAME) ) )
-!endif
-
-# Useful for developing when all we want to do is remove the library products.
-clean-lib:
-!ifdef VERBOSE
- if exist $(LIB_DIRNAME) \
- ( $(RMDIR) $(LIB_DIRNAME) )
- if exist $(DLL_DIRNAME) \
- ( $(RMDIR) $(DLL_DIRNAME) )
-!else
- @if exist $(LIB_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \
- ( $(RMDIR) $(LIB_DIRNAME) ) )
- @if exist $(DLL_DIRNAME) \
- ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \
- ( $(RMDIR) $(DLL_DIRNAME) ) )
-!endif
-
-
-
-#
-# --- Help target --------------------------------------------------------------
-#
-
-help:
- @$(NMAKE_HELP)
-
+#\r
+#\r
+# BLIS \r
+# An object-based framework for developing high-performance BLAS-like\r
+# libraries.\r
+#\r
+# Copyright (C) 2014, The University of Texas at Austin\r
+#\r
+# Redistribution and use in source and binary forms, with or without\r
+# modification, are permitted provided that the following conditions are\r
+# met:\r
+# - Redistributions of source code must retain the above copyright\r
+# notice, this list of conditions and the following disclaimer.\r
+# - Redistributions in binary form must reproduce the above copyright\r
+# notice, this list of conditions and the following disclaimer in the\r
+# documentation and/or other materials provided with the distribution.\r
+# - Neither the name of The University of Texas at Austin nor the names\r
+# of its contributors may be used to endorse or promote products\r
+# derived from this software without specific prior written permission.\r
+#\r
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+#\r
+#\r
+\r
+\r
+\r
+#\r
+# --- Include variables determined at configure-time --------------------------\r
+#\r
+CONFIGURE_DEFS = config\config.mk\r
+\r
+!if exist ( $(CONFIGURE_DEFS) )\r
+!include $(CONFIGURE_DEFS)\r
+!else\r
+!error nmake: $(CONFIGURE_DEFS) does not exist! Run configure.cmd first.\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Include environment- and build-specific definitions ----------------------\r
+#\r
+\r
+MAKE_DEFS = build\defs.mk\r
+\r
+# Include build definitions\r
+!if exist ( $(MAKE_DEFS) )\r
+!include $(MAKE_DEFS)\r
+!else\r
+!error nmake: $(MAKE_DEFS) does not exist! Your libblis distribution may be incomplete.\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Variable modifications ---------------------------------------------------\r
+#\r
+\r
+\r
+\r
+#\r
+# --- High-level rules ---------------------------------------------------------\r
+#\r
+\r
+all: libblis\r
+\r
+libblis: libblis-lib\r
+\r
+libblis-objs: $(BLIS_OBJS)\r
+\r
+libblis-lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB)\r
+\r
+libblis-dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL)\r
+\r
+lib: libblis-lib\r
+\r
+dll: libblis-dll\r
+\r
+install: install-lib install-headers\r
+\r
+install-lib: $(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib\r
+\r
+install-dll: $(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll \\r
+ $(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib \\r
+ $(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp\r
+\r
+install-headers: $(INSTALL_PREFIX_INC)\$(BLIS_H)\r
+\r
+clean: clean-build clean-log\r
+\r
+distclean: clean-config clean-build clean-log\r
+\r
+\r
+\r
+#\r
+# --- Source code (inference) rules --------------------------------------------\r
+#\r
+\r
+# --- C source files in flamec directory ---\r
+{$(SRC_BLI_DIRPATH)}.c{$(OBJ_BLI_DIRPATH)}.obj:\r
+!ifdef VERBOSE\r
+ if not exist $(OBJ_BLI_DIRPATH) \\r
+ ( $(MKDIR) $(OBJ_BLI_DIRPATH) )\r
+ $(CC) $(CFLAGS) /c $< /Fo$@\r
+!else\r
+ @if not exist $(OBJ_BLI_DIRPATH) \\r
+ ( ( $(ECHO) nmake: Creating $(OBJ_BLI_DIRPATH) directory ) & \\r
+ ( $(MKDIR) $(OBJ_BLI_DIRPATH) ) )\r
+ @$(ECHO) nmake: Compiling $<\r
+ @$(CC) $(CFLAGS) /c $< /Fo$@ >> $(CC_LOG_FILE)\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Library generation rules -------------------------------------------------\r
+#\r
+\r
+# --- Static library ---\r
+$(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB): libblis-objs\r
+!ifdef VERBOSE\r
+ if not exist $(LIB_LIBBLIS_DIRPATH) \\r
+ ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) )\r
+ $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH)\r
+ $(CD) $(LIB_LIBBLIS_DIRPATH)\r
+ $(LIB) $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)\r
+ $(DEL) *.obj\r
+ $(CD) $(TOP_BUILD_DIR_ABS)\r
+!else\r
+ @if not exist $(LIB_LIBBLIS_DIRPATH) \\r
+ ( ( $(ECHO) nmake: Creating $(LIB_LIBBLIS_DIRPATH) directory ) & \\r
+ ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) ) )\r
+ @$(ECHO) nmake: Creating static library $@\r
+ @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)\r
+ @$(CD) $(LIB_LIBBLIS_DIRPATH)\r
+ @$(LIB) /VERBOSE $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS)\r
+ @$(DEL) *.obj\r
+ @$(CD) $(TOP_BUILD_DIR_ABS)\r
+!endif\r
+\r
+# --- Dynamic library (object code file, import library, and export file) ---\r
+$(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL): libblis-objs\r
+!ifdef VERBOSE\r
+ if not exist $(DLL_LIBBLIS_DIRPATH) \\r
+ ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) )\r
+ $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)\r
+ $(CD) $(DLL_LIBBLIS_DIRPATH)\r
+ $(DIR) /B *.obj > $(OBJ_LIST_FILE)\r
+ $(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)\r
+ $(DEL) $(OBJ_LIST_FILE)\r
+ $(DEL) *.obj\r
+ $(CD) $(TOP_BUILD_DIR_ABS)\r
+!else\r
+ @if not exist $(DLL_LIBBLIS_DIRPATH) \\r
+ ( ( $(ECHO) nmake: Creating $(DLL_LIBBLIS_DIRPATH) directory ) & \\r
+ ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) ) )\r
+ @$(ECHO) nmake: Creating dynamic library $@\r
+ @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE)\r
+ @$(CD) $(DLL_LIBBLIS_DIRPATH)\r
+ @$(DIR) /B *.obj > $(OBJ_LIST_FILE)\r
+ @$(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE)\r
+ @$(DEL) $(OBJ_LIST_FILE)\r
+ @$(DEL) *.obj\r
+ @$(CD) $(TOP_BUILD_DIR_ABS)\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Install rules ------------------------------------------------------------\r
+#\r
+\r
+# --- Header files ---\r
+$(INSTALL_PREFIX_INC)\$(BLIS_H): $(INC_BLI_DIRPATH)\$(BLIS_H) \\r
+ $(BUILD_DIRNAME)\$(BLI_CONFIG_H)\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_INC) \\r
+ ( $(MKDIR) $(INSTALL_PREFIX_INC) )\r
+ $(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)\r
+ $(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_INC) \\r
+ ( $(MKDIR) $(INSTALL_PREFIX_INC) )\r
+ @$(ECHO) nmake: Installing libblis header files to $(INSTALL_PREFIX_INC)\r
+ @$(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)\r
+ @$(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE)\r
+!endif\r
+\r
+# --- Static library ---\r
+$(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )\r
+ if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \\r
+ ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) )\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) )\r
+ @if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \\r
+ ( ( $(ECHO) nmake: Installing $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_LIB) ) & \\r
+ ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+# --- Dynamic library (object code) ---\r
+$(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \\r
+ ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll to $(INSTALL_PREFIX_DLL) ) & \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+# --- Dynamic library (import library) ---\r
+$(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \\r
+ ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_DLL) ) & \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+# --- Dynamic library (export file) ---\r
+$(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp\r
+!ifdef VERBOSE\r
+ if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) )\r
+!else\r
+ @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) )\r
+ @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \\r
+ ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp to $(INSTALL_PREFIX_DLL) ) & \\r
+ ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Clean rules --------------------------------------------------------------\r
+#\r
+\r
+clean-log:\r
+!ifdef VERBOSE\r
+ if exist $(CC_LOG_FILE) \\r
+ ( $(DEL) $(CC_LOG_FILE) )\r
+ if exist $(FC_LOG_FILE) \\r
+ ( $(DEL) $(FC_LOG_FILE) )\r
+ if exist $(COPY_LOG_FILE) \\r
+ ( $(DEL) $(COPY_LOG_FILE) )\r
+!else\r
+ @if exist $(CC_LOG_FILE) \\r
+ ( ( $(ECHO) nmake: Deleting $(CC_LOG_FILE) ) & \\r
+ ( $(DEL) $(CC_LOG_FILE) ) )\r
+ @if exist $(FC_LOG_FILE) \\r
+ ( ( $(ECHO) nmake: Deleting $(FC_LOG_FILE) ) & \\r
+ ( $(DEL) $(FC_LOG_FILE) ) )\r
+ @if exist $(COPY_LOG_FILE) \\r
+ ( ( $(ECHO) nmake: Deleting $(COPY_LOG_FILE) ) & \\r
+ ( $(DEL) $(COPY_LOG_FILE) ) )\r
+!endif\r
+\r
+clean-config:\r
+!ifdef VERBOSE\r
+ if exist $(CNF_DIRNAME) \\r
+ ( $(RMDIR) $(CNF_DIRNAME) )\r
+ if exist $(INC_DIRNAME) \\r
+ ( $(RMDIR) $(INC_DIRNAME) )\r
+ if exist $(SRC_DIRNAME) \\r
+ ( $(RMDIR) $(SRC_DIRNAME) )\r
+!else\r
+ @if exist $(CNF_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(CNF_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(CNF_DIRNAME) ) )\r
+ @if exist $(INC_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(INC_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(INC_DIRNAME) ) )\r
+ @if exist $(SRC_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(SRC_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(SRC_DIRNAME) ) )\r
+!endif\r
+\r
+clean-build:\r
+!ifdef VERBOSE\r
+ if exist $(OBJ_DIRNAME) \\r
+ ( $(RMDIR) $(OBJ_DIRNAME) )\r
+ if exist $(LIB_DIRNAME) \\r
+ ( $(RMDIR) $(LIB_DIRNAME) )\r
+ if exist $(DLL_DIRNAME) \\r
+ ( $(RMDIR) $(DLL_DIRNAME) )\r
+!else\r
+ @if exist $(OBJ_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(OBJ_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(OBJ_DIRNAME) ) )\r
+ @if exist $(LIB_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(LIB_DIRNAME) ) )\r
+ @if exist $(DLL_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(DLL_DIRNAME) ) )\r
+!endif\r
+\r
+# Useful for developing when all we want to do is remove the library products.\r
+clean-lib:\r
+!ifdef VERBOSE\r
+ if exist $(LIB_DIRNAME) \\r
+ ( $(RMDIR) $(LIB_DIRNAME) )\r
+ if exist $(DLL_DIRNAME) \\r
+ ( $(RMDIR) $(DLL_DIRNAME) )\r
+!else\r
+ @if exist $(LIB_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(LIB_DIRNAME) ) )\r
+ @if exist $(DLL_DIRNAME) \\r
+ ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \\r
+ ( $(RMDIR) $(DLL_DIRNAME) ) )\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Help target --------------------------------------------------------------\r
+#\r
+\r
+help:\r
+ @$(NMAKE_HELP)\r
+\r
index 4e560a463f7b38dc69ee476a24b93d9888f3d49c..525eee039b03fe13f6e83f80dd2125cb54dcc32a 100644 (file)
-#
-#
-# BLIS
-# An object-based framework for developing high-performance BLAS-like
-# libraries.
-#
-# Copyright (C) 2014, The University of Texas at Austin
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-# - Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# - Neither the name of The University of Texas at Austin nor the names
-# of its contributors may be used to endorse or promote products
-# derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-#
-# --- Configuration variable definitions ---------------------------------------
-#
-# Environment-related variables:
-# REVISION - The code's revision number.
-# PWD - The path to current working directory.
-# ARCH_STR - A string to identify the requested build architecture.
-# BUILD_STR - A string to identify the requested build type.
-# CCOMPILER_STR - A string to identify the requested C compiler.
-#
-# Target-related variables:
-# FLAMEC_OBJS - List of paths to flamec object files.
-# LAPACK2FLAMEC_OBJS - List of paths to lapack2flamec object files.
-#
-# Note: these variables are not present in the .in template file. Instead, they
-# are appended to the contents of the .in file by a build script and output to
-# a separate file (by the same name, without the .in extension).
-#
+#\r
+#\r
+# BLIS \r
+# An object-based framework for developing high-performance BLAS-like\r
+# libraries.\r
+#\r
+# Copyright (C) 2014, The University of Texas at Austin\r
+#\r
+# Redistribution and use in source and binary forms, with or without\r
+# modification, are permitted provided that the following conditions are\r
+# met:\r
+# - Redistributions of source code must retain the above copyright\r
+# notice, this list of conditions and the following disclaimer.\r
+# - Redistributions in binary form must reproduce the above copyright\r
+# notice, this list of conditions and the following disclaimer in the\r
+# documentation and/or other materials provided with the distribution.\r
+# - Neither the name of The University of Texas at Austin nor the names\r
+# of its contributors may be used to endorse or promote products\r
+# derived from this software without specific prior written permission.\r
+#\r
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+#\r
+#\r
+\r
+#\r
+# --- Configuration variable definitions ---------------------------------------\r
+#\r
+# Environment-related variables:\r
+# REVISION - The code's revision number.\r
+# PWD - The path to current working directory.\r
+# ARCH_STR - A string to identify the requested build architecture.\r
+# BUILD_STR - A string to identify the requested build type.\r
+# CCOMPILER_STR - A string to identify the requested C compiler.\r
+#\r
+# Target-related variables:\r
+# FLAMEC_OBJS - List of paths to flamec object files.\r
+# LAPACK2FLAMEC_OBJS - List of paths to lapack2flamec object files.\r
+#\r
+# Note: these variables are not present in the .in template file. Instead, they\r
+# are appended to the contents of the .in file by a build script and output to\r
+# a separate file (by the same name, without the .in extension).\r
+#\r
index af5b69e03cd59a360580f5b6d479dc2ce9a5a68a..2c7775b2114eac6049e0f823bf8e93661f32f5b5 100644 (file)
-#
-#
-# BLIS
-# An object-based framework for developing high-performance BLAS-like
-# libraries.
-#
-# Copyright (C) 2014, The University of Texas at Austin
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-# - Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# - Neither the name of The University of Texas at Austin nor the names
-# of its contributors may be used to endorse or promote products
-# derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#
-
-
-#
-# --- General build system options --------------------------------------------
-#
-
-# Uncomment this for verbose output from nmake.
-# VERBOSE = 1
-
-# Assign this varible to be the full path to the directory to which you would
-# like the BLIS build products to be installed upon running "nmake install".
-# The nmake install target will create the install directory and all requisite
-# subdirectories if they do not already exist (in which case the user must have
-# permission to create these directories).
-INSTALL_PREFIX = c:\field\lib
-
-
-#
-# --- Important build system filenames ----------------------------------------
-#
-
-# DLL link arguments. The contents of this file should be customized when
-# building a dynamically-linked library. The lines of the file should contain
-# linker options, library names, and library paths. Note that the library
-# paths must be declared in the following form:
-#
-# /link /LIBPATH:<path1>
-# /link /LIBPATH:<path2>
-# /link /LIBPATH:<path3>
-#
-# where <path1>, <path2>, and <path3> are library paths to add to the list
-# of paths to search when the linker attempts to locate other libraries
-# listed in the file.
-LINKARGS_FILENAME = linkargs.txt
-LINKARGS_FILEPATH = $(PWD)\$(LINKARGS_FILENAME)
-
-# Various log file names that capture standard output when VERBOSE is undefined.
-CC_LOG_FILE = nmake-cc.log
-FC_LOG_FILE = nmake-fc.log
-COPY_LOG_FILE = nmake-copy.log
-
-
-#
-# --- General name and directory definitions -----------------------------------
-#
-
-# The relative and absolute locations of the top-level Windows build directory.
-# This is the directory in which nmake is run (not the directory named "build").
-TOP_BUILD_DIR_REL = .
-TOP_BUILD_DIR_ABS = $(PWD)
-
-# The revision string.
-REV_STR = r$(REVISION)
-
-# The names of the libraries.
-LIBBLIS_NAME_ONLY = libblis
-LIBBLIS = $(LIBBLIS_NAME_ONLY)-$(ARCH_STR)-$(REV_STR)
-
-# Directories that reside within the top-level Windows directory.
-CNF_DIRNAME = config
-INC_DIRNAME = include
-SRC_DIRNAME = frame
-OBJ_DIRNAME = obj
-LIB_DIRNAME = lib
-DLL_DIRNAME = dll
-
-# Leaves of interest for Windows.
-
-# Relative directory paths to each of the above subdirectories.
-INC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(INC_DIRNAME)
-SRC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(SRC_DIRNAME)
-OBJ_DIRPATH = $(TOP_BUILD_DIR_REL)\$(OBJ_DIRNAME)
-LIB_DIRPATH = $(TOP_BUILD_DIR_REL)\$(LIB_DIRNAME)
-DLL_DIRPATH = $(TOP_BUILD_DIR_REL)\$(DLL_DIRNAME)
-
-# We only have header files for flamec leaves.
-INC_BLI_DIRPATH = $(INC_DIRPATH)
-
-# We have source code for flamec and lapack2flamec leaves.
-SRC_BLI_DIRPATH = $(SRC_DIRPATH)
-
-
-# And we have object file paths corresponding to those source leaves defined
-# above.
-OBJ_BLI_DIRPATH = $(OBJ_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)
-
-# Separate directories into which we'll move object files when we create the
-# static libraries.
-LIB_LIBBLIS_DIRPATH = $(LIB_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)
-
-# Separate directories into which we'll move object files when we create the
-# dynamic libraries.
-DLL_LIBBLIS_DIRPATH = $(DLL_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)
-
-# The install subdirectories.
-INSTALL_PREFIX_LIB = $(INSTALL_PREFIX)\libblis\lib
-INSTALL_PREFIX_DLL = $(INSTALL_PREFIX)\libblis\dll
-INSTALL_PREFIX_INC = $(INSTALL_PREFIX)\libblis\include-$(ARCH_STR)-$(REV_STR)
-
-# Definitions for important header files used in the install-headers rule.
-BUILD_DIRNAME = build
-BLIS_H = blis.h
-
-
-#
-# --- General shell definitions ------------------------------------------------
-#
-
-CD = cd
-DIR = dir
-COPY = copy
-DEL = del /F /Q
-MKDIR = mkdir
-RMDIR = rd /S /Q
-ECHO = echo
-
-
-#
-# --- Helper scripts -----------------------------------------------------------
-#
-
-NMAKE_HELP = .\build\nmake-help.cmd
-
-
-
-#
-# --- Compiler-related definitions ---------------------------------------------
-#
-
-#!include $(VERSION_FILE)
-
-# --- C compiler definitions ---
-
-WINDOWS_BUILD = BLIS_ENABLE_WINDOWS_BUILD
-VERS_STR = 0.0.9
-VERSION = BLIS_VERSION_STRING=\"$(VERS_STR)\"
-
-!if "$(CCOMPILER_STR)"=="icl"
-
-!if "$(BUILD_STR)"=="debug"
-CDEBUG = /Zi
-COPTIM = /Od
-!elseif "$(BUILD_STR)"=="release"
-CDEBUG =
-COPTIM = /Ox
-!endif
-
-CC = icl.exe
-CMISCFLAGS = /nologo
-CLANGFLAGS =
-CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)
-CWARNFLAGS = /w
-CDBGFLAGS = $(CDEBUG)
-COPTFLAGS = $(COPTIM)
-CRTIMEFLAGS = /MT
-CMTHREADFLAGS = /Qopenmp
-CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \
- $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)
-
-!elseif "$(CCOMPILER_STR)"=="cl"
-
-!if "$(BUILD_STR)"=="debug"
-CDEBUG = /Zi
-COPTIM = /Od
-!elseif "$(BUILD_STR)"=="release"
-CDEBUG =
-COPTIM = /Ox
-!endif
-
-CC = cl.exe
-CMISCFLAGS = /nologo
-CLANGFLAGS =
-CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)
-CWARNFLAGS = /w
-CDBGFLAGS = $(CDEBUG)
-COPTFLAGS = $(COPTIM)
-CRTIMEFLAGS = /MT
-CMTHREADFLAGS = /openmp
-CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \
- $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)
-
-!endif
-
-
-
-#
-# --- Library-related definitions ----------------------------------------------
-#
-
-# --- Static library definitions ---
-
-LIBBLIS_LIB = $(LIBBLIS).lib
-
-LIB = lib
-LIB_OPTIONS = /nologo
-LIB_BLI_OUTPUT_ARG = /out:$(LIBBLIS_LIB)
-LIB_BLI_INPUT_ARGS = *.obj
-
-# --- Dynamic library definitions ---
-
-LIBBLIS_DLL = $(LIBBLIS).dll
-
-GENDLL = $(TOP_BUILD_DIR_ABS)\gendll.cmd
-OBJ_LIST_FILE = libblis-objects.txt
-
-SYM_DEF_FILEPATH = $(TOP_BUILD_DIR_ABS)\$(BUILD_DIRNAME)\libblis-symbols.def
-
+#\r
+#\r
+# BLIS \r
+# An object-based framework for developing high-performance BLAS-like\r
+# libraries.\r
+#\r
+# Copyright (C) 2014, The University of Texas at Austin\r
+#\r
+# Redistribution and use in source and binary forms, with or without\r
+# modification, are permitted provided that the following conditions are\r
+# met:\r
+# - Redistributions of source code must retain the above copyright\r
+# notice, this list of conditions and the following disclaimer.\r
+# - Redistributions in binary form must reproduce the above copyright\r
+# notice, this list of conditions and the following disclaimer in the\r
+# documentation and/or other materials provided with the distribution.\r
+# - Neither the name of The University of Texas at Austin nor the names\r
+# of its contributors may be used to endorse or promote products\r
+# derived from this software without specific prior written permission.\r
+#\r
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+#\r
+#\r
+\r
+\r
+#\r
+# --- General build system options --------------------------------------------\r
+#\r
+\r
+# Uncomment this for verbose output from nmake.\r
+# VERBOSE = 1\r
+\r
+# Assign this varible to be the full path to the directory to which you would\r
+# like the BLIS build products to be installed upon running "nmake install".\r
+# The nmake install target will create the install directory and all requisite\r
+# subdirectories if they do not already exist (in which case the user must have\r
+# permission to create these directories).\r
+INSTALL_PREFIX = c:\field\lib\r
+\r
+\r
+#\r
+# --- Important build system filenames ----------------------------------------\r
+#\r
+\r
+# DLL link arguments. The contents of this file should be customized when\r
+# building a dynamically-linked library. The lines of the file should contain\r
+# linker options, library names, and library paths. Note that the library\r
+# paths must be declared in the following form:\r
+#\r
+# /link /LIBPATH:<path1>\r
+# /link /LIBPATH:<path2>\r
+# /link /LIBPATH:<path3>\r
+#\r
+# where <path1>, <path2>, and <path3> are library paths to add to the list\r
+# of paths to search when the linker attempts to locate other libraries\r
+# listed in the file.\r
+LINKARGS_FILENAME = linkargs.txt\r
+LINKARGS_FILEPATH = $(PWD)\$(LINKARGS_FILENAME)\r
+\r
+# Various log file names that capture standard output when VERBOSE is undefined.\r
+CC_LOG_FILE = nmake-cc.log\r
+FC_LOG_FILE = nmake-fc.log\r
+COPY_LOG_FILE = nmake-copy.log\r
+\r
+\r
+#\r
+# --- General name and directory definitions -----------------------------------\r
+#\r
+\r
+# The relative and absolute locations of the top-level Windows build directory.\r
+# This is the directory in which nmake is run (not the directory named "build").\r
+TOP_BUILD_DIR_REL = .\r
+TOP_BUILD_DIR_ABS = $(PWD)\r
+\r
+# The revision string.\r
+REV_STR = r$(REVISION)\r
+\r
+# The names of the libraries.\r
+LIBBLIS_NAME_ONLY = libblis\r
+LIBBLIS = $(LIBBLIS_NAME_ONLY)-$(ARCH_STR)-$(REV_STR)\r
+\r
+# Directories that reside within the top-level Windows directory.\r
+CNF_DIRNAME = config\r
+INC_DIRNAME = include\r
+SRC_DIRNAME = frame\r
+OBJ_DIRNAME = obj\r
+LIB_DIRNAME = lib\r
+DLL_DIRNAME = dll\r
+\r
+# Leaves of interest for Windows.\r
+\r
+# Relative directory paths to each of the above subdirectories.\r
+INC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(INC_DIRNAME)\r
+SRC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(SRC_DIRNAME)\r
+OBJ_DIRPATH = $(TOP_BUILD_DIR_REL)\$(OBJ_DIRNAME)\r
+LIB_DIRPATH = $(TOP_BUILD_DIR_REL)\$(LIB_DIRNAME)\r
+DLL_DIRPATH = $(TOP_BUILD_DIR_REL)\$(DLL_DIRNAME)\r
+\r
+# We only have header files for flamec leaves.\r
+INC_BLI_DIRPATH = $(INC_DIRPATH)\r
+\r
+# We have source code for flamec and lapack2flamec leaves.\r
+SRC_BLI_DIRPATH = $(SRC_DIRPATH)\r
+\r
+\r
+# And we have object file paths corresponding to those source leaves defined\r
+# above.\r
+OBJ_BLI_DIRPATH = $(OBJ_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)\r
+\r
+# Separate directories into which we'll move object files when we create the\r
+# static libraries.\r
+LIB_LIBBLIS_DIRPATH = $(LIB_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)\r
+\r
+# Separate directories into which we'll move object files when we create the\r
+# dynamic libraries.\r
+DLL_LIBBLIS_DIRPATH = $(DLL_DIRPATH)\$(ARCH_STR)\$(BUILD_STR)\r
+\r
+# The install subdirectories.\r
+INSTALL_PREFIX_LIB = $(INSTALL_PREFIX)\libblis\lib\r
+INSTALL_PREFIX_DLL = $(INSTALL_PREFIX)\libblis\dll\r
+INSTALL_PREFIX_INC = $(INSTALL_PREFIX)\libblis\include-$(ARCH_STR)-$(REV_STR)\r
+\r
+# Definitions for important header files used in the install-headers rule.\r
+BUILD_DIRNAME = build\r
+BLIS_H = blis.h\r
+\r
+\r
+#\r
+# --- General shell definitions ------------------------------------------------\r
+#\r
+\r
+CD = cd\r
+DIR = dir\r
+COPY = copy\r
+DEL = del /F /Q\r
+MKDIR = mkdir\r
+RMDIR = rd /S /Q\r
+ECHO = echo\r
+\r
+\r
+#\r
+# --- Helper scripts -----------------------------------------------------------\r
+#\r
+\r
+NMAKE_HELP = .\build\nmake-help.cmd\r
+\r
+\r
+\r
+#\r
+# --- Compiler-related definitions ---------------------------------------------\r
+#\r
+\r
+#!include $(VERSION_FILE)\r
+\r
+# --- C compiler definitions ---\r
+\r
+WINDOWS_BUILD = BLIS_ENABLE_WINDOWS_BUILD\r
+VERS_STR = 0.0.9\r
+VERSION = BLIS_VERSION_STRING=\"$(VERS_STR)\"\r
+\r
+!if "$(CCOMPILER_STR)"=="icl"\r
+\r
+!if "$(BUILD_STR)"=="debug"\r
+CDEBUG = /Zi\r
+COPTIM = /Od\r
+!elseif "$(BUILD_STR)"=="release"\r
+CDEBUG =\r
+COPTIM = /Ox\r
+!endif\r
+\r
+CC = icl.exe\r
+CMISCFLAGS = /nologo\r
+CLANGFLAGS =\r
+CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)\r
+CWARNFLAGS = /w\r
+CDBGFLAGS = $(CDEBUG)\r
+COPTFLAGS = $(COPTIM)\r
+CRTIMEFLAGS = /MT\r
+CMTHREADFLAGS = /Qopenmp\r
+CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \\r
+ $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)\r
+\r
+!elseif "$(CCOMPILER_STR)"=="cl"\r
+\r
+!if "$(BUILD_STR)"=="debug"\r
+CDEBUG = /Zi\r
+COPTIM = /Od\r
+!elseif "$(BUILD_STR)"=="release"\r
+CDEBUG =\r
+COPTIM = /Ox\r
+!endif\r
+\r
+CC = cl.exe\r
+CMISCFLAGS = /nologo\r
+CLANGFLAGS =\r
+CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION)\r
+CWARNFLAGS = /w\r
+CDBGFLAGS = $(CDEBUG)\r
+COPTFLAGS = $(COPTIM)\r
+CRTIMEFLAGS = /MT\r
+CMTHREADFLAGS = /openmp\r
+CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \\r
+ $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS)\r
+\r
+!endif\r
+\r
+\r
+\r
+#\r
+# --- Library-related definitions ----------------------------------------------\r
+#\r
+\r
+# --- Static library definitions ---\r
+\r
+LIBBLIS_LIB = $(LIBBLIS).lib\r
+\r
+LIB = lib\r
+LIB_OPTIONS = /nologo\r
+LIB_BLI_OUTPUT_ARG = /out:$(LIBBLIS_LIB)\r
+LIB_BLI_INPUT_ARGS = *.obj\r
+\r
+# --- Dynamic library definitions ---\r
+\r
+LIBBLIS_DLL = $(LIBBLIS).dll\r
+\r
+GENDLL = $(TOP_BUILD_DIR_ABS)\gendll.cmd\r
+OBJ_LIST_FILE = libblis-objects.txt\r
+\r
+SYM_DEF_FILEPATH = $(TOP_BUILD_DIR_ABS)\$(BUILD_DIRNAME)\libblis-symbols.def\r
+\r
index ccdd18f6449eca2b9686d1cbc09109dee1610f11..a8230623ed348d0a0edb8e8a3598207f51456469 100644 (file)
-attic
-broken
-old
-other
-temp
-tmp
-test
+attic\r
+broken\r
+old\r
+other\r
+temp\r
+tmp\r
+test\r
index 6b8710a711f3b689885aa5c26c6c06bde348e82b..46f8b9aacc801dea0aa2f79a5e2ecc2219fd5806 100644 (file)
-.git
+.git\r
index 338b3b4b3d4ead60aec077ba31dde59c7bddc8ba..98e115e3f6c8286672a27d7ec86e76b189323235 100644 (file)
-c:h
+c:h\r
index 90e8ddca25820b3d53619c40e0c04bdbf2791fb7..fd953520681022305a39500ab3649522630b68cf 100644 (file)
-EXPORTS
-FLA_TWO
-FLA_ONE
-FLA_ONE_HALF
-FLA_ZERO
-FLA_MINUS_ONE_HALF
-FLA_MINUS_ONE
-FLA_MINUS_TWO
-fla_axpyt_cntl_blas
-fla_copyt_cntl_blas
-fla_gemm_cntl_blas
-fla_hemm_cntl_blas
-fla_herk_cntl_blas
-fla_her2k_cntl_blas
-fla_symm_cntl_blas
-fla_syrk_cntl_blas
-fla_syr2k_cntl_blas
-fla_trmm_cntl_blas
-fla_trsm_cntl_blas
-fla_appiv_cntl_unb
-bli_samax
-bli_damax
-bli_camax
-bli_zamax
-bli_sasum
-bli_dasum
-bli_casum
-bli_zasum
-bli_saxpy
-bli_daxpy
-bli_caxpy
-bli_zaxpy
-bli_saxpymt
-bli_daxpymt
-bli_caxpymt
-bli_zaxpymt
-bli_saxpysmt
-bli_daxpysmt
-bli_caxpysmt
-bli_zaxpysmt
-bli_saxpysv
-bli_daxpysv
-bli_caxpysv
-bli_zaxpysv
-bli_saxpyv
-bli_daxpyv
-bli_caxpyv
-bli_zaxpyv
-bli_cconjm
-bli_zconjm
-bli_cconjmr
-bli_zconjmr
-bli_cconjv
-bli_zconjv
-bli_scopy
-bli_dcopy
-bli_ccopy
-bli_zcopy
-bli_scopymr
-bli_dcopymr
-bli_ccopymr
-bli_zcopymr
-bli_scopymt
-bli_dcopymt
-bli_ccopymt
-bli_zcopymt
-bli_scopyv
-bli_dcopyv
-bli_ccopyv
-bli_zcopyv
-bli_sdot
-bli_ddot
-bli_cdot
-bli_zdot
-bli_sdot2s
-bli_ddot2s
-bli_cdot2s
-bli_zdot2s
-bli_sdots
-bli_ddots
-bli_cdots
-bli_zdots
-bli_sinverts
-bli_dinverts
-bli_cinverts
-bli_zinverts
-bli_sinvscalm
-bli_dinvscalm
-bli_csinvscalm
-bli_cinvscalm
-bli_zdinvscalm
-bli_zinvscalm
-bli_sinvscalv
-bli_dinvscalv
-bli_csinvscalv
-bli_cinvscalv
-bli_zdinvscalv
-bli_zinvscalv
-bli_snrm2
-bli_dnrm2
-bli_cnrm2
-bli_znrm2
-bli_sscal
-bli_dscal
-bli_csscal
-bli_cscal
-bli_zdscal
-bli_zscal
-bli_sscalm
-bli_dscalm
-bli_csscalm
-bli_cscalm
-bli_zdscalm
-bli_zscalm
-bli_sscalmr
-bli_dscalmr
-bli_csscalmr
-bli_cscalmr
-bli_zdscalmr
-bli_zscalmr
-bli_sscalv
-bli_dscalv
-bli_csscalv
-bli_cscalv
-bli_zdscalv
-bli_zscalv
-bli_sswap
-bli_dswap
-bli_cswap
-bli_zswap
-bli_sswapmt
-bli_dswapmt
-bli_cswapmt
-bli_zswapmt
-bli_sgemv
-bli_dgemv
-bli_cgemv
-bli_zgemv
-bli_sger
-bli_dger
-bli_cger
-bli_zger
-bli_chemv
-bli_zhemv
-bli_cher
-bli_zher
-bli_cher2
-bli_zher2
-bli_ssymv
-bli_dsymv
-bli_csymv
-bli_zsymv
-bli_ssyr
-bli_dsyr
-bli_csyr
-bli_zsyr
-bli_ssyr2
-bli_dsyr2
-bli_csyr2
-bli_zsyr2
-bli_strmv
-bli_dtrmv
-bli_ctrmv
-bli_ztrmv
-bli_strsv
-bli_dtrsv
-bli_ctrsv
-bli_ztrsv
-bli_sgemm
-bli_dgemm
-bli_cgemm
-bli_zgemm
-bli_chemm
-bli_zhemm
-bli_cherk
-bli_zherk
-bli_cher2k
-bli_zher2k
-bli_ssymm
-bli_dsymm
-bli_csymm
-bli_zsymm
-bli_ssyrk
-bli_dsyrk
-bli_csyrk
-bli_zsyrk
-bli_ssyr2k
-bli_dsyr2k
-bli_csyr2k
-bli_zsyr2k
-bli_strmm
-bli_dtrmm
-bli_ctrmm
-bli_ztrmm
-bli_strsm
-bli_dtrsm
-bli_ctrsm
-bli_ztrsm
-FLASH_Apply_pivots
-FLASH_Apply_pivots_cntl_init
-FLASH_Apply_pivots_cntl_finalize
-FLASH_Apply_Q_UT
-FLASH_Apply_Q_UT_cntl_init
-FLASH_Apply_Q_UT_cntl_finalize
-FLASH_Apply_Q_UT_inc
-FLASH_Apply_Q_UT_inc_cntl_init
-FLASH_Apply_Q_UT_inc_cntl_finalize
-FLASH_Apply_Q_UT_inc_create_workspace
-FLASH_Apply_Q2_UT
-FLASH_Apply_Q2_UT_cntl_init
-FLASH_Apply_Q2_UT_cntl_finalize
-FLASH_Axpy
-FLASH_Axpyt
-FLASH_Axpyt_cntl_init
-FLASH_Axpyt_cntl_finalize
-FLASH_Axpy_cntl_init
-FLASH_Axpy_cntl_finalize
-FLASH_Axpy_buffer_to_hier
-FLASH_Axpy_hier_to_buffer
-FLASH_Axpy_flat_to_hier
-FLASH_Axpy_hier_to_flat
-FLASH_Axpy_hierarchy
-FLASH_Axpy_hierarchy_r
-FLASH_Chol
-FLASH_Chol_cntl_init
-FLASH_Chol_cntl_finalize
-FLASH_Chol_solve
-FLASH_Copy
-FLASH_Copyt
-FLASH_Copyt_cntl_init
-FLASH_Copyt_cntl_finalize
-FLASH_Copy_cntl_init
-FLASH_Copy_cntl_finalize
-FLASH_Copy_buffer_to_hier
-FLASH_Copy_hier_to_buffer
-FLASH_Copy_flat_to_hier
-FLASH_Copy_hier_to_flat
-FLASH_Copy_hierarchy
-FLASH_Copy_hierarchy_r
-FLASH_FS_incpiv
-FLASH_FS_incpiv_aux1
-FLASH_FS_incpiv_aux2
-FLASH_Gemm
-FLASH_Gemm_cntl_init
-FLASH_Gemm_cntl_finalize
-FLASH_Gemv
-FLASH_Gemv_cntl_init
-FLASH_Gemv_cntl_finalize
-FLASH_Hemm
-FLASH_Hemm_cntl_init
-FLASH_Hemm_cntl_finalize
-FLASH_Her2k
-FLASH_Her2k_cntl_init
-FLASH_Her2k_cntl_finalize
-FLASH_Herk
-FLASH_Herk_cntl_init
-FLASH_Herk_cntl_finalize
-FLASH_LU_find_zero_on_diagonal
-FLASH_LU_incpiv
-FLASH_LU_incpiv_cntl_init
-FLASH_LU_incpiv_cntl_finalize
-FLASH_LU_incpiv_create_hier_matrices
-FLASH_LU_incpiv_determine_alg_blocksize
-FLASH_LU_incpiv_noopt
-FLASH_LU_incpiv_opt1
-FLASH_LU_incpiv_solve
-FLASH_LU_incpiv_var1
-FLASH_LU_incpiv_var2
-FLASH_LU_nopiv
-FLASH_LU_nopiv_cntl_init
-FLASH_LU_nopiv_cntl_finalize
-FLASH_LU_nopiv_solve
-FLASH_LU_piv
-FLASH_LU_piv_cntl_init
-FLASH_LU_piv_cntl_finalize
-FLASH_LU_piv_solve
-FLASH_Max_elemwise_diff
-FLASH_Norm1
-FLASH_Obj_datatype
-FLASH_Obj_depth
-FLASH_Obj_blocksizes
-FLASH_Obj_scalar_length
-FLASH_Obj_scalar_width
-FLASH_Obj_create
-FLASH_Obj_create_ext
-FLASH_Obj_create_without_buffer
-FLASH_Obj_create_without_buffer_ext
-FLASH_Obj_create_helper
-FLASH_Obj_create_hierarchy
-FLASH_Obj_create_conf_to
-FLASH_Obj_create_hier_conf_to_flat
-FLASH_Obj_create_hier_conf_to_flat_ext
-FLASH_Obj_create_flat_conf_to_hier
-FLASH_Obj_create_hier_copy_of_flat
-FLASH_Obj_create_hier_copy_of_flat_ext
-FLASH_Obj_create_flat_copy_of_hier
-FLASH_Obj_free
-FLASH_Obj_free_without_buffer
-FLASH_Obj_free_hierarchy
-FLASH_Obj_extract_buffer
-FLASH_Obj_flatten
-FLASH_Obj_hierarchify
-FLASH_Obj_show
-FLASH_Obj_attach_buffer
-FLASH_Obj_attach_buffer_hierarchy
-FLASH_print_struct
-FLASH_print_struct_helper
-FLASH_Obj_create_diag_panel
-FLASH_Obj_exec
-FLASH_Obj_exec_parallel
-FLASH_Obj_push
-FLASH_Set
-FLASH_Shift_diag
-FLASH_QR_UT_cntl_init
-FLASH_QR_UT_cntl_finalize
-FLASH_QR_UT_inc
-FLASH_QR_UT_inc_cntl_init
-FLASH_QR_UT_inc_cntl_finalize
-FLASH_QR_UT_inc_create_hier_matrices
-FLASH_QR_UT_inc_determine_alg_blocksize
-FLASH_QR_UT_inc_noopt
-FLASH_QR_UT_inc_opt1
-FLASH_QR_UT_inc_solve
-FLASH_QR2_UT
-FLASH_QR2_UT_cntl_init
-FLASH_QR2_UT_cntl_finalize
-FLASH_Queue_begin
-FLASH_Queue_end
-FLASH_Queue_stack_depth
-FLASH_Queue_enable
-FLASH_Queue_disable
-FLASH_Queue_get_enabled
-FLASH_Queue_set_num_threads
-FLASH_Queue_get_num_threads
-FLASH_Queue_init
-FLASH_Queue_finalize
-FLASH_Queue_get_num_tasks
-FLASH_Queue_set_verbose_output
-FLASH_Queue_get_verbose_output
-FLASH_Queue_set_sorting
-FLASH_Queue_get_sorting
-FLASH_Queue_set_caching
-FLASH_Queue_get_caching
-FLASH_Queue_set_work_stealing
-FLASH_Queue_get_work_stealing
-FLASH_Queue_set_data_affinity
-FLASH_Queue_get_data_affinity
-FLASH_Queue_get_total_time
-FLASH_Queue_get_parallel_time
-FLASH_Queue_set_parallel_time
-FLASH_Queue_get_num_blocks
-FLASH_Queue_set_block_size
-FLASH_Queue_get_block_size
-FLASH_Queue_set_cache_size
-FLASH_Queue_get_cache_size
-FLASH_Queue_set_cache_line_size
-FLASH_Queue_get_cache_line_size
-FLASH_Queue_set_cores_per_cache
-FLASH_Queue_get_cores_per_cache
-FLASH_Queue_set_cores_per_queue
-FLASH_Queue_get_cores_per_queue
-FLASH_Queue_reset
-FLASH_Queue_get_head_task
-FLASH_Queue_get_tail_task
-FLASH_Queue_push
-FLASH_Queue_push_input
-FLASH_Queue_push_output
-FLASH_Task_alloc
-FLASH_Task_free
-FLASH_Queue_exec_task
-FLASH_Queue_verbose_output
-FLASH_Queue_exec
-FLASH_Queue_init_tasks
-FLASH_Queue_wait_enqueue
-FLASH_Queue_wait_dequeue
-FLASH_Queue_wait_dequeue_block
-FLASH_Queue_reside_in_cache
-FLASH_Queue_update_cache
-FLASH_Queue_update_cache_block
-FLASH_Queue_prefetch
-FLASH_Queue_prefetch_block
-FLASH_Queue_work_stealing
-FLASH_Queue_exec_parallel
-FLASH_Queue_exec_parallel_function
-FLASH_Task_update_dependencies
-FLASH_Task_update_binding
-FLASH_Task_free_parallel
-FLASH_Random_matrix
-FLASH_Random_spd_matrix
-FLASH_SA_FS
-FLASH_SA_LU
-FLASH_SPDinv
-FLASH_SPDinv_cntl_init
-FLASH_SPDinv_cntl_finalize
-FLASH_Sylv
-FLASH_Sylv_cntl_init
-FLASH_Sylv_cntl_finalize
-FLASH_Symm
-FLASH_Symm_cntl_init
-FLASH_Symm_cntl_finalize
-FLASH_Syr2k
-FLASH_Syr2k_cntl_init
-FLASH_Syr2k_cntl_finalize
-FLASH_Syrk
-FLASH_Syrk_cntl_init
-FLASH_Syrk_cntl_finalize
-FLASH_Triangularize
-FLASH_Trinv
-FLASH_Trinv_cntl_init
-FLASH_Trinv_cntl_finalize
-FLASH_Trmm
-FLASH_Trmm_cntl_init
-FLASH_Trmm_cntl_finalize
-FLASH_Trsm
-FLASH_Trsm_cntl_init
-FLASH_Trsm_cntl_finalize
-FLASH_Trsm_piv
-FLASH_Trsv
-FLASH_Trsv_cntl_init
-FLASH_Trsv_cntl_finalize
-FLASH_Ttmm
-FLASH_Ttmm_cntl_init
-FLASH_Ttmm_cntl_finalize
-FLA_Absolute_square
-FLA_Accum_T_UT
-FLA_Accum_T_UT_fc_blk_var2
-FLA_Accum_T_UT_fc_opt_var1
-FLA_Accum_T_UT_fc_ops_var1
-FLA_Accum_T_UT_fc_opd_var1
-FLA_Accum_T_UT_fc_opc_var1
-FLA_Accum_T_UT_fc_opz_var1
-FLA_Accum_T_UT_fc_unb_var1
-FLA_Accum_T_UT_fr_blk_var2
-FLA_Accum_T_UT_fr_opt_var1
-FLA_Accum_T_UT_fr_ops_var1
-FLA_Accum_T_UT_fr_opd_var1
-FLA_Accum_T_UT_fr_opc_var1
-FLA_Accum_T_UT_fr_opz_var1
-FLA_Accum_T_UT_fr_unb_var1
-FLA_Accum_T_UT_internal
-FLA_Amax
-FLA_Amax_external
-FLA_Apply_H2_UT
-FLA_Apply_H2_UT_internal
-FLA_Apply_H2_UT_lh_opt_var1
-FLA_Apply_H2_UT_lh_ops_var1
-FLA_Apply_H2_UT_lh_opd_var1
-FLA_Apply_H2_UT_lh_opc_var1
-FLA_Apply_H2_UT_lh_opz_var1
-FLA_Apply_H2_UT_lh_unb_var1
-FLA_Apply_H2_UT_rh_opt_var1
-FLA_Apply_H2_UT_rh_ops_var1
-FLA_Apply_H2_UT_rh_opd_var1
-FLA_Apply_H2_UT_rh_opc_var1
-FLA_Apply_H2_UT_rh_opz_var1
-FLA_Apply_H2_UT_rh_unb_var1
-FLA_Apply_H2_UT_rn_opt_var1
-FLA_Apply_H2_UT_rn_ops_var1
-FLA_Apply_H2_UT_rn_opd_var1
-FLA_Apply_H2_UT_rn_opc_var1
-FLA_Apply_H2_UT_rn_opz_var1
-FLA_Apply_H2_UT_rn_unb_var1
-FLA_Apply_pivots
-FLA_Apply_pivots_cntl_init
-FLA_Apply_pivots_cntl_finalize
-FLA_Apply_pivots_internal
-FLA_Apply_pivots_ln
-FLA_Apply_pivots_ln_blk_var1
-FLA_Apply_pivots_ln_blk_var2
-FLA_Apply_pivots_ln_opt_var1
-FLA_Apply_pivots_ln_ops_var1
-FLA_Apply_pivots_ln_opd_var1
-FLA_Apply_pivots_ln_opc_var1
-FLA_Apply_pivots_ln_opz_var1
-FLA_Apply_pivots_macro_external
-FLA_Apply_pivots_macro_task
-FLA_Apply_pivots_task
-FLA_Apply_pivots_ln_task
-FLA_Apply_pivots_unb_external
-FLA_Apply_pivots_ln_unb_ext
-FLA_Apply_Q_blk_external
-FLA_Apply_Q_UT
-FLA_Apply_Q_UT_cntl_init
-FLA_Apply_Q_UT_cntl_finalize
-FLA_Apply_Q_UT_create_workspace
-FLA_Apply_Q_UT_inc_internal
-FLA_Apply_Q_UT_inc_lhfc
-FLA_Apply_Q_UT_inc_lhfc_blk_var1
-FLA_Apply_Q_UT_internal
-FLA_Apply_Q_UT_lhfc
-FLA_Apply_Q_UT_lhfc_blk_var1
-FLA_Apply_Q_UT_lhfc_blk_var2
-FLA_Apply_Q_UT_lnfr
-FLA_Apply_Q_UT_lnfr_blk_var1
-FLA_Apply_Q_UT_lnfr_blk_var2
-FLA_Apply_Q_UT_rnfr
-FLA_Apply_Q_UT_rnfr_blk_var1
-FLA_Apply_Q_UT_rnfr_blk_var2
-FLA_Apply_Q_UT_task
-FLA_Apply_Q_UT_lhfc_task
-FLA_Apply_Q_UT_lnfr_task
-FLA_Apply_Q_UT_rnfr_task
-FLA_Apply_Q2_UT_cntl_init
-FLA_Apply_Q2_UT_cntl_finalize
-FLA_Apply_Q2_UT_internal
-FLA_Apply_Q2_UT_lhfc
-FLA_Apply_Q2_UT_lhfc_blk_var1
-FLA_Apply_Q2_UT_lhfc_blk_var2
-FLA_Apply_Q2_UT_lhfc_blk_var3
-FLA_Apply_Q2_UT_task
-FLA_Apply_Q2_UT_lhfc_task
-FLA_Asum
-FLA_Asum_external
-FLA_Axpy
-FLA_Axpys
-FLA_Axpys_external
-FLA_Axpyt
-FLA_Axpyt_c
-FLA_Axpyt_cntl_init
-FLA_Axpyt_cntl_finalize
-FLA_Axpyt_c_blk_var1
-FLA_Axpyt_c_blk_var2
-FLA_Axpyt_c_blk_var3
-FLA_Axpyt_c_blk_var4
-FLA_Axpyt_external
-FLA_Axpyt_h
-FLA_Axpyt_h_blk_var1
-FLA_Axpyt_h_blk_var2
-FLA_Axpyt_h_blk_var3
-FLA_Axpyt_h_blk_var4
-FLA_Axpyt_internal
-FLA_Axpyt_n
-FLA_Axpyt_n_blk_var1
-FLA_Axpyt_n_blk_var2
-FLA_Axpyt_n_blk_var3
-FLA_Axpyt_n_blk_var4
-FLA_Axpyt_t
-FLA_Axpyt_task
-FLA_Axpyt_n_task
-FLA_Axpyt_t_task
-FLA_Axpyt_c_task
-FLA_Axpyt_h_task
-FLA_Axpyt_t_blk_var1
-FLA_Axpyt_t_blk_var2
-FLA_Axpyt_t_blk_var3
-FLA_Axpyt_t_blk_var4
-FLA_Axpy_blk_var1
-FLA_Axpy_blk_var2
-FLA_Axpy_blk_var3
-FLA_Axpy_blk_var4
-FLA_Axpy_cntl_init
-FLA_Axpy_cntl_finalize
-FLA_Axpy_external
-FLA_Axpy_internal
-FLA_Axpy_task
-FLA_Axpy_buffer_to_object
-FLA_Axpy_object_to_buffer
-FLA_Blocksize_create
-FLA_Blocksize_set
-FLA_Blocksize_scale
-FLA_Blocksize_create_copy
-FLA_Blocksize_free
-FLA_Blocksize_extract
-FLA_Query_blocksizes
-FLA_Query_blocksize
-FLA_Determine_blocksize
-FLA_determine_matrix_size
-FLA_Check_error_level
-FLA_Check_error_level_set
-FLA_Check_error_code_helper
-FLA_Check_valid_side
-FLA_Check_valid_uplo
-FLA_Check_valid_trans
-FLA_Check_valid_diag
-FLA_Check_valid_conj
-FLA_Check_valid_direct
-FLA_Check_valid_storev
-FLA_Check_valid_datatype
-FLA_Check_valid_object_datatype
-FLA_Check_floating_datatype
-FLA_Check_int_datatype
-FLA_Check_real_datatype
-FLA_Check_complex_datatype
-FLA_Check_floating_object
-FLA_Check_int_object
-FLA_Check_real_object
-FLA_Check_complex_object
-FLA_Check_identical_object_precision
-FLA_Check_consistent_object_datatype
-FLA_Check_consistent_datatype
-FLA_Check_square
-FLA_Check_if_scalar
-FLA_Check_if_vector
-FLA_Check_conformal_dims
-FLA_Check_matrix_matrix_dims
-FLA_Check_matrix_vector_dims
-FLA_Check_equal_vector_lengths
-FLA_Check_conj_trans_and_datatype
-FLA_Check_vector_length
-FLA_Check_null_pointer
-FLA_Check_object_dims
-FLA_Check_valid_pivot_type
-FLA_Check_malloc_pointer
-FLA_Check_base_buffer_mismatch
-FLA_Check_adjacent_objects_2x2
-FLA_Check_adjacent_objects_2x1
-FLA_Check_adjacent_objects_1x2
-FLA_Check_blocksize_value
-FLA_Check_blocksize_object
-FLA_Check_file_descriptor
-FLA_Check_lseek_result
-FLA_Check_close_result
-FLA_Check_unlink_result
-FLA_Check_read_result
-FLA_Check_write_result
-FLA_Check_valid_quadrant
-FLA_Check_vector_length_min
-FLA_Check_pthread_create_result
-FLA_Check_pthread_join_result
-FLA_Check_valid_isgn_value
-FLA_Check_sylv_matrix_dims
-FLA_Check_chol_failure
-FLA_Check_valid_elemtype
-FLA_Check_posix_memalign_failure
-FLA_Check_submatrix_dims_and_offset
-FLA_Check_object_scalar_elemtype
-FLA_Check_object_matrix_elemtype
-FLA_Check_num_threads
-FLA_Check_conj_and_datatype
-FLA_Check_valid_complex_trans
-FLA_Check_valid_real_trans
-FLA_Check_valid_blas_trans
-FLA_Check_nonconstant_datatype
-FLA_Check_nonconstant_object
-FLA_Check_identical_object_datatype
-FLA_Check_divide_by_zero
-FLA_Check_identical_object_elemtype
-FLA_Check_pivot_index_range
-FLA_Check_householder_panel_dims
-FLA_Check_object_length_equals
-FLA_Check_object_width_equals
-FLA_Check_object_length_min
-FLA_Check_object_width_min
-FLA_Check_valid_error_level
-FLA_Check_attempted_repart_2x2
-FLA_Check_attempted_repart_2x1
-FLA_Check_attempted_repart_1x2
-FLA_Check_valid_leftright_side
-FLA_Check_valid_topbottom_side
-FLA_Check_matrix_strides
-FLA_Chol
-FLA_Chol_blk_external
-FLA_Chol_cntl_init
-FLA_Chol_cntl_finalize
-FLA_Chol_internal
-FLA_Chol_l
-FLA_Chol_l_blk_var1
-FLA_Chol_l_blk_var2
-FLA_Chol_l_blk_var3
-FLA_Chol_l_opt_var1
-FLA_Chol_l_ops_var1
-FLA_Chol_l_opd_var1
-FLA_Chol_l_opc_var1
-FLA_Chol_l_opz_var1
-FLA_Chol_l_opt_var2
-FLA_Chol_l_ops_var2
-FLA_Chol_l_opd_var2
-FLA_Chol_l_opc_var2
-FLA_Chol_l_opz_var2
-FLA_Chol_l_opt_var3
-FLA_Chol_l_ops_var3
-FLA_Chol_l_opd_var3
-FLA_Chol_l_opc_var3
-FLA_Chol_l_opz_var3
-FLA_Chol_l_unb_var1
-FLA_Chol_l_unb_var2
-FLA_Chol_l_unb_var3
-FLA_Chol_solve
-FLA_Chol_task
-FLA_Chol_l_task
-FLA_Chol_u_task
-FLA_Chol_u
-FLA_Chol_unb_external
-FLA_Chol_l_unb_ext
-FLA_Chol_u_unb_ext
-FLA_Chol_u_blk_var1
-FLA_Chol_u_blk_var2
-FLA_Chol_u_blk_var3
-FLA_Chol_u_opt_var1
-FLA_Chol_u_ops_var1
-FLA_Chol_u_opd_var1
-FLA_Chol_u_opc_var1
-FLA_Chol_u_opz_var1
-FLA_Chol_u_opt_var2
-FLA_Chol_u_ops_var2
-FLA_Chol_u_opd_var2
-FLA_Chol_u_opc_var2
-FLA_Chol_u_opz_var2
-FLA_Chol_u_opt_var3
-FLA_Chol_u_ops_var3
-FLA_Chol_u_opd_var3
-FLA_Chol_u_opc_var3
-FLA_Chol_u_opz_var3
-FLA_Chol_u_unb_var1
-FLA_Chol_u_unb_var2
-FLA_Chol_u_unb_var3
-FLA_Clock
-FLA_Clock_helper
-FLA_Cntl_obj_free
-FLA_Cntl_axpy_obj_create
-FLA_Cntl_axpyt_obj_create
-FLA_Cntl_copy_obj_create
-FLA_Cntl_copyt_obj_create
-FLA_Cntl_swap_obj_create
-FLA_Cntl_tpose_obj_create
-FLA_Cntl_gemv_obj_create
-FLA_Cntl_trsv_obj_create
-FLA_Cntl_gemm_obj_create
-FLA_Cntl_hemm_obj_create
-FLA_Cntl_herk_obj_create
-FLA_Cntl_her2k_obj_create
-FLA_Cntl_symm_obj_create
-FLA_Cntl_syrk_obj_create
-FLA_Cntl_syr2k_obj_create
-FLA_Cntl_trmm_obj_create
-FLA_Cntl_trsm_obj_create
-FLA_Cntl_init
-FLA_Cntl_finalize
-FLA_Cntl_init_flamec
-FLA_Cntl_finalize_flamec
-FLA_Cntl_init_flash
-FLA_Cntl_finalize_flash
-FLA_Cntl_chol_obj_create
-FLA_Cntl_lu_obj_create
-FLA_Cntl_appiv_obj_create
-FLA_Cntl_qrut_obj_create
-FLA_Cntl_qrutud_obj_create
-FLA_Cntl_qrutinc_obj_create
-FLA_Cntl_lqut_obj_create
-FLA_Cntl_trinv_obj_create
-FLA_Cntl_ttmm_obj_create
-FLA_Cntl_sylv_obj_create
-FLA_Cntl_spdinv_obj_create
-FLA_Cntl_apqut_obj_create
-FLA_Cntl_apqutud_obj_create
-FLA_Cntl_apqutinc_obj_create
-FLA_Conjugate
-FLA_Conjugate_r
-FLA_Copy
-FLA_Copyr
-FLA_Copyr_external
-FLA_Copyt
-FLA_Copyt_c
-FLA_Copyt_cntl_init
-FLA_Copyt_cntl_finalize
-FLA_Copyt_c_blk_var1
-FLA_Copyt_c_blk_var2
-FLA_Copyt_c_blk_var3
-FLA_Copyt_c_blk_var4
-FLA_Copyt_external
-FLA_Copyt_h
-FLA_Copyt_h_blk_var1
-FLA_Copyt_h_blk_var2
-FLA_Copyt_h_blk_var3
-FLA_Copyt_h_blk_var4
-FLA_Copyt_internal
-FLA_Copyt_n
-FLA_Copyt_n_blk_var1
-FLA_Copyt_n_blk_var2
-FLA_Copyt_n_blk_var3
-FLA_Copyt_n_blk_var4
-FLA_Copyt_t
-FLA_Copyt_task
-FLA_Copyt_n_task
-FLA_Copyt_t_task
-FLA_Copyt_c_task
-FLA_Copyt_h_task
-FLA_Copyt_t_blk_var1
-FLA_Copyt_t_blk_var2
-FLA_Copyt_t_blk_var3
-FLA_Copyt_t_blk_var4
-FLA_Copy_blk_var1
-FLA_Copy_blk_var2
-FLA_Copy_blk_var3
-FLA_Copy_blk_var4
-FLA_Copy_cntl_init
-FLA_Copy_cntl_finalize
-FLA_Copy_external
-FLA_Copy_internal
-FLA_Copy_task
-FLA_Copy_buffer_to_object
-FLA_Copy_object_to_buffer
-FLA_Dot
-FLA_Dot2cs
-FLA_Dot2cs_external
-FLA_Dot2s
-FLA_Dot2s_external
-FLA_Dotc
-FLA_Dotcs
-FLA_Dotcs_external
-FLA_Dotc_external
-FLA_Dots
-FLA_Dots_external
-FLA_Dot_external
-FLA_Error_string_for_code
-FLA_Error_messages_init
-FLA_Print_message
-FLA_Abort
-FLA_Form_perm_matrix
-FLA_Gemm
-FLA_Gemm_cntl_init
-FLA_Gemm_cntl_finalize
-FLA_Gemm_external
-FLA_Gemm_hh
-FLA_Gemm_hh_blk_var1
-FLA_Gemm_hh_blk_var2
-FLA_Gemm_hh_blk_var3
-FLA_Gemm_hh_blk_var4
-FLA_Gemm_hh_blk_var5
-FLA_Gemm_hh_blk_var6
-FLA_Gemm_hh_unb_var1
-FLA_Gemm_hh_unb_var2
-FLA_Gemm_hh_unb_var3
-FLA_Gemm_hh_unb_var4
-FLA_Gemm_hh_unb_var5
-FLA_Gemm_hh_unb_var6
-FLA_Gemm_hn
-FLA_Gemm_hn_blk_var1
-FLA_Gemm_hn_blk_var2
-FLA_Gemm_hn_blk_var3
-FLA_Gemm_hn_blk_var4
-FLA_Gemm_hn_blk_var5
-FLA_Gemm_hn_blk_var6
-FLA_Gemm_hn_unb_var1
-FLA_Gemm_hn_unb_var2
-FLA_Gemm_hn_unb_var3
-FLA_Gemm_hn_unb_var4
-FLA_Gemm_hn_unb_var5
-FLA_Gemm_hn_unb_var6
-FLA_Gemm_ht
-FLA_Gemm_ht_blk_var1
-FLA_Gemm_ht_blk_var2
-FLA_Gemm_ht_blk_var3
-FLA_Gemm_ht_blk_var4
-FLA_Gemm_ht_blk_var5
-FLA_Gemm_ht_blk_var6
-FLA_Gemm_ht_unb_var1
-FLA_Gemm_ht_unb_var2
-FLA_Gemm_ht_unb_var3
-FLA_Gemm_ht_unb_var4
-FLA_Gemm_ht_unb_var5
-FLA_Gemm_ht_unb_var6
-FLA_Gemm_internal
-FLA_Gemm_nh
-FLA_Gemm_nh_blk_var1
-FLA_Gemm_nh_blk_var2
-FLA_Gemm_nh_blk_var3
-FLA_Gemm_nh_blk_var4
-FLA_Gemm_nh_blk_var5
-FLA_Gemm_nh_blk_var6
-FLA_Gemm_nh_unb_var1
-FLA_Gemm_nh_unb_var2
-FLA_Gemm_nh_unb_var3
-FLA_Gemm_nh_unb_var4
-FLA_Gemm_nh_unb_var5
-FLA_Gemm_nh_unb_var6
-FLA_Gemm_nn
-FLA_Gemm_nn_blk_var1
-FLA_Gemm_nn_blk_var2
-FLA_Gemm_nn_blk_var3
-FLA_Gemm_nn_blk_var4
-FLA_Gemm_nn_blk_var5
-FLA_Gemm_nn_blk_var6
-FLA_Gemm_nn_unb_var1
-FLA_Gemm_nn_unb_var2
-FLA_Gemm_nn_unb_var3
-FLA_Gemm_nn_unb_var4
-FLA_Gemm_nn_unb_var5
-FLA_Gemm_nn_unb_var6
-FLA_Gemm_nt
-FLA_Gemm_nt_blk_var1
-FLA_Gemm_nt_blk_var2
-FLA_Gemm_nt_blk_var3
-FLA_Gemm_nt_blk_var4
-FLA_Gemm_nt_blk_var5
-FLA_Gemm_nt_blk_var6
-FLA_Gemm_nt_unb_var1
-FLA_Gemm_nt_unb_var2
-FLA_Gemm_nt_unb_var3
-FLA_Gemm_nt_unb_var4
-FLA_Gemm_nt_unb_var5
-FLA_Gemm_nt_unb_var6
-FLA_Gemm_task
-FLA_Gemm_hh_task
-FLA_Gemm_hn_task
-FLA_Gemm_ht_task
-FLA_Gemm_nh_task
-FLA_Gemm_nn_task
-FLA_Gemm_nt_task
-FLA_Gemm_th_task
-FLA_Gemm_tn_task
-FLA_Gemm_tt_task
-FLA_Gemm_th
-FLA_Gemm_th_blk_var1
-FLA_Gemm_th_blk_var2
-FLA_Gemm_th_blk_var3
-FLA_Gemm_th_blk_var4
-FLA_Gemm_th_blk_var5
-FLA_Gemm_th_blk_var6
-FLA_Gemm_th_unb_var1
-FLA_Gemm_th_unb_var2
-FLA_Gemm_th_unb_var3
-FLA_Gemm_th_unb_var4
-FLA_Gemm_th_unb_var5
-FLA_Gemm_th_unb_var6
-FLA_Gemm_tn
-FLA_Gemm_tn_blk_var1
-FLA_Gemm_tn_blk_var2
-FLA_Gemm_tn_blk_var3
-FLA_Gemm_tn_blk_var4
-FLA_Gemm_tn_blk_var5
-FLA_Gemm_tn_blk_var6
-FLA_Gemm_tn_unb_var1
-FLA_Gemm_tn_unb_var2
-FLA_Gemm_tn_unb_var3
-FLA_Gemm_tn_unb_var4
-FLA_Gemm_tn_unb_var5
-FLA_Gemm_tn_unb_var6
-FLA_Gemm_tt
-FLA_Gemm_tt_blk_var1
-FLA_Gemm_tt_blk_var2
-FLA_Gemm_tt_blk_var3
-FLA_Gemm_tt_blk_var4
-FLA_Gemm_tt_blk_var5
-FLA_Gemm_tt_blk_var6
-FLA_Gemm_tt_unb_var1
-FLA_Gemm_tt_unb_var2
-FLA_Gemm_tt_unb_var3
-FLA_Gemm_tt_unb_var4
-FLA_Gemm_tt_unb_var5
-FLA_Gemm_tt_unb_var6
-FLA_Gemp
-FLA_Gemv
-FLA_Gemvc
-FLA_Gemvc_external
-FLA_Gemv_c
-FLA_Gemv_cntl_init
-FLA_Gemv_cntl_finalize
-FLA_Gemv_c_blk_var1
-FLA_Gemv_c_blk_var2
-FLA_Gemv_c_blk_var5
-FLA_Gemv_c_blk_var6
-FLA_Gemv_external
-FLA_Gemv_internal
-FLA_Gemv_n
-FLA_Gemv_n_blk_var1
-FLA_Gemv_n_blk_var2
-FLA_Gemv_n_blk_var5
-FLA_Gemv_n_blk_var6
-FLA_Gemv_t
-FLA_Gemv_task
-FLA_Gemv_c_task
-FLA_Gemv_n_task
-FLA_Gemv_t_task
-FLA_Gemv_t_blk_var1
-FLA_Gemv_t_blk_var2
-FLA_Gemv_t_blk_var5
-FLA_Gemv_t_blk_var6
-FLA_Gepm
-FLA_Gepp
-FLA_Ger
-FLA_Gerc
-FLA_Gerc_external
-FLA_Ger_external
-FLA_Hemm
-FLA_Hemm_cntl_init
-FLA_Hemm_cntl_finalize
-FLA_Hemm_external
-FLA_Hemm_internal
-FLA_Hemm_ll
-FLA_Hemm_ll_blk_var1
-FLA_Hemm_ll_blk_var10
-FLA_Hemm_ll_blk_var2
-FLA_Hemm_ll_blk_var3
-FLA_Hemm_ll_blk_var4
-FLA_Hemm_ll_blk_var5
-FLA_Hemm_ll_blk_var6
-FLA_Hemm_ll_blk_var7
-FLA_Hemm_ll_blk_var8
-FLA_Hemm_ll_blk_var9
-FLA_Hemm_ll_unb_var1
-FLA_Hemm_ll_unb_var10
-FLA_Hemm_ll_unb_var2
-FLA_Hemm_ll_unb_var3
-FLA_Hemm_ll_unb_var4
-FLA_Hemm_ll_unb_var5
-FLA_Hemm_ll_unb_var6
-FLA_Hemm_ll_unb_var7
-FLA_Hemm_ll_unb_var8
-FLA_Hemm_ll_unb_var9
-FLA_Hemm_lu
-FLA_Hemm_lu_blk_var1
-FLA_Hemm_lu_blk_var10
-FLA_Hemm_lu_blk_var2
-FLA_Hemm_lu_blk_var3
-FLA_Hemm_lu_blk_var4
-FLA_Hemm_lu_blk_var5
-FLA_Hemm_lu_blk_var6
-FLA_Hemm_lu_blk_var7
-FLA_Hemm_lu_blk_var8
-FLA_Hemm_lu_blk_var9
-FLA_Hemm_lu_unb_var1
-FLA_Hemm_lu_unb_var10
-FLA_Hemm_lu_unb_var2
-FLA_Hemm_lu_unb_var3
-FLA_Hemm_lu_unb_var4
-FLA_Hemm_lu_unb_var5
-FLA_Hemm_lu_unb_var6
-FLA_Hemm_lu_unb_var7
-FLA_Hemm_lu_unb_var8
-FLA_Hemm_lu_unb_var9
-FLA_Hemm_rl
-FLA_Hemm_rl_blk_var1
-FLA_Hemm_rl_blk_var10
-FLA_Hemm_rl_blk_var2
-FLA_Hemm_rl_blk_var3
-FLA_Hemm_rl_blk_var4
-FLA_Hemm_rl_blk_var5
-FLA_Hemm_rl_blk_var6
-FLA_Hemm_rl_blk_var7
-FLA_Hemm_rl_blk_var8
-FLA_Hemm_rl_blk_var9
-FLA_Hemm_rl_unb_var1
-FLA_Hemm_rl_unb_var10
-FLA_Hemm_rl_unb_var2
-FLA_Hemm_rl_unb_var3
-FLA_Hemm_rl_unb_var4
-FLA_Hemm_rl_unb_var5
-FLA_Hemm_rl_unb_var6
-FLA_Hemm_rl_unb_var7
-FLA_Hemm_rl_unb_var8
-FLA_Hemm_rl_unb_var9
-FLA_Hemm_ru
-FLA_Hemm_ru_blk_var1
-FLA_Hemm_ru_blk_var10
-FLA_Hemm_ru_blk_var2
-FLA_Hemm_ru_blk_var3
-FLA_Hemm_ru_blk_var4
-FLA_Hemm_ru_blk_var5
-FLA_Hemm_ru_blk_var6
-FLA_Hemm_ru_blk_var7
-FLA_Hemm_ru_blk_var8
-FLA_Hemm_ru_blk_var9
-FLA_Hemm_ru_unb_var1
-FLA_Hemm_ru_unb_var10
-FLA_Hemm_ru_unb_var2
-FLA_Hemm_ru_unb_var3
-FLA_Hemm_ru_unb_var4
-FLA_Hemm_ru_unb_var5
-FLA_Hemm_ru_unb_var6
-FLA_Hemm_ru_unb_var7
-FLA_Hemm_ru_unb_var8
-FLA_Hemm_ru_unb_var9
-FLA_Hemm_task
-FLA_Hemm_ll_task
-FLA_Hemm_lu_task
-FLA_Hemm_rl_task
-FLA_Hemm_ru_task
-FLA_Hemv
-FLA_Hemvc
-FLA_Hemvc_external
-FLA_Hemv_external
-FLA_Her
-FLA_Her2
-FLA_Her2c
-FLA_Her2c_external
-FLA_Her2k
-FLA_Her2k_cntl_init
-FLA_Her2k_cntl_finalize
-FLA_Her2k_external
-FLA_Her2k_internal
-FLA_Her2k_lh
-FLA_Her2k_lh_blk_var1
-FLA_Her2k_lh_blk_var10
-FLA_Her2k_lh_blk_var2
-FLA_Her2k_lh_blk_var3
-FLA_Her2k_lh_blk_var4
-FLA_Her2k_lh_blk_var5
-FLA_Her2k_lh_blk_var6
-FLA_Her2k_lh_blk_var7
-FLA_Her2k_lh_blk_var8
-FLA_Her2k_lh_blk_var9
-FLA_Her2k_lh_unb_var1
-FLA_Her2k_lh_unb_var10
-FLA_Her2k_lh_unb_var2
-FLA_Her2k_lh_unb_var3
-FLA_Her2k_lh_unb_var4
-FLA_Her2k_lh_unb_var5
-FLA_Her2k_lh_unb_var6
-FLA_Her2k_lh_unb_var7
-FLA_Her2k_lh_unb_var8
-FLA_Her2k_lh_unb_var9
-FLA_Her2k_ln
-FLA_Her2k_ln_blk_var1
-FLA_Her2k_ln_blk_var10
-FLA_Her2k_ln_blk_var2
-FLA_Her2k_ln_blk_var3
-FLA_Her2k_ln_blk_var4
-FLA_Her2k_ln_blk_var5
-FLA_Her2k_ln_blk_var6
-FLA_Her2k_ln_blk_var7
-FLA_Her2k_ln_blk_var8
-FLA_Her2k_ln_blk_var9
-FLA_Her2k_ln_unb_var1
-FLA_Her2k_ln_unb_var10
-FLA_Her2k_ln_unb_var2
-FLA_Her2k_ln_unb_var3
-FLA_Her2k_ln_unb_var4
-FLA_Her2k_ln_unb_var5
-FLA_Her2k_ln_unb_var6
-FLA_Her2k_ln_unb_var7
-FLA_Her2k_ln_unb_var8
-FLA_Her2k_ln_unb_var9
-FLA_Her2k_task
-FLA_Her2k_ln_task
-FLA_Her2k_lh_task
-FLA_Her2k_un_task
-FLA_Her2k_uh_task
-FLA_Her2k_uh
-FLA_Her2k_uh_blk_var1
-FLA_Her2k_uh_blk_var10
-FLA_Her2k_uh_blk_var2
-FLA_Her2k_uh_blk_var3
-FLA_Her2k_uh_blk_var4
-FLA_Her2k_uh_blk_var5
-FLA_Her2k_uh_blk_var6
-FLA_Her2k_uh_blk_var7
-FLA_Her2k_uh_blk_var8
-FLA_Her2k_uh_blk_var9
-FLA_Her2k_uh_unb_var1
-FLA_Her2k_uh_unb_var10
-FLA_Her2k_uh_unb_var2
-FLA_Her2k_uh_unb_var3
-FLA_Her2k_uh_unb_var4
-FLA_Her2k_uh_unb_var5
-FLA_Her2k_uh_unb_var6
-FLA_Her2k_uh_unb_var7
-FLA_Her2k_uh_unb_var8
-FLA_Her2k_uh_unb_var9
-FLA_Her2k_un
-FLA_Her2k_un_blk_var1
-FLA_Her2k_un_blk_var10
-FLA_Her2k_un_blk_var2
-FLA_Her2k_un_blk_var3
-FLA_Her2k_un_blk_var4
-FLA_Her2k_un_blk_var5
-FLA_Her2k_un_blk_var6
-FLA_Her2k_un_blk_var7
-FLA_Her2k_un_blk_var8
-FLA_Her2k_un_blk_var9
-FLA_Her2k_un_unb_var1
-FLA_Her2k_un_unb_var10
-FLA_Her2k_un_unb_var2
-FLA_Her2k_un_unb_var3
-FLA_Her2k_un_unb_var4
-FLA_Her2k_un_unb_var5
-FLA_Her2k_un_unb_var6
-FLA_Her2k_un_unb_var7
-FLA_Her2k_un_unb_var8
-FLA_Her2k_un_unb_var9
-FLA_Her2_external
-FLA_Herc
-FLA_Herc_external
-FLA_Herk
-FLA_Herk_cntl_init
-FLA_Herk_cntl_finalize
-FLA_Herk_external
-FLA_Herk_internal
-FLA_Herk_lh
-FLA_Herk_lh_blk_var1
-FLA_Herk_lh_blk_var2
-FLA_Herk_lh_blk_var3
-FLA_Herk_lh_blk_var4
-FLA_Herk_lh_blk_var5
-FLA_Herk_lh_blk_var6
-FLA_Herk_lh_unb_var1
-FLA_Herk_lh_unb_var2
-FLA_Herk_lh_unb_var3
-FLA_Herk_lh_unb_var4
-FLA_Herk_lh_unb_var5
-FLA_Herk_lh_unb_var6
-FLA_Herk_ln
-FLA_Herk_ln_blk_var1
-FLA_Herk_ln_blk_var2
-FLA_Herk_ln_blk_var3
-FLA_Herk_ln_blk_var4
-FLA_Herk_ln_blk_var5
-FLA_Herk_ln_blk_var6
-FLA_Herk_ln_unb_var1
-FLA_Herk_ln_unb_var2
-FLA_Herk_ln_unb_var3
-FLA_Herk_ln_unb_var4
-FLA_Herk_ln_unb_var5
-FLA_Herk_ln_unb_var6
-FLA_Herk_task
-FLA_Herk_ln_task
-FLA_Herk_lh_task
-FLA_Herk_un_task
-FLA_Herk_uh_task
-FLA_Herk_uh
-FLA_Herk_uh_blk_var1
-FLA_Herk_uh_blk_var2
-FLA_Herk_uh_blk_var3
-FLA_Herk_uh_blk_var4
-FLA_Herk_uh_blk_var5
-FLA_Herk_uh_blk_var6
-FLA_Herk_uh_unb_var1
-FLA_Herk_uh_unb_var2
-FLA_Herk_uh_unb_var3
-FLA_Herk_uh_unb_var4
-FLA_Herk_uh_unb_var5
-FLA_Herk_uh_unb_var6
-FLA_Herk_un
-FLA_Herk_un_blk_var1
-FLA_Herk_un_blk_var2
-FLA_Herk_un_blk_var3
-FLA_Herk_un_blk_var4
-FLA_Herk_un_blk_var5
-FLA_Herk_un_blk_var6
-FLA_Herk_un_unb_var1
-FLA_Herk_un_unb_var2
-FLA_Herk_un_unb_var3
-FLA_Herk_un_unb_var4
-FLA_Herk_un_unb_var5
-FLA_Herk_un_unb_var6
-FLA_Hermitianize
-FLA_Her_external
-FLA_Househ2_UT
-FLA_Househ2_UT_ops
-FLA_Househ2_UT_opd
-FLA_Househ2_UT_opc
-FLA_Househ2_UT_opz
-FLA_Init
-FLA_Finalize
-FLA_Init_safe
-FLA_Finalize_safe
-FLA_Initialized
-FLA_Init_constants
-FLA_Finalize_constants
-FLA_Invert
-FLA_Inv_scal
-FLA_Inv_scalc
-FLA_Inv_scalc_external
-FLA_Inv_scal_external
-FLA_Lock_init
-FLA_Lock_acquire
-FLA_Lock_release
-FLA_Lock_destroy
-FLA_LQ_blk_external
-FLA_LQ_unb_external
-FLA_LQ_UT
-FLA_LQ_UT_Accum_T_blk_var1
-FLA_LQ_UT_Accum_T_opt_var1
-FLA_LQ_UT_Accum_T_ops_var1
-FLA_LQ_UT_Accum_T_opd_var1
-FLA_LQ_UT_Accum_T_opc_var1
-FLA_LQ_UT_Accum_T_opz_var1
-FLA_LQ_UT_Accum_T_unb_var1
-FLA_LQ_UT_blk_var2
-FLA_LQ_UT_cntl_init
-FLA_LQ_UT_cntl_finalize
-FLA_LQ_UT_create_T
-FLA_LQ_UT_internal
-FLA_LQ_UT_opt_var2
-FLA_LQ_UT_ops_var2
-FLA_LQ_UT_opd_var2
-FLA_LQ_UT_opc_var2
-FLA_LQ_UT_opz_var2
-FLA_LQ_UT_recover_tau
-FLA_LQ_UT_recover_tau_submatrix
-FLA_LQ_UT_solve
-FLA_LQ_UT_task
-FLA_LQ_UT_unb_var2
-FLA_LU_find_zero_on_diagonal
-FLA_LU_nopiv
-FLA_LU_nopiv_blk_var1
-FLA_LU_nopiv_blk_var2
-FLA_LU_nopiv_blk_var3
-FLA_LU_nopiv_blk_var4
-FLA_LU_nopiv_blk_var5
-FLA_LU_nopiv_cntl_init
-FLA_LU_nopiv_cntl_finalize
-FLA_LU_nopiv_internal
-FLA_LU_nopiv_opt_var1
-FLA_LU_nopiv_ops_var1
-FLA_LU_nopiv_opd_var1
-FLA_LU_nopiv_opc_var1
-FLA_LU_nopiv_opz_var1
-FLA_LU_nopiv_opt_var2
-FLA_LU_nopiv_ops_var2
-FLA_LU_nopiv_opd_var2
-FLA_LU_nopiv_opc_var2
-FLA_LU_nopiv_opz_var2
-FLA_LU_nopiv_opt_var3
-FLA_LU_nopiv_ops_var3
-FLA_LU_nopiv_opd_var3
-FLA_LU_nopiv_opc_var3
-FLA_LU_nopiv_opz_var3
-FLA_LU_nopiv_opt_var4
-FLA_LU_nopiv_ops_var4
-FLA_LU_nopiv_opd_var4
-FLA_LU_nopiv_opc_var4
-FLA_LU_nopiv_opz_var4
-FLA_LU_nopiv_opt_var5
-FLA_LU_nopiv_ops_var5
-FLA_LU_nopiv_opd_var5
-FLA_LU_nopiv_opc_var5
-FLA_LU_nopiv_opz_var5
-FLA_LU_nopiv_solve
-FLA_LU_nopiv_task
-FLA_LU_nopiv_unb_var1
-FLA_LU_nopiv_unb_var2
-FLA_LU_nopiv_unb_var3
-FLA_LU_nopiv_unb_var4
-FLA_LU_nopiv_unb_var5
-FLA_LU_piv
-FLA_LU_piv_blk_external
-FLA_LU_piv_blk_var3
-FLA_LU_piv_blk_var4
-FLA_LU_piv_blk_var5
-FLA_LU_piv_cntl_init
-FLA_LU_piv_cntl_finalize
-FLA_LU_piv_copy_task
-FLA_LU_piv_internal
-FLA_LU_piv_macro_task
-FLA_LU_piv_opt_var3
-FLA_LU_piv_ops_var3
-FLA_LU_piv_opd_var3
-FLA_LU_piv_opc_var3
-FLA_LU_piv_opz_var3
-FLA_LU_piv_opt_var4
-FLA_LU_piv_ops_var4
-FLA_LU_piv_opd_var4
-FLA_LU_piv_opc_var4
-FLA_LU_piv_opz_var4
-FLA_LU_piv_opt_var5
-FLA_LU_piv_ops_var5
-FLA_LU_piv_opd_var5
-FLA_LU_piv_opc_var5
-FLA_LU_piv_opz_var5
-FLA_LU_piv_solve
-FLA_LU_piv_task
-FLA_LU_piv_unb_external
-FLA_LU_piv_unb_ext
-FLA_LU_piv_unb_var3
-FLA_LU_piv_unb_var3b
-FLA_LU_piv_unb_var4
-FLA_LU_piv_unb_var5
-FLA_Max_abs_value
-FLA_Max_elemwise_diff
-FLA_Memory_leak_counter_init
-FLA_Memory_leak_counter_finalize
-FLA_Memory_leak_counter_status
-FLA_Memory_leak_counter_set
-FLA_malloc
-FLA_realloc
-FLA_free
-FLA_Set
-FLA_Obj_extract_real_scalar
-FLA_Set_diag
-FLA_Set_to_identity
-FLA_Add_to_diag
-FLA_Shift_diag
-FLA_Scale_diag
-FLA_Obj_fshow
-FLA_Obj_show
-FLA_Mult_add
-FLA_Negate
-FLA_Norm1
-FLA_Norm_inf
-FLA_Nrm2
-FLA_Nrm2_external
-FLA_Obj_create
-FLA_Obj_create_ext
-FLA_align_ldim
-FLA_Obj_create_conf_to
-FLA_Obj_create_copy_of
-FLA_Obj_create_without_buffer
-FLA_Obj_create_constant
-FLA_Obj_create_complex_constant
-FLA_Obj_attach_buffer
-FLA_Obj_free
-FLA_Obj_free_without_buffer
-FLA_Param_map_flame_to_netlib_trans
-FLA_Param_map_flame_to_netlib_uplo
-FLA_Param_map_flame_to_netlib_side
-FLA_Param_map_flame_to_netlib_diag
-FLA_Param_map_flame_to_netlib_direct
-FLA_Param_map_flame_to_netlib_storev
-FLA_Param_map_flame_to_blis_trans
-FLA_Param_map_flame_to_blis_conj
-FLA_Param_map_flame_to_blis_uplo
-FLA_Param_map_flame_to_blis_side
-FLA_Param_map_flame_to_blis_diag
-FLA_Param_map_blis_to_netlib_trans
-FLA_Param_map_blis_to_netlib_uplo
-FLA_Param_map_blis_to_netlib_side
-FLA_Param_map_blis_to_netlib_diag
-FLA_Param_map_netlib_to_flame_trans
-FLA_Param_map_netlib_to_flame_uplo
-FLA_Param_map_netlib_to_flame_side
-FLA_Param_map_netlib_to_flame_diag
-FLA_Param_map_blislapack_to_flame_trans
-FLA_Param_map_blislapack_to_flame_uplo
-FLA_Param_map_blislapack_to_flame_side
-FLA_Param_map_blislapack_to_flame_diag
-FLA_QR_blk_external
-FLA_QR_unb_external
-FLA_QR_UT
-FLA_QR_UT_Accum_T_blk_var1
-FLA_QR_UT_Accum_T_opt_var1
-FLA_QR_UT_Accum_T_ops_var1
-FLA_QR_UT_Accum_T_opd_var1
-FLA_QR_UT_Accum_T_opc_var1
-FLA_QR_UT_Accum_T_opz_var1
-FLA_QR_UT_Accum_T_unb_var1
-FLA_QR_UT_blk_var2
-FLA_QR_UT_cntl_init
-FLA_QR_UT_cntl_finalize
-FLA_QR_UT_copy_internal
-FLA_QR_UT_copy_task
-FLA_QR_UT_create_T
-FLA_QR_UT_inc_blk_var1
-FLA_QR_UT_inc_blk_var2
-FLA_QR_UT_internal
-FLA_QR_UT_opt_var2
-FLA_QR_UT_ops_var2
-FLA_QR_UT_opd_var2
-FLA_QR_UT_opc_var2
-FLA_QR_UT_opz_var2
-FLA_QR_UT_recover_tau
-FLA_QR_UT_recover_tau_submatrix
-FLA_QR_UT_solve
-FLA_QR_UT_task
-FLA_QR2_UT_Accum_T_opt_var1
-FLA_QR2_UT_Accum_T_ops_var1
-FLA_QR2_UT_Accum_T_opd_var1
-FLA_QR2_UT_Accum_T_opc_var1
-FLA_QR2_UT_Accum_T_opz_var1
-FLA_QR2_UT_Accum_T_unb_var1
-FLA_QR2_UT_blk_var1
-FLA_QR2_UT_blk_var2
-FLA_QR2_UT_cntl_init
-FLA_QR2_UT_cntl_finalize
-FLA_QR2_UT_internal
-FLA_QR2_UT_task
-FLA_QR2_UT_unb_var2
-FLA_Obj_datatype
-FLA_Obj_datatype_proj_to_real
-FLA_Obj_elemtype
-FLA_Obj_datatype_size
-FLA_Obj_elem_size
-FLA_Obj_length
-FLA_Obj_width
-FLA_Obj_vector_dim
-FLA_Obj_vector_inc
-FLA_Obj_min_dim
-FLA_Obj_max_dim
-FLA_Obj_row_stride
-FLA_Obj_col_stride
-FLA_Obj_buffer
-FLA_Obj_is_int
-FLA_Obj_is_floating_point
-FLA_Obj_is_constant
-FLA_Obj_is_real
-FLA_Obj_is_complex
-FLA_Obj_is_single_precision
-FLA_Obj_is_double_precision
-FLA_Obj_is_scalar
-FLA_Obj_is_vector
-FLA_Obj_has_zero_dim
-FLA_Obj_is_col_major
-FLA_Obj_is_row_major
-FLA_Obj_is_conformal_to
-FLA_Obj_is
-FLA_Obj_equals
-FLA_Random_herm_matrix
-FLA_Random_matrix
-FLA_random_float
-FLA_random_double
-FLA_random_scomplex
-FLA_random_dcomplex
-FLA_Random_spd_matrix
-FLA_Random_tri_matrix
-FLA_SA_Apply_pivots
-FLA_SA_FS_blk
-FLA_SA_FS_task
-FLA_SA_LU_blk
-FLA_SA_LU_task
-FLA_SA_LU_unb
-FLA_Scal
-FLA_Scalc
-FLA_Scalc_external
-FLA_Scalr
-FLA_Scalr_external
-FLA_Scal_external
-FLA_Shift_pivots_to
-FLA_SPDinv
-FLA_SPDinv_blk_external
-FLA_SPDinv_cntl_init
-FLA_SPDinv_cntl_finalize
-FLA_SPDinv_internal
-FLA_Sqrt
-FLA_Swap
-FLA_Swapt
-FLA_Swapt_external
-FLA_Swap_external
-FLA_Swap_t_blk_var1
-FLA_Swap_t_blk_var2
-FLA_Sylv
-FLA_Sylv_blk_external
-FLA_Sylv_cntl_init
-FLA_Sylv_cntl_finalize
-FLA_Sylv_hh
-FLA_Sylv_hh_blk_var1
-FLA_Sylv_hh_blk_var10
-FLA_Sylv_hh_blk_var11
-FLA_Sylv_hh_blk_var12
-FLA_Sylv_hh_blk_var13
-FLA_Sylv_hh_blk_var14
-FLA_Sylv_hh_blk_var15
-FLA_Sylv_hh_blk_var16
-FLA_Sylv_hh_blk_var17
-FLA_Sylv_hh_blk_var18
-FLA_Sylv_hh_blk_var2
-FLA_Sylv_hh_blk_var3
-FLA_Sylv_hh_blk_var4
-FLA_Sylv_hh_blk_var5
-FLA_Sylv_hh_blk_var6
-FLA_Sylv_hh_blk_var7
-FLA_Sylv_hh_blk_var8
-FLA_Sylv_hh_blk_var9
-FLA_Sylv_hh_opt_var1
-FLA_Sylv_hh_ops_var1
-FLA_Sylv_hh_opd_var1
-FLA_Sylv_hh_opc_var1
-FLA_Sylv_hh_opz_var1
-FLA_Sylv_hh_opt_var10
-FLA_Sylv_hh_opt_var11
-FLA_Sylv_hh_opt_var12
-FLA_Sylv_hh_opt_var13
-FLA_Sylv_hh_opt_var14
-FLA_Sylv_hh_opt_var15
-FLA_Sylv_hh_opt_var16
-FLA_Sylv_hh_opt_var17
-FLA_Sylv_hh_opt_var18
-FLA_Sylv_hh_opt_var2
-FLA_Sylv_hh_opt_var3
-FLA_Sylv_hh_opt_var4
-FLA_Sylv_hh_opt_var5
-FLA_Sylv_hh_opt_var6
-FLA_Sylv_hh_opt_var7
-FLA_Sylv_hh_opt_var8
-FLA_Sylv_hh_opt_var9
-FLA_Sylv_hn
-FLA_Sylv_hn_blk_var1
-FLA_Sylv_hn_blk_var10
-FLA_Sylv_hn_blk_var11
-FLA_Sylv_hn_blk_var12
-FLA_Sylv_hn_blk_var13
-FLA_Sylv_hn_blk_var14
-FLA_Sylv_hn_blk_var15
-FLA_Sylv_hn_blk_var16
-FLA_Sylv_hn_blk_var17
-FLA_Sylv_hn_blk_var18
-FLA_Sylv_hn_blk_var2
-FLA_Sylv_hn_blk_var3
-FLA_Sylv_hn_blk_var4
-FLA_Sylv_hn_blk_var5
-FLA_Sylv_hn_blk_var6
-FLA_Sylv_hn_blk_var7
-FLA_Sylv_hn_blk_var8
-FLA_Sylv_hn_blk_var9
-FLA_Sylv_hn_opt_var1
-FLA_Sylv_hn_ops_var1
-FLA_Sylv_hn_opd_var1
-FLA_Sylv_hn_opc_var1
-FLA_Sylv_hn_opz_var1
-FLA_Sylv_hn_opt_var10
-FLA_Sylv_hn_opt_var11
-FLA_Sylv_hn_opt_var12
-FLA_Sylv_hn_opt_var13
-FLA_Sylv_hn_opt_var14
-FLA_Sylv_hn_opt_var15
-FLA_Sylv_hn_opt_var16
-FLA_Sylv_hn_opt_var17
-FLA_Sylv_hn_opt_var18
-FLA_Sylv_hn_opt_var2
-FLA_Sylv_hn_opt_var3
-FLA_Sylv_hn_opt_var4
-FLA_Sylv_hn_opt_var5
-FLA_Sylv_hn_opt_var6
-FLA_Sylv_hn_opt_var7
-FLA_Sylv_hn_opt_var8
-FLA_Sylv_hn_opt_var9
-FLA_Sylv_internal
-FLA_Sylv_nh
-FLA_Sylv_nh_blk_var1
-FLA_Sylv_nh_blk_var10
-FLA_Sylv_nh_blk_var11
-FLA_Sylv_nh_blk_var12
-FLA_Sylv_nh_blk_var13
-FLA_Sylv_nh_blk_var14
-FLA_Sylv_nh_blk_var15
-FLA_Sylv_nh_blk_var16
-FLA_Sylv_nh_blk_var17
-FLA_Sylv_nh_blk_var18
-FLA_Sylv_nh_blk_var2
-FLA_Sylv_nh_blk_var3
-FLA_Sylv_nh_blk_var4
-FLA_Sylv_nh_blk_var5
-FLA_Sylv_nh_blk_var6
-FLA_Sylv_nh_blk_var7
-FLA_Sylv_nh_blk_var8
-FLA_Sylv_nh_blk_var9
-FLA_Sylv_nh_opt_var1
-FLA_Sylv_nh_ops_var1
-FLA_Sylv_nh_opd_var1
-FLA_Sylv_nh_opc_var1
-FLA_Sylv_nh_opz_var1
-FLA_Sylv_nh_opt_var10
-FLA_Sylv_nh_opt_var11
-FLA_Sylv_nh_opt_var12
-FLA_Sylv_nh_opt_var13
-FLA_Sylv_nh_opt_var14
-FLA_Sylv_nh_opt_var15
-FLA_Sylv_nh_opt_var16
-FLA_Sylv_nh_opt_var17
-FLA_Sylv_nh_opt_var18
-FLA_Sylv_nh_opt_var2
-FLA_Sylv_nh_opt_var3
-FLA_Sylv_nh_opt_var4
-FLA_Sylv_nh_opt_var5
-FLA_Sylv_nh_opt_var6
-FLA_Sylv_nh_opt_var7
-FLA_Sylv_nh_opt_var8
-FLA_Sylv_nh_opt_var9
-FLA_Sylv_nn
-FLA_Sylv_nn_blk_var1
-FLA_Sylv_nn_blk_var10
-FLA_Sylv_nn_blk_var11
-FLA_Sylv_nn_blk_var12
-FLA_Sylv_nn_blk_var13
-FLA_Sylv_nn_blk_var14
-FLA_Sylv_nn_blk_var15
-FLA_Sylv_nn_blk_var16
-FLA_Sylv_nn_blk_var17
-FLA_Sylv_nn_blk_var18
-FLA_Sylv_nn_blk_var2
-FLA_Sylv_nn_blk_var3
-FLA_Sylv_nn_blk_var4
-FLA_Sylv_nn_blk_var5
-FLA_Sylv_nn_blk_var6
-FLA_Sylv_nn_blk_var7
-FLA_Sylv_nn_blk_var8
-FLA_Sylv_nn_blk_var9
-FLA_Sylv_nn_opt_var1
-FLA_Sylv_nn_ops_var1
-FLA_Sylv_nn_opd_var1
-FLA_Sylv_nn_opc_var1
-FLA_Sylv_nn_opz_var1
-FLA_Sylv_nn_opt_var10
-FLA_Sylv_nn_opt_var11
-FLA_Sylv_nn_opt_var12
-FLA_Sylv_nn_opt_var13
-FLA_Sylv_nn_opt_var14
-FLA_Sylv_nn_opt_var15
-FLA_Sylv_nn_opt_var16
-FLA_Sylv_nn_opt_var17
-FLA_Sylv_nn_opt_var18
-FLA_Sylv_nn_opt_var2
-FLA_Sylv_nn_opt_var3
-FLA_Sylv_nn_opt_var4
-FLA_Sylv_nn_opt_var5
-FLA_Sylv_nn_opt_var6
-FLA_Sylv_nn_opt_var7
-FLA_Sylv_nn_opt_var8
-FLA_Sylv_nn_opt_var9
-FLA_Sylv_task
-FLA_Sylv_nn_task
-FLA_Sylv_nh_task
-FLA_Sylv_hn_task
-FLA_Sylv_hh_task
-FLA_Sylv_unb_external
-FLA_Sylv_nn_unb_ext
-FLA_Sylv_nh_unb_ext
-FLA_Sylv_hn_unb_ext
-FLA_Sylv_hh_unb_ext
-FLA_Symm
-FLA_Symmetrize
-FLA_Symm_cntl_init
-FLA_Symm_cntl_finalize
-FLA_Symm_external
-FLA_Symm_internal
-FLA_Symm_ll
-FLA_Symm_ll_blk_var1
-FLA_Symm_ll_blk_var10
-FLA_Symm_ll_blk_var2
-FLA_Symm_ll_blk_var3
-FLA_Symm_ll_blk_var4
-FLA_Symm_ll_blk_var5
-FLA_Symm_ll_blk_var6
-FLA_Symm_ll_blk_var7
-FLA_Symm_ll_blk_var8
-FLA_Symm_ll_blk_var9
-FLA_Symm_ll_unb_var1
-FLA_Symm_ll_unb_var10
-FLA_Symm_ll_unb_var2
-FLA_Symm_ll_unb_var3
-FLA_Symm_ll_unb_var4
-FLA_Symm_ll_unb_var5
-FLA_Symm_ll_unb_var6
-FLA_Symm_ll_unb_var7
-FLA_Symm_ll_unb_var8
-FLA_Symm_ll_unb_var9
-FLA_Symm_lu
-FLA_Symm_lu_blk_var1
-FLA_Symm_lu_blk_var10
-FLA_Symm_lu_blk_var2
-FLA_Symm_lu_blk_var3
-FLA_Symm_lu_blk_var4
-FLA_Symm_lu_blk_var5
-FLA_Symm_lu_blk_var6
-FLA_Symm_lu_blk_var7
-FLA_Symm_lu_blk_var8
-FLA_Symm_lu_blk_var9
-FLA_Symm_lu_unb_var1
-FLA_Symm_lu_unb_var10
-FLA_Symm_lu_unb_var2
-FLA_Symm_lu_unb_var3
-FLA_Symm_lu_unb_var4
-FLA_Symm_lu_unb_var5
-FLA_Symm_lu_unb_var6
-FLA_Symm_lu_unb_var7
-FLA_Symm_lu_unb_var8
-FLA_Symm_lu_unb_var9
-FLA_Symm_rl
-FLA_Symm_rl_blk_var1
-FLA_Symm_rl_blk_var10
-FLA_Symm_rl_blk_var2
-FLA_Symm_rl_blk_var3
-FLA_Symm_rl_blk_var4
-FLA_Symm_rl_blk_var5
-FLA_Symm_rl_blk_var6
-FLA_Symm_rl_blk_var7
-FLA_Symm_rl_blk_var8
-FLA_Symm_rl_blk_var9
-FLA_Symm_rl_unb_var1
-FLA_Symm_rl_unb_var10
-FLA_Symm_rl_unb_var2
-FLA_Symm_rl_unb_var3
-FLA_Symm_rl_unb_var4
-FLA_Symm_rl_unb_var5
-FLA_Symm_rl_unb_var6
-FLA_Symm_rl_unb_var7
-FLA_Symm_rl_unb_var8
-FLA_Symm_rl_unb_var9
-FLA_Symm_ru
-FLA_Symm_ru_blk_var1
-FLA_Symm_ru_blk_var10
-FLA_Symm_ru_blk_var2
-FLA_Symm_ru_blk_var3
-FLA_Symm_ru_blk_var4
-FLA_Symm_ru_blk_var5
-FLA_Symm_ru_blk_var6
-FLA_Symm_ru_blk_var7
-FLA_Symm_ru_blk_var8
-FLA_Symm_ru_blk_var9
-FLA_Symm_ru_unb_var1
-FLA_Symm_ru_unb_var10
-FLA_Symm_ru_unb_var2
-FLA_Symm_ru_unb_var3
-FLA_Symm_ru_unb_var4
-FLA_Symm_ru_unb_var5
-FLA_Symm_ru_unb_var6
-FLA_Symm_ru_unb_var7
-FLA_Symm_ru_unb_var8
-FLA_Symm_ru_unb_var9
-FLA_Symm_task
-FLA_Symm_ll_task
-FLA_Symm_lu_task
-FLA_Symm_rl_task
-FLA_Symm_ru_task
-FLA_Symv
-FLA_Symv_external
-FLA_Syr
-FLA_Syr2
-FLA_Syr2k
-FLA_Syr2k_cntl_init
-FLA_Syr2k_cntl_finalize
-FLA_Syr2k_external
-FLA_Syr2k_internal
-FLA_Syr2k_ln
-FLA_Syr2k_ln_blk_var1
-FLA_Syr2k_ln_blk_var10
-FLA_Syr2k_ln_blk_var2
-FLA_Syr2k_ln_blk_var3
-FLA_Syr2k_ln_blk_var4
-FLA_Syr2k_ln_blk_var5
-FLA_Syr2k_ln_blk_var6
-FLA_Syr2k_ln_blk_var7
-FLA_Syr2k_ln_blk_var8
-FLA_Syr2k_ln_blk_var9
-FLA_Syr2k_ln_unb_var1
-FLA_Syr2k_ln_unb_var10
-FLA_Syr2k_ln_unb_var2
-FLA_Syr2k_ln_unb_var3
-FLA_Syr2k_ln_unb_var4
-FLA_Syr2k_ln_unb_var5
-FLA_Syr2k_ln_unb_var6
-FLA_Syr2k_ln_unb_var7
-FLA_Syr2k_ln_unb_var8
-FLA_Syr2k_ln_unb_var9
-FLA_Syr2k_lt
-FLA_Syr2k_lt_blk_var1
-FLA_Syr2k_lt_blk_var10
-FLA_Syr2k_lt_blk_var2
-FLA_Syr2k_lt_blk_var3
-FLA_Syr2k_lt_blk_var4
-FLA_Syr2k_lt_blk_var5
-FLA_Syr2k_lt_blk_var6
-FLA_Syr2k_lt_blk_var7
-FLA_Syr2k_lt_blk_var8
-FLA_Syr2k_lt_blk_var9
-FLA_Syr2k_lt_unb_var1
-FLA_Syr2k_lt_unb_var10
-FLA_Syr2k_lt_unb_var2
-FLA_Syr2k_lt_unb_var3
-FLA_Syr2k_lt_unb_var4
-FLA_Syr2k_lt_unb_var5
-FLA_Syr2k_lt_unb_var6
-FLA_Syr2k_lt_unb_var7
-FLA_Syr2k_lt_unb_var8
-FLA_Syr2k_lt_unb_var9
-FLA_Syr2k_task
-FLA_Syr2k_ln_task
-FLA_Syr2k_lt_task
-FLA_Syr2k_un_task
-FLA_Syr2k_ut_task
-FLA_Syr2k_un
-FLA_Syr2k_un_blk_var1
-FLA_Syr2k_un_blk_var10
-FLA_Syr2k_un_blk_var2
-FLA_Syr2k_un_blk_var3
-FLA_Syr2k_un_blk_var4
-FLA_Syr2k_un_blk_var5
-FLA_Syr2k_un_blk_var6
-FLA_Syr2k_un_blk_var7
-FLA_Syr2k_un_blk_var8
-FLA_Syr2k_un_blk_var9
-FLA_Syr2k_un_unb_var1
-FLA_Syr2k_un_unb_var10
-FLA_Syr2k_un_unb_var2
-FLA_Syr2k_un_unb_var3
-FLA_Syr2k_un_unb_var4
-FLA_Syr2k_un_unb_var5
-FLA_Syr2k_un_unb_var6
-FLA_Syr2k_un_unb_var7
-FLA_Syr2k_un_unb_var8
-FLA_Syr2k_un_unb_var9
-FLA_Syr2k_ut
-FLA_Syr2k_ut_blk_var1
-FLA_Syr2k_ut_blk_var10
-FLA_Syr2k_ut_blk_var2
-FLA_Syr2k_ut_blk_var3
-FLA_Syr2k_ut_blk_var4
-FLA_Syr2k_ut_blk_var5
-FLA_Syr2k_ut_blk_var6
-FLA_Syr2k_ut_blk_var7
-FLA_Syr2k_ut_blk_var8
-FLA_Syr2k_ut_blk_var9
-FLA_Syr2k_ut_unb_var1
-FLA_Syr2k_ut_unb_var10
-FLA_Syr2k_ut_unb_var2
-FLA_Syr2k_ut_unb_var3
-FLA_Syr2k_ut_unb_var4
-FLA_Syr2k_ut_unb_var5
-FLA_Syr2k_ut_unb_var6
-FLA_Syr2k_ut_unb_var7
-FLA_Syr2k_ut_unb_var8
-FLA_Syr2k_ut_unb_var9
-FLA_Syr2_external
-FLA_Syrk
-FLA_Syrk_cntl_init
-FLA_Syrk_cntl_finalize
-FLA_Syrk_external
-FLA_Syrk_internal
-FLA_Syrk_ln
-FLA_Syrk_ln_blk_var1
-FLA_Syrk_ln_blk_var2
-FLA_Syrk_ln_blk_var3
-FLA_Syrk_ln_blk_var4
-FLA_Syrk_ln_blk_var5
-FLA_Syrk_ln_blk_var6
-FLA_Syrk_ln_unb_var1
-FLA_Syrk_ln_unb_var2
-FLA_Syrk_ln_unb_var3
-FLA_Syrk_ln_unb_var4
-FLA_Syrk_ln_unb_var5
-FLA_Syrk_ln_unb_var6
-FLA_Syrk_lt
-FLA_Syrk_lt_blk_var1
-FLA_Syrk_lt_blk_var2
-FLA_Syrk_lt_blk_var3
-FLA_Syrk_lt_blk_var4
-FLA_Syrk_lt_blk_var5
-FLA_Syrk_lt_blk_var6
-FLA_Syrk_lt_unb_var1
-FLA_Syrk_lt_unb_var2
-FLA_Syrk_lt_unb_var3
-FLA_Syrk_lt_unb_var4
-FLA_Syrk_lt_unb_var5
-FLA_Syrk_lt_unb_var6
-FLA_Syrk_task
-FLA_Syrk_ln_task
-FLA_Syrk_lt_task
-FLA_Syrk_un_task
-FLA_Syrk_ut_task
-FLA_Syrk_un
-FLA_Syrk_un_blk_var1
-FLA_Syrk_un_blk_var2
-FLA_Syrk_un_blk_var3
-FLA_Syrk_un_blk_var4
-FLA_Syrk_un_blk_var5
-FLA_Syrk_un_blk_var6
-FLA_Syrk_un_unb_var1
-FLA_Syrk_un_unb_var2
-FLA_Syrk_un_unb_var3
-FLA_Syrk_un_unb_var4
-FLA_Syrk_un_unb_var5
-FLA_Syrk_un_unb_var6
-FLA_Syrk_ut
-FLA_Syrk_ut_blk_var1
-FLA_Syrk_ut_blk_var2
-FLA_Syrk_ut_blk_var3
-FLA_Syrk_ut_blk_var4
-FLA_Syrk_ut_blk_var5
-FLA_Syrk_ut_blk_var6
-FLA_Syrk_ut_unb_var1
-FLA_Syrk_ut_unb_var2
-FLA_Syrk_ut_unb_var3
-FLA_Syrk_ut_unb_var4
-FLA_Syrk_ut_unb_var5
-FLA_Syrk_ut_unb_var6
-FLA_Syr_external
-FLA_Transpose
-FLA_Transpose_blk_var1
-FLA_Transpose_blk_var2
-FLA_Transpose_cntl_init
-FLA_Transpose_cntl_finalize
-FLA_Transpose_unb_var1
-FLA_Transpose_unb_var2
-FLA_Triangularize
-FLA_Trinv
-FLA_Trinv_blk_external
-FLA_Trinv_cntl_init
-FLA_Trinv_cntl_finalize
-FLA_Trinv_internal
-FLA_Trinv_ln
-FLA_Trinv_ln_blk_var1
-FLA_Trinv_ln_blk_var2
-FLA_Trinv_ln_blk_var3
-FLA_Trinv_ln_blk_var4
-FLA_Trinv_ln_opt_var1
-FLA_Trinv_ln_ops_var1
-FLA_Trinv_ln_opd_var1
-FLA_Trinv_ln_opc_var1
-FLA_Trinv_ln_opz_var1
-FLA_Trinv_ln_opt_var2
-FLA_Trinv_ln_ops_var2
-FLA_Trinv_ln_opd_var2
-FLA_Trinv_ln_opc_var2
-FLA_Trinv_ln_opz_var2
-FLA_Trinv_ln_opt_var3
-FLA_Trinv_ln_ops_var3
-FLA_Trinv_ln_opd_var3
-FLA_Trinv_ln_opc_var3
-FLA_Trinv_ln_opz_var3
-FLA_Trinv_ln_opt_var4
-FLA_Trinv_ln_ops_var4
-FLA_Trinv_ln_opd_var4
-FLA_Trinv_ln_opc_var4
-FLA_Trinv_ln_opz_var4
-FLA_Trinv_ln_unb_var1
-FLA_Trinv_ln_unb_var2
-FLA_Trinv_ln_unb_var3
-FLA_Trinv_ln_unb_var4
-FLA_Trinv_lu
-FLA_Trinv_lu_blk_var1
-FLA_Trinv_lu_blk_var2
-FLA_Trinv_lu_blk_var3
-FLA_Trinv_lu_blk_var4
-FLA_Trinv_lu_opt_var1
-FLA_Trinv_lu_ops_var1
-FLA_Trinv_lu_opd_var1
-FLA_Trinv_lu_opc_var1
-FLA_Trinv_lu_opz_var1
-FLA_Trinv_lu_opt_var2
-FLA_Trinv_lu_ops_var2
-FLA_Trinv_lu_opd_var2
-FLA_Trinv_lu_opc_var2
-FLA_Trinv_lu_opz_var2
-FLA_Trinv_lu_opt_var3
-FLA_Trinv_lu_ops_var3
-FLA_Trinv_lu_opd_var3
-FLA_Trinv_lu_opc_var3
-FLA_Trinv_lu_opz_var3
-FLA_Trinv_lu_opt_var4
-FLA_Trinv_lu_ops_var4
-FLA_Trinv_lu_opd_var4
-FLA_Trinv_lu_opc_var4
-FLA_Trinv_lu_opz_var4
-FLA_Trinv_lu_unb_var1
-FLA_Trinv_lu_unb_var2
-FLA_Trinv_lu_unb_var3
-FLA_Trinv_lu_unb_var4
-FLA_Trinv_task
-FLA_Trinv_ln_task
-FLA_Trinv_lu_task
-FLA_Trinv_un_task
-FLA_Trinv_uu_task
-FLA_Trinv_un
-FLA_Trinv_unb_external
-FLA_Trinv_ln_unb_ext
-FLA_Trinv_lu_unb_ext
-FLA_Trinv_un_unb_ext
-FLA_Trinv_uu_unb_ext
-FLA_Trinv_un_blk_var1
-FLA_Trinv_un_blk_var2
-FLA_Trinv_un_blk_var3
-FLA_Trinv_un_blk_var4
-FLA_Trinv_un_opt_var1
-FLA_Trinv_un_ops_var1
-FLA_Trinv_un_opd_var1
-FLA_Trinv_un_opc_var1
-FLA_Trinv_un_opz_var1
-FLA_Trinv_un_opt_var2
-FLA_Trinv_un_ops_var2
-FLA_Trinv_un_opd_var2
-FLA_Trinv_un_opc_var2
-FLA_Trinv_un_opz_var2
-FLA_Trinv_un_opt_var3
-FLA_Trinv_un_ops_var3
-FLA_Trinv_un_opd_var3
-FLA_Trinv_un_opc_var3
-FLA_Trinv_un_opz_var3
-FLA_Trinv_un_opt_var4
-FLA_Trinv_un_ops_var4
-FLA_Trinv_un_opd_var4
-FLA_Trinv_un_opc_var4
-FLA_Trinv_un_opz_var4
-FLA_Trinv_un_unb_var1
-FLA_Trinv_un_unb_var2
-FLA_Trinv_un_unb_var3
-FLA_Trinv_un_unb_var4
-FLA_Trinv_uu
-FLA_Trinv_uu_blk_var1
-FLA_Trinv_uu_blk_var2
-FLA_Trinv_uu_blk_var3
-FLA_Trinv_uu_blk_var4
-FLA_Trinv_uu_opt_var1
-FLA_Trinv_uu_ops_var1
-FLA_Trinv_uu_opd_var1
-FLA_Trinv_uu_opc_var1
-FLA_Trinv_uu_opz_var1
-FLA_Trinv_uu_opt_var2
-FLA_Trinv_uu_ops_var2
-FLA_Trinv_uu_opd_var2
-FLA_Trinv_uu_opc_var2
-FLA_Trinv_uu_opz_var2
-FLA_Trinv_uu_opt_var3
-FLA_Trinv_uu_ops_var3
-FLA_Trinv_uu_opd_var3
-FLA_Trinv_uu_opc_var3
-FLA_Trinv_uu_opz_var3
-FLA_Trinv_uu_opt_var4
-FLA_Trinv_uu_ops_var4
-FLA_Trinv_uu_opd_var4
-FLA_Trinv_uu_opc_var4
-FLA_Trinv_uu_opz_var4
-FLA_Trinv_uu_unb_var1
-FLA_Trinv_uu_unb_var2
-FLA_Trinv_uu_unb_var3
-FLA_Trinv_uu_unb_var4
-FLA_Trmm
-FLA_Trmmsx_external
-FLA_Trmm_cntl_init
-FLA_Trmm_cntl_finalize
-FLA_Trmm_external
-FLA_Trmm_internal
-FLA_Trmm_llh
-FLA_Trmm_llh_blk_var1
-FLA_Trmm_llh_blk_var2
-FLA_Trmm_llh_blk_var3
-FLA_Trmm_llh_blk_var4
-FLA_Trmm_llh_unb_var1
-FLA_Trmm_llh_unb_var2
-FLA_Trmm_llh_unb_var3
-FLA_Trmm_llh_unb_var4
-FLA_Trmm_lln
-FLA_Trmm_lln_blk_var1
-FLA_Trmm_lln_blk_var2
-FLA_Trmm_lln_blk_var3
-FLA_Trmm_lln_blk_var4
-FLA_Trmm_lln_unb_var1
-FLA_Trmm_lln_unb_var2
-FLA_Trmm_lln_unb_var3
-FLA_Trmm_lln_unb_var4
-FLA_Trmm_llt
-FLA_Trmm_llt_blk_var1
-FLA_Trmm_llt_blk_var2
-FLA_Trmm_llt_blk_var3
-FLA_Trmm_llt_blk_var4
-FLA_Trmm_llt_unb_var1
-FLA_Trmm_llt_unb_var2
-FLA_Trmm_llt_unb_var3
-FLA_Trmm_llt_unb_var4
-FLA_Trmm_luh
-FLA_Trmm_luh_blk_var1
-FLA_Trmm_luh_blk_var2
-FLA_Trmm_luh_blk_var3
-FLA_Trmm_luh_blk_var4
-FLA_Trmm_luh_unb_var1
-FLA_Trmm_luh_unb_var2
-FLA_Trmm_luh_unb_var3
-FLA_Trmm_luh_unb_var4
-FLA_Trmm_lun
-FLA_Trmm_lun_blk_var1
-FLA_Trmm_lun_blk_var2
-FLA_Trmm_lun_blk_var3
-FLA_Trmm_lun_blk_var4
-FLA_Trmm_lun_unb_var1
-FLA_Trmm_lun_unb_var2
-FLA_Trmm_lun_unb_var3
-FLA_Trmm_lun_unb_var4
-FLA_Trmm_lut
-FLA_Trmm_lut_blk_var1
-FLA_Trmm_lut_blk_var2
-FLA_Trmm_lut_blk_var3
-FLA_Trmm_lut_blk_var4
-FLA_Trmm_lut_unb_var1
-FLA_Trmm_lut_unb_var2
-FLA_Trmm_lut_unb_var3
-FLA_Trmm_lut_unb_var4
-FLA_Trmm_rlh
-FLA_Trmm_rlh_blk_var1
-FLA_Trmm_rlh_blk_var2
-FLA_Trmm_rlh_blk_var3
-FLA_Trmm_rlh_blk_var4
-FLA_Trmm_rlh_unb_var1
-FLA_Trmm_rlh_unb_var2
-FLA_Trmm_rlh_unb_var3
-FLA_Trmm_rlh_unb_var4
-FLA_Trmm_rln
-FLA_Trmm_rln_blk_var1
-FLA_Trmm_rln_blk_var2
-FLA_Trmm_rln_blk_var3
-FLA_Trmm_rln_blk_var4
-FLA_Trmm_rln_unb_var1
-FLA_Trmm_rln_unb_var2
-FLA_Trmm_rln_unb_var3
-FLA_Trmm_rln_unb_var4
-FLA_Trmm_rlt
-FLA_Trmm_rlt_blk_var1
-FLA_Trmm_rlt_blk_var2
-FLA_Trmm_rlt_blk_var3
-FLA_Trmm_rlt_blk_var4
-FLA_Trmm_rlt_unb_var1
-FLA_Trmm_rlt_unb_var2
-FLA_Trmm_rlt_unb_var3
-FLA_Trmm_rlt_unb_var4
-FLA_Trmm_ruh
-FLA_Trmm_ruh_blk_var1
-FLA_Trmm_ruh_blk_var2
-FLA_Trmm_ruh_blk_var3
-FLA_Trmm_ruh_blk_var4
-FLA_Trmm_ruh_unb_var1
-FLA_Trmm_ruh_unb_var2
-FLA_Trmm_ruh_unb_var3
-FLA_Trmm_ruh_unb_var4
-FLA_Trmm_run
-FLA_Trmm_run_blk_var1
-FLA_Trmm_run_blk_var2
-FLA_Trmm_run_blk_var3
-FLA_Trmm_run_blk_var4
-FLA_Trmm_run_unb_var1
-FLA_Trmm_run_unb_var2
-FLA_Trmm_run_unb_var3
-FLA_Trmm_run_unb_var4
-FLA_Trmm_rut
-FLA_Trmm_rut_blk_var1
-FLA_Trmm_rut_blk_var2
-FLA_Trmm_rut_blk_var3
-FLA_Trmm_rut_blk_var4
-FLA_Trmm_rut_unb_var1
-FLA_Trmm_rut_unb_var2
-FLA_Trmm_rut_unb_var3
-FLA_Trmm_rut_unb_var4
-FLA_Trmm_task
-FLA_Trmm_llh_task
-FLA_Trmm_lln_task
-FLA_Trmm_llt_task
-FLA_Trmm_luh_task
-FLA_Trmm_lun_task
-FLA_Trmm_lut_task
-FLA_Trmm_rlh_task
-FLA_Trmm_rln_task
-FLA_Trmm_rlt_task
-FLA_Trmm_ruh_task
-FLA_Trmm_run_task
-FLA_Trmm_rut_task
-FLA_Trmv
-FLA_Trmvsx
-FLA_Trmvsx_external
-FLA_Trmv_external
-FLA_Trsm
-FLA_Trsmsx_external
-FLA_Trsm_cntl_init
-FLA_Trsm_cntl_finalize
-FLA_Trsm_external
-FLA_Trsm_internal
-FLA_Trsm_llh
-FLA_Trsm_llh_blk_var1
-FLA_Trsm_llh_blk_var2
-FLA_Trsm_llh_blk_var3
-FLA_Trsm_llh_blk_var4
-FLA_Trsm_llh_unb_var1
-FLA_Trsm_llh_unb_var2
-FLA_Trsm_llh_unb_var3
-FLA_Trsm_llh_unb_var4
-FLA_Trsm_lln
-FLA_Trsm_lln_blk_var1
-FLA_Trsm_lln_blk_var2
-FLA_Trsm_lln_blk_var3
-FLA_Trsm_lln_blk_var4
-FLA_Trsm_lln_unb_var1
-FLA_Trsm_lln_unb_var2
-FLA_Trsm_lln_unb_var3
-FLA_Trsm_lln_unb_var4
-FLA_Trsm_llt
-FLA_Trsm_llt_blk_var1
-FLA_Trsm_llt_blk_var2
-FLA_Trsm_llt_blk_var3
-FLA_Trsm_llt_blk_var4
-FLA_Trsm_llt_unb_var1
-FLA_Trsm_llt_unb_var2
-FLA_Trsm_llt_unb_var3
-FLA_Trsm_llt_unb_var4
-FLA_Trsm_luh
-FLA_Trsm_luh_blk_var1
-FLA_Trsm_luh_blk_var2
-FLA_Trsm_luh_blk_var3
-FLA_Trsm_luh_blk_var4
-FLA_Trsm_luh_unb_var1
-FLA_Trsm_luh_unb_var2
-FLA_Trsm_luh_unb_var3
-FLA_Trsm_luh_unb_var4
-FLA_Trsm_lun
-FLA_Trsm_lun_blk_var1
-FLA_Trsm_lun_blk_var2
-FLA_Trsm_lun_blk_var3
-FLA_Trsm_lun_blk_var4
-FLA_Trsm_lun_unb_var1
-FLA_Trsm_lun_unb_var2
-FLA_Trsm_lun_unb_var3
-FLA_Trsm_lun_unb_var4
-FLA_Trsm_lut
-FLA_Trsm_lut_blk_var1
-FLA_Trsm_lut_blk_var2
-FLA_Trsm_lut_blk_var3
-FLA_Trsm_lut_blk_var4
-FLA_Trsm_lut_unb_var1
-FLA_Trsm_lut_unb_var2
-FLA_Trsm_lut_unb_var3
-FLA_Trsm_lut_unb_var4
-FLA_Trsm_piv_task
-FLA_Trsm_rlh
-FLA_Trsm_rlh_blk_var1
-FLA_Trsm_rlh_blk_var2
-FLA_Trsm_rlh_blk_var3
-FLA_Trsm_rlh_blk_var4
-FLA_Trsm_rlh_unb_var1
-FLA_Trsm_rlh_unb_var2
-FLA_Trsm_rlh_unb_var3
-FLA_Trsm_rlh_unb_var4
-FLA_Trsm_rln
-FLA_Trsm_rln_blk_var1
-FLA_Trsm_rln_blk_var2
-FLA_Trsm_rln_blk_var3
-FLA_Trsm_rln_blk_var4
-FLA_Trsm_rln_unb_var1
-FLA_Trsm_rln_unb_var2
-FLA_Trsm_rln_unb_var3
-FLA_Trsm_rln_unb_var4
-FLA_Trsm_rlt
-FLA_Trsm_rlt_blk_var1
-FLA_Trsm_rlt_blk_var2
-FLA_Trsm_rlt_blk_var3
-FLA_Trsm_rlt_blk_var4
-FLA_Trsm_rlt_unb_var1
-FLA_Trsm_rlt_unb_var2
-FLA_Trsm_rlt_unb_var3
-FLA_Trsm_rlt_unb_var4
-FLA_Trsm_ruh
-FLA_Trsm_ruh_blk_var1
-FLA_Trsm_ruh_blk_var2
-FLA_Trsm_ruh_blk_var3
-FLA_Trsm_ruh_blk_var4
-FLA_Trsm_ruh_unb_var1
-FLA_Trsm_ruh_unb_var2
-FLA_Trsm_ruh_unb_var3
-FLA_Trsm_ruh_unb_var4
-FLA_Trsm_run
-FLA_Trsm_run_blk_var1
-FLA_Trsm_run_blk_var2
-FLA_Trsm_run_blk_var3
-FLA_Trsm_run_blk_var4
-FLA_Trsm_run_unb_var1
-FLA_Trsm_run_unb_var2
-FLA_Trsm_run_unb_var3
-FLA_Trsm_run_unb_var4
-FLA_Trsm_rut
-FLA_Trsm_rut_blk_var1
-FLA_Trsm_rut_blk_var2
-FLA_Trsm_rut_blk_var3
-FLA_Trsm_rut_blk_var4
-FLA_Trsm_rut_unb_var1
-FLA_Trsm_rut_unb_var2
-FLA_Trsm_rut_unb_var3
-FLA_Trsm_rut_unb_var4
-FLA_Trsm_task
-FLA_Trsm_llh_task
-FLA_Trsm_lln_task
-FLA_Trsm_llt_task
-FLA_Trsm_luh_task
-FLA_Trsm_lun_task
-FLA_Trsm_lut_task
-FLA_Trsm_rlh_task
-FLA_Trsm_rln_task
-FLA_Trsm_rlt_task
-FLA_Trsm_ruh_task
-FLA_Trsm_run_task
-FLA_Trsm_rut_task
-FLA_Trsv
-FLA_Trsvsx
-FLA_Trsvsx_external
-FLA_Trsv_cntl_init
-FLA_Trsv_cntl_finalize
-FLA_Trsv_external
-FLA_Trsv_internal
-FLA_Trsv_lc
-FLA_Trsv_lc_blk_var1
-FLA_Trsv_lc_blk_var2
-FLA_Trsv_ln
-FLA_Trsv_ln_blk_var1
-FLA_Trsv_ln_blk_var2
-FLA_Trsv_lt
-FLA_Trsv_lt_blk_var1
-FLA_Trsv_lt_blk_var2
-FLA_Trsv_task
-FLA_Trsv_lc_task
-FLA_Trsv_ln_task
-FLA_Trsv_lt_task
-FLA_Trsv_uc_task
-FLA_Trsv_un_task
-FLA_Trsv_ut_task
-FLA_Trsv_uc
-FLA_Trsv_uc_blk_var1
-FLA_Trsv_uc_blk_var2
-FLA_Trsv_un
-FLA_Trsv_un_blk_var1
-FLA_Trsv_un_blk_var2
-FLA_Trsv_ut
-FLA_Trsv_ut_blk_var1
-FLA_Trsv_ut_blk_var2
-FLA_Ttmm
-FLA_Ttmm_blk_external
-FLA_Ttmm_cntl_init
-FLA_Ttmm_cntl_finalize
-FLA_Ttmm_internal
-FLA_Ttmm_l
-FLA_Ttmm_l_blk_var1
-FLA_Ttmm_l_blk_var2
-FLA_Ttmm_l_blk_var3
-FLA_Ttmm_l_opt_var1
-FLA_Ttmm_l_ops_var1
-FLA_Ttmm_l_opd_var1
-FLA_Ttmm_l_opc_var1
-FLA_Ttmm_l_opz_var1
-FLA_Ttmm_l_opt_var2
-FLA_Ttmm_l_ops_var2
-FLA_Ttmm_l_opd_var2
-FLA_Ttmm_l_opc_var2
-FLA_Ttmm_l_opz_var2
-FLA_Ttmm_l_opt_var3
-FLA_Ttmm_l_ops_var3
-FLA_Ttmm_l_opd_var3
-FLA_Ttmm_l_opc_var3
-FLA_Ttmm_l_opz_var3
-FLA_Ttmm_l_unb_var1
-FLA_Ttmm_l_unb_var2
-FLA_Ttmm_l_unb_var3
-FLA_Ttmm_task
-FLA_Ttmm_l_task
-FLA_Ttmm_u_task
-FLA_Ttmm_u
-FLA_Ttmm_unb_external
-FLA_Ttmm_l_unb_ext
-FLA_Ttmm_u_unb_ext
-FLA_Ttmm_u_blk_var1
-FLA_Ttmm_u_blk_var2
-FLA_Ttmm_u_blk_var3
-FLA_Ttmm_u_opt_var1
-FLA_Ttmm_u_ops_var1
-FLA_Ttmm_u_opd_var1
-FLA_Ttmm_u_opc_var1
-FLA_Ttmm_u_opz_var1
-FLA_Ttmm_u_opt_var2
-FLA_Ttmm_u_ops_var2
-FLA_Ttmm_u_opd_var2
-FLA_Ttmm_u_opc_var2
-FLA_Ttmm_u_opz_var2
-FLA_Ttmm_u_opt_var3
-FLA_Ttmm_u_ops_var3
-FLA_Ttmm_u_opd_var3
-FLA_Ttmm_u_opc_var3
-FLA_Ttmm_u_opz_var3
-FLA_Ttmm_u_unb_var1
-FLA_Ttmm_u_unb_var2
-FLA_Ttmm_u_unb_var3
-FLA_Part_2x2
-FLA_Part_2x1
-FLA_Part_1x2
-FLA_Repart_2x2_to_3x3
-FLA_Repart_2x1_to_3x1
-FLA_Repart_1x2_to_1x3
-FLA_Cont_with_3x3_to_2x2
-FLA_Cont_with_3x1_to_2x1
-FLA_Cont_with_1x3_to_1x2
-FLA_Merge_2x2
-FLA_Merge_2x1
-FLA_Merge_1x2
+EXPORTS \r
+FLA_TWO\r
+FLA_ONE\r
+FLA_ONE_HALF\r
+FLA_ZERO\r
+FLA_MINUS_ONE_HALF\r
+FLA_MINUS_ONE\r
+FLA_MINUS_TWO\r
+fla_axpyt_cntl_blas\r
+fla_copyt_cntl_blas\r
+fla_gemm_cntl_blas\r
+fla_hemm_cntl_blas\r
+fla_herk_cntl_blas\r
+fla_her2k_cntl_blas\r
+fla_symm_cntl_blas\r
+fla_syrk_cntl_blas\r
+fla_syr2k_cntl_blas\r
+fla_trmm_cntl_blas\r
+fla_trsm_cntl_blas\r
+fla_appiv_cntl_unb\r
+bli_samax \r
+bli_damax \r
+bli_camax \r
+bli_zamax \r
+bli_sasum \r
+bli_dasum \r
+bli_casum \r
+bli_zasum \r
+bli_saxpy \r
+bli_daxpy \r
+bli_caxpy \r
+bli_zaxpy \r
+bli_saxpymt \r
+bli_daxpymt \r
+bli_caxpymt \r
+bli_zaxpymt \r
+bli_saxpysmt \r
+bli_daxpysmt \r
+bli_caxpysmt \r
+bli_zaxpysmt \r
+bli_saxpysv \r
+bli_daxpysv \r
+bli_caxpysv \r
+bli_zaxpysv \r
+bli_saxpyv \r
+bli_daxpyv \r
+bli_caxpyv \r
+bli_zaxpyv \r
+bli_cconjm \r
+bli_zconjm \r
+bli_cconjmr \r
+bli_zconjmr \r
+bli_cconjv \r
+bli_zconjv \r
+bli_scopy \r
+bli_dcopy \r
+bli_ccopy \r
+bli_zcopy \r
+bli_scopymr \r
+bli_dcopymr \r
+bli_ccopymr \r
+bli_zcopymr \r
+bli_scopymt \r
+bli_dcopymt \r
+bli_ccopymt \r
+bli_zcopymt \r
+bli_scopyv \r
+bli_dcopyv \r
+bli_ccopyv \r
+bli_zcopyv \r
+bli_sdot \r
+bli_ddot \r
+bli_cdot \r
+bli_zdot \r
+bli_sdot2s \r
+bli_ddot2s \r
+bli_cdot2s \r
+bli_zdot2s \r
+bli_sdots \r
+bli_ddots \r
+bli_cdots \r
+bli_zdots \r
+bli_sinverts \r
+bli_dinverts \r
+bli_cinverts \r
+bli_zinverts \r
+bli_sinvscalm \r
+bli_dinvscalm \r
+bli_csinvscalm \r
+bli_cinvscalm \r
+bli_zdinvscalm \r
+bli_zinvscalm \r
+bli_sinvscalv \r
+bli_dinvscalv \r
+bli_csinvscalv \r
+bli_cinvscalv \r
+bli_zdinvscalv \r
+bli_zinvscalv \r
+bli_snrm2 \r
+bli_dnrm2 \r
+bli_cnrm2 \r
+bli_znrm2 \r
+bli_sscal \r
+bli_dscal \r
+bli_csscal \r
+bli_cscal \r
+bli_zdscal \r
+bli_zscal \r
+bli_sscalm \r
+bli_dscalm \r
+bli_csscalm \r
+bli_cscalm \r
+bli_zdscalm \r
+bli_zscalm \r
+bli_sscalmr \r
+bli_dscalmr \r
+bli_csscalmr \r
+bli_cscalmr \r
+bli_zdscalmr \r
+bli_zscalmr \r
+bli_sscalv \r
+bli_dscalv \r
+bli_csscalv \r
+bli_cscalv \r
+bli_zdscalv \r
+bli_zscalv \r
+bli_sswap \r
+bli_dswap \r
+bli_cswap \r
+bli_zswap \r
+bli_sswapmt \r
+bli_dswapmt \r
+bli_cswapmt \r
+bli_zswapmt \r
+bli_sgemv \r
+bli_dgemv \r
+bli_cgemv \r
+bli_zgemv \r
+bli_sger \r
+bli_dger \r
+bli_cger \r
+bli_zger \r
+bli_chemv \r
+bli_zhemv \r
+bli_cher \r
+bli_zher \r
+bli_cher2 \r
+bli_zher2 \r
+bli_ssymv \r
+bli_dsymv \r
+bli_csymv \r
+bli_zsymv \r
+bli_ssyr \r
+bli_dsyr \r
+bli_csyr \r
+bli_zsyr \r
+bli_ssyr2 \r
+bli_dsyr2 \r
+bli_csyr2 \r
+bli_zsyr2 \r
+bli_strmv \r
+bli_dtrmv \r
+bli_ctrmv \r
+bli_ztrmv \r
+bli_strsv \r
+bli_dtrsv \r
+bli_ctrsv \r
+bli_ztrsv \r
+bli_sgemm \r
+bli_dgemm \r
+bli_cgemm \r
+bli_zgemm \r
+bli_chemm \r
+bli_zhemm \r
+bli_cherk \r
+bli_zherk \r
+bli_cher2k \r
+bli_zher2k \r
+bli_ssymm \r
+bli_dsymm \r
+bli_csymm \r
+bli_zsymm \r
+bli_ssyrk \r
+bli_dsyrk \r
+bli_csyrk \r
+bli_zsyrk \r
+bli_ssyr2k \r
+bli_dsyr2k \r
+bli_csyr2k \r
+bli_zsyr2k \r
+bli_strmm \r
+bli_dtrmm \r
+bli_ctrmm \r
+bli_ztrmm \r
+bli_strsm \r
+bli_dtrsm \r
+bli_ctrsm \r
+bli_ztrsm \r
+FLASH_Apply_pivots \r
+FLASH_Apply_pivots_cntl_init \r
+FLASH_Apply_pivots_cntl_finalize \r
+FLASH_Apply_Q_UT \r
+FLASH_Apply_Q_UT_cntl_init \r
+FLASH_Apply_Q_UT_cntl_finalize \r
+FLASH_Apply_Q_UT_inc \r
+FLASH_Apply_Q_UT_inc_cntl_init \r
+FLASH_Apply_Q_UT_inc_cntl_finalize \r
+FLASH_Apply_Q_UT_inc_create_workspace \r
+FLASH_Apply_Q2_UT \r
+FLASH_Apply_Q2_UT_cntl_init \r
+FLASH_Apply_Q2_UT_cntl_finalize \r
+FLASH_Axpy \r
+FLASH_Axpyt \r
+FLASH_Axpyt_cntl_init \r
+FLASH_Axpyt_cntl_finalize \r
+FLASH_Axpy_cntl_init \r
+FLASH_Axpy_cntl_finalize \r
+FLASH_Axpy_buffer_to_hier \r
+FLASH_Axpy_hier_to_buffer \r
+FLASH_Axpy_flat_to_hier \r
+FLASH_Axpy_hier_to_flat \r
+FLASH_Axpy_hierarchy \r
+FLASH_Axpy_hierarchy_r \r
+FLASH_Chol \r
+FLASH_Chol_cntl_init \r
+FLASH_Chol_cntl_finalize \r
+FLASH_Chol_solve \r
+FLASH_Copy \r
+FLASH_Copyt \r
+FLASH_Copyt_cntl_init \r
+FLASH_Copyt_cntl_finalize \r
+FLASH_Copy_cntl_init \r
+FLASH_Copy_cntl_finalize \r
+FLASH_Copy_buffer_to_hier \r
+FLASH_Copy_hier_to_buffer \r
+FLASH_Copy_flat_to_hier \r
+FLASH_Copy_hier_to_flat \r
+FLASH_Copy_hierarchy \r
+FLASH_Copy_hierarchy_r \r
+FLASH_FS_incpiv \r
+FLASH_FS_incpiv_aux1 \r
+FLASH_FS_incpiv_aux2 \r
+FLASH_Gemm \r
+FLASH_Gemm_cntl_init \r
+FLASH_Gemm_cntl_finalize \r
+FLASH_Gemv \r
+FLASH_Gemv_cntl_init \r
+FLASH_Gemv_cntl_finalize \r
+FLASH_Hemm \r
+FLASH_Hemm_cntl_init \r
+FLASH_Hemm_cntl_finalize \r
+FLASH_Her2k \r
+FLASH_Her2k_cntl_init \r
+FLASH_Her2k_cntl_finalize \r
+FLASH_Herk \r
+FLASH_Herk_cntl_init \r
+FLASH_Herk_cntl_finalize \r
+FLASH_LU_find_zero_on_diagonal \r
+FLASH_LU_incpiv \r
+FLASH_LU_incpiv_cntl_init \r
+FLASH_LU_incpiv_cntl_finalize \r
+FLASH_LU_incpiv_create_hier_matrices \r
+FLASH_LU_incpiv_determine_alg_blocksize \r
+FLASH_LU_incpiv_noopt \r
+FLASH_LU_incpiv_opt1 \r
+FLASH_LU_incpiv_solve \r
+FLASH_LU_incpiv_var1 \r
+FLASH_LU_incpiv_var2 \r
+FLASH_LU_nopiv \r
+FLASH_LU_nopiv_cntl_init \r
+FLASH_LU_nopiv_cntl_finalize \r
+FLASH_LU_nopiv_solve \r
+FLASH_LU_piv \r
+FLASH_LU_piv_cntl_init \r
+FLASH_LU_piv_cntl_finalize \r
+FLASH_LU_piv_solve \r
+FLASH_Max_elemwise_diff \r
+FLASH_Norm1 \r
+FLASH_Obj_datatype \r
+FLASH_Obj_depth \r
+FLASH_Obj_blocksizes \r
+FLASH_Obj_scalar_length \r
+FLASH_Obj_scalar_width \r
+FLASH_Obj_create \r
+FLASH_Obj_create_ext \r
+FLASH_Obj_create_without_buffer \r
+FLASH_Obj_create_without_buffer_ext \r
+FLASH_Obj_create_helper \r
+FLASH_Obj_create_hierarchy \r
+FLASH_Obj_create_conf_to \r
+FLASH_Obj_create_hier_conf_to_flat \r
+FLASH_Obj_create_hier_conf_to_flat_ext \r
+FLASH_Obj_create_flat_conf_to_hier \r
+FLASH_Obj_create_hier_copy_of_flat \r
+FLASH_Obj_create_hier_copy_of_flat_ext \r
+FLASH_Obj_create_flat_copy_of_hier \r
+FLASH_Obj_free \r
+FLASH_Obj_free_without_buffer \r
+FLASH_Obj_free_hierarchy \r
+FLASH_Obj_extract_buffer \r
+FLASH_Obj_flatten \r
+FLASH_Obj_hierarchify \r
+FLASH_Obj_show \r
+FLASH_Obj_attach_buffer \r
+FLASH_Obj_attach_buffer_hierarchy \r
+FLASH_print_struct \r
+FLASH_print_struct_helper \r
+FLASH_Obj_create_diag_panel \r
+FLASH_Obj_exec \r
+FLASH_Obj_exec_parallel \r
+FLASH_Obj_push \r
+FLASH_Set \r
+FLASH_Shift_diag \r
+FLASH_QR_UT_cntl_init \r
+FLASH_QR_UT_cntl_finalize \r
+FLASH_QR_UT_inc \r
+FLASH_QR_UT_inc_cntl_init \r
+FLASH_QR_UT_inc_cntl_finalize \r
+FLASH_QR_UT_inc_create_hier_matrices \r
+FLASH_QR_UT_inc_determine_alg_blocksize \r
+FLASH_QR_UT_inc_noopt \r
+FLASH_QR_UT_inc_opt1 \r
+FLASH_QR_UT_inc_solve \r
+FLASH_QR2_UT \r
+FLASH_QR2_UT_cntl_init \r
+FLASH_QR2_UT_cntl_finalize \r
+FLASH_Queue_begin \r
+FLASH_Queue_end \r
+FLASH_Queue_stack_depth \r
+FLASH_Queue_enable \r
+FLASH_Queue_disable \r
+FLASH_Queue_get_enabled \r
+FLASH_Queue_set_num_threads \r
+FLASH_Queue_get_num_threads \r
+FLASH_Queue_init \r
+FLASH_Queue_finalize \r
+FLASH_Queue_get_num_tasks \r
+FLASH_Queue_set_verbose_output \r
+FLASH_Queue_get_verbose_output \r
+FLASH_Queue_set_sorting \r
+FLASH_Queue_get_sorting \r
+FLASH_Queue_set_caching \r
+FLASH_Queue_get_caching \r
+FLASH_Queue_set_work_stealing \r
+FLASH_Queue_get_work_stealing \r
+FLASH_Queue_set_data_affinity \r
+FLASH_Queue_get_data_affinity \r
+FLASH_Queue_get_total_time \r
+FLASH_Queue_get_parallel_time \r
+FLASH_Queue_set_parallel_time \r
+FLASH_Queue_get_num_blocks \r
+FLASH_Queue_set_block_size \r
+FLASH_Queue_get_block_size \r
+FLASH_Queue_set_cache_size \r
+FLASH_Queue_get_cache_size \r
+FLASH_Queue_set_cache_line_size \r
+FLASH_Queue_get_cache_line_size \r
+FLASH_Queue_set_cores_per_cache \r
+FLASH_Queue_get_cores_per_cache \r
+FLASH_Queue_set_cores_per_queue \r
+FLASH_Queue_get_cores_per_queue \r
+FLASH_Queue_reset \r
+FLASH_Queue_get_head_task \r
+FLASH_Queue_get_tail_task \r
+FLASH_Queue_push \r
+FLASH_Queue_push_input \r
+FLASH_Queue_push_output \r
+FLASH_Task_alloc \r
+FLASH_Task_free \r
+FLASH_Queue_exec_task \r
+FLASH_Queue_verbose_output \r
+FLASH_Queue_exec \r
+FLASH_Queue_init_tasks \r
+FLASH_Queue_wait_enqueue \r
+FLASH_Queue_wait_dequeue \r
+FLASH_Queue_wait_dequeue_block \r
+FLASH_Queue_reside_in_cache \r
+FLASH_Queue_update_cache \r
+FLASH_Queue_update_cache_block \r
+FLASH_Queue_prefetch \r
+FLASH_Queue_prefetch_block \r
+FLASH_Queue_work_stealing \r
+FLASH_Queue_exec_parallel \r
+FLASH_Queue_exec_parallel_function \r
+FLASH_Task_update_dependencies \r
+FLASH_Task_update_binding \r
+FLASH_Task_free_parallel \r
+FLASH_Random_matrix \r
+FLASH_Random_spd_matrix \r
+FLASH_SA_FS \r
+FLASH_SA_LU \r
+FLASH_SPDinv \r
+FLASH_SPDinv_cntl_init \r
+FLASH_SPDinv_cntl_finalize \r
+FLASH_Sylv \r
+FLASH_Sylv_cntl_init \r
+FLASH_Sylv_cntl_finalize \r
+FLASH_Symm \r
+FLASH_Symm_cntl_init \r
+FLASH_Symm_cntl_finalize \r
+FLASH_Syr2k \r
+FLASH_Syr2k_cntl_init \r
+FLASH_Syr2k_cntl_finalize \r
+FLASH_Syrk \r
+FLASH_Syrk_cntl_init \r
+FLASH_Syrk_cntl_finalize \r
+FLASH_Triangularize \r
+FLASH_Trinv \r
+FLASH_Trinv_cntl_init \r
+FLASH_Trinv_cntl_finalize \r
+FLASH_Trmm \r
+FLASH_Trmm_cntl_init \r
+FLASH_Trmm_cntl_finalize \r
+FLASH_Trsm \r
+FLASH_Trsm_cntl_init \r
+FLASH_Trsm_cntl_finalize \r
+FLASH_Trsm_piv \r
+FLASH_Trsv \r
+FLASH_Trsv_cntl_init \r
+FLASH_Trsv_cntl_finalize \r
+FLASH_Ttmm \r
+FLASH_Ttmm_cntl_init \r
+FLASH_Ttmm_cntl_finalize \r
+FLA_Absolute_square \r
+FLA_Accum_T_UT \r
+FLA_Accum_T_UT_fc_blk_var2 \r
+FLA_Accum_T_UT_fc_opt_var1 \r
+FLA_Accum_T_UT_fc_ops_var1 \r
+FLA_Accum_T_UT_fc_opd_var1 \r
+FLA_Accum_T_UT_fc_opc_var1 \r
+FLA_Accum_T_UT_fc_opz_var1 \r
+FLA_Accum_T_UT_fc_unb_var1 \r
+FLA_Accum_T_UT_fr_blk_var2 \r
+FLA_Accum_T_UT_fr_opt_var1 \r
+FLA_Accum_T_UT_fr_ops_var1 \r
+FLA_Accum_T_UT_fr_opd_var1 \r
+FLA_Accum_T_UT_fr_opc_var1 \r
+FLA_Accum_T_UT_fr_opz_var1 \r
+FLA_Accum_T_UT_fr_unb_var1 \r
+FLA_Accum_T_UT_internal \r
+FLA_Amax \r
+FLA_Amax_external \r
+FLA_Apply_H2_UT \r
+FLA_Apply_H2_UT_internal \r
+FLA_Apply_H2_UT_lh_opt_var1 \r
+FLA_Apply_H2_UT_lh_ops_var1 \r
+FLA_Apply_H2_UT_lh_opd_var1 \r
+FLA_Apply_H2_UT_lh_opc_var1 \r
+FLA_Apply_H2_UT_lh_opz_var1 \r
+FLA_Apply_H2_UT_lh_unb_var1 \r
+FLA_Apply_H2_UT_rh_opt_var1 \r
+FLA_Apply_H2_UT_rh_ops_var1 \r
+FLA_Apply_H2_UT_rh_opd_var1 \r
+FLA_Apply_H2_UT_rh_opc_var1 \r
+FLA_Apply_H2_UT_rh_opz_var1 \r
+FLA_Apply_H2_UT_rh_unb_var1 \r
+FLA_Apply_H2_UT_rn_opt_var1 \r
+FLA_Apply_H2_UT_rn_ops_var1 \r
+FLA_Apply_H2_UT_rn_opd_var1 \r
+FLA_Apply_H2_UT_rn_opc_var1 \r
+FLA_Apply_H2_UT_rn_opz_var1 \r
+FLA_Apply_H2_UT_rn_unb_var1 \r
+FLA_Apply_pivots \r
+FLA_Apply_pivots_cntl_init \r
+FLA_Apply_pivots_cntl_finalize \r
+FLA_Apply_pivots_internal \r
+FLA_Apply_pivots_ln \r
+FLA_Apply_pivots_ln_blk_var1 \r
+FLA_Apply_pivots_ln_blk_var2 \r
+FLA_Apply_pivots_ln_opt_var1 \r
+FLA_Apply_pivots_ln_ops_var1 \r
+FLA_Apply_pivots_ln_opd_var1 \r
+FLA_Apply_pivots_ln_opc_var1 \r
+FLA_Apply_pivots_ln_opz_var1 \r
+FLA_Apply_pivots_macro_external \r
+FLA_Apply_pivots_macro_task \r
+FLA_Apply_pivots_task \r
+FLA_Apply_pivots_ln_task \r
+FLA_Apply_pivots_unb_external \r
+FLA_Apply_pivots_ln_unb_ext \r
+FLA_Apply_Q_blk_external \r
+FLA_Apply_Q_UT \r
+FLA_Apply_Q_UT_cntl_init \r
+FLA_Apply_Q_UT_cntl_finalize \r
+FLA_Apply_Q_UT_create_workspace \r
+FLA_Apply_Q_UT_inc_internal \r
+FLA_Apply_Q_UT_inc_lhfc \r
+FLA_Apply_Q_UT_inc_lhfc_blk_var1 \r
+FLA_Apply_Q_UT_internal \r
+FLA_Apply_Q_UT_lhfc \r
+FLA_Apply_Q_UT_lhfc_blk_var1 \r
+FLA_Apply_Q_UT_lhfc_blk_var2 \r
+FLA_Apply_Q_UT_lnfr \r
+FLA_Apply_Q_UT_lnfr_blk_var1 \r
+FLA_Apply_Q_UT_lnfr_blk_var2 \r
+FLA_Apply_Q_UT_rnfr \r
+FLA_Apply_Q_UT_rnfr_blk_var1 \r
+FLA_Apply_Q_UT_rnfr_blk_var2 \r
+FLA_Apply_Q_UT_task \r
+FLA_Apply_Q_UT_lhfc_task \r
+FLA_Apply_Q_UT_lnfr_task \r
+FLA_Apply_Q_UT_rnfr_task \r
+FLA_Apply_Q2_UT_cntl_init \r
+FLA_Apply_Q2_UT_cntl_finalize \r
+FLA_Apply_Q2_UT_internal \r
+FLA_Apply_Q2_UT_lhfc \r
+FLA_Apply_Q2_UT_lhfc_blk_var1 \r
+FLA_Apply_Q2_UT_lhfc_blk_var2 \r
+FLA_Apply_Q2_UT_lhfc_blk_var3 \r
+FLA_Apply_Q2_UT_task \r
+FLA_Apply_Q2_UT_lhfc_task \r
+FLA_Asum \r
+FLA_Asum_external \r
+FLA_Axpy \r
+FLA_Axpys \r
+FLA_Axpys_external \r
+FLA_Axpyt \r
+FLA_Axpyt_c \r
+FLA_Axpyt_cntl_init \r
+FLA_Axpyt_cntl_finalize \r
+FLA_Axpyt_c_blk_var1 \r
+FLA_Axpyt_c_blk_var2 \r
+FLA_Axpyt_c_blk_var3 \r
+FLA_Axpyt_c_blk_var4 \r
+FLA_Axpyt_external \r
+FLA_Axpyt_h \r
+FLA_Axpyt_h_blk_var1 \r
+FLA_Axpyt_h_blk_var2 \r
+FLA_Axpyt_h_blk_var3 \r
+FLA_Axpyt_h_blk_var4 \r
+FLA_Axpyt_internal \r
+FLA_Axpyt_n \r
+FLA_Axpyt_n_blk_var1 \r
+FLA_Axpyt_n_blk_var2 \r
+FLA_Axpyt_n_blk_var3 \r
+FLA_Axpyt_n_blk_var4 \r
+FLA_Axpyt_t \r
+FLA_Axpyt_task \r
+FLA_Axpyt_n_task \r
+FLA_Axpyt_t_task \r
+FLA_Axpyt_c_task \r
+FLA_Axpyt_h_task \r
+FLA_Axpyt_t_blk_var1 \r
+FLA_Axpyt_t_blk_var2 \r
+FLA_Axpyt_t_blk_var3 \r
+FLA_Axpyt_t_blk_var4 \r
+FLA_Axpy_blk_var1 \r
+FLA_Axpy_blk_var2 \r
+FLA_Axpy_blk_var3 \r
+FLA_Axpy_blk_var4 \r
+FLA_Axpy_cntl_init \r
+FLA_Axpy_cntl_finalize \r
+FLA_Axpy_external \r
+FLA_Axpy_internal \r
+FLA_Axpy_task \r
+FLA_Axpy_buffer_to_object \r
+FLA_Axpy_object_to_buffer \r
+FLA_Blocksize_create \r
+FLA_Blocksize_set \r
+FLA_Blocksize_scale \r
+FLA_Blocksize_create_copy \r
+FLA_Blocksize_free \r
+FLA_Blocksize_extract \r
+FLA_Query_blocksizes \r
+FLA_Query_blocksize \r
+FLA_Determine_blocksize \r
+FLA_determine_matrix_size \r
+FLA_Check_error_level \r
+FLA_Check_error_level_set \r
+FLA_Check_error_code_helper \r
+FLA_Check_valid_side \r
+FLA_Check_valid_uplo \r
+FLA_Check_valid_trans \r
+FLA_Check_valid_diag \r
+FLA_Check_valid_conj \r
+FLA_Check_valid_direct \r
+FLA_Check_valid_storev \r
+FLA_Check_valid_datatype \r
+FLA_Check_valid_object_datatype \r
+FLA_Check_floating_datatype \r
+FLA_Check_int_datatype \r
+FLA_Check_real_datatype \r
+FLA_Check_complex_datatype \r
+FLA_Check_floating_object \r
+FLA_Check_int_object \r
+FLA_Check_real_object \r
+FLA_Check_complex_object \r
+FLA_Check_identical_object_precision \r
+FLA_Check_consistent_object_datatype \r
+FLA_Check_consistent_datatype \r
+FLA_Check_square \r
+FLA_Check_if_scalar \r
+FLA_Check_if_vector \r
+FLA_Check_conformal_dims \r
+FLA_Check_matrix_matrix_dims \r
+FLA_Check_matrix_vector_dims \r
+FLA_Check_equal_vector_lengths \r
+FLA_Check_conj_trans_and_datatype \r
+FLA_Check_vector_length \r
+FLA_Check_null_pointer \r
+FLA_Check_object_dims \r
+FLA_Check_valid_pivot_type \r
+FLA_Check_malloc_pointer \r
+FLA_Check_base_buffer_mismatch \r
+FLA_Check_adjacent_objects_2x2 \r
+FLA_Check_adjacent_objects_2x1 \r
+FLA_Check_adjacent_objects_1x2 \r
+FLA_Check_blocksize_value \r
+FLA_Check_blocksize_object \r
+FLA_Check_file_descriptor \r
+FLA_Check_lseek_result \r
+FLA_Check_close_result \r
+FLA_Check_unlink_result \r
+FLA_Check_read_result \r
+FLA_Check_write_result \r
+FLA_Check_valid_quadrant \r
+FLA_Check_vector_length_min \r
+FLA_Check_pthread_create_result \r
+FLA_Check_pthread_join_result \r
+FLA_Check_valid_isgn_value \r
+FLA_Check_sylv_matrix_dims \r
+FLA_Check_chol_failure \r
+FLA_Check_valid_elemtype \r
+FLA_Check_posix_memalign_failure \r
+FLA_Check_submatrix_dims_and_offset \r
+FLA_Check_object_scalar_elemtype \r
+FLA_Check_object_matrix_elemtype \r
+FLA_Check_num_threads \r
+FLA_Check_conj_and_datatype \r
+FLA_Check_valid_complex_trans \r
+FLA_Check_valid_real_trans \r
+FLA_Check_valid_blas_trans \r
+FLA_Check_nonconstant_datatype \r
+FLA_Check_nonconstant_object \r
+FLA_Check_identical_object_datatype \r
+FLA_Check_divide_by_zero \r
+FLA_Check_identical_object_elemtype \r
+FLA_Check_pivot_index_range \r
+FLA_Check_householder_panel_dims \r
+FLA_Check_object_length_equals \r
+FLA_Check_object_width_equals \r
+FLA_Check_object_length_min \r
+FLA_Check_object_width_min \r
+FLA_Check_valid_error_level \r
+FLA_Check_attempted_repart_2x2 \r
+FLA_Check_attempted_repart_2x1 \r
+FLA_Check_attempted_repart_1x2 \r
+FLA_Check_valid_leftright_side \r
+FLA_Check_valid_topbottom_side \r
+FLA_Check_matrix_strides \r
+FLA_Chol \r
+FLA_Chol_blk_external \r
+FLA_Chol_cntl_init \r
+FLA_Chol_cntl_finalize \r
+FLA_Chol_internal \r
+FLA_Chol_l \r
+FLA_Chol_l_blk_var1 \r
+FLA_Chol_l_blk_var2 \r
+FLA_Chol_l_blk_var3 \r
+FLA_Chol_l_opt_var1 \r
+FLA_Chol_l_ops_var1 \r
+FLA_Chol_l_opd_var1 \r
+FLA_Chol_l_opc_var1 \r
+FLA_Chol_l_opz_var1 \r
+FLA_Chol_l_opt_var2 \r
+FLA_Chol_l_ops_var2 \r
+FLA_Chol_l_opd_var2 \r
+FLA_Chol_l_opc_var2 \r
+FLA_Chol_l_opz_var2 \r
+FLA_Chol_l_opt_var3 \r
+FLA_Chol_l_ops_var3 \r
+FLA_Chol_l_opd_var3 \r
+FLA_Chol_l_opc_var3 \r
+FLA_Chol_l_opz_var3 \r
+FLA_Chol_l_unb_var1 \r
+FLA_Chol_l_unb_var2 \r
+FLA_Chol_l_unb_var3 \r
+FLA_Chol_solve \r
+FLA_Chol_task \r
+FLA_Chol_l_task \r
+FLA_Chol_u_task \r
+FLA_Chol_u \r
+FLA_Chol_unb_external \r
+FLA_Chol_l_unb_ext \r
+FLA_Chol_u_unb_ext \r
+FLA_Chol_u_blk_var1 \r
+FLA_Chol_u_blk_var2 \r
+FLA_Chol_u_blk_var3 \r
+FLA_Chol_u_opt_var1 \r
+FLA_Chol_u_ops_var1 \r
+FLA_Chol_u_opd_var1 \r
+FLA_Chol_u_opc_var1 \r
+FLA_Chol_u_opz_var1 \r
+FLA_Chol_u_opt_var2 \r
+FLA_Chol_u_ops_var2 \r
+FLA_Chol_u_opd_var2 \r
+FLA_Chol_u_opc_var2 \r
+FLA_Chol_u_opz_var2 \r
+FLA_Chol_u_opt_var3 \r
+FLA_Chol_u_ops_var3 \r
+FLA_Chol_u_opd_var3 \r
+FLA_Chol_u_opc_var3 \r
+FLA_Chol_u_opz_var3 \r
+FLA_Chol_u_unb_var1 \r
+FLA_Chol_u_unb_var2 \r
+FLA_Chol_u_unb_var3 \r
+FLA_Clock \r
+FLA_Clock_helper \r
+FLA_Cntl_obj_free \r
+FLA_Cntl_axpy_obj_create \r
+FLA_Cntl_axpyt_obj_create \r
+FLA_Cntl_copy_obj_create \r
+FLA_Cntl_copyt_obj_create \r
+FLA_Cntl_swap_obj_create \r
+FLA_Cntl_tpose_obj_create \r
+FLA_Cntl_gemv_obj_create \r
+FLA_Cntl_trsv_obj_create \r
+FLA_Cntl_gemm_obj_create \r
+FLA_Cntl_hemm_obj_create \r
+FLA_Cntl_herk_obj_create \r
+FLA_Cntl_her2k_obj_create \r
+FLA_Cntl_symm_obj_create \r
+FLA_Cntl_syrk_obj_create \r
+FLA_Cntl_syr2k_obj_create \r
+FLA_Cntl_trmm_obj_create \r
+FLA_Cntl_trsm_obj_create \r
+FLA_Cntl_init \r
+FLA_Cntl_finalize \r
+FLA_Cntl_init_flamec \r
+FLA_Cntl_finalize_flamec \r
+FLA_Cntl_init_flash \r
+FLA_Cntl_finalize_flash \r
+FLA_Cntl_chol_obj_create \r
+FLA_Cntl_lu_obj_create \r
+FLA_Cntl_appiv_obj_create \r
+FLA_Cntl_qrut_obj_create \r
+FLA_Cntl_qrutud_obj_create \r
+FLA_Cntl_qrutinc_obj_create \r
+FLA_Cntl_lqut_obj_create \r
+FLA_Cntl_trinv_obj_create \r
+FLA_Cntl_ttmm_obj_create \r
+FLA_Cntl_sylv_obj_create \r
+FLA_Cntl_spdinv_obj_create \r
+FLA_Cntl_apqut_obj_create \r
+FLA_Cntl_apqutud_obj_create \r
+FLA_Cntl_apqutinc_obj_create \r
+FLA_Conjugate \r
+FLA_Conjugate_r \r
+FLA_Copy \r
+FLA_Copyr \r
+FLA_Copyr_external \r
+FLA_Copyt \r
+FLA_Copyt_c \r
+FLA_Copyt_cntl_init \r
+FLA_Copyt_cntl_finalize \r
+FLA_Copyt_c_blk_var1 \r
+FLA_Copyt_c_blk_var2 \r
+FLA_Copyt_c_blk_var3 \r
+FLA_Copyt_c_blk_var4 \r
+FLA_Copyt_external \r
+FLA_Copyt_h \r
+FLA_Copyt_h_blk_var1 \r
+FLA_Copyt_h_blk_var2 \r
+FLA_Copyt_h_blk_var3 \r
+FLA_Copyt_h_blk_var4 \r
+FLA_Copyt_internal \r
+FLA_Copyt_n \r
+FLA_Copyt_n_blk_var1 \r
+FLA_Copyt_n_blk_var2 \r
+FLA_Copyt_n_blk_var3 \r
+FLA_Copyt_n_blk_var4 \r
+FLA_Copyt_t \r
+FLA_Copyt_task \r
+FLA_Copyt_n_task \r
+FLA_Copyt_t_task \r
+FLA_Copyt_c_task \r
+FLA_Copyt_h_task \r
+FLA_Copyt_t_blk_var1 \r
+FLA_Copyt_t_blk_var2 \r
+FLA_Copyt_t_blk_var3 \r
+FLA_Copyt_t_blk_var4 \r
+FLA_Copy_blk_var1 \r
+FLA_Copy_blk_var2 \r
+FLA_Copy_blk_var3 \r
+FLA_Copy_blk_var4 \r
+FLA_Copy_cntl_init \r
+FLA_Copy_cntl_finalize \r
+FLA_Copy_external \r
+FLA_Copy_internal \r
+FLA_Copy_task \r
+FLA_Copy_buffer_to_object \r
+FLA_Copy_object_to_buffer \r
+FLA_Dot \r
+FLA_Dot2cs \r
+FLA_Dot2cs_external \r
+FLA_Dot2s \r
+FLA_Dot2s_external \r
+FLA_Dotc \r
+FLA_Dotcs \r
+FLA_Dotcs_external \r
+FLA_Dotc_external \r
+FLA_Dots \r
+FLA_Dots_external \r
+FLA_Dot_external \r
+FLA_Error_string_for_code \r
+FLA_Error_messages_init \r
+FLA_Print_message \r
+FLA_Abort \r
+FLA_Form_perm_matrix \r
+FLA_Gemm \r
+FLA_Gemm_cntl_init \r
+FLA_Gemm_cntl_finalize \r
+FLA_Gemm_external \r
+FLA_Gemm_hh \r
+FLA_Gemm_hh_blk_var1 \r
+FLA_Gemm_hh_blk_var2 \r
+FLA_Gemm_hh_blk_var3 \r
+FLA_Gemm_hh_blk_var4 \r
+FLA_Gemm_hh_blk_var5 \r
+FLA_Gemm_hh_blk_var6 \r
+FLA_Gemm_hh_unb_var1 \r
+FLA_Gemm_hh_unb_var2 \r
+FLA_Gemm_hh_unb_var3 \r
+FLA_Gemm_hh_unb_var4 \r
+FLA_Gemm_hh_unb_var5 \r
+FLA_Gemm_hh_unb_var6 \r
+FLA_Gemm_hn \r
+FLA_Gemm_hn_blk_var1 \r
+FLA_Gemm_hn_blk_var2 \r
+FLA_Gemm_hn_blk_var3 \r
+FLA_Gemm_hn_blk_var4 \r
+FLA_Gemm_hn_blk_var5 \r
+FLA_Gemm_hn_blk_var6 \r
+FLA_Gemm_hn_unb_var1 \r
+FLA_Gemm_hn_unb_var2 \r
+FLA_Gemm_hn_unb_var3 \r
+FLA_Gemm_hn_unb_var4 \r
+FLA_Gemm_hn_unb_var5 \r
+FLA_Gemm_hn_unb_var6 \r
+FLA_Gemm_ht \r
+FLA_Gemm_ht_blk_var1 \r
+FLA_Gemm_ht_blk_var2 \r
+FLA_Gemm_ht_blk_var3 \r
+FLA_Gemm_ht_blk_var4 \r
+FLA_Gemm_ht_blk_var5 \r
+FLA_Gemm_ht_blk_var6 \r
+FLA_Gemm_ht_unb_var1 \r
+FLA_Gemm_ht_unb_var2 \r
+FLA_Gemm_ht_unb_var3 \r
+FLA_Gemm_ht_unb_var4 \r
+FLA_Gemm_ht_unb_var5 \r
+FLA_Gemm_ht_unb_var6 \r
+FLA_Gemm_internal \r
+FLA_Gemm_nh \r
+FLA_Gemm_nh_blk_var1 \r
+FLA_Gemm_nh_blk_var2 \r
+FLA_Gemm_nh_blk_var3 \r
+FLA_Gemm_nh_blk_var4 \r
+FLA_Gemm_nh_blk_var5 \r
+FLA_Gemm_nh_blk_var6 \r
+FLA_Gemm_nh_unb_var1 \r
+FLA_Gemm_nh_unb_var2 \r
+FLA_Gemm_nh_unb_var3 \r
+FLA_Gemm_nh_unb_var4 \r
+FLA_Gemm_nh_unb_var5 \r
+FLA_Gemm_nh_unb_var6 \r
+FLA_Gemm_nn \r
+FLA_Gemm_nn_blk_var1 \r
+FLA_Gemm_nn_blk_var2 \r
+FLA_Gemm_nn_blk_var3 \r
+FLA_Gemm_nn_blk_var4 \r
+FLA_Gemm_nn_blk_var5 \r
+FLA_Gemm_nn_blk_var6 \r
+FLA_Gemm_nn_unb_var1 \r
+FLA_Gemm_nn_unb_var2 \r
+FLA_Gemm_nn_unb_var3 \r
+FLA_Gemm_nn_unb_var4 \r
+FLA_Gemm_nn_unb_var5 \r
+FLA_Gemm_nn_unb_var6 \r
+FLA_Gemm_nt \r
+FLA_Gemm_nt_blk_var1 \r
+FLA_Gemm_nt_blk_var2 \r
+FLA_Gemm_nt_blk_var3 \r
+FLA_Gemm_nt_blk_var4 \r
+FLA_Gemm_nt_blk_var5 \r
+FLA_Gemm_nt_blk_var6 \r
+FLA_Gemm_nt_unb_var1 \r
+FLA_Gemm_nt_unb_var2 \r
+FLA_Gemm_nt_unb_var3 \r
+FLA_Gemm_nt_unb_var4 \r
+FLA_Gemm_nt_unb_var5 \r
+FLA_Gemm_nt_unb_var6 \r
+FLA_Gemm_task \r
+FLA_Gemm_hh_task \r
+FLA_Gemm_hn_task \r
+FLA_Gemm_ht_task \r
+FLA_Gemm_nh_task \r
+FLA_Gemm_nn_task \r
+FLA_Gemm_nt_task \r
+FLA_Gemm_th_task \r
+FLA_Gemm_tn_task \r
+FLA_Gemm_tt_task \r
+FLA_Gemm_th \r
+FLA_Gemm_th_blk_var1 \r
+FLA_Gemm_th_blk_var2 \r
+FLA_Gemm_th_blk_var3 \r
+FLA_Gemm_th_blk_var4 \r
+FLA_Gemm_th_blk_var5 \r
+FLA_Gemm_th_blk_var6 \r
+FLA_Gemm_th_unb_var1 \r
+FLA_Gemm_th_unb_var2 \r
+FLA_Gemm_th_unb_var3 \r
+FLA_Gemm_th_unb_var4 \r
+FLA_Gemm_th_unb_var5 \r
+FLA_Gemm_th_unb_var6 \r
+FLA_Gemm_tn \r
+FLA_Gemm_tn_blk_var1 \r
+FLA_Gemm_tn_blk_var2 \r
+FLA_Gemm_tn_blk_var3 \r
+FLA_Gemm_tn_blk_var4 \r
+FLA_Gemm_tn_blk_var5 \r
+FLA_Gemm_tn_blk_var6 \r
+FLA_Gemm_tn_unb_var1 \r
+FLA_Gemm_tn_unb_var2 \r
+FLA_Gemm_tn_unb_var3 \r
+FLA_Gemm_tn_unb_var4 \r
+FLA_Gemm_tn_unb_var5 \r
+FLA_Gemm_tn_unb_var6 \r
+FLA_Gemm_tt \r
+FLA_Gemm_tt_blk_var1 \r
+FLA_Gemm_tt_blk_var2 \r
+FLA_Gemm_tt_blk_var3 \r
+FLA_Gemm_tt_blk_var4 \r
+FLA_Gemm_tt_blk_var5 \r
+FLA_Gemm_tt_blk_var6 \r
+FLA_Gemm_tt_unb_var1 \r
+FLA_Gemm_tt_unb_var2 \r
+FLA_Gemm_tt_unb_var3 \r
+FLA_Gemm_tt_unb_var4 \r
+FLA_Gemm_tt_unb_var5 \r
+FLA_Gemm_tt_unb_var6 \r
+FLA_Gemp \r
+FLA_Gemv \r
+FLA_Gemvc \r
+FLA_Gemvc_external \r
+FLA_Gemv_c \r
+FLA_Gemv_cntl_init \r
+FLA_Gemv_cntl_finalize \r
+FLA_Gemv_c_blk_var1 \r
+FLA_Gemv_c_blk_var2 \r
+FLA_Gemv_c_blk_var5 \r
+FLA_Gemv_c_blk_var6 \r
+FLA_Gemv_external \r
+FLA_Gemv_internal \r
+FLA_Gemv_n \r
+FLA_Gemv_n_blk_var1 \r
+FLA_Gemv_n_blk_var2 \r
+FLA_Gemv_n_blk_var5 \r
+FLA_Gemv_n_blk_var6 \r
+FLA_Gemv_t \r
+FLA_Gemv_task \r
+FLA_Gemv_c_task \r
+FLA_Gemv_n_task \r
+FLA_Gemv_t_task \r
+FLA_Gemv_t_blk_var1 \r
+FLA_Gemv_t_blk_var2 \r
+FLA_Gemv_t_blk_var5 \r
+FLA_Gemv_t_blk_var6 \r
+FLA_Gepm \r
+FLA_Gepp \r
+FLA_Ger \r
+FLA_Gerc \r
+FLA_Gerc_external \r
+FLA_Ger_external \r
+FLA_Hemm \r
+FLA_Hemm_cntl_init \r
+FLA_Hemm_cntl_finalize \r
+FLA_Hemm_external \r
+FLA_Hemm_internal \r
+FLA_Hemm_ll \r
+FLA_Hemm_ll_blk_var1 \r
+FLA_Hemm_ll_blk_var10 \r
+FLA_Hemm_ll_blk_var2 \r
+FLA_Hemm_ll_blk_var3 \r
+FLA_Hemm_ll_blk_var4 \r
+FLA_Hemm_ll_blk_var5 \r
+FLA_Hemm_ll_blk_var6 \r
+FLA_Hemm_ll_blk_var7 \r
+FLA_Hemm_ll_blk_var8 \r
+FLA_Hemm_ll_blk_var9 \r
+FLA_Hemm_ll_unb_var1 \r
+FLA_Hemm_ll_unb_var10 \r
+FLA_Hemm_ll_unb_var2 \r
+FLA_Hemm_ll_unb_var3 \r
+FLA_Hemm_ll_unb_var4 \r
+FLA_Hemm_ll_unb_var5 \r
+FLA_Hemm_ll_unb_var6 \r
+FLA_Hemm_ll_unb_var7 \r
+FLA_Hemm_ll_unb_var8 \r
+FLA_Hemm_ll_unb_var9 \r
+FLA_Hemm_lu \r
+FLA_Hemm_lu_blk_var1 \r
+FLA_Hemm_lu_blk_var10 \r
+FLA_Hemm_lu_blk_var2 \r
+FLA_Hemm_lu_blk_var3 \r
+FLA_Hemm_lu_blk_var4 \r
+FLA_Hemm_lu_blk_var5 \r
+FLA_Hemm_lu_blk_var6 \r
+FLA_Hemm_lu_blk_var7 \r
+FLA_Hemm_lu_blk_var8 \r
+FLA_Hemm_lu_blk_var9 \r
+FLA_Hemm_lu_unb_var1 \r
+FLA_Hemm_lu_unb_var10 \r
+FLA_Hemm_lu_unb_var2 \r
+FLA_Hemm_lu_unb_var3 \r
+FLA_Hemm_lu_unb_var4 \r
+FLA_Hemm_lu_unb_var5 \r
+FLA_Hemm_lu_unb_var6 \r
+FLA_Hemm_lu_unb_var7 \r
+FLA_Hemm_lu_unb_var8 \r
+FLA_Hemm_lu_unb_var9 \r
+FLA_Hemm_rl \r
+FLA_Hemm_rl_blk_var1 \r
+FLA_Hemm_rl_blk_var10 \r
+FLA_Hemm_rl_blk_var2 \r
+FLA_Hemm_rl_blk_var3 \r
+FLA_Hemm_rl_blk_var4 \r
+FLA_Hemm_rl_blk_var5 \r
+FLA_Hemm_rl_blk_var6 \r
+FLA_Hemm_rl_blk_var7 \r
+FLA_Hemm_rl_blk_var8 \r
+FLA_Hemm_rl_blk_var9 \r
+FLA_Hemm_rl_unb_var1 \r
+FLA_Hemm_rl_unb_var10 \r
+FLA_Hemm_rl_unb_var2 \r
+FLA_Hemm_rl_unb_var3 \r
+FLA_Hemm_rl_unb_var4 \r
+FLA_Hemm_rl_unb_var5 \r
+FLA_Hemm_rl_unb_var6 \r
+FLA_Hemm_rl_unb_var7 \r
+FLA_Hemm_rl_unb_var8 \r
+FLA_Hemm_rl_unb_var9 \r
+FLA_Hemm_ru \r
+FLA_Hemm_ru_blk_var1 \r
+FLA_Hemm_ru_blk_var10 \r
+FLA_Hemm_ru_blk_var2 \r
+FLA_Hemm_ru_blk_var3 \r
+FLA_Hemm_ru_blk_var4 \r
+FLA_Hemm_ru_blk_var5 \r
+FLA_Hemm_ru_blk_var6 \r
+FLA_Hemm_ru_blk_var7 \r
+FLA_Hemm_ru_blk_var8 \r
+FLA_Hemm_ru_blk_var9 \r
+FLA_Hemm_ru_unb_var1 \r
+FLA_Hemm_ru_unb_var10 \r
+FLA_Hemm_ru_unb_var2 \r
+FLA_Hemm_ru_unb_var3 \r
+FLA_Hemm_ru_unb_var4 \r
+FLA_Hemm_ru_unb_var5 \r
+FLA_Hemm_ru_unb_var6 \r
+FLA_Hemm_ru_unb_var7 \r
+FLA_Hemm_ru_unb_var8 \r
+FLA_Hemm_ru_unb_var9 \r
+FLA_Hemm_task \r
+FLA_Hemm_ll_task \r
+FLA_Hemm_lu_task \r
+FLA_Hemm_rl_task \r
+FLA_Hemm_ru_task \r
+FLA_Hemv \r
+FLA_Hemvc \r
+FLA_Hemvc_external \r
+FLA_Hemv_external \r
+FLA_Her \r
+FLA_Her2 \r
+FLA_Her2c \r
+FLA_Her2c_external \r
+FLA_Her2k \r
+FLA_Her2k_cntl_init \r
+FLA_Her2k_cntl_finalize \r
+FLA_Her2k_external \r
+FLA_Her2k_internal \r
+FLA_Her2k_lh \r
+FLA_Her2k_lh_blk_var1 \r
+FLA_Her2k_lh_blk_var10 \r
+FLA_Her2k_lh_blk_var2 \r
+FLA_Her2k_lh_blk_var3 \r
+FLA_Her2k_lh_blk_var4 \r
+FLA_Her2k_lh_blk_var5 \r
+FLA_Her2k_lh_blk_var6 \r
+FLA_Her2k_lh_blk_var7 \r
+FLA_Her2k_lh_blk_var8 \r
+FLA_Her2k_lh_blk_var9 \r
+FLA_Her2k_lh_unb_var1 \r
+FLA_Her2k_lh_unb_var10 \r
+FLA_Her2k_lh_unb_var2 \r
+FLA_Her2k_lh_unb_var3 \r
+FLA_Her2k_lh_unb_var4 \r
+FLA_Her2k_lh_unb_var5 \r
+FLA_Her2k_lh_unb_var6 \r
+FLA_Her2k_lh_unb_var7 \r
+FLA_Her2k_lh_unb_var8 \r
+FLA_Her2k_lh_unb_var9 \r
+FLA_Her2k_ln \r
+FLA_Her2k_ln_blk_var1 \r
+FLA_Her2k_ln_blk_var10 \r
+FLA_Her2k_ln_blk_var2 \r
+FLA_Her2k_ln_blk_var3 \r
+FLA_Her2k_ln_blk_var4 \r
+FLA_Her2k_ln_blk_var5 \r
+FLA_Her2k_ln_blk_var6 \r
+FLA_Her2k_ln_blk_var7 \r
+FLA_Her2k_ln_blk_var8 \r
+FLA_Her2k_ln_blk_var9 \r
+FLA_Her2k_ln_unb_var1 \r
+FLA_Her2k_ln_unb_var10 \r
+FLA_Her2k_ln_unb_var2 \r
+FLA_Her2k_ln_unb_var3 \r
+FLA_Her2k_ln_unb_var4 \r
+FLA_Her2k_ln_unb_var5 \r
+FLA_Her2k_ln_unb_var6 \r
+FLA_Her2k_ln_unb_var7 \r
+FLA_Her2k_ln_unb_var8 \r
+FLA_Her2k_ln_unb_var9 \r
+FLA_Her2k_task \r
+FLA_Her2k_ln_task \r
+FLA_Her2k_lh_task \r
+FLA_Her2k_un_task \r
+FLA_Her2k_uh_task \r
+FLA_Her2k_uh \r
+FLA_Her2k_uh_blk_var1 \r
+FLA_Her2k_uh_blk_var10 \r
+FLA_Her2k_uh_blk_var2 \r
+FLA_Her2k_uh_blk_var3 \r
+FLA_Her2k_uh_blk_var4 \r
+FLA_Her2k_uh_blk_var5 \r
+FLA_Her2k_uh_blk_var6 \r
+FLA_Her2k_uh_blk_var7 \r
+FLA_Her2k_uh_blk_var8 \r
+FLA_Her2k_uh_blk_var9 \r
+FLA_Her2k_uh_unb_var1 \r
+FLA_Her2k_uh_unb_var10 \r
+FLA_Her2k_uh_unb_var2 \r
+FLA_Her2k_uh_unb_var3 \r
+FLA_Her2k_uh_unb_var4 \r
+FLA_Her2k_uh_unb_var5 \r
+FLA_Her2k_uh_unb_var6 \r
+FLA_Her2k_uh_unb_var7 \r
+FLA_Her2k_uh_unb_var8 \r
+FLA_Her2k_uh_unb_var9 \r
+FLA_Her2k_un \r
+FLA_Her2k_un_blk_var1 \r
+FLA_Her2k_un_blk_var10 \r
+FLA_Her2k_un_blk_var2 \r
+FLA_Her2k_un_blk_var3 \r
+FLA_Her2k_un_blk_var4 \r
+FLA_Her2k_un_blk_var5 \r
+FLA_Her2k_un_blk_var6 \r
+FLA_Her2k_un_blk_var7 \r
+FLA_Her2k_un_blk_var8 \r
+FLA_Her2k_un_blk_var9 \r
+FLA_Her2k_un_unb_var1 \r
+FLA_Her2k_un_unb_var10 \r
+FLA_Her2k_un_unb_var2 \r
+FLA_Her2k_un_unb_var3 \r
+FLA_Her2k_un_unb_var4 \r
+FLA_Her2k_un_unb_var5 \r
+FLA_Her2k_un_unb_var6 \r
+FLA_Her2k_un_unb_var7 \r
+FLA_Her2k_un_unb_var8 \r
+FLA_Her2k_un_unb_var9 \r
+FLA_Her2_external \r
+FLA_Herc \r
+FLA_Herc_external \r
+FLA_Herk \r
+FLA_Herk_cntl_init \r
+FLA_Herk_cntl_finalize \r
+FLA_Herk_external \r
+FLA_Herk_internal \r
+FLA_Herk_lh \r
+FLA_Herk_lh_blk_var1 \r
+FLA_Herk_lh_blk_var2 \r
+FLA_Herk_lh_blk_var3 \r
+FLA_Herk_lh_blk_var4 \r
+FLA_Herk_lh_blk_var5 \r
+FLA_Herk_lh_blk_var6 \r
+FLA_Herk_lh_unb_var1 \r
+FLA_Herk_lh_unb_var2 \r
+FLA_Herk_lh_unb_var3 \r
+FLA_Herk_lh_unb_var4 \r
+FLA_Herk_lh_unb_var5 \r
+FLA_Herk_lh_unb_var6 \r
+FLA_Herk_ln \r
+FLA_Herk_ln_blk_var1 \r
+FLA_Herk_ln_blk_var2 \r
+FLA_Herk_ln_blk_var3 \r
+FLA_Herk_ln_blk_var4 \r
+FLA_Herk_ln_blk_var5 \r
+FLA_Herk_ln_blk_var6 \r
+FLA_Herk_ln_unb_var1 \r
+FLA_Herk_ln_unb_var2 \r
+FLA_Herk_ln_unb_var3 \r
+FLA_Herk_ln_unb_var4 \r
+FLA_Herk_ln_unb_var5 \r
+FLA_Herk_ln_unb_var6 \r
+FLA_Herk_task \r
+FLA_Herk_ln_task \r
+FLA_Herk_lh_task \r
+FLA_Herk_un_task \r
+FLA_Herk_uh_task \r
+FLA_Herk_uh \r
+FLA_Herk_uh_blk_var1 \r
+FLA_Herk_uh_blk_var2 \r
+FLA_Herk_uh_blk_var3 \r
+FLA_Herk_uh_blk_var4 \r
+FLA_Herk_uh_blk_var5 \r
+FLA_Herk_uh_blk_var6 \r
+FLA_Herk_uh_unb_var1 \r
+FLA_Herk_uh_unb_var2 \r
+FLA_Herk_uh_unb_var3 \r
+FLA_Herk_uh_unb_var4 \r
+FLA_Herk_uh_unb_var5 \r
+FLA_Herk_uh_unb_var6 \r
+FLA_Herk_un \r
+FLA_Herk_un_blk_var1 \r
+FLA_Herk_un_blk_var2 \r
+FLA_Herk_un_blk_var3 \r
+FLA_Herk_un_blk_var4 \r
+FLA_Herk_un_blk_var5 \r
+FLA_Herk_un_blk_var6 \r
+FLA_Herk_un_unb_var1 \r
+FLA_Herk_un_unb_var2 \r
+FLA_Herk_un_unb_var3 \r
+FLA_Herk_un_unb_var4 \r
+FLA_Herk_un_unb_var5 \r
+FLA_Herk_un_unb_var6 \r
+FLA_Hermitianize \r
+FLA_Her_external \r
+FLA_Househ2_UT \r
+FLA_Househ2_UT_ops \r
+FLA_Househ2_UT_opd \r
+FLA_Househ2_UT_opc \r
+FLA_Househ2_UT_opz \r
+FLA_Init \r
+FLA_Finalize \r
+FLA_Init_safe \r
+FLA_Finalize_safe \r
+FLA_Initialized \r
+FLA_Init_constants \r
+FLA_Finalize_constants \r
+FLA_Invert \r
+FLA_Inv_scal \r
+FLA_Inv_scalc \r
+FLA_Inv_scalc_external \r
+FLA_Inv_scal_external \r
+FLA_Lock_init \r
+FLA_Lock_acquire \r
+FLA_Lock_release \r
+FLA_Lock_destroy \r
+FLA_LQ_blk_external \r
+FLA_LQ_unb_external \r
+FLA_LQ_UT \r
+FLA_LQ_UT_Accum_T_blk_var1 \r
+FLA_LQ_UT_Accum_T_opt_var1 \r
+FLA_LQ_UT_Accum_T_ops_var1 \r
+FLA_LQ_UT_Accum_T_opd_var1 \r
+FLA_LQ_UT_Accum_T_opc_var1 \r
+FLA_LQ_UT_Accum_T_opz_var1 \r
+FLA_LQ_UT_Accum_T_unb_var1 \r
+FLA_LQ_UT_blk_var2 \r
+FLA_LQ_UT_cntl_init \r
+FLA_LQ_UT_cntl_finalize \r
+FLA_LQ_UT_create_T \r
+FLA_LQ_UT_internal \r
+FLA_LQ_UT_opt_var2 \r
+FLA_LQ_UT_ops_var2 \r
+FLA_LQ_UT_opd_var2 \r
+FLA_LQ_UT_opc_var2 \r
+FLA_LQ_UT_opz_var2 \r
+FLA_LQ_UT_recover_tau \r
+FLA_LQ_UT_recover_tau_submatrix \r
+FLA_LQ_UT_solve \r
+FLA_LQ_UT_task \r
+FLA_LQ_UT_unb_var2 \r
+FLA_LU_find_zero_on_diagonal \r
+FLA_LU_nopiv \r
+FLA_LU_nopiv_blk_var1 \r
+FLA_LU_nopiv_blk_var2 \r
+FLA_LU_nopiv_blk_var3 \r
+FLA_LU_nopiv_blk_var4 \r
+FLA_LU_nopiv_blk_var5 \r
+FLA_LU_nopiv_cntl_init \r
+FLA_LU_nopiv_cntl_finalize \r
+FLA_LU_nopiv_internal \r
+FLA_LU_nopiv_opt_var1 \r
+FLA_LU_nopiv_ops_var1 \r
+FLA_LU_nopiv_opd_var1 \r
+FLA_LU_nopiv_opc_var1 \r
+FLA_LU_nopiv_opz_var1 \r
+FLA_LU_nopiv_opt_var2 \r
+FLA_LU_nopiv_ops_var2 \r
+FLA_LU_nopiv_opd_var2 \r
+FLA_LU_nopiv_opc_var2 \r
+FLA_LU_nopiv_opz_var2 \r
+FLA_LU_nopiv_opt_var3 \r
+FLA_LU_nopiv_ops_var3 \r
+FLA_LU_nopiv_opd_var3 \r
+FLA_LU_nopiv_opc_var3 \r
+FLA_LU_nopiv_opz_var3 \r
+FLA_LU_nopiv_opt_var4 \r
+FLA_LU_nopiv_ops_var4 \r
+FLA_LU_nopiv_opd_var4 \r
+FLA_LU_nopiv_opc_var4 \r
+FLA_LU_nopiv_opz_var4 \r
+FLA_LU_nopiv_opt_var5 \r
+FLA_LU_nopiv_ops_var5 \r
+FLA_LU_nopiv_opd_var5 \r
+FLA_LU_nopiv_opc_var5 \r
+FLA_LU_nopiv_opz_var5 \r
+FLA_LU_nopiv_solve \r
+FLA_LU_nopiv_task \r
+FLA_LU_nopiv_unb_var1 \r
+FLA_LU_nopiv_unb_var2 \r
+FLA_LU_nopiv_unb_var3 \r
+FLA_LU_nopiv_unb_var4 \r
+FLA_LU_nopiv_unb_var5 \r
+FLA_LU_piv \r
+FLA_LU_piv_blk_external \r
+FLA_LU_piv_blk_var3 \r
+FLA_LU_piv_blk_var4 \r
+FLA_LU_piv_blk_var5 \r
+FLA_LU_piv_cntl_init \r
+FLA_LU_piv_cntl_finalize \r
+FLA_LU_piv_copy_task \r
+FLA_LU_piv_internal \r
+FLA_LU_piv_macro_task \r
+FLA_LU_piv_opt_var3 \r
+FLA_LU_piv_ops_var3 \r
+FLA_LU_piv_opd_var3 \r
+FLA_LU_piv_opc_var3 \r
+FLA_LU_piv_opz_var3 \r
+FLA_LU_piv_opt_var4 \r
+FLA_LU_piv_ops_var4 \r
+FLA_LU_piv_opd_var4 \r
+FLA_LU_piv_opc_var4 \r
+FLA_LU_piv_opz_var4 \r
+FLA_LU_piv_opt_var5 \r
+FLA_LU_piv_ops_var5 \r
+FLA_LU_piv_opd_var5 \r
+FLA_LU_piv_opc_var5 \r
+FLA_LU_piv_opz_var5 \r
+FLA_LU_piv_solve \r
+FLA_LU_piv_task \r
+FLA_LU_piv_unb_external \r
+FLA_LU_piv_unb_ext \r
+FLA_LU_piv_unb_var3 \r
+FLA_LU_piv_unb_var3b \r
+FLA_LU_piv_unb_var4 \r
+FLA_LU_piv_unb_var5 \r
+FLA_Max_abs_value \r
+FLA_Max_elemwise_diff \r
+FLA_Memory_leak_counter_init \r
+FLA_Memory_leak_counter_finalize \r
+FLA_Memory_leak_counter_status \r
+FLA_Memory_leak_counter_set \r
+FLA_malloc \r
+FLA_realloc \r
+FLA_free \r
+FLA_Set \r
+FLA_Obj_extract_real_scalar \r
+FLA_Set_diag \r
+FLA_Set_to_identity \r
+FLA_Add_to_diag \r
+FLA_Shift_diag \r
+FLA_Scale_diag\r
+FLA_Obj_fshow \r
+FLA_Obj_show \r
+FLA_Mult_add \r
+FLA_Negate \r
+FLA_Norm1 \r
+FLA_Norm_inf \r
+FLA_Nrm2 \r
+FLA_Nrm2_external \r
+FLA_Obj_create \r
+FLA_Obj_create_ext \r
+FLA_align_ldim \r
+FLA_Obj_create_conf_to \r
+FLA_Obj_create_copy_of \r
+FLA_Obj_create_without_buffer \r
+FLA_Obj_create_constant \r
+FLA_Obj_create_complex_constant\r
+FLA_Obj_attach_buffer \r
+FLA_Obj_free \r
+FLA_Obj_free_without_buffer \r
+FLA_Param_map_flame_to_netlib_trans \r
+FLA_Param_map_flame_to_netlib_uplo \r
+FLA_Param_map_flame_to_netlib_side \r
+FLA_Param_map_flame_to_netlib_diag \r
+FLA_Param_map_flame_to_netlib_direct \r
+FLA_Param_map_flame_to_netlib_storev \r
+FLA_Param_map_flame_to_blis_trans \r
+FLA_Param_map_flame_to_blis_conj \r
+FLA_Param_map_flame_to_blis_uplo \r
+FLA_Param_map_flame_to_blis_side \r
+FLA_Param_map_flame_to_blis_diag \r
+FLA_Param_map_blis_to_netlib_trans \r
+FLA_Param_map_blis_to_netlib_uplo \r
+FLA_Param_map_blis_to_netlib_side \r
+FLA_Param_map_blis_to_netlib_diag \r
+FLA_Param_map_netlib_to_flame_trans \r
+FLA_Param_map_netlib_to_flame_uplo \r
+FLA_Param_map_netlib_to_flame_side \r
+FLA_Param_map_netlib_to_flame_diag \r
+FLA_Param_map_blislapack_to_flame_trans \r
+FLA_Param_map_blislapack_to_flame_uplo \r
+FLA_Param_map_blislapack_to_flame_side \r
+FLA_Param_map_blislapack_to_flame_diag \r
+FLA_QR_blk_external \r
+FLA_QR_unb_external \r
+FLA_QR_UT \r
+FLA_QR_UT_Accum_T_blk_var1 \r
+FLA_QR_UT_Accum_T_opt_var1 \r
+FLA_QR_UT_Accum_T_ops_var1 \r
+FLA_QR_UT_Accum_T_opd_var1 \r
+FLA_QR_UT_Accum_T_opc_var1 \r
+FLA_QR_UT_Accum_T_opz_var1 \r
+FLA_QR_UT_Accum_T_unb_var1 \r
+FLA_QR_UT_blk_var2 \r
+FLA_QR_UT_cntl_init \r
+FLA_QR_UT_cntl_finalize \r
+FLA_QR_UT_copy_internal \r
+FLA_QR_UT_copy_task \r
+FLA_QR_UT_create_T \r
+FLA_QR_UT_inc_blk_var1 \r
+FLA_QR_UT_inc_blk_var2 \r
+FLA_QR_UT_internal \r
+FLA_QR_UT_opt_var2 \r
+FLA_QR_UT_ops_var2 \r
+FLA_QR_UT_opd_var2 \r
+FLA_QR_UT_opc_var2 \r
+FLA_QR_UT_opz_var2 \r
+FLA_QR_UT_recover_tau \r
+FLA_QR_UT_recover_tau_submatrix \r
+FLA_QR_UT_solve \r
+FLA_QR_UT_task \r
+FLA_QR2_UT_Accum_T_opt_var1 \r
+FLA_QR2_UT_Accum_T_ops_var1 \r
+FLA_QR2_UT_Accum_T_opd_var1 \r
+FLA_QR2_UT_Accum_T_opc_var1 \r
+FLA_QR2_UT_Accum_T_opz_var1 \r
+FLA_QR2_UT_Accum_T_unb_var1 \r
+FLA_QR2_UT_blk_var1 \r
+FLA_QR2_UT_blk_var2 \r
+FLA_QR2_UT_cntl_init \r
+FLA_QR2_UT_cntl_finalize \r
+FLA_QR2_UT_internal \r
+FLA_QR2_UT_task \r
+FLA_QR2_UT_unb_var2 \r
+FLA_Obj_datatype \r
+FLA_Obj_datatype_proj_to_real \r
+FLA_Obj_elemtype \r
+FLA_Obj_datatype_size \r
+FLA_Obj_elem_size \r
+FLA_Obj_length \r
+FLA_Obj_width \r
+FLA_Obj_vector_dim \r
+FLA_Obj_vector_inc \r
+FLA_Obj_min_dim \r
+FLA_Obj_max_dim \r
+FLA_Obj_row_stride \r
+FLA_Obj_col_stride \r
+FLA_Obj_buffer \r
+FLA_Obj_is_int \r
+FLA_Obj_is_floating_point \r
+FLA_Obj_is_constant \r
+FLA_Obj_is_real \r
+FLA_Obj_is_complex \r
+FLA_Obj_is_single_precision \r
+FLA_Obj_is_double_precision \r
+FLA_Obj_is_scalar \r
+FLA_Obj_is_vector \r
+FLA_Obj_has_zero_dim \r
+FLA_Obj_is_col_major \r
+FLA_Obj_is_row_major \r
+FLA_Obj_is_conformal_to \r
+FLA_Obj_is \r
+FLA_Obj_equals \r
+FLA_Random_herm_matrix \r
+FLA_Random_matrix \r
+FLA_random_float \r
+FLA_random_double \r
+FLA_random_scomplex \r
+FLA_random_dcomplex \r
+FLA_Random_spd_matrix \r
+FLA_Random_tri_matrix \r
+FLA_SA_Apply_pivots \r
+FLA_SA_FS_blk \r
+FLA_SA_FS_task \r
+FLA_SA_LU_blk \r
+FLA_SA_LU_task \r
+FLA_SA_LU_unb \r
+FLA_Scal \r
+FLA_Scalc \r
+FLA_Scalc_external \r
+FLA_Scalr \r
+FLA_Scalr_external \r
+FLA_Scal_external \r
+FLA_Shift_pivots_to \r
+FLA_SPDinv \r
+FLA_SPDinv_blk_external \r
+FLA_SPDinv_cntl_init \r
+FLA_SPDinv_cntl_finalize \r
+FLA_SPDinv_internal \r
+FLA_Sqrt \r
+FLA_Swap \r
+FLA_Swapt \r
+FLA_Swapt_external \r
+FLA_Swap_external \r
+FLA_Swap_t_blk_var1 \r
+FLA_Swap_t_blk_var2 \r
+FLA_Sylv \r
+FLA_Sylv_blk_external \r
+FLA_Sylv_cntl_init \r
+FLA_Sylv_cntl_finalize \r
+FLA_Sylv_hh \r
+FLA_Sylv_hh_blk_var1 \r
+FLA_Sylv_hh_blk_var10 \r
+FLA_Sylv_hh_blk_var11 \r
+FLA_Sylv_hh_blk_var12 \r
+FLA_Sylv_hh_blk_var13 \r
+FLA_Sylv_hh_blk_var14 \r
+FLA_Sylv_hh_blk_var15 \r
+FLA_Sylv_hh_blk_var16 \r
+FLA_Sylv_hh_blk_var17 \r
+FLA_Sylv_hh_blk_var18 \r
+FLA_Sylv_hh_blk_var2 \r
+FLA_Sylv_hh_blk_var3 \r
+FLA_Sylv_hh_blk_var4 \r
+FLA_Sylv_hh_blk_var5 \r
+FLA_Sylv_hh_blk_var6 \r
+FLA_Sylv_hh_blk_var7 \r
+FLA_Sylv_hh_blk_var8 \r
+FLA_Sylv_hh_blk_var9 \r
+FLA_Sylv_hh_opt_var1 \r
+FLA_Sylv_hh_ops_var1 \r
+FLA_Sylv_hh_opd_var1 \r
+FLA_Sylv_hh_opc_var1 \r
+FLA_Sylv_hh_opz_var1 \r
+FLA_Sylv_hh_opt_var10 \r
+FLA_Sylv_hh_opt_var11 \r
+FLA_Sylv_hh_opt_var12 \r
+FLA_Sylv_hh_opt_var13 \r
+FLA_Sylv_hh_opt_var14 \r
+FLA_Sylv_hh_opt_var15 \r
+FLA_Sylv_hh_opt_var16 \r
+FLA_Sylv_hh_opt_var17 \r
+FLA_Sylv_hh_opt_var18 \r
+FLA_Sylv_hh_opt_var2 \r
+FLA_Sylv_hh_opt_var3 \r
+FLA_Sylv_hh_opt_var4 \r
+FLA_Sylv_hh_opt_var5 \r
+FLA_Sylv_hh_opt_var6 \r
+FLA_Sylv_hh_opt_var7 \r
+FLA_Sylv_hh_opt_var8 \r
+FLA_Sylv_hh_opt_var9 \r
+FLA_Sylv_hn \r
+FLA_Sylv_hn_blk_var1 \r
+FLA_Sylv_hn_blk_var10 \r
+FLA_Sylv_hn_blk_var11 \r
+FLA_Sylv_hn_blk_var12 \r
+FLA_Sylv_hn_blk_var13 \r
+FLA_Sylv_hn_blk_var14 \r
+FLA_Sylv_hn_blk_var15 \r
+FLA_Sylv_hn_blk_var16 \r
+FLA_Sylv_hn_blk_var17 \r
+FLA_Sylv_hn_blk_var18 \r
+FLA_Sylv_hn_blk_var2 \r
+FLA_Sylv_hn_blk_var3 \r
+FLA_Sylv_hn_blk_var4 \r
+FLA_Sylv_hn_blk_var5 \r
+FLA_Sylv_hn_blk_var6 \r
+FLA_Sylv_hn_blk_var7 \r
+FLA_Sylv_hn_blk_var8 \r
+FLA_Sylv_hn_blk_var9 \r
+FLA_Sylv_hn_opt_var1 \r
+FLA_Sylv_hn_ops_var1 \r
+FLA_Sylv_hn_opd_var1 \r
+FLA_Sylv_hn_opc_var1 \r
+FLA_Sylv_hn_opz_var1 \r
+FLA_Sylv_hn_opt_var10 \r
+FLA_Sylv_hn_opt_var11 \r
+FLA_Sylv_hn_opt_var12 \r
+FLA_Sylv_hn_opt_var13 \r
+FLA_Sylv_hn_opt_var14 \r
+FLA_Sylv_hn_opt_var15 \r
+FLA_Sylv_hn_opt_var16 \r
+FLA_Sylv_hn_opt_var17 \r
+FLA_Sylv_hn_opt_var18 \r
+FLA_Sylv_hn_opt_var2 \r
+FLA_Sylv_hn_opt_var3 \r
+FLA_Sylv_hn_opt_var4 \r
+FLA_Sylv_hn_opt_var5 \r
+FLA_Sylv_hn_opt_var6 \r
+FLA_Sylv_hn_opt_var7 \r
+FLA_Sylv_hn_opt_var8 \r
+FLA_Sylv_hn_opt_var9 \r
+FLA_Sylv_internal \r
+FLA_Sylv_nh \r
+FLA_Sylv_nh_blk_var1 \r
+FLA_Sylv_nh_blk_var10 \r
+FLA_Sylv_nh_blk_var11 \r
+FLA_Sylv_nh_blk_var12 \r
+FLA_Sylv_nh_blk_var13 \r
+FLA_Sylv_nh_blk_var14 \r
+FLA_Sylv_nh_blk_var15 \r
+FLA_Sylv_nh_blk_var16 \r
+FLA_Sylv_nh_blk_var17 \r
+FLA_Sylv_nh_blk_var18 \r
+FLA_Sylv_nh_blk_var2 \r
+FLA_Sylv_nh_blk_var3 \r
+FLA_Sylv_nh_blk_var4 \r
+FLA_Sylv_nh_blk_var5 \r
+FLA_Sylv_nh_blk_var6 \r
+FLA_Sylv_nh_blk_var7 \r
+FLA_Sylv_nh_blk_var8 \r
+FLA_Sylv_nh_blk_var9 \r
+FLA_Sylv_nh_opt_var1 \r
+FLA_Sylv_nh_ops_var1 \r
+FLA_Sylv_nh_opd_var1 \r
+FLA_Sylv_nh_opc_var1 \r
+FLA_Sylv_nh_opz_var1 \r
+FLA_Sylv_nh_opt_var10 \r
+FLA_Sylv_nh_opt_var11 \r
+FLA_Sylv_nh_opt_var12 \r
+FLA_Sylv_nh_opt_var13 \r
+FLA_Sylv_nh_opt_var14 \r
+FLA_Sylv_nh_opt_var15 \r
+FLA_Sylv_nh_opt_var16 \r
+FLA_Sylv_nh_opt_var17 \r
+FLA_Sylv_nh_opt_var18 \r
+FLA_Sylv_nh_opt_var2 \r
+FLA_Sylv_nh_opt_var3 \r
+FLA_Sylv_nh_opt_var4 \r
+FLA_Sylv_nh_opt_var5 \r
+FLA_Sylv_nh_opt_var6 \r
+FLA_Sylv_nh_opt_var7 \r
+FLA_Sylv_nh_opt_var8 \r
+FLA_Sylv_nh_opt_var9 \r
+FLA_Sylv_nn \r
+FLA_Sylv_nn_blk_var1 \r
+FLA_Sylv_nn_blk_var10 \r
+FLA_Sylv_nn_blk_var11 \r
+FLA_Sylv_nn_blk_var12 \r
+FLA_Sylv_nn_blk_var13 \r
+FLA_Sylv_nn_blk_var14 \r
+FLA_Sylv_nn_blk_var15 \r
+FLA_Sylv_nn_blk_var16 \r
+FLA_Sylv_nn_blk_var17 \r
+FLA_Sylv_nn_blk_var18 \r
+FLA_Sylv_nn_blk_var2 \r
+FLA_Sylv_nn_blk_var3 \r
+FLA_Sylv_nn_blk_var4 \r
+FLA_Sylv_nn_blk_var5 \r
+FLA_Sylv_nn_blk_var6 \r
+FLA_Sylv_nn_blk_var7 \r
+FLA_Sylv_nn_blk_var8 \r
+FLA_Sylv_nn_blk_var9 \r
+FLA_Sylv_nn_opt_var1 \r
+FLA_Sylv_nn_ops_var1 \r
+FLA_Sylv_nn_opd_var1 \r
+FLA_Sylv_nn_opc_var1 \r
+FLA_Sylv_nn_opz_var1 \r
+FLA_Sylv_nn_opt_var10 \r
+FLA_Sylv_nn_opt_var11 \r
+FLA_Sylv_nn_opt_var12 \r
+FLA_Sylv_nn_opt_var13 \r
+FLA_Sylv_nn_opt_var14 \r
+FLA_Sylv_nn_opt_var15 \r
+FLA_Sylv_nn_opt_var16 \r
+FLA_Sylv_nn_opt_var17 \r
+FLA_Sylv_nn_opt_var18 \r
+FLA_Sylv_nn_opt_var2 \r
+FLA_Sylv_nn_opt_var3 \r
+FLA_Sylv_nn_opt_var4 \r
+FLA_Sylv_nn_opt_var5 \r
+FLA_Sylv_nn_opt_var6 \r
+FLA_Sylv_nn_opt_var7 \r
+FLA_Sylv_nn_opt_var8 \r
+FLA_Sylv_nn_opt_var9 \r
+FLA_Sylv_task \r
+FLA_Sylv_nn_task \r
+FLA_Sylv_nh_task \r
+FLA_Sylv_hn_task \r
+FLA_Sylv_hh_task \r
+FLA_Sylv_unb_external \r
+FLA_Sylv_nn_unb_ext \r
+FLA_Sylv_nh_unb_ext \r
+FLA_Sylv_hn_unb_ext \r
+FLA_Sylv_hh_unb_ext \r
+FLA_Symm \r
+FLA_Symmetrize \r
+FLA_Symm_cntl_init \r
+FLA_Symm_cntl_finalize \r
+FLA_Symm_external \r
+FLA_Symm_internal \r
+FLA_Symm_ll \r
+FLA_Symm_ll_blk_var1 \r
+FLA_Symm_ll_blk_var10 \r
+FLA_Symm_ll_blk_var2 \r
+FLA_Symm_ll_blk_var3 \r
+FLA_Symm_ll_blk_var4 \r
+FLA_Symm_ll_blk_var5 \r
+FLA_Symm_ll_blk_var6 \r
+FLA_Symm_ll_blk_var7 \r
+FLA_Symm_ll_blk_var8 \r
+FLA_Symm_ll_blk_var9 \r
+FLA_Symm_ll_unb_var1 \r
+FLA_Symm_ll_unb_var10 \r
+FLA_Symm_ll_unb_var2 \r
+FLA_Symm_ll_unb_var3 \r
+FLA_Symm_ll_unb_var4 \r
+FLA_Symm_ll_unb_var5 \r
+FLA_Symm_ll_unb_var6 \r
+FLA_Symm_ll_unb_var7 \r
+FLA_Symm_ll_unb_var8 \r
+FLA_Symm_ll_unb_var9 \r
+FLA_Symm_lu \r
+FLA_Symm_lu_blk_var1 \r
+FLA_Symm_lu_blk_var10 \r
+FLA_Symm_lu_blk_var2 \r
+FLA_Symm_lu_blk_var3 \r
+FLA_Symm_lu_blk_var4 \r
+FLA_Symm_lu_blk_var5 \r
+FLA_Symm_lu_blk_var6 \r
+FLA_Symm_lu_blk_var7 \r
+FLA_Symm_lu_blk_var8 \r
+FLA_Symm_lu_blk_var9 \r
+FLA_Symm_lu_unb_var1 \r
+FLA_Symm_lu_unb_var10 \r
+FLA_Symm_lu_unb_var2 \r
+FLA_Symm_lu_unb_var3 \r
+FLA_Symm_lu_unb_var4 \r
+FLA_Symm_lu_unb_var5 \r
+FLA_Symm_lu_unb_var6 \r
+FLA_Symm_lu_unb_var7 \r
+FLA_Symm_lu_unb_var8 \r
+FLA_Symm_lu_unb_var9 \r
+FLA_Symm_rl \r
+FLA_Symm_rl_blk_var1 \r
+FLA_Symm_rl_blk_var10 \r
+FLA_Symm_rl_blk_var2 \r
+FLA_Symm_rl_blk_var3 \r
+FLA_Symm_rl_blk_var4 \r
+FLA_Symm_rl_blk_var5 \r
+FLA_Symm_rl_blk_var6 \r
+FLA_Symm_rl_blk_var7 \r
+FLA_Symm_rl_blk_var8 \r
+FLA_Symm_rl_blk_var9 \r
+FLA_Symm_rl_unb_var1 \r
+FLA_Symm_rl_unb_var10 \r
+FLA_Symm_rl_unb_var2 \r
+FLA_Symm_rl_unb_var3 \r
+FLA_Symm_rl_unb_var4 \r
+FLA_Symm_rl_unb_var5 \r
+FLA_Symm_rl_unb_var6 \r
+FLA_Symm_rl_unb_var7 \r
+FLA_Symm_rl_unb_var8 \r
+FLA_Symm_rl_unb_var9 \r
+FLA_Symm_ru \r
+FLA_Symm_ru_blk_var1 \r
+FLA_Symm_ru_blk_var10 \r
+FLA_Symm_ru_blk_var2 \r
+FLA_Symm_ru_blk_var3 \r
+FLA_Symm_ru_blk_var4 \r
+FLA_Symm_ru_blk_var5 \r
+FLA_Symm_ru_blk_var6 \r
+FLA_Symm_ru_blk_var7 \r
+FLA_Symm_ru_blk_var8 \r
+FLA_Symm_ru_blk_var9 \r
+FLA_Symm_ru_unb_var1 \r
+FLA_Symm_ru_unb_var10 \r
+FLA_Symm_ru_unb_var2 \r
+FLA_Symm_ru_unb_var3 \r
+FLA_Symm_ru_unb_var4 \r
+FLA_Symm_ru_unb_var5 \r
+FLA_Symm_ru_unb_var6 \r
+FLA_Symm_ru_unb_var7 \r
+FLA_Symm_ru_unb_var8 \r
+FLA_Symm_ru_unb_var9 \r
+FLA_Symm_task \r
+FLA_Symm_ll_task \r
+FLA_Symm_lu_task \r
+FLA_Symm_rl_task \r
+FLA_Symm_ru_task \r
+FLA_Symv \r
+FLA_Symv_external \r
+FLA_Syr \r
+FLA_Syr2 \r
+FLA_Syr2k \r
+FLA_Syr2k_cntl_init \r
+FLA_Syr2k_cntl_finalize \r
+FLA_Syr2k_external \r
+FLA_Syr2k_internal \r
+FLA_Syr2k_ln \r
+FLA_Syr2k_ln_blk_var1 \r
+FLA_Syr2k_ln_blk_var10 \r
+FLA_Syr2k_ln_blk_var2 \r
+FLA_Syr2k_ln_blk_var3 \r
+FLA_Syr2k_ln_blk_var4 \r
+FLA_Syr2k_ln_blk_var5 \r
+FLA_Syr2k_ln_blk_var6 \r
+FLA_Syr2k_ln_blk_var7 \r
+FLA_Syr2k_ln_blk_var8 \r
+FLA_Syr2k_ln_blk_var9 \r
+FLA_Syr2k_ln_unb_var1 \r
+FLA_Syr2k_ln_unb_var10 \r
+FLA_Syr2k_ln_unb_var2 \r
+FLA_Syr2k_ln_unb_var3 \r
+FLA_Syr2k_ln_unb_var4 \r
+FLA_Syr2k_ln_unb_var5 \r
+FLA_Syr2k_ln_unb_var6 \r
+FLA_Syr2k_ln_unb_var7 \r
+FLA_Syr2k_ln_unb_var8 \r
+FLA_Syr2k_ln_unb_var9 \r
+FLA_Syr2k_lt \r
+FLA_Syr2k_lt_blk_var1 \r
+FLA_Syr2k_lt_blk_var10 \r
+FLA_Syr2k_lt_blk_var2 \r
+FLA_Syr2k_lt_blk_var3 \r
+FLA_Syr2k_lt_blk_var4 \r
+FLA_Syr2k_lt_blk_var5 \r
+FLA_Syr2k_lt_blk_var6 \r
+FLA_Syr2k_lt_blk_var7 \r
+FLA_Syr2k_lt_blk_var8 \r
+FLA_Syr2k_lt_blk_var9 \r
+FLA_Syr2k_lt_unb_var1 \r
+FLA_Syr2k_lt_unb_var10 \r
+FLA_Syr2k_lt_unb_var2 \r
+FLA_Syr2k_lt_unb_var3 \r
+FLA_Syr2k_lt_unb_var4 \r
+FLA_Syr2k_lt_unb_var5 \r
+FLA_Syr2k_lt_unb_var6 \r
+FLA_Syr2k_lt_unb_var7 \r
+FLA_Syr2k_lt_unb_var8 \r
+FLA_Syr2k_lt_unb_var9 \r
+FLA_Syr2k_task \r
+FLA_Syr2k_ln_task \r
+FLA_Syr2k_lt_task \r
+FLA_Syr2k_un_task \r
+FLA_Syr2k_ut_task \r
+FLA_Syr2k_un \r
+FLA_Syr2k_un_blk_var1 \r
+FLA_Syr2k_un_blk_var10 \r
+FLA_Syr2k_un_blk_var2 \r
+FLA_Syr2k_un_blk_var3 \r
+FLA_Syr2k_un_blk_var4 \r
+FLA_Syr2k_un_blk_var5 \r
+FLA_Syr2k_un_blk_var6 \r
+FLA_Syr2k_un_blk_var7 \r
+FLA_Syr2k_un_blk_var8 \r
+FLA_Syr2k_un_blk_var9 \r
+FLA_Syr2k_un_unb_var1 \r
+FLA_Syr2k_un_unb_var10 \r
+FLA_Syr2k_un_unb_var2 \r
+FLA_Syr2k_un_unb_var3 \r
+FLA_Syr2k_un_unb_var4 \r
+FLA_Syr2k_un_unb_var5 \r
+FLA_Syr2k_un_unb_var6 \r
+FLA_Syr2k_un_unb_var7 \r
+FLA_Syr2k_un_unb_var8 \r
+FLA_Syr2k_un_unb_var9 \r
+FLA_Syr2k_ut \r
+FLA_Syr2k_ut_blk_var1 \r
+FLA_Syr2k_ut_blk_var10 \r
+FLA_Syr2k_ut_blk_var2 \r
+FLA_Syr2k_ut_blk_var3 \r
+FLA_Syr2k_ut_blk_var4 \r
+FLA_Syr2k_ut_blk_var5 \r
+FLA_Syr2k_ut_blk_var6 \r
+FLA_Syr2k_ut_blk_var7 \r
+FLA_Syr2k_ut_blk_var8 \r
+FLA_Syr2k_ut_blk_var9 \r
+FLA_Syr2k_ut_unb_var1 \r
+FLA_Syr2k_ut_unb_var10 \r
+FLA_Syr2k_ut_unb_var2 \r
+FLA_Syr2k_ut_unb_var3 \r
+FLA_Syr2k_ut_unb_var4 \r
+FLA_Syr2k_ut_unb_var5 \r
+FLA_Syr2k_ut_unb_var6 \r
+FLA_Syr2k_ut_unb_var7 \r
+FLA_Syr2k_ut_unb_var8 \r
+FLA_Syr2k_ut_unb_var9 \r
+FLA_Syr2_external \r
+FLA_Syrk \r
+FLA_Syrk_cntl_init \r
+FLA_Syrk_cntl_finalize \r
+FLA_Syrk_external \r
+FLA_Syrk_internal \r
+FLA_Syrk_ln \r
+FLA_Syrk_ln_blk_var1 \r
+FLA_Syrk_ln_blk_var2 \r
+FLA_Syrk_ln_blk_var3 \r
+FLA_Syrk_ln_blk_var4 \r
+FLA_Syrk_ln_blk_var5 \r
+FLA_Syrk_ln_blk_var6 \r
+FLA_Syrk_ln_unb_var1 \r
+FLA_Syrk_ln_unb_var2 \r
+FLA_Syrk_ln_unb_var3 \r
+FLA_Syrk_ln_unb_var4 \r
+FLA_Syrk_ln_unb_var5 \r
+FLA_Syrk_ln_unb_var6 \r
+FLA_Syrk_lt \r
+FLA_Syrk_lt_blk_var1 \r
+FLA_Syrk_lt_blk_var2 \r
+FLA_Syrk_lt_blk_var3 \r
+FLA_Syrk_lt_blk_var4 \r
+FLA_Syrk_lt_blk_var5 \r
+FLA_Syrk_lt_blk_var6 \r
+FLA_Syrk_lt_unb_var1 \r
+FLA_Syrk_lt_unb_var2 \r
+FLA_Syrk_lt_unb_var3 \r
+FLA_Syrk_lt_unb_var4 \r
+FLA_Syrk_lt_unb_var5 \r
+FLA_Syrk_lt_unb_var6 \r
+FLA_Syrk_task \r
+FLA_Syrk_ln_task \r
+FLA_Syrk_lt_task \r
+FLA_Syrk_un_task \r
+FLA_Syrk_ut_task \r
+FLA_Syrk_un \r
+FLA_Syrk_un_blk_var1 \r
+FLA_Syrk_un_blk_var2 \r
+FLA_Syrk_un_blk_var3 \r
+FLA_Syrk_un_blk_var4 \r
+FLA_Syrk_un_blk_var5 \r
+FLA_Syrk_un_blk_var6 \r
+FLA_Syrk_un_unb_var1 \r
+FLA_Syrk_un_unb_var2 \r
+FLA_Syrk_un_unb_var3 \r
+FLA_Syrk_un_unb_var4 \r
+FLA_Syrk_un_unb_var5 \r
+FLA_Syrk_un_unb_var6 \r
+FLA_Syrk_ut \r
+FLA_Syrk_ut_blk_var1 \r
+FLA_Syrk_ut_blk_var2 \r
+FLA_Syrk_ut_blk_var3 \r
+FLA_Syrk_ut_blk_var4 \r
+FLA_Syrk_ut_blk_var5 \r
+FLA_Syrk_ut_blk_var6 \r
+FLA_Syrk_ut_unb_var1 \r
+FLA_Syrk_ut_unb_var2 \r
+FLA_Syrk_ut_unb_var3 \r
+FLA_Syrk_ut_unb_var4 \r
+FLA_Syrk_ut_unb_var5 \r
+FLA_Syrk_ut_unb_var6 \r
+FLA_Syr_external \r
+FLA_Transpose \r
+FLA_Transpose_blk_var1 \r
+FLA_Transpose_blk_var2 \r
+FLA_Transpose_cntl_init \r
+FLA_Transpose_cntl_finalize \r
+FLA_Transpose_unb_var1 \r
+FLA_Transpose_unb_var2 \r
+FLA_Triangularize \r
+FLA_Trinv \r
+FLA_Trinv_blk_external \r
+FLA_Trinv_cntl_init \r
+FLA_Trinv_cntl_finalize \r
+FLA_Trinv_internal \r
+FLA_Trinv_ln \r
+FLA_Trinv_ln_blk_var1 \r
+FLA_Trinv_ln_blk_var2 \r
+FLA_Trinv_ln_blk_var3 \r
+FLA_Trinv_ln_blk_var4 \r
+FLA_Trinv_ln_opt_var1 \r
+FLA_Trinv_ln_ops_var1 \r
+FLA_Trinv_ln_opd_var1 \r
+FLA_Trinv_ln_opc_var1 \r
+FLA_Trinv_ln_opz_var1 \r
+FLA_Trinv_ln_opt_var2 \r
+FLA_Trinv_ln_ops_var2 \r
+FLA_Trinv_ln_opd_var2 \r
+FLA_Trinv_ln_opc_var2 \r
+FLA_Trinv_ln_opz_var2 \r
+FLA_Trinv_ln_opt_var3 \r
+FLA_Trinv_ln_ops_var3 \r
+FLA_Trinv_ln_opd_var3 \r
+FLA_Trinv_ln_opc_var3 \r
+FLA_Trinv_ln_opz_var3 \r
+FLA_Trinv_ln_opt_var4 \r
+FLA_Trinv_ln_ops_var4 \r
+FLA_Trinv_ln_opd_var4 \r
+FLA_Trinv_ln_opc_var4 \r
+FLA_Trinv_ln_opz_var4 \r
+FLA_Trinv_ln_unb_var1 \r
+FLA_Trinv_ln_unb_var2 \r
+FLA_Trinv_ln_unb_var3 \r
+FLA_Trinv_ln_unb_var4 \r
+FLA_Trinv_lu \r
+FLA_Trinv_lu_blk_var1 \r
+FLA_Trinv_lu_blk_var2 \r
+FLA_Trinv_lu_blk_var3 \r
+FLA_Trinv_lu_blk_var4 \r
+FLA_Trinv_lu_opt_var1 \r
+FLA_Trinv_lu_ops_var1 \r
+FLA_Trinv_lu_opd_var1 \r
+FLA_Trinv_lu_opc_var1 \r
+FLA_Trinv_lu_opz_var1 \r
+FLA_Trinv_lu_opt_var2 \r
+FLA_Trinv_lu_ops_var2 \r
+FLA_Trinv_lu_opd_var2 \r
+FLA_Trinv_lu_opc_var2 \r
+FLA_Trinv_lu_opz_var2 \r
+FLA_Trinv_lu_opt_var3 \r
+FLA_Trinv_lu_ops_var3 \r
+FLA_Trinv_lu_opd_var3 \r
+FLA_Trinv_lu_opc_var3 \r
+FLA_Trinv_lu_opz_var3 \r
+FLA_Trinv_lu_opt_var4 \r
+FLA_Trinv_lu_ops_var4 \r
+FLA_Trinv_lu_opd_var4 \r
+FLA_Trinv_lu_opc_var4 \r
+FLA_Trinv_lu_opz_var4 \r
+FLA_Trinv_lu_unb_var1 \r
+FLA_Trinv_lu_unb_var2 \r
+FLA_Trinv_lu_unb_var3 \r
+FLA_Trinv_lu_unb_var4 \r
+FLA_Trinv_task \r
+FLA_Trinv_ln_task \r
+FLA_Trinv_lu_task \r
+FLA_Trinv_un_task \r
+FLA_Trinv_uu_task \r
+FLA_Trinv_un \r
+FLA_Trinv_unb_external \r
+FLA_Trinv_ln_unb_ext \r
+FLA_Trinv_lu_unb_ext \r
+FLA_Trinv_un_unb_ext \r
+FLA_Trinv_uu_unb_ext \r
+FLA_Trinv_un_blk_var1 \r
+FLA_Trinv_un_blk_var2 \r
+FLA_Trinv_un_blk_var3 \r
+FLA_Trinv_un_blk_var4 \r
+FLA_Trinv_un_opt_var1 \r
+FLA_Trinv_un_ops_var1 \r
+FLA_Trinv_un_opd_var1 \r
+FLA_Trinv_un_opc_var1 \r
+FLA_Trinv_un_opz_var1 \r
+FLA_Trinv_un_opt_var2 \r
+FLA_Trinv_un_ops_var2 \r
+FLA_Trinv_un_opd_var2 \r
+FLA_Trinv_un_opc_var2 \r
+FLA_Trinv_un_opz_var2 \r
+FLA_Trinv_un_opt_var3 \r
+FLA_Trinv_un_ops_var3 \r
+FLA_Trinv_un_opd_var3 \r
+FLA_Trinv_un_opc_var3 \r
+FLA_Trinv_un_opz_var3 \r
+FLA_Trinv_un_opt_var4 \r
+FLA_Trinv_un_ops_var4 \r
+FLA_Trinv_un_opd_var4 \r
+FLA_Trinv_un_opc_var4 \r
+FLA_Trinv_un_opz_var4 \r
+FLA_Trinv_un_unb_var1 \r
+FLA_Trinv_un_unb_var2 \r
+FLA_Trinv_un_unb_var3 \r
+FLA_Trinv_un_unb_var4 \r
+FLA_Trinv_uu \r
+FLA_Trinv_uu_blk_var1 \r
+FLA_Trinv_uu_blk_var2 \r
+FLA_Trinv_uu_blk_var3 \r
+FLA_Trinv_uu_blk_var4 \r
+FLA_Trinv_uu_opt_var1 \r
+FLA_Trinv_uu_ops_var1 \r
+FLA_Trinv_uu_opd_var1 \r
+FLA_Trinv_uu_opc_var1 \r
+FLA_Trinv_uu_opz_var1 \r
+FLA_Trinv_uu_opt_var2 \r
+FLA_Trinv_uu_ops_var2 \r
+FLA_Trinv_uu_opd_var2 \r
+FLA_Trinv_uu_opc_var2 \r
+FLA_Trinv_uu_opz_var2 \r
+FLA_Trinv_uu_opt_var3 \r
+FLA_Trinv_uu_ops_var3 \r
+FLA_Trinv_uu_opd_var3 \r
+FLA_Trinv_uu_opc_var3 \r
+FLA_Trinv_uu_opz_var3 \r
+FLA_Trinv_uu_opt_var4 \r
+FLA_Trinv_uu_ops_var4 \r
+FLA_Trinv_uu_opd_var4 \r
+FLA_Trinv_uu_opc_var4 \r
+FLA_Trinv_uu_opz_var4 \r
+FLA_Trinv_uu_unb_var1 \r
+FLA_Trinv_uu_unb_var2 \r
+FLA_Trinv_uu_unb_var3 \r
+FLA_Trinv_uu_unb_var4 \r
+FLA_Trmm \r
+FLA_Trmmsx_external \r
+FLA_Trmm_cntl_init \r
+FLA_Trmm_cntl_finalize \r
+FLA_Trmm_external \r
+FLA_Trmm_internal \r
+FLA_Trmm_llh \r
+FLA_Trmm_llh_blk_var1 \r
+FLA_Trmm_llh_blk_var2 \r
+FLA_Trmm_llh_blk_var3 \r
+FLA_Trmm_llh_blk_var4 \r
+FLA_Trmm_llh_unb_var1 \r
+FLA_Trmm_llh_unb_var2 \r
+FLA_Trmm_llh_unb_var3 \r
+FLA_Trmm_llh_unb_var4 \r
+FLA_Trmm_lln \r
+FLA_Trmm_lln_blk_var1 \r
+FLA_Trmm_lln_blk_var2 \r
+FLA_Trmm_lln_blk_var3 \r
+FLA_Trmm_lln_blk_var4 \r
+FLA_Trmm_lln_unb_var1 \r
+FLA_Trmm_lln_unb_var2 \r
+FLA_Trmm_lln_unb_var3 \r
+FLA_Trmm_lln_unb_var4 \r
+FLA_Trmm_llt \r
+FLA_Trmm_llt_blk_var1 \r
+FLA_Trmm_llt_blk_var2 \r
+FLA_Trmm_llt_blk_var3 \r
+FLA_Trmm_llt_blk_var4 \r
+FLA_Trmm_llt_unb_var1 \r
+FLA_Trmm_llt_unb_var2 \r
+FLA_Trmm_llt_unb_var3 \r
+FLA_Trmm_llt_unb_var4 \r
+FLA_Trmm_luh \r
+FLA_Trmm_luh_blk_var1 \r
+FLA_Trmm_luh_blk_var2 \r
+FLA_Trmm_luh_blk_var3 \r
+FLA_Trmm_luh_blk_var4 \r
+FLA_Trmm_luh_unb_var1 \r
+FLA_Trmm_luh_unb_var2 \r
+FLA_Trmm_luh_unb_var3 \r
+FLA_Trmm_luh_unb_var4 \r
+FLA_Trmm_lun \r
+FLA_Trmm_lun_blk_var1 \r
+FLA_Trmm_lun_blk_var2 \r
+FLA_Trmm_lun_blk_var3 \r
+FLA_Trmm_lun_blk_var4 \r
+FLA_Trmm_lun_unb_var1 \r
+FLA_Trmm_lun_unb_var2 \r
+FLA_Trmm_lun_unb_var3 \r
+FLA_Trmm_lun_unb_var4 \r
+FLA_Trmm_lut \r
+FLA_Trmm_lut_blk_var1 \r
+FLA_Trmm_lut_blk_var2 \r
+FLA_Trmm_lut_blk_var3 \r
+FLA_Trmm_lut_blk_var4 \r
+FLA_Trmm_lut_unb_var1 \r
+FLA_Trmm_lut_unb_var2 \r
+FLA_Trmm_lut_unb_var3 \r
+FLA_Trmm_lut_unb_var4 \r
+FLA_Trmm_rlh \r
+FLA_Trmm_rlh_blk_var1 \r
+FLA_Trmm_rlh_blk_var2 \r
+FLA_Trmm_rlh_blk_var3 \r
+FLA_Trmm_rlh_blk_var4 \r
+FLA_Trmm_rlh_unb_var1 \r
+FLA_Trmm_rlh_unb_var2 \r
+FLA_Trmm_rlh_unb_var3 \r
+FLA_Trmm_rlh_unb_var4 \r
+FLA_Trmm_rln \r
+FLA_Trmm_rln_blk_var1 \r
+FLA_Trmm_rln_blk_var2 \r
+FLA_Trmm_rln_blk_var3 \r
+FLA_Trmm_rln_blk_var4 \r
+FLA_Trmm_rln_unb_var1 \r
+FLA_Trmm_rln_unb_var2 \r
+FLA_Trmm_rln_unb_var3 \r
+FLA_Trmm_rln_unb_var4 \r
+FLA_Trmm_rlt \r
+FLA_Trmm_rlt_blk_var1 \r
+FLA_Trmm_rlt_blk_var2 \r
+FLA_Trmm_rlt_blk_var3 \r
+FLA_Trmm_rlt_blk_var4 \r
+FLA_Trmm_rlt_unb_var1 \r
+FLA_Trmm_rlt_unb_var2 \r
+FLA_Trmm_rlt_unb_var3 \r
+FLA_Trmm_rlt_unb_var4 \r
+FLA_Trmm_ruh \r
+FLA_Trmm_ruh_blk_var1 \r
+FLA_Trmm_ruh_blk_var2 \r
+FLA_Trmm_ruh_blk_var3 \r
+FLA_Trmm_ruh_blk_var4 \r
+FLA_Trmm_ruh_unb_var1 \r
+FLA_Trmm_ruh_unb_var2 \r
+FLA_Trmm_ruh_unb_var3 \r
+FLA_Trmm_ruh_unb_var4 \r
+FLA_Trmm_run \r
+FLA_Trmm_run_blk_var1 \r
+FLA_Trmm_run_blk_var2 \r
+FLA_Trmm_run_blk_var3 \r
+FLA_Trmm_run_blk_var4 \r
+FLA_Trmm_run_unb_var1 \r
+FLA_Trmm_run_unb_var2 \r
+FLA_Trmm_run_unb_var3 \r
+FLA_Trmm_run_unb_var4 \r
+FLA_Trmm_rut \r
+FLA_Trmm_rut_blk_var1 \r
+FLA_Trmm_rut_blk_var2 \r
+FLA_Trmm_rut_blk_var3 \r
+FLA_Trmm_rut_blk_var4 \r
+FLA_Trmm_rut_unb_var1 \r
+FLA_Trmm_rut_unb_var2 \r
+FLA_Trmm_rut_unb_var3 \r
+FLA_Trmm_rut_unb_var4 \r
+FLA_Trmm_task \r
+FLA_Trmm_llh_task \r
+FLA_Trmm_lln_task \r
+FLA_Trmm_llt_task \r
+FLA_Trmm_luh_task \r
+FLA_Trmm_lun_task \r
+FLA_Trmm_lut_task \r
+FLA_Trmm_rlh_task \r
+FLA_Trmm_rln_task \r
+FLA_Trmm_rlt_task \r
+FLA_Trmm_ruh_task \r
+FLA_Trmm_run_task \r
+FLA_Trmm_rut_task \r
+FLA_Trmv \r
+FLA_Trmvsx \r
+FLA_Trmvsx_external \r
+FLA_Trmv_external \r
+FLA_Trsm \r
+FLA_Trsmsx_external \r
+FLA_Trsm_cntl_init \r
+FLA_Trsm_cntl_finalize \r
+FLA_Trsm_external \r
+FLA_Trsm_internal \r
+FLA_Trsm_llh \r
+FLA_Trsm_llh_blk_var1 \r
+FLA_Trsm_llh_blk_var2 \r
+FLA_Trsm_llh_blk_var3 \r
+FLA_Trsm_llh_blk_var4 \r
+FLA_Trsm_llh_unb_var1 \r
+FLA_Trsm_llh_unb_var2 \r
+FLA_Trsm_llh_unb_var3 \r
+FLA_Trsm_llh_unb_var4 \r
+FLA_Trsm_lln \r
+FLA_Trsm_lln_blk_var1 \r
+FLA_Trsm_lln_blk_var2 \r
+FLA_Trsm_lln_blk_var3 \r
+FLA_Trsm_lln_blk_var4 \r
+FLA_Trsm_lln_unb_var1 \r
+FLA_Trsm_lln_unb_var2 \r
+FLA_Trsm_lln_unb_var3 \r
+FLA_Trsm_lln_unb_var4 \r
+FLA_Trsm_llt \r
+FLA_Trsm_llt_blk_var1 \r
+FLA_Trsm_llt_blk_var2 \r
+FLA_Trsm_llt_blk_var3 \r
+FLA_Trsm_llt_blk_var4 \r
+FLA_Trsm_llt_unb_var1 \r
+FLA_Trsm_llt_unb_var2 \r
+FLA_Trsm_llt_unb_var3 \r
+FLA_Trsm_llt_unb_var4 \r
+FLA_Trsm_luh \r
+FLA_Trsm_luh_blk_var1 \r
+FLA_Trsm_luh_blk_var2 \r
+FLA_Trsm_luh_blk_var3 \r
+FLA_Trsm_luh_blk_var4 \r
+FLA_Trsm_luh_unb_var1 \r
+FLA_Trsm_luh_unb_var2 \r
+FLA_Trsm_luh_unb_var3 \r
+FLA_Trsm_luh_unb_var4 \r
+FLA_Trsm_lun \r
+FLA_Trsm_lun_blk_var1 \r
+FLA_Trsm_lun_blk_var2 \r
+FLA_Trsm_lun_blk_var3 \r
+FLA_Trsm_lun_blk_var4 \r
+FLA_Trsm_lun_unb_var1 \r
+FLA_Trsm_lun_unb_var2 \r
+FLA_Trsm_lun_unb_var3 \r
+FLA_Trsm_lun_unb_var4 \r
+FLA_Trsm_lut \r
+FLA_Trsm_lut_blk_var1 \r
+FLA_Trsm_lut_blk_var2 \r
+FLA_Trsm_lut_blk_var3 \r
+FLA_Trsm_lut_blk_var4 \r
+FLA_Trsm_lut_unb_var1 \r
+FLA_Trsm_lut_unb_var2 \r
+FLA_Trsm_lut_unb_var3 \r
+FLA_Trsm_lut_unb_var4 \r
+FLA_Trsm_piv_task \r
+FLA_Trsm_rlh \r
+FLA_Trsm_rlh_blk_var1 \r
+FLA_Trsm_rlh_blk_var2 \r
+FLA_Trsm_rlh_blk_var3 \r
+FLA_Trsm_rlh_blk_var4 \r
+FLA_Trsm_rlh_unb_var1 \r
+FLA_Trsm_rlh_unb_var2 \r
+FLA_Trsm_rlh_unb_var3 \r
+FLA_Trsm_rlh_unb_var4 \r
+FLA_Trsm_rln \r
+FLA_Trsm_rln_blk_var1 \r
+FLA_Trsm_rln_blk_var2 \r
+FLA_Trsm_rln_blk_var3 \r
+FLA_Trsm_rln_blk_var4 \r
+FLA_Trsm_rln_unb_var1 \r
+FLA_Trsm_rln_unb_var2 \r
+FLA_Trsm_rln_unb_var3 \r
+FLA_Trsm_rln_unb_var4 \r
+FLA_Trsm_rlt \r
+FLA_Trsm_rlt_blk_var1 \r
+FLA_Trsm_rlt_blk_var2 \r
+FLA_Trsm_rlt_blk_var3 \r
+FLA_Trsm_rlt_blk_var4 \r
+FLA_Trsm_rlt_unb_var1 \r
+FLA_Trsm_rlt_unb_var2 \r
+FLA_Trsm_rlt_unb_var3 \r
+FLA_Trsm_rlt_unb_var4 \r
+FLA_Trsm_ruh \r
+FLA_Trsm_ruh_blk_var1 \r
+FLA_Trsm_ruh_blk_var2 \r
+FLA_Trsm_ruh_blk_var3 \r
+FLA_Trsm_ruh_blk_var4 \r
+FLA_Trsm_ruh_unb_var1 \r
+FLA_Trsm_ruh_unb_var2 \r
+FLA_Trsm_ruh_unb_var3 \r
+FLA_Trsm_ruh_unb_var4 \r
+FLA_Trsm_run \r
+FLA_Trsm_run_blk_var1 \r
+FLA_Trsm_run_blk_var2 \r
+FLA_Trsm_run_blk_var3 \r
+FLA_Trsm_run_blk_var4 \r
+FLA_Trsm_run_unb_var1 \r
+FLA_Trsm_run_unb_var2 \r
+FLA_Trsm_run_unb_var3 \r
+FLA_Trsm_run_unb_var4 \r
+FLA_Trsm_rut \r
+FLA_Trsm_rut_blk_var1 \r
+FLA_Trsm_rut_blk_var2 \r
+FLA_Trsm_rut_blk_var3 \r
+FLA_Trsm_rut_blk_var4 \r
+FLA_Trsm_rut_unb_var1 \r
+FLA_Trsm_rut_unb_var2 \r
+FLA_Trsm_rut_unb_var3 \r
+FLA_Trsm_rut_unb_var4 \r
+FLA_Trsm_task \r
+FLA_Trsm_llh_task \r
+FLA_Trsm_lln_task \r
+FLA_Trsm_llt_task \r
+FLA_Trsm_luh_task \r
+FLA_Trsm_lun_task \r
+FLA_Trsm_lut_task \r
+FLA_Trsm_rlh_task \r
+FLA_Trsm_rln_task \r
+FLA_Trsm_rlt_task \r
+FLA_Trsm_ruh_task \r
+FLA_Trsm_run_task \r
+FLA_Trsm_rut_task \r
+FLA_Trsv \r
+FLA_Trsvsx \r
+FLA_Trsvsx_external \r
+FLA_Trsv_cntl_init \r
+FLA_Trsv_cntl_finalize \r
+FLA_Trsv_external \r
+FLA_Trsv_internal \r
+FLA_Trsv_lc \r
+FLA_Trsv_lc_blk_var1 \r
+FLA_Trsv_lc_blk_var2 \r
+FLA_Trsv_ln \r
+FLA_Trsv_ln_blk_var1 \r
+FLA_Trsv_ln_blk_var2 \r
+FLA_Trsv_lt \r
+FLA_Trsv_lt_blk_var1 \r
+FLA_Trsv_lt_blk_var2 \r
+FLA_Trsv_task \r
+FLA_Trsv_lc_task \r
+FLA_Trsv_ln_task \r
+FLA_Trsv_lt_task \r
+FLA_Trsv_uc_task \r
+FLA_Trsv_un_task \r
+FLA_Trsv_ut_task \r
+FLA_Trsv_uc \r
+FLA_Trsv_uc_blk_var1 \r
+FLA_Trsv_uc_blk_var2 \r
+FLA_Trsv_un \r
+FLA_Trsv_un_blk_var1 \r
+FLA_Trsv_un_blk_var2 \r
+FLA_Trsv_ut \r
+FLA_Trsv_ut_blk_var1 \r
+FLA_Trsv_ut_blk_var2 \r
+FLA_Ttmm \r
+FLA_Ttmm_blk_external \r
+FLA_Ttmm_cntl_init \r
+FLA_Ttmm_cntl_finalize \r
+FLA_Ttmm_internal \r
+FLA_Ttmm_l \r
+FLA_Ttmm_l_blk_var1 \r
+FLA_Ttmm_l_blk_var2 \r
+FLA_Ttmm_l_blk_var3 \r
+FLA_Ttmm_l_opt_var1 \r
+FLA_Ttmm_l_ops_var1 \r
+FLA_Ttmm_l_opd_var1 \r
+FLA_Ttmm_l_opc_var1 \r
+FLA_Ttmm_l_opz_var1 \r
+FLA_Ttmm_l_opt_var2 \r
+FLA_Ttmm_l_ops_var2 \r
+FLA_Ttmm_l_opd_var2 \r
+FLA_Ttmm_l_opc_var2 \r
+FLA_Ttmm_l_opz_var2 \r
+FLA_Ttmm_l_opt_var3 \r
+FLA_Ttmm_l_ops_var3 \r
+FLA_Ttmm_l_opd_var3 \r
+FLA_Ttmm_l_opc_var3 \r
+FLA_Ttmm_l_opz_var3 \r
+FLA_Ttmm_l_unb_var1 \r
+FLA_Ttmm_l_unb_var2 \r
+FLA_Ttmm_l_unb_var3 \r
+FLA_Ttmm_task \r
+FLA_Ttmm_l_task \r
+FLA_Ttmm_u_task \r
+FLA_Ttmm_u \r
+FLA_Ttmm_unb_external \r
+FLA_Ttmm_l_unb_ext \r
+FLA_Ttmm_u_unb_ext \r
+FLA_Ttmm_u_blk_var1 \r
+FLA_Ttmm_u_blk_var2 \r
+FLA_Ttmm_u_blk_var3 \r
+FLA_Ttmm_u_opt_var1 \r
+FLA_Ttmm_u_ops_var1 \r
+FLA_Ttmm_u_opd_var1 \r
+FLA_Ttmm_u_opc_var1 \r
+FLA_Ttmm_u_opz_var1 \r
+FLA_Ttmm_u_opt_var2 \r
+FLA_Ttmm_u_ops_var2 \r
+FLA_Ttmm_u_opd_var2 \r
+FLA_Ttmm_u_opc_var2 \r
+FLA_Ttmm_u_opz_var2 \r
+FLA_Ttmm_u_opt_var3 \r
+FLA_Ttmm_u_ops_var3 \r
+FLA_Ttmm_u_opd_var3 \r
+FLA_Ttmm_u_opc_var3 \r
+FLA_Ttmm_u_opz_var3 \r
+FLA_Ttmm_u_unb_var1 \r
+FLA_Ttmm_u_unb_var2 \r
+FLA_Ttmm_u_unb_var3 \r
+FLA_Part_2x2 \r
+FLA_Part_2x1 \r
+FLA_Part_1x2 \r
+FLA_Repart_2x2_to_3x3 \r
+FLA_Repart_2x1_to_3x1 \r
+FLA_Repart_1x2_to_1x3 \r
+FLA_Cont_with_3x3_to_2x2 \r
+FLA_Cont_with_3x1_to_2x1 \r
+FLA_Cont_with_1x3_to_1x2 \r
+FLA_Merge_2x2 \r
+FLA_Merge_2x1 \r
+FLA_Merge_1x2 \r
index 2fa601b01f4723030d2940a8c06ff0222821b4d5..7ea6f83ab8cb1eff07f18e4259c07883f678a1a4 100644 (file)
-::
-::
-:: BLIS
-:: An object-based framework for developing high-performance BLAS-like
-:: libraries.
-::
-:: Copyright (C) 2014, The University of Texas at Austin
-::
-:: Redistribution and use in source and binary forms, with or without
-:: modification, are permitted provided that the following conditions are
-:: met:
-:: - Redistributions of source code must retain the above copyright
-:: notice, this list of conditions and the following disclaimer.
-:: - Redistributions in binary form must reproduce the above copyright
-:: notice, this list of conditions and the following disclaimer in the
-:: documentation and/or other materials provided with the distribution.
-:: - Neither the name of The University of Texas at Austin nor the names
-:: of its contributors may be used to endorse or promote products
-:: derived from this software without specific prior written permission.
-::
-:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-::
-::
-
-@echo off
-
-echo.
-echo Makefile
-echo.
-echo Field G. Van Zee
-echo.
-echo nmake Makefile for building BLIS for Microsoft Windows. nmake targets
-echo may be invoked after running the configure.cmd script. Valid targets are:
-echo.
-echo all - Invoke the lib and dll targets.
-echo lib - Build BLIS as a static library.
-echo dll - Build BLIS as a dynamically-linked library.
-echo help - Output help and usage information.
-echo clean - Invoke clean-log and clean-build targets.
-echo clean-log - Remove any log files present.
-echo clean-config - Remove all products of configure.cmd. Namely, remove the
-echo config, include, and src directories.
-echo clean-build - Remove all products of the compilation portion of the build
-echo process. Namely, remove the obj, lib, and dll directories.
-echo distclean - Invoke clean-log, clean-config, and clean-build targets.
-echo.
-echo The Makefile also recognizes configuration options corresponding to the
-echo following Makefile variables:
-echo.
-echo VERBOSE - When defined, nmake outputs the actual commands
-echo executed instead of more concise one-line progress
-echo indicators. (Undefined by default.)
-echo.
-echo Typically, these options are specified by commenting or uncommenting the
-echo corresponding lines in the Makefile. However, if the Makefile currently does
-echo not define one of the options, and you wish to enable the corresponding
-echo feature without editing the Makefile, you may define the variable at the
-echo command line when nmake is invoked. For example, you may enable verboseness
-echo while invoking the lib target as follows:
-echo.
-echo nmake lib VERBOSE=1
-echo.
+::\r
+::\r
+:: BLIS \r
+:: An object-based framework for developing high-performance BLAS-like\r
+:: libraries.\r
+::\r
+:: Copyright (C) 2014, The University of Texas at Austin\r
+::\r
+:: Redistribution and use in source and binary forms, with or without\r
+:: modification, are permitted provided that the following conditions are\r
+:: met:\r
+:: - Redistributions of source code must retain the above copyright\r
+:: notice, this list of conditions and the following disclaimer.\r
+:: - Redistributions in binary form must reproduce the above copyright\r
+:: notice, this list of conditions and the following disclaimer in the\r
+:: documentation and/or other materials provided with the distribution.\r
+:: - Neither the name of The University of Texas at Austin nor the names\r
+:: of its contributors may be used to endorse or promote products\r
+:: derived from this software without specific prior written permission.\r
+::\r
+:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+::\r
+::\r
+\r
+@echo off\r
+\r
+echo. \r
+echo Makefile\r
+echo. \r
+echo Field G. Van Zee\r
+echo. \r
+echo nmake Makefile for building BLIS for Microsoft Windows. nmake targets\r
+echo may be invoked after running the configure.cmd script. Valid targets are:\r
+echo. \r
+echo all - Invoke the lib and dll targets.\r
+echo lib - Build BLIS as a static library.\r
+echo dll - Build BLIS as a dynamically-linked library.\r
+echo help - Output help and usage information.\r
+echo clean - Invoke clean-log and clean-build targets.\r
+echo clean-log - Remove any log files present.\r
+echo clean-config - Remove all products of configure.cmd. Namely, remove the\r
+echo config, include, and src directories.\r
+echo clean-build - Remove all products of the compilation portion of the build\r
+echo process. Namely, remove the obj, lib, and dll directories.\r
+echo distclean - Invoke clean-log, clean-config, and clean-build targets.\r
+echo.\r
+echo The Makefile also recognizes configuration options corresponding to the\r
+echo following Makefile variables:\r
+echo.\r
+echo VERBOSE - When defined, nmake outputs the actual commands\r
+echo executed instead of more concise one-line progress\r
+echo indicators. (Undefined by default.)\r
+echo.\r
+echo Typically, these options are specified by commenting or uncommenting the\r
+echo corresponding lines in the Makefile. However, if the Makefile currently does\r
+echo not define one of the options, and you wish to enable the corresponding\r
+echo feature without editing the Makefile, you may define the variable at the\r
+echo command line when nmake is invoked. For example, you may enable verboseness\r
+echo while invoking the lib target as follows:\r
+echo.\r
+echo nmake lib VERBOSE=1\r
+echo.\r
index 37b965cb7bddb330c86eb25bf3d95693d0423f09..98115790e3fc839b38697a3a9c886b6ac75403be 100644 (file)
-::
-::
-:: BLIS
-:: An object-based framework for developing high-performance BLAS-like
-:: libraries.
-::
-:: Copyright (C) 2014, The University of Texas at Austin
-::
-:: Redistribution and use in source and binary forms, with or without
-:: modification, are permitted provided that the following conditions are
-:: met:
-:: - Redistributions of source code must retain the above copyright
-:: notice, this list of conditions and the following disclaimer.
-:: - Redistributions in binary form must reproduce the above copyright
-:: notice, this list of conditions and the following disclaimer in the
-:: documentation and/or other materials provided with the distribution.
-:: - Neither the name of The University of Texas at Austin nor the names
-:: of its contributors may be used to endorse or promote products
-:: derived from this software without specific prior written permission.
-::
-:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-::
-::
-
-@echo off
-
-:ENVIRONMENT
- set GEN_CHECK_REV_FILE=.\build\gen-check-rev-file.py
- set GATHER_SRC=.\build\gather-src-for-windows.py
- set GEN_CONFIG_FILE=.\build\gen-config-file.py
- set CONFIG_DEFS_TEMPL=.\build\config.mk.in
- set SRC_TREE_DIR=..\frame
- set TOP_BUILD_DIR=.
-
-:PARAMS
- if "%1"=="" (goto USAGE)
- if "%2"=="" (goto USAGE)
- if "%3"=="" (goto USAGE)
-
- set ARCH=%1
- set BUILD=%2
- set CCOMPILER=%3
-
-:TASK_UNIT
- echo %0: Checking/updating revision file.
- %GEN_CHECK_REV_FILE% -v
- echo %0: Gathering source files into local flat directories.
- %GATHER_SRC% %SRC_TREE_DIR% %TOP_BUILD_DIR%
- echo %0: Creating configure definitions file.
- %GEN_CONFIG_FILE% %TOP_BUILD_DIR% %ARCH% %BUILD% %CCOMPILER% %CONFIG_DEFS_TEMPL%
- echo %0: Configuration and setup complete. You may now run nmake.
-
- goto END
-
-:USAGE
- echo.
- echo configure.cmd
- echo.
- echo A wrapper script for various configuration and setup scripts that need
- echo. to be run before nmake when building BLIS for Microsoft Windows.
- echo.
- echo USAGE:
- echo %0 [arch] [build] [cc]
- echo.
- echo arch -- The architecture string to build.
- echo Supported values: {x86,x64}
- echo build -- The kind of build.
- echo Supported values: {debug,release}
- echo cc -- The C compiler to use.
- echo Supported values: {icl,cl}
- echo.
- echo examples:
- echo %0 x86 debug icl
- echo %0 x64 release cl
- echo.
-
-:END
+::\r
+::\r
+:: BLIS \r
+:: An object-based framework for developing high-performance BLAS-like\r
+:: libraries.\r
+::\r
+:: Copyright (C) 2014, The University of Texas at Austin\r
+::\r
+:: Redistribution and use in source and binary forms, with or without\r
+:: modification, are permitted provided that the following conditions are\r
+:: met:\r
+:: - Redistributions of source code must retain the above copyright\r
+:: notice, this list of conditions and the following disclaimer.\r
+:: - Redistributions in binary form must reproduce the above copyright\r
+:: notice, this list of conditions and the following disclaimer in the\r
+:: documentation and/or other materials provided with the distribution.\r
+:: - Neither the name of The University of Texas at Austin nor the names\r
+:: of its contributors may be used to endorse or promote products\r
+:: derived from this software without specific prior written permission.\r
+::\r
+:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+::\r
+::\r
+\r
+@echo off\r
+\r
+:ENVIRONMENT\r
+ set GEN_CHECK_REV_FILE=.\build\gen-check-rev-file.py\r
+ set GATHER_SRC=.\build\gather-src-for-windows.py\r
+ set GEN_CONFIG_FILE=.\build\gen-config-file.py\r
+ set CONFIG_DEFS_TEMPL=.\build\config.mk.in\r
+ set SRC_TREE_DIR=..\frame\r
+ set TOP_BUILD_DIR=.\r
+\r
+:PARAMS\r
+ if "%1"=="" (goto USAGE)\r
+ if "%2"=="" (goto USAGE)\r
+ if "%3"=="" (goto USAGE)\r
+\r
+ set ARCH=%1\r
+ set BUILD=%2\r
+ set CCOMPILER=%3\r
+ \r
+:TASK_UNIT\r
+ echo %0: Checking/updating revision file.\r
+ %GEN_CHECK_REV_FILE% -v\r
+ echo %0: Gathering source files into local flat directories.\r
+ %GATHER_SRC% %SRC_TREE_DIR% %TOP_BUILD_DIR%\r
+ echo %0: Creating configure definitions file.\r
+ %GEN_CONFIG_FILE% %TOP_BUILD_DIR% %ARCH% %BUILD% %CCOMPILER% %CONFIG_DEFS_TEMPL%\r
+ echo %0: Configuration and setup complete. You may now run nmake. \r
+\r
+ goto END\r
+\r
+:USAGE\r
+ echo. \r
+ echo configure.cmd\r
+ echo. \r
+ echo A wrapper script for various configuration and setup scripts that need\r
+ echo. to be run before nmake when building BLIS for Microsoft Windows.\r
+ echo. \r
+ echo USAGE:\r
+ echo %0 [arch] [build] [cc]\r
+ echo.\r
+ echo arch -- The architecture string to build.\r
+ echo Supported values: {x86,x64}\r
+ echo build -- The kind of build.\r
+ echo Supported values: {debug,release}\r
+ echo cc -- The C compiler to use.\r
+ echo Supported values: {icl,cl}\r
+ echo. \r
+ echo examples:\r
+ echo %0 x86 debug icl\r
+ echo %0 x64 release cl\r
+ echo.\r
+\r
+:END\r
index 0d73eb01e11df5879e4f62679b80eed5f1282f45..db0cdc1d26bdd664c89e973e97ae762582a1633a 100644 (file)
--- a/blis/windows/gendll.cmd
+++ b/blis/windows/gendll.cmd
-@echo off
-@setlocal enabledelayedexpansion
-
-rem --------------------------------------------------------------------
-rem Build a dll out of a set of object files specified by the
-rem argument /objlist.
-rem
-rem The .lib file thus created is an "import" library, which one links
-rem with, but the bulk of the code ends up in the associated .dll file.
-rem ---------------------------------------------------------------------
-
-set THIS_SCRIPT=%~dp0%~nx0
-
-if "%1"=="" goto USAGE
-if "%2"=="" goto USAGE
-if "%3"=="" goto USAGE
-if "%4"=="" goto USAGE
-if "%5"=="" goto USAGE
-
-set gd_lib_name=%1
-set gd_link=%gd_lib_name%-static.link
-set LINKER=%3
-set LINKARGSFILE=%4
-set gd_def=%5
-
-:PARSE_ARGS
-set IMPORT=
-set OBJLIST=
-:ARGLOOP
-if "%6"=="" goto ENDARGLOOP
-if /i not "%6"=="/import" goto OBJARG
-set IMPORT=!IMPORT! %7
-goto SHIFT
-:OBJARG
-if /i not "%6"=="/objlist" goto ENDARGLOOP
-set OBJLIST=%7
-:SHIFT
-shift /4
-shift /4
-goto ARGLOOP
-:ENDARGLOOP
-
-if defined OBJLIST goto COMPILER_SETUP
-echo Error: must supply /objlist <file with list of object names>
-goto USAGE
-
-:COMPILER_SETUP
-set gd_path=%2
-set gd_dll_path=%gd_path%.dll
-set gd_main_c=dll_main__%gd_lib_name%.c
-set gd_main_obj=dll_main__%gd_lib_name%.obj
-
-rem create C file for dll_main
-for /F "tokens=*" %%i in ("#include <windows.h>") do echo %%i >%gd_main_c%
-echo. >>%gd_main_c%
-echo BOOLEAN WINAPI DllMain( >>%gd_main_c%
-echo HINSTANCE hDllHandle, >>%gd_main_c%
-echo DWORD nReason, >>%gd_main_c%
-echo LPVOID Reserved){ >>%gd_main_c%
-echo. >>%gd_main_c%
-echo BOOLEAN bSuccess = TRUE;>>%gd_main_c%
-echo. >>%gd_main_c%
-echo switch (nReason){ >>%gd_main_c%
-echo case DLL_PROCESS_ATTACH: >>%gd_main_c%
-echo DisableThreadLibraryCalls( hDllHandle ); >>%gd_main_c%
-echo break; >>%gd_main_c%
-echo case DLL_PROCESS_DETACH: >>%gd_main_c%
-echo break; >>%gd_main_c%
-echo. >>%gd_main_c%
-echo }; >>%gd_main_c%
-echo. >>%gd_main_c%
-echo return bSuccess; >>%gd_main_c%
-echo }; >>%gd_main_c%
-echo.>>%gd_main_c%
-
-rem set up link file by specifying dll filepath and main object
-echo /Fe%gd_dll_path% > %gd_link%
-echo %gd_main_obj% >> %gd_link%
-
-rem add contents of linkargs file; most of the link argument action is
-rem in this file
-type %LINKARGSFILE% >> %gd_link%
-
-rem add command-line import libraries, if any
-if defined IMPORT echo !IMPORT! >> %gd_link%
-
-rem add export specification
-echo %gd_def% >> %gd_link%
-
-rem add contents of OBJLIST file
-type %OBJLIST% >> %gd_link%
-
-rem create dll, import lib, and export file
-%LINKER% /nologo /c /O2 /Fo%gd_main_obj% %gd_main_c% >> gendll-cl.log
-%LINKER% @%gd_link%
-
-:CLEANUP
-del /F /Q %gd_link% %gd_main_c% %gd_main_obj% gendll-cl.log
-goto END
-
-
-:USAGE
-echo.
-echo. gendll.cmd
-echo.
-echo. Generate a dynamically-linked library from a set of object files
-echo. specified in objlist_file.
-echo.
-echo. Usage:
-echo. %0 dllname dllpath linker linkargs_file symbols_file {/import importlib} /objlist objlist_file
-echo.
-echo. dllname -- the name of the DLL being created, with no extension.
-echo. dllpath -- the path to the DLL being created, with no extension.
-echo. linker -- the compiler to use to link the DLL.
-echo. linkargs_file -- the path to a file containing a list of all linker
-echo. arguments--link options, libraries, and library paths--
-echo. that that may be needed to successfully link the DLL
-echo. being created.
-echo. symbols_file -- the path to a file containing a list of symbols to
-echo. export in the DLL.
-echo. importlib -- the path to a .lib library that you wish to import into
-echo. the DLL being created. Optional.
-echo. objlist_file -- the path to a file containing the list of object files
-echo. that make up the bulk of the DLL being created.
-echo.
-
-:END
-endlocal
+@echo off\r
+@setlocal enabledelayedexpansion\r
+\r
+rem --------------------------------------------------------------------\r
+rem Build a dll out of a set of object files specified by the \r
+rem argument /objlist.\r
+rem\r
+rem The .lib file thus created is an "import" library, which one links\r
+rem with, but the bulk of the code ends up in the associated .dll file.\r
+rem ---------------------------------------------------------------------\r
+\r
+set THIS_SCRIPT=%~dp0%~nx0\r
+\r
+if "%1"=="" goto USAGE\r
+if "%2"=="" goto USAGE\r
+if "%3"=="" goto USAGE\r
+if "%4"=="" goto USAGE\r
+if "%5"=="" goto USAGE\r
+\r
+set gd_lib_name=%1\r
+set gd_link=%gd_lib_name%-static.link\r
+set LINKER=%3\r
+set LINKARGSFILE=%4\r
+set gd_def=%5\r
+\r
+:PARSE_ARGS\r
+set IMPORT=\r
+set OBJLIST=\r
+:ARGLOOP\r
+if "%6"=="" goto ENDARGLOOP\r
+if /i not "%6"=="/import" goto OBJARG\r
+set IMPORT=!IMPORT! %7\r
+goto SHIFT\r
+:OBJARG\r
+if /i not "%6"=="/objlist" goto ENDARGLOOP\r
+set OBJLIST=%7\r
+:SHIFT\r
+shift /4\r
+shift /4\r
+goto ARGLOOP\r
+:ENDARGLOOP\r
+\r
+if defined OBJLIST goto COMPILER_SETUP\r
+echo Error: must supply /objlist <file with list of object names>\r
+goto USAGE\r
+\r
+:COMPILER_SETUP\r
+set gd_path=%2\r
+set gd_dll_path=%gd_path%.dll\r
+set gd_main_c=dll_main__%gd_lib_name%.c\r
+set gd_main_obj=dll_main__%gd_lib_name%.obj\r
+\r
+rem create C file for dll_main\r
+for /F "tokens=*" %%i in ("#include <windows.h>") do echo %%i >%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo BOOLEAN WINAPI DllMain( >>%gd_main_c%\r
+echo HINSTANCE hDllHandle, >>%gd_main_c%\r
+echo DWORD nReason, >>%gd_main_c%\r
+echo LPVOID Reserved){ >>%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo BOOLEAN bSuccess = TRUE;>>%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo switch (nReason){ >>%gd_main_c%\r
+echo case DLL_PROCESS_ATTACH: >>%gd_main_c%\r
+echo DisableThreadLibraryCalls( hDllHandle ); >>%gd_main_c%\r
+echo break; >>%gd_main_c%\r
+echo case DLL_PROCESS_DETACH: >>%gd_main_c%\r
+echo break; >>%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo }; >>%gd_main_c%\r
+echo. >>%gd_main_c%\r
+echo return bSuccess; >>%gd_main_c%\r
+echo }; >>%gd_main_c%\r
+echo.>>%gd_main_c%\r
+\r
+rem set up link file by specifying dll filepath and main object\r
+echo /Fe%gd_dll_path% > %gd_link%\r
+echo %gd_main_obj% >> %gd_link%\r
+\r
+rem add contents of linkargs file; most of the link argument action is\r
+rem in this file\r
+type %LINKARGSFILE% >> %gd_link%\r
+\r
+rem add command-line import libraries, if any\r
+if defined IMPORT echo !IMPORT! >> %gd_link%\r
+\r
+rem add export specification\r
+echo %gd_def% >> %gd_link%\r
+\r
+rem add contents of OBJLIST file\r
+type %OBJLIST% >> %gd_link%\r
+\r
+rem create dll, import lib, and export file\r
+%LINKER% /nologo /c /O2 /Fo%gd_main_obj% %gd_main_c% >> gendll-cl.log\r
+%LINKER% @%gd_link%\r
+\r
+:CLEANUP\r
+del /F /Q %gd_link% %gd_main_c% %gd_main_obj% gendll-cl.log\r
+goto END\r
+\r
+\r
+:USAGE\r
+echo. \r
+echo. gendll.cmd\r
+echo. \r
+echo. Generate a dynamically-linked library from a set of object files\r
+echo. specified in objlist_file.\r
+echo. \r
+echo. Usage:\r
+echo. %0 dllname dllpath linker linkargs_file symbols_file {/import importlib} /objlist objlist_file\r
+echo.\r
+echo. dllname -- the name of the DLL being created, with no extension.\r
+echo. dllpath -- the path to the DLL being created, with no extension.\r
+echo. linker -- the compiler to use to link the DLL.\r
+echo. linkargs_file -- the path to a file containing a list of all linker\r
+echo. arguments--link options, libraries, and library paths--\r
+echo. that that may be needed to successfully link the DLL\r
+echo. being created.\r
+echo. symbols_file -- the path to a file containing a list of symbols to\r
+echo. export in the DLL.\r
+echo. importlib -- the path to a .lib library that you wish to import into\r
+echo. the DLL being created. Optional.\r
+echo. objlist_file -- the path to a file containing the list of object files\r
+echo. that make up the bulk of the DLL being created.\r
+echo.\r
+\r
+:END\r
+endlocal\r
diff --git a/ticblas/src/Makefile b/ticblas/src/Makefile
index 79e555400b26046e37388b4f39284ba6c23e831b..671bcf60b446d80419ffcde7dec428d7a49ec44a 100644 (file)
--- a/ticblas/src/Makefile
+++ b/ticblas/src/Makefile
# INCLUDE Directory
$(eval $(call FIND_DSP_PKG,OMP_DIR,openmp_dsp*,packages))
ifneq (,$(findstring 86, $(UNAME_M)))
-$(eval $(call FIND_DSP_PKG,C6636_PDK_DIR,pdk_keystone2*,packages))
+$(eval $(call FIND_DSP_PKG,PDK_DIR,pdk_keystone2*,packages))
+PDK_INC=$(PDK_DIR)/packages
+else
+PDK_INC=$(TARGET_ROOTDIR)/usr/include
endif
$(eval $(call FIND_DSP_PKG,FC_DIR,framework_components*,packages))
$(eval $(call FIND_DSP_PKG,XDAIS_DIR,xdais*,packages))
$(eval $(call FIND_DSP_PKG,XDC_DIR,xdc*,packages))
-INCDIR := $(TI_OCL_CGT_INSTALL)/include;$(TARGET_ROOTDIR)/usr/include;../../blis/install/c66x/include/blis/;$(OMP_DIR)/packages/ti/runtime/openmp;$(PDK_DIR)/packages
+
+ifeq ($(MEM_MODEL),Large)
+BLIS_INC = ../../blis/install/c66x/include/blis/
+else ifeq ($(MEM_MODEL),Medium)
+BLIS_INC = ../../blis/install/shannon/include/blis/
+else ifeq ($(MEM_MODEL),Small)
+BLIS_INC = ../../blis/install/am57x/include/blis/
+#else ifeq ($(MEM_MODEL),Tiny)
+endif
+
+INCDIR := $(TI_OCL_CGT_INSTALL)/include;$(OMP_DIR)/packages/ti/runtime/openmp;$(PDK_INC)
INCDIR += -I$(FC_DIR)/packages
INCDIR += -I$(XDC_DIR)/packages
INCDIR += -I$(XDAIS_DIR)/packages
INCDIR += -I$(LIBARCH_DIR)/packages
+INCDIR += -I$(BLIS_INC)
INCS = -I. -I$(strip $(subst ;, -I,$(subst $(space),$(space),$(INCDIR))))
diff --git a/ticblas/src/ticblas.c b/ticblas/src/ticblas.c
index da4da28173f18c65b324ccd0c508978a84519719..888268b3430c6c07e2bcdc4815d3233235e85e70 100644 (file)
--- a/ticblas/src/ticblas.c
+++ b/ticblas/src/ticblas.c
}\r
} /* tiCblasInit */\r
\r
+int tiCblasNew()\r
+{\r
+ bli_init(); \r
+ \r
+ return(TICBLAS_SUCCESS);\r
+}\r
+\r
+int tiCblasDelete()\r
+{\r
+ bli_finalize();\r
+ \r
+ return(TICBLAS_SUCCESS);\r
+}\r
+\r
/* Nothing after this line */\r
diff --git a/ticblas/ticblas.h b/ticblas/ticblas.h
index 25cd40ee9b98aa6c255b3f8393ba7ebe79b7dca8..2dff96d273a679555451282cbdd2e7ad2c921dae 100644 (file)
--- a/ticblas/ticblas.h
+++ b/ticblas/ticblas.h
*****************************************************************************/\r
#ifndef TICBLAS_H\r
#define TICBLAS_H\r
- \r
+\r
+#include <stddef.h> \r
+\r
#define TICBLAS_SUCCESS (0)\r
#define TICBLAS_INIT_ERROR (-1)\r
\r