summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: f2eba74)
raw | patch | inline | side by side (parent: f2eba74)
author | Jianzhong Xu <xuj@ti.com> | |
Wed, 18 May 2016 13:53:09 +0000 (13:53 +0000) | ||
committer | Jianzhong Xu <xuj@ti.com> | |
Wed, 18 May 2016 13:53:09 +0000 (13:53 +0000) |
index 3d9ee061c56c025215e7967a469e99741a67370a..f9d03d4fb3d8f570ae6b4bc16b21c1828c74dcad 100644 (file)
endif
CPP_DEBUG = -g
-CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DMEM_MODEL_${MEM_MODEL} -I../../cblas/include -I../../blis/install/arm/include/blis/ -I$(TI_OCL_INSTALL_DIR)/include -fopenmp
+CPP_FLAGS = -D_LITTLE_ENDIAN -D__ARMv7 -DMEM_MODEL_${MEM_MODEL} -I../../cblas/include -I../../blis/install/arm/include/blis/ -fopenmp
CL6X_FLAGS = $(INCS) --openmp --use_g2 -D$(TARGET) -DLIB_OPENCL
CLOCL_FLAGS =
OBJCOPY_ARGS=
diff --git a/src/ti/linalg/blasblisacc/src/ti_cblas_initfini.c b/src/ti/linalg/blasblisacc/src/ti_cblas_initfini.c
index 7ae2b566754e725a45c57421cdf70d1068e19c09..65963d314a9c41065980c76bc60d14e7df366dd7 100644 (file)
*****************************************************************************/
#include "ti_cblas_acc.h"
+#include "../../ticblas/ticblas.h"
#include <pthread.h>
#ifdef TI_CBLAS_FAT_BINARY
#include "ti_cblas_kernel.dsp_h"
#endif
-/* Global variables */
-#ifdef __cplusplus
+#define TI_CBLAS_INITFINI_SUCCESS 0
+#define TI_CBLAS_INITFINI_OCL_ERR 1
+#define TI_CBLAS_INITFINI_BLI_ERR 2
-#if 0
-Context ti_cblas_ocl_context;
-std::vector<Device> ti_cblas_ocl_devices;
-CommandQueue ti_cblas_ocl_Q;
-Program::Binaries ti_cblas_ocl_binary;
-Program ti_cblas_ocl_program;
-Kernel* ti_cblas_ocl_kernels[TI_CBLAS_NUM_KERNELS];
-#else
+/* Global variables */
Context* ti_cblas_ocl_context = NULL;
std::vector<Device>* ti_cblas_ocl_devices = NULL;
CommandQueue* ti_cblas_ocl_Q = NULL;
Program::Binaries* ti_cblas_ocl_binary = NULL;
Program* ti_cblas_ocl_program = NULL;
-#endif
-#else
-cl_context ti_cblas_ocl_context;
-cl_command_queue ti_cblas_ocl_Q;
-cl_program ti_cblas_ocl_program;
-cl_kernel ti_cblas_ocl_kernels[TI_CBLAS_NUM_KERNELS];
-#endif
int ti_cblas_init_done = 0; /* flag to check if init is complete */
int ti_cblas_disable_debug = 0; /* runtime toggle to disable debug */
int ti_cblas_offload = TI_CBLAS_OFFLOAD_SIZE;
fprintf(stderr, "ERROR: (%s,%d)\n", msg, code);
}
-#ifdef __cplusplus
extern "C"
-#endif
int ti_blis_init(void)
{
- int r_val = 1;
- TI_CBLAS_DEBUG_PRINT("Initializing BLIS ARM\n");
- bli_init();
- TI_CBLAS_DEBUG_PRINT("BLIS ARM initialized\n");
+ int r_val = TI_CBLAS_INITFINI_SUCCESS;
+
+ TI_CBLAS_DEBUG_PRINT("Initializing BLIS on ARM...\n");
+ /* Initialize BLIS on ARM */
+ bli_init();
+ TI_CBLAS_DEBUG_PRINT("BLIS initialized on ARM.\n");
-#ifdef __cplusplus
+ /* Initialize BLIS on DSP */
+ TI_CBLAS_DEBUG_PRINT("Initializing BLIS on DSP...\n");
Event e;
Kernel* __K;
-#else
- cl_kernel __K;
-#endif
+
__K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMM_IDX, "ocl_bli_init");
-#ifdef __cplusplus
try
-#else
- cl_int err = CL_SUCCESS;
-#endif
{
- TI_CBLAS_DEBUG_PRINT("Initializing BLIS DSP\n");
+ TI_CBLAS_DEBUG_PRINT("Initializing BLIS on DSP...\n");
+
+ int err_code;
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(0, buf_err);
-#ifdef __cplusplus
ti_cblas_ocl_Q->enqueueTask(*__K, 0, &e);
e.wait();
-#else
- cl_event e;
- err |= clEnqueueTask(ti_cblas_ocl_Q, __K, 0, 0, &e);
- TI_CBLAS_OCL_CHKERROR("clEnqueueTask",err);
- err |= clWaitForEvents(1, &e);
- TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
- err |= clReleaseEvent(e);
- TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-#endif
+ if(err_code != TICBLAS_SUCCESS) {
+ TI_CBLAS_DEBUG_PRINT("Error in offloaded ocl_bli_init with error code %d!\n", err_code);
+ r_val = TI_CBLAS_INITFINI_BLI_ERR;
+ }
+
ti_cblas_delete_kernel(__K);
- TI_CBLAS_DEBUG_PRINT("BLIS DSP initialized\n");
-
+ TI_CBLAS_DEBUG_PRINT("BLIS DSP initialization finished.\n");
}
-#ifdef __cplusplus
+
catch (Error err)
{
+ ti_cblas_delete_kernel(__K);
ti_cblas_error(err.what(),err.err());
- r_val = 1;
- return r_val;
+ r_val = TI_CBLAS_INITFINI_OCL_ERR;
}
-#endif
+
+ return r_val;
}
-#ifdef __cplusplus
extern "C"
-#endif
int ti_blis_finalize(void)
{
- int r_val = 1;
+ int r_val = TI_CBLAS_INITFINI_SUCCESS;
+
+ TI_CBLAS_DEBUG_PRINT("Finalizing BLIS on ARM...\n");
+ /* Finalize BLIS on ARM */
bli_finalize();
+ TI_CBLAS_DEBUG_PRINT("BLIS finalized on ARM.\n");
-#ifdef __cplusplus
Event e;
Kernel* __K;
-#else
- cl_kernel __K;
-#endif
+
__K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMM_IDX, "ocl_bli_finalize");
-#ifdef __cplusplus
+
+ /* Finalize BLIS on DSP */
+ TI_CBLAS_DEBUG_PRINT("Finalizing BLIS on DSP...\n");
+ int err_code;
+ Buffer buf_err(*ti_cblas_ocl_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(int), &err_code);
+ __K->setArg(0, buf_err);
+
try
-#else
- cl_int err = CL_SUCCESS;
-#endif
{
-#ifdef __cplusplus
ti_cblas_ocl_Q->enqueueTask(*__K, 0, &e);
e.wait();
-#else
- cl_event e;
- err |= clEnqueueTask(ti_cblas_ocl_Q, __K, 0, 0, &e);
- TI_CBLAS_OCL_CHKERROR("clEnqueueTask",err);
- err |= clWaitForEvents(1, &e);
- TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err);
- err |= clReleaseEvent(e);
- TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err);
-#endif
+ if(err_code != TICBLAS_SUCCESS) {
+ TI_CBLAS_DEBUG_PRINT("Error in offloaded ocl_bli_finalize with error code %d!\n", err_code);
+ r_val = TI_CBLAS_INITFINI_BLI_ERR;
+ }
+
ti_cblas_delete_kernel(__K);
}
-#ifdef __cplusplus
+
catch (Error err)
{
ti_cblas_error(err.what(),err.err());
- r_val = 1;
- return r_val;
+ r_val = TI_CBLAS_INITFINI_OCL_ERR;
}
-#endif
+
+ return r_val;
}
#ifdef __cplusplus
#endif
int ti_cblas_finalize(void)
{
-
- int r_val = 1;
- //printf("ti_cblas_finalize\n");
-
/* If ti_cblas_init_done is equal to 0,
* then we know that ti_cblas_init was not called,
* and so we can return early.
*/
- if(ti_cblas_init_done == 0)
- return 0;
+ if(ti_cblas_init_done == 0) {
+ return TI_CBLAS_INITFINI_SUCCESS;
+ }
- r_val = ti_blis_finalize();
+ int r_val = ti_blis_finalize();
/*Using same name as ti_cblas_init critical region. See notes in bli_init*/
#pragma omp critical (ti_cblas_init_critical)
{
- if (ti_cblas_init_done == 1)
+ // Destroy Pthread
+ pthread_mutex_destroy(&MUTEX);
+ pthread_cond_destroy (&CV);
+
+ //destroy Command queue, program, devices and context.
+ if(ti_cblas_ocl_Q != NULL)
+ {
+ delete(ti_cblas_ocl_Q);
+ ti_cblas_ocl_Q = NULL;
+ }
+ if(ti_cblas_ocl_program != NULL)
+ {
+ delete(ti_cblas_ocl_program);
+ ti_cblas_ocl_program = NULL;
+ }
+ if(ti_cblas_ocl_binary != NULL)
+ {
+ delete(ti_cblas_ocl_binary);
+ ti_cblas_ocl_binary = NULL;
+ }
+ if(ti_cblas_ocl_devices != NULL)
{
- // Destroy Pthread
- pthread_mutex_destroy(&MUTEX);
- pthread_cond_destroy (&CV);
-
- //destroy Command queue, program, devices and context.
- if(ti_cblas_ocl_Q != NULL)
- {
- delete(ti_cblas_ocl_Q);
- ti_cblas_ocl_Q = NULL;
- }
- if(ti_cblas_ocl_program != NULL)
- {
- delete(ti_cblas_ocl_program);
- ti_cblas_ocl_program = NULL;
- }
- if(ti_cblas_ocl_binary != NULL)
- {
- delete(ti_cblas_ocl_binary);
- ti_cblas_ocl_binary = NULL;
- }
- if(ti_cblas_ocl_devices != NULL)
- {
- delete(ti_cblas_ocl_devices);
- ti_cblas_ocl_devices = NULL;
- }
- if(ti_cblas_ocl_context != NULL)
- {
- delete(ti_cblas_ocl_context);
- ti_cblas_ocl_context = NULL;
- }
- ti_cblas_init_done = 0;
- r_val = 0;
+ delete(ti_cblas_ocl_devices);
+ ti_cblas_ocl_devices = NULL;
+ }
+ if(ti_cblas_ocl_context != NULL)
+ {
+ delete(ti_cblas_ocl_context);
+ ti_cblas_ocl_context = NULL;
}
}
+
return r_val;
}
void ti_cblas_auto_finalize(void)
{
- int i;
+ int r_val;
- i = ti_cblas_finalize();
- if (i != 0)
+ r_val = ti_cblas_finalize();
+ if (r_val != TI_CBLAS_INITFINI_SUCCESS)
{
- fprintf(stderr, "Error: ti_cblas_finalize failed\n");
+ fprintf(stderr, "Error: ti_cblas_finalize failed with error code %d!\n", r_val);
}
}
}
}
#endif
+
TI_CBLAS_DEBUG_PRINT("ti_cblas_init: Initializing OpenCL on first use..\n");
TI_CBLAS_PROFILE_START();
}
}
- /* 3-digit value: 012
- * Left-most digit => L1 (0)
- * Middle-digit => L2 (1)
- * Right-most => L3 (2)
- */
- TI_CBLAS_L1_OFFLOAD = ti_cblas_offload / 100;
- int tmp_offload = ti_cblas_offload % 100;
- TI_CBLAS_L2_OFFLOAD = tmp_offload / 10;
- TI_CBLAS_L3_OFFLOAD = tmp_offload % 10;
- TI_CBLAS_DEBUG_PRINT("BLAS Offload values: L1=%d, L2=%d, L3=%d\n",
+ /* 3-digit value: 012
+ * Left-most digit => L1 (0)
+ * Middle-digit => L2 (1)
+ * Right-most => L3 (2)
+ */
+ TI_CBLAS_L1_OFFLOAD = ti_cblas_offload / 100;
+ int tmp_offload = ti_cblas_offload % 100;
+ TI_CBLAS_L2_OFFLOAD = tmp_offload / 10;
+ TI_CBLAS_L3_OFFLOAD = tmp_offload % 10;
+ TI_CBLAS_DEBUG_PRINT("BLAS Offload values: L1=%d, L2=%d, L3=%d\n",
TI_CBLAS_L1_OFFLOAD, TI_CBLAS_L2_OFFLOAD, TI_CBLAS_L3_OFFLOAD);
- if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE)) {
- TI_CBLAS_ERROR_EXIT("Size-based offload NOT supported for BLAS Level 1 yet.\n");
- }
- if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE)) {
- TI_CBLAS_ERROR_EXIT("Size-based offload NOT supported for BLAS Level 2 yet.\n");
- }
-
-
+ if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE)) {
+ TI_CBLAS_ERROR_EXIT("Size-based offload NOT supported for BLAS Level 1 yet.\n");
+ }
+ if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE)) {
+ TI_CBLAS_ERROR_EXIT("Size-based offload NOT supported for BLAS Level 2 yet.\n");
+ }
/*------------------------------------------------------------------------
* Read the offline compiled kernel module
*-----------------------------------------------------------------------*/
- TI_CBLAS_DEBUG_PRINT("Reading Kernels\n");
- const unsigned char* bin;
+ TI_CBLAS_DEBUG_PRINT("Reading Kernels\n");
+ const unsigned char* bin;
#ifdef TI_CBLAS_FAT_BINARY
bin = (unsigned char *)ti_cblas_kernel_dsp_bin;
const size_t bin_length = ti_cblas_kernel_dsp_bin_len;
/* OpenCL init */
TI_CBLAS_DEBUG_PRINT("Initializing OpenCL\n");
#ifdef __cplusplus
- ti_cblas_ocl_context = new Context(CL_DEVICE_TYPE_ACCELERATOR);
- ti_cblas_ocl_devices = new std::vector<Device> (ti_cblas_ocl_context->getInfo<CL_CONTEXT_DEVICES>());
- ti_cblas_ocl_binary = new Program::Binaries(1, std::make_pair(bin, bin_length));
- ti_cblas_ocl_program = new Program(*ti_cblas_ocl_context, *ti_cblas_ocl_devices, *ti_cblas_ocl_binary);
- ti_cblas_ocl_program->build(*ti_cblas_ocl_devices);
- ti_cblas_ocl_Q = new CommandQueue(*ti_cblas_ocl_context, ti_cblas_ocl_devices[0][0], CL_QUEUE_PROFILING_ENABLE);
+ ti_cblas_ocl_context = new Context(CL_DEVICE_TYPE_ACCELERATOR);
+ ti_cblas_ocl_devices = new std::vector<Device> (ti_cblas_ocl_context->getInfo<CL_CONTEXT_DEVICES>());
+ ti_cblas_ocl_binary = new Program::Binaries(1, std::make_pair(bin, bin_length));
+ ti_cblas_ocl_program = new Program(*ti_cblas_ocl_context, *ti_cblas_ocl_devices, *ti_cblas_ocl_binary);
+ ti_cblas_ocl_program->build(*ti_cblas_ocl_devices);
+ ti_cblas_ocl_Q = new CommandQueue(*ti_cblas_ocl_context, ti_cblas_ocl_devices[0][0], CL_QUEUE_PROFILING_ENABLE);
#else
cl_int err;
cl_device_id device;
TI_CBLAS_DEBUG_PRINT("OpenCL initialized\n");
TI_CBLAS_DEBUG_PRINT("Initializing Pthreads\n");
- /* Initializing pthreads */
- pthread_cond_init (&CV, 0);
- pthread_mutex_init(&MUTEX, 0);
- TI_CBLAS_DEBUG_PRINT("Pthreads initialized\n");
-
- TI_CBLAS_DEBUG_PRINT("Initializing BLIS\n");
- ti_blis_init();
- TI_CBLAS_DEBUG_PRINT("BLIS initialized\n");
-
- atexit(ti_cblas_auto_finalize);
-
- TI_CBLAS_PROFILE_REPORT(" Initialization took %8.2f us\n", (float) clock_diff);
- ti_cblas_init_done = 1;
- TI_CBLAS_DEBUG_PRINT("ti_cblas_init: Finished OpenCL initialization\n");
- } //end of !ti_cblas_init_done
- } // End of critical section
- return;
+ /* Initializing pthreads */
+ pthread_cond_init (&CV, 0);
+ pthread_mutex_init(&MUTEX, 0);
+ TI_CBLAS_DEBUG_PRINT("Pthreads initialized\n");
+ TI_CBLAS_DEBUG_PRINT("Initializing BLIS\n");
+ if(ti_blis_init() == TI_CBLAS_INITFINI_SUCCESS) {
+ TI_CBLAS_DEBUG_PRINT("BLIS initialized\n");\
+ }
+ else {
+ TI_CBLAS_DEBUG_PRINT("BLIS NOT initialized!\n");\
+ }
+
+ atexit(ti_cblas_auto_finalize);
+
+ TI_CBLAS_PROFILE_REPORT(" Initialization took %8.2f us\n", (float) clock_diff);
+ ti_cblas_init_done = 1;
+ TI_CBLAS_DEBUG_PRINT("ti_cblas_init: Finished OpenCL initialization\n");
+ } //end of !ti_cblas_init_done
+
+ } // End of critical section
+
+ return;
}
diff --git a/src/ti/linalg/blasblisacc/src/ti_cblas_kernel.cl b/src/ti/linalg/blasblisacc/src/ti_cblas_kernel.cl
index 0a3736214862c210befc6c228a76f23c6a2cb283..8fa16d0e0071be262aa33f76ce8f46f18bdba3f0 100644 (file)
enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
-void ti_bli_init_dsp(void);
-kernel void ocl_bli_init(void)
-{ ti_bli_init_dsp(); }
-void ti_bli_finalize_dsp(void);
-kernel void ocl_bli_finalize(void)
-{ ti_bli_finalize_dsp(); }
+int ti_bli_init_dsp(void);
+kernel void ocl_bli_init(global int *err_code)
+{
+ *err_code = ti_bli_init_dsp();
+}
+int ti_bli_finalize_dsp(void);
+kernel void ocl_bli_finalize(global int *err_code)
+{
+ *err_code = ti_bli_finalize_dsp();
+}
void cblas_caxpy_facade(const int N, global const void *alpha, global const void *X, const int incX, global void *Y, const int incY);
kernel void ocl_cblas_caxpy(const int N, global const void *alpha, global const void *X, const int incX, global void *Y, const int incY)
diff --git a/src/ti/linalg/blasblisacc/src/ti_cblas_mem_config.c b/src/ti/linalg/blasblisacc/src/ti_cblas_mem_config.c
index 47ff0c029321f8c37d0a21c4f62065f16e1feeb8..a506eceaf31bc1771474957c54fc41296f0624be 100644 (file)
/*==============================================================================
* This function initializes BLIS before first CBLAS call is made.
*============================================================================*/
-void ti_bli_init_dsp(void)
+int ti_bli_init_dsp(void)
{
- tiCblasNew();
+ return tiCblasNew();
}
/*==============================================================================
* This function frees all memories allocated by ti_bli_init_dsp.
*============================================================================*/
-void ti_bli_finalize_dsp(void)
+int ti_bli_finalize_dsp(void)
{
- tiCblasDelete();
+ return tiCblasDelete();
}
/* Nothing after this line */
index be509d9b4fd8fed7d9a38cc38e5e2e0733d6ad9b..f78191c5a36091ae82dbbbcae294d91ef0bf4356 100755 (executable)
* must transfer rows/columns that are outside the partition so that the packing
* routine can "densify" or symmetrize the panel.
*/
- if(!bli_obj_root_is_general( *p ) && bli_obj_intersects_diag( *p ) && !(bli_is_triangular( struc_p )))
+ if(!bli_obj_root_is_general( *p ) && bli_obj_intersects_diag( *p ) && !(bli_is_triangular( struc_p )) && bli_is_herm_or_symm(struc_p))
{
//printf("not general and diag intersects\n");
- if(bli_is_herm_or_symm( struc_p ))
- {
if(bli_is_lower(uplo_p))
{
//printf("lower diagoff %d m_p %d, n_p %d offm_a %d, offn_a %d\n", diagoff_p, m_p, n_p, offm_a, offn_a);
cs_p = 1;
}
}
- else if (bli_is_upper(uplo_p))
+ else
{
//printf("upper diagoff %d\n", diagoff_p);
m_transfer = bli_max(m_p, m_p + diagoff_p);
rs_p = 1;
cs_p = m_transfer;
}
- else if (bli_is_row_stored( rs_a , cs_a ))
+ else
{
rs_p = n_transfer;
cs_p = 1;
}
}
- }
}
else
{
rs_p = 1;
cs_p = m_p;
}
- else if (bli_is_row_stored( rs_a , cs_a ))
+ else
{
rs_p = n_p;
cs_p = 1;
{
bli_dma_channel_acquire(&(p->emt_handle), lib_get_coreID());
if(p->emt_handle == NULL)
- printf("DMAM_INIT Failed to alloc edma handle CoreID %d %x\n", lib_get_coreID(), p->emt_handle);
+ printf("DMAM_INIT Failed to alloc edma handle CoreID %d.\n", lib_get_coreID());
}
}