/****************************************************************************** * Copyright (c) 2013-2015, Texas Instruments Incorporated - http://www.ti.com/ * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Texas Instruments Incorporated nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "ti_cblas_acc.h" #include #ifdef TI_CBLAS_FAT_BINARY #include "ti_cblas_kernel.dsp_h" #endif /* Global variables */ #ifdef __cplusplus #if 0 Context ti_cblas_ocl_context; std::vector ti_cblas_ocl_devices; CommandQueue ti_cblas_ocl_Q; Program::Binaries ti_cblas_ocl_binary; Program ti_cblas_ocl_program; Kernel* ti_cblas_ocl_kernels[TI_CBLAS_NUM_KERNELS]; #else Context* ti_cblas_ocl_context = NULL; std::vector* ti_cblas_ocl_devices = NULL; CommandQueue* ti_cblas_ocl_Q = NULL; Program::Binaries* ti_cblas_ocl_binary = NULL; Program* ti_cblas_ocl_program = NULL; #endif #else cl_context ti_cblas_ocl_context; cl_command_queue ti_cblas_ocl_Q; cl_program ti_cblas_ocl_program; cl_kernel ti_cblas_ocl_kernels[TI_CBLAS_NUM_KERNELS]; #endif int ti_cblas_init_done = 0; /* flag to check if init is complete */ int ti_cblas_disable_debug = 0; /* runtime toggle to disable debug */ int ti_cblas_offload = TI_CBLAS_OFFLOAD_SIZE; int ti_cblas_kernel_valid[TI_CBLAS_NUM_KERNELS]; int TI_CBLAS_L1_OFFLOAD = TI_CBLAS_OFFLOAD_NONE; int TI_CBLAS_L2_OFFLOAD = TI_CBLAS_OFFLOAD_NONE; int TI_CBLAS_L3_OFFLOAD = TI_CBLAS_OFFLOAD_NONE; pthread_cond_t CV; pthread_mutex_t MUTEX; void ti_cblas_error(const char* msg, int code) { fprintf(stderr, "ERROR: (%s,%d)\n", msg, code); } #ifdef __cplusplus extern "C" #endif int ti_blis_init(void) { int r_val = 1; TI_CBLAS_DEBUG_PRINT("Initializing BLIS ARM\n"); bli_init(); TI_CBLAS_DEBUG_PRINT("BLIS ARM initialized\n"); #ifdef __cplusplus Event e; Kernel* __K; #else cl_kernel __K; #endif __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMM_IDX, "ocl_bli_init"); #ifdef __cplusplus try #else cl_int err = CL_SUCCESS; #endif { TI_CBLAS_DEBUG_PRINT("Initializing BLIS DSP\n"); #ifdef __cplusplus ti_cblas_ocl_Q->enqueueTask(*__K, 0, &e); e.wait(); #else cl_event e; err |= clEnqueueTask(ti_cblas_ocl_Q, __K, 0, 0, &e); TI_CBLAS_OCL_CHKERROR("clEnqueueTask",err); err |= clWaitForEvents(1, &e); TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err); err |= clReleaseEvent(e); TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err); #endif ti_cblas_delete_kernel(__K); TI_CBLAS_DEBUG_PRINT("BLIS DSP initialized\n"); } #ifdef __cplusplus catch (Error err) { ti_cblas_error(err.what(),err.err()); r_val = 1; return r_val; } #endif } #ifdef __cplusplus extern "C" #endif int ti_blis_finalize(void) { int r_val = 1; bli_finalize(); #ifdef __cplusplus Event e; Kernel* __K; #else cl_kernel __K; #endif __K = ti_cblas_get_kernel(TI_CBLAS_CBLAS_CGEMM_IDX, "ocl_bli_finalize"); #ifdef __cplusplus try #else cl_int err = CL_SUCCESS; #endif { #ifdef __cplusplus ti_cblas_ocl_Q->enqueueTask(*__K, 0, &e); e.wait(); #else cl_event e; err |= clEnqueueTask(ti_cblas_ocl_Q, __K, 0, 0, &e); TI_CBLAS_OCL_CHKERROR("clEnqueueTask",err); err |= clWaitForEvents(1, &e); TI_CBLAS_OCL_CHKERROR("clWaitForEvents",err); err |= clReleaseEvent(e); TI_CBLAS_OCL_CHKERROR("clReleaseEvent",err); #endif ti_cblas_delete_kernel(__K); } #ifdef __cplusplus catch (Error err) { ti_cblas_error(err.what(),err.err()); r_val = 1; return r_val; } #endif } #ifdef __cplusplus extern "C" #endif int ti_cblas_finalize(void) { int r_val = 1; //printf("ti_cblas_finalize\n"); /* If ti_cblas_init_done is equal to 0, * then we know that ti_cblas_init was not called, * and so we can return early. */ if(ti_cblas_init_done == 0) return 0; r_val = ti_blis_finalize(); /*Using same name as ti_cblas_init critical region. See notes in bli_init*/ #pragma omp critical (ti_cblas_init_critical) { if (ti_cblas_init_done == 1) { // Destroy Pthread pthread_mutex_destroy(&MUTEX); pthread_cond_destroy (&CV); //destroy Command queue, program, devices and context. if(ti_cblas_ocl_Q != NULL) { delete(ti_cblas_ocl_Q); ti_cblas_ocl_Q = NULL; } if(ti_cblas_ocl_program != NULL) { delete(ti_cblas_ocl_program); ti_cblas_ocl_program = NULL; } if(ti_cblas_ocl_binary != NULL) { delete(ti_cblas_ocl_binary); ti_cblas_ocl_binary = NULL; } if(ti_cblas_ocl_devices != NULL) { delete(ti_cblas_ocl_devices); ti_cblas_ocl_devices = NULL; } if(ti_cblas_ocl_context != NULL) { delete(ti_cblas_ocl_context); ti_cblas_ocl_context = NULL; } ti_cblas_init_done = 0; r_val = 0; } } return r_val; } void ti_cblas_auto_finalize(void) { int i; i = ti_cblas_finalize(); if (i != 0) { fprintf(stderr, "Error: ti_cblas_finalize failed\n"); } } /* This function is invoked exactly once on startup */ /* Its purpose is to parse the environment variables and do OpenCL init */ void ti_cblas_init(void) { #pragma omp critical (ti_cblas_init_critical) { /* Add code for interception */ if (!ti_cblas_init_done) { #ifdef TI_CBLAS_DEBUG char *no_debug_env = getenv("TI_CBLAS_NO_DEBUG"); if (no_debug_env) { if (atoi(no_debug_env) > 0) { ti_cblas_disable_debug = 1; } } #endif TI_CBLAS_DEBUG_PRINT("ti_cblas_init: Initializing OpenCL on first use..\n"); TI_CBLAS_PROFILE_START(); /* check environment variables */ const char *offload_env = getenv("TI_CBLAS_OFFLOAD"); if (!offload_env) { TI_CBLAS_DEBUG_PRINT("Using build time default for offload: TI_CBLAS_OFFLOAD=%s\n", TI_CBLAS_OFFLOAD); offload_env = TI_CBLAS_OFFLOAD; } else { TI_CBLAS_DEBUG_PRINT("Using runtime override for offloads: TI_CBLAS_OFFLOAD=%s\n", offload_env); } if (offload_env) { ti_cblas_offload = atoi(offload_env); if (ti_cblas_offload == TI_CBLAS_OFFLOAD_NONE) { TI_CBLAS_DEBUG_PRINT("Disabling all offloads\n"); } } /* 3-digit value: 012 * Left-most digit => L1 (0) * Middle-digit => L2 (1) * Right-most => L3 (2) */ TI_CBLAS_L1_OFFLOAD = ti_cblas_offload / 100; int tmp_offload = ti_cblas_offload % 100; TI_CBLAS_L2_OFFLOAD = tmp_offload / 10; TI_CBLAS_L3_OFFLOAD = tmp_offload % 10; TI_CBLAS_DEBUG_PRINT("BLAS Offload values: L1=%d, L2=%d, L3=%d\n", TI_CBLAS_L1_OFFLOAD, TI_CBLAS_L2_OFFLOAD, TI_CBLAS_L3_OFFLOAD); if ((TI_CBLAS_L1_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE)) { TI_CBLAS_ERROR_EXIT("Size-based offload NOT supported for BLAS Level 1 yet.\n"); } if ((TI_CBLAS_L2_OFFLOAD == TI_CBLAS_OFFLOAD_SIZE)) { TI_CBLAS_ERROR_EXIT("Size-based offload NOT supported for BLAS Level 2 yet.\n"); } /*------------------------------------------------------------------------ * Read the offline compiled kernel module *-----------------------------------------------------------------------*/ TI_CBLAS_DEBUG_PRINT("Reading Kernels\n"); const unsigned char* bin; #ifdef TI_CBLAS_FAT_BINARY bin = (unsigned char *)ti_cblas_kernel_dsp_bin; const size_t bin_length = ti_cblas_kernel_dsp_bin_len; #else const char binary[] = "./ti_cblas_kernel.out"; unsigned int bin_length; #ifdef __cplusplus bin_length = ocl_read_binary(binary, (char*&)bin); #else FILE *fp = fopen(binary, "r"); if (!fp) { TI_CBLAS_ERROR_EXIT("Could not open OpenCL pre-compiled binary %s for reading\n", binary); } struct stat fileinfo; stat(binary, &fileinfo); bin_length = fileinfo.st_size; bin = (char *)malloc(bin_length); if (!bin) { TI_CBLAS_ERROR_EXIT("Could not malloc of size %d for reading OpenCL binary\n", bin_length); } if (fread((char *)bin, bin_length, 1, fp) != 1) { TI_CBLAS_ERROR_EXIT("Could not read %d bytes of OpenCL binary\n", bin_length); } fclose(fp); #endif /* cplusplus */ #endif /* FAT_BINARY */ /* OpenCL init */ TI_CBLAS_DEBUG_PRINT("Initializing OpenCL\n"); #ifdef __cplusplus ti_cblas_ocl_context = new Context(CL_DEVICE_TYPE_ACCELERATOR); ti_cblas_ocl_devices = new std::vector (ti_cblas_ocl_context->getInfo()); ti_cblas_ocl_binary = new Program::Binaries(1, std::make_pair(bin, bin_length)); ti_cblas_ocl_program = new Program(*ti_cblas_ocl_context, *ti_cblas_ocl_devices, *ti_cblas_ocl_binary); ti_cblas_ocl_program->build(*ti_cblas_ocl_devices); ti_cblas_ocl_Q = new CommandQueue(*ti_cblas_ocl_context, ti_cblas_ocl_devices[0][0], CL_QUEUE_PROFILING_ENABLE); #else cl_int err; cl_device_id device; /* Create an in-order command queue by default*/ int queue_flags = 0; #ifdef TI_CBLAS_PROFILE queue_flags |= CL_QUEUE_PROFILING_ENABLE; #endif ti_cblas_ocl_context = clCreateContextFromType(0,CL_DEVICE_TYPE_ACCELERATOR,0,0,&err); TI_CBLAS_OCL_CHKERROR("clCreateContextFromType",err); err = clGetDeviceIDs(0,CL_DEVICE_TYPE_ACCELERATOR,1,&device,0); TI_CBLAS_OCL_CHKERROR("clGetDeviceIDs",err); ti_cblas_ocl_Q = clCreateCommandQueue(ti_cblas_ocl_context, device, queue_flags, &err); TI_CBLAS_OCL_CHKERROR("clCreateCommandQueue",err); ti_cblas_ocl_program = clCreateProgramWithBinary(ti_cblas_ocl_context, 1, &device, &bin_length, &bin, NULL, &err); TI_CBLAS_OCL_CHKERROR("clCreateProgramWithBinary",err); const char *compile_options = ""; err = clBuildProgram(ti_cblas_ocl_program, 1, &device, compile_options, 0, 0); TI_CBLAS_OCL_CHKERROR("clBuildProgram",err); #endif #ifndef TI_CBLAS_FAT_BINARY #ifdef __cplusplus delete [] bin; #else free((char*)bin); #endif #endif /* FAT_BINARY */ TI_CBLAS_DEBUG_PRINT("OpenCL initialized\n"); TI_CBLAS_DEBUG_PRINT("Initializing Pthreads\n"); /* Initializing pthreads */ pthread_cond_init (&CV, 0); pthread_mutex_init(&MUTEX, 0); TI_CBLAS_DEBUG_PRINT("Pthreads initialized\n"); TI_CBLAS_DEBUG_PRINT("Initializing BLIS\n"); ti_blis_init(); TI_CBLAS_DEBUG_PRINT("BLIS initialized\n"); atexit(ti_cblas_auto_finalize); TI_CBLAS_PROFILE_REPORT(" Initialization took %8.2f us\n", (float) clock_diff); ti_cblas_init_done = 1; TI_CBLAS_DEBUG_PRINT("ti_cblas_init: Finished OpenCL initialization\n"); } //end of !ti_cblas_init_done } // End of critical section return; } void ti_cblas_mem_free(void *ptr) { pthread_mutex_lock(&MUTEX); __free_msmc(ptr); pthread_cond_broadcast(&CV); pthread_mutex_unlock(&MUTEX); } void *ti_cblas_mem_alloc(size_t size) { void *ptr; pthread_mutex_lock(&MUTEX); /*------------------------------------------------------------------------- * Loop in case of false signal after broadcast. *------------------------------------------------------------------------*/ while ((ptr = __malloc_msmc(size)) == 0) pthread_cond_wait(&CV, &MUTEX); pthread_mutex_unlock(&MUTEX); return ptr; } /* Returns a handle to the kernel for the specified * function with index 'idx'. Initializes the handle if it's * not been used before, otherwise returns earlier handle */ #ifdef __cplusplus Kernel* #else cl_kernel #endif ti_cblas_get_kernel(int idx, const char *fname) { #ifdef __cplusplus Kernel* __K; #else cl_kernel __K; #endif #ifdef __cplusplus __K = new Kernel(*ti_cblas_ocl_program, fname); #else cl_int err; __K = clCreateKernel(ti_cblas_ocl_program,fname,&err); TI_CBLAS_OCL_CHKERROR("clCreateKernel",err); #endif return __K; } #ifdef __cplusplus int ti_cblas_delete_kernel(Kernel* K) #else int ti_cblas_delete_kernel(cl_kernel K) #endif { #ifdef __cplusplus if(K != NULL) { delete(K); K=NULL; } #else clReleaseKernel(K); #endif return 0; }