Enqueue multiple frames at device side
[tidl/tidl-api.git] / tidl_api / src / ocl_device.cpp
1 /******************************************************************************
2  * Copyright (c) 2017-2018  Texas Instruments Incorporated - http://www.ti.com/
3  *   All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of Texas Instruments Incorporated nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
30 #include <cstdlib>
31 #include <cassert>
32 using std::size_t;
34 #include <iostream>
36 #include "ocl_device.h"
37 #include "ocl_util.h"
38 #include "trace.h"
39 #include "../dsp/ocl_wrapper.dsp_h"
41 using namespace tidl;
43 static const char* error2string(cl_int err);
44 static void        errorCheck(cl_int ret, int line);
46 Device::Device(cl_device_type t, const DeviceIds& ids):
47                 device_type_m(t), device_ids_m(ids)
48 {
49     TRACE::print("\tOCL Device: %s created\n",
50               device_type_m == CL_DEVICE_TYPE_ACCELERATOR ? "DSP" :
51               device_type_m == CL_DEVICE_TYPE_CUSTOM ? "EVE" : "Unknown");
53     for (int i = 0; i < MAX_DEVICES; i++)
54         queue_m[i] = nullptr;
56 }
58 DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename):
59               Device(CL_DEVICE_TYPE_ACCELERATOR, ids)
60 {
61     cl_uint num_devices_found;
62     cl_device_id device_ids[MAX_DEVICES];
64     cl_int errcode = clGetDeviceIDs(0,               // platform
65                              device_type_m,          // device_type
66                              MAX_DEVICES,            // num_entries
67                              device_ids,             // devices
68                              &num_devices_found);    // num_devices
69     errorCheck(errcode, __LINE__);
71     if (num_devices_found != 1)
72         throw Exception("OpenCL DSP device not found",
73                         __FILE__, __FUNCTION__, __LINE__);
75     cl_int num_compute_units;
76     errcode = clGetDeviceInfo(device_ids[0],
77                               CL_DEVICE_MAX_COMPUTE_UNITS,
78                               sizeof(num_compute_units),
79                               &num_compute_units,
80                               nullptr);
82     if (num_compute_units == 1)
83     {
84         context_m = clCreateContextFromType(0,              // properties
85                                             device_type_m,  // device_type
86                                             0,              // pfn_notify
87                                             0,              // user_data
88                                             &errcode);
89         errorCheck(errcode, __LINE__);
91         // Queue 0 on device 0
92         queue_m[0] = clCreateCommandQueue(context_m,
93                                           device_ids[0],
94                                           CL_QUEUE_PROFILING_ENABLE,
95                                           &errcode);
96         errorCheck(errcode, __LINE__);
97         BuildProgramFromBinary(binary_filename, device_ids, 1);
98     }
99     else
100     {
101         const cl_uint NUM_SUB_DEVICES = 2;
103         // Create 2 sub-device's, each consisting of a C66x DSP
104         cl_device_partition_property properties[3] =
105                                         { CL_DEVICE_PARTITION_EQUALLY, 1, 0 };
107         // Query the number of sub-devices that can be created
108         cl_uint n_sub_devices = 0;
109         errcode = clCreateSubDevices(device_ids[0],      // in_device
110                                      properties,         // properties
111                                      0,                  // num_devices
112                                      NULL,               // out_devices
113                                      &n_sub_devices);    // num_devices_ret
114         errorCheck(errcode, __LINE__);
116         assert(n_sub_devices == NUM_SUB_DEVICES);
118         // Create the sub-devices
119         cl_device_id sub_devices[NUM_SUB_DEVICES] = {0, 0};
120         errcode = clCreateSubDevices(device_ids[0],        // in_device
121                                      properties,           // properties
122                                      n_sub_devices,        // num_devices
123                                      sub_devices,          // out_devices
124                                      nullptr);             // num_devices_ret
125         errorCheck(errcode, __LINE__);
127         // Create a context containing the sub-devices
128         context_m = clCreateContext(NULL,               // properties
129                                     NUM_SUB_DEVICES,    // num_devices
130                                     sub_devices,        // devices
131                                     NULL,               // pfn_notify
132                                     NULL,               // user_data
133                                     &errcode);          // errcode_ret
134         errorCheck(errcode, __LINE__);
136         // Create queues to each sub-device
137         for (auto id : device_ids_m)
138         {
139             int index = static_cast<int>(id);
140             queue_m[index] = clCreateCommandQueue(context_m,
141                                           sub_devices[index],
142                                           CL_QUEUE_PROFILING_ENABLE,
143                                           &errcode);
144             errorCheck(errcode, __LINE__);
145         }
147         BuildProgramFromBinary(binary_filename, sub_devices, NUM_SUB_DEVICES);
148     }
150     errcode = clGetDeviceInfo(device_ids[0],
151                                 CL_DEVICE_MAX_CLOCK_FREQUENCY,
152                                 sizeof(freq_in_mhz_m),
153                                 &freq_in_mhz_m,
154                                 nullptr);
155     errorCheck(errcode, __LINE__);
159 EveDevice::EveDevice(const DeviceIds& ids, const std::string &kernel_names):
160             Device(CL_DEVICE_TYPE_CUSTOM, ids)
162     cl_uint num_devices_found;
163     cl_device_id all_device_ids[MAX_DEVICES];
165     // Find all the OpenCL devices available of the given type
166     cl_int errcode = clGetDeviceIDs(0,              // platform
167                              device_type_m,         // device_type
168                              MAX_DEVICES,           // num_entries
169                              all_device_ids,        // devices
170                              &num_devices_found);   // num_devices
171     errorCheck(errcode, __LINE__);
173     assert (num_devices_found >= device_ids_m.size());
175     context_m = clCreateContextFromType(0,              // properties
176                                         device_type_m,  // device_type
177                                         0,              // pfn_notify
178                                         0,              // user_data
179                                         &errcode);
180     errorCheck(errcode, __LINE__);
183     // Create command queues to OpenCL devices specified by the
184     // device_ids_m set.
185     for (auto id : device_ids_m)
186     {
187         int index = static_cast<int>(id);
188         queue_m[index] = clCreateCommandQueue(context_m,
189                                       all_device_ids[index],
190                                       CL_QUEUE_PROFILING_ENABLE,
191                                       &errcode);
192         errorCheck(errcode, __LINE__);
193     }
195     BuildProgramFromBinary(kernel_names, all_device_ids, device_ids_m.size());
197     errcode = clGetDeviceInfo(all_device_ids[0],
198                                 CL_DEVICE_MAX_CLOCK_FREQUENCY,
199                                 sizeof(freq_in_mhz_m),
200                                 &freq_in_mhz_m,
201                                 nullptr);
202     errorCheck(errcode, __LINE__);
206 bool DspDevice::BuildProgramFromBinary(const std::string &BFN,
207                                        cl_device_id device_ids[],
208                                        int num_devices)
210     size_t bin_len = ocl_wrapper_dsp_bin_len;
212     assert (bin_len != 0);
214     // Casting to make ocl_read_binary work with clCreateProgramWithBinary
215     const unsigned char *bin_arrc = reinterpret_cast <const unsigned char *>
216                                     (ocl_wrapper_dsp_bin);
218     size_t lengths[num_devices];
219     for (int i=0; i < num_devices; i++) lengths[i] = bin_len;
221     const unsigned char* binaries[num_devices];
222     for (int i=0; i < num_devices; i++) binaries[i] = bin_arrc;
224     cl_int err;
225     program_m = clCreateProgramWithBinary(context_m,
226                                           num_devices,
227                                           device_ids,          // device_list
228                                           lengths,
229                                           binaries,
230                                           0,                   // binary_status
231                                           &err);
232     errorCheck(err, __LINE__);
234     const char *options = "";
235     err = clBuildProgram(program_m, num_devices, device_ids, options, 0, 0);
236     errorCheck(err, __LINE__);
238     return true;
241 bool EveDevice::BuildProgramFromBinary(const std::string& kernel_names,
242                                        cl_device_id device_ids[],
243                                        int num_devices)
245     cl_int err;
246     cl_device_id executor_device_ids[MAX_DEVICES];
248     int i = 0;
249     for (auto id : device_ids_m)
250         executor_device_ids[i++] = device_ids[static_cast<int>(id)];
252     program_m = clCreateProgramWithBuiltInKernels(context_m,
253                                           num_devices,
254                                           executor_device_ids,  // device_list
255                                           kernel_names.c_str(),
256                                           &err);
257     errorCheck(err, __LINE__);
259     return true;
262 Kernel::Kernel(Device* device, const std::string& name,
263                const KernelArgs& args, uint8_t device_index):
264            name_m(name), device_m(device), device_index_m(device_index),
265            num_running_contexts_m(0)
267     TRACE::print("Creating kernel %s\n", name.c_str());
268     cl_int err;
269     kernel_m = clCreateKernel(device_m->program_m, name_m.c_str(), &err);
270     errorCheck(err, __LINE__);
272     int arg_index = 0;
273     for (const auto& arg : args)
274     {
275         if (!arg.isLocal())
276         {
277             if (arg.kind() == DeviceArgInfo::Kind::BUFFER)
278             {
279                 cl_mem buffer = device_m->CreateBuffer(arg);
281                 clSetKernelArg(kernel_m, arg_index, sizeof(cl_mem), &buffer);
282                 TRACE::print("  Arg[%d]: %p\n", arg_index, buffer);
284                 if (buffer)
285                     buffers_m.push_back(buffer);
286             }
287             else if (arg.kind() == DeviceArgInfo::Kind::SCALAR)
288             {
289                 clSetKernelArg(kernel_m, arg_index, arg.size(), arg.ptr());
290                 TRACE::print("  Arg[%d]: %p\n", arg_index, arg.ptr());
291             }
292             else
293             {
294                 assert ("DeviceArgInfo kind not supported");
295             }
296         }
297         else
298         {
299             clSetKernelArg(kernel_m, arg_index, arg.size(), NULL);
300             TRACE::print("  Arg[%d]: local, %d\n", arg_index, arg.size());
301         }
302         arg_index++;
304     }
307 bool Kernel::UpdateScalarArg(uint32_t index, size_t size, const void *value)
309     cl_int ret = clSetKernelArg(kernel_m, index, size, value);
310     return ret == CL_SUCCESS;
313 Kernel& Kernel::RunAsync(uint32_t context_idx)
315     // Execute kernel
316     TRACE::print("\tKernel: device %d executing %s, context %d\n",
317                  device_index_m, name_m.c_str(), context_idx);
318     cl_int ret = clEnqueueTask(device_m->queue_m[device_index_m],
319                                kernel_m, 0, 0, &event_m[context_idx]);
320     errorCheck(ret, __LINE__);
321     __sync_fetch_and_add(&num_running_contexts_m, 1);
323     return *this;
327 bool Kernel::Wait(float *host_elapsed_ms, uint32_t context_idx)
329     // Wait called without a corresponding RunAsync
330     if (num_running_contexts_m == 0)
331         return false;
333     TRACE::print("\tKernel: waiting context %d...\n", context_idx);
334     cl_int ret = clWaitForEvents(1, &event_m[context_idx]);
335     errorCheck(ret, __LINE__);
337     if (host_elapsed_ms != nullptr)
338     {
339         cl_ulong t_que, t_end;
340         clGetEventProfilingInfo(event_m[context_idx],
341                                 CL_PROFILING_COMMAND_QUEUED,
342                                 sizeof(cl_ulong), &t_que, nullptr);
343         clGetEventProfilingInfo(event_m[context_idx], CL_PROFILING_COMMAND_END,
344                                 sizeof(cl_ulong), &t_end, nullptr);
345         *host_elapsed_ms = (t_end - t_que) / 1.0e6;  // nano to milli seconds
346     }
348     ret = clReleaseEvent(event_m[context_idx]);
349     errorCheck(ret, __LINE__);
350     TRACE::print("\tKernel: finished execution\n");
352     __sync_fetch_and_sub(&num_running_contexts_m, 1);
353     return true;
356 extern void CallbackWrapper(void *user_data) __attribute__((weak));
358 static
359 void EventCallback(cl_event event, cl_int exec_status, void *user_data)
361     if (exec_status != CL_SUCCESS || user_data == nullptr)  return;
362     if (CallbackWrapper)  CallbackWrapper(user_data);
365 bool Kernel::AddCallback(void *user_data, uint32_t context_idx)
367     if (num_running_contexts_m == 0)  return false;
368     return clSetEventCallback(event_m[context_idx], CL_COMPLETE, EventCallback,
369                               user_data) == CL_SUCCESS;
372 Kernel::~Kernel()
374     for (auto b : buffers_m)
375         device_m->ReleaseBuffer(b);
377     clReleaseKernel(kernel_m);
380 cl_mem Device::CreateBuffer(const DeviceArgInfo &Arg)
382     size_t  size     = Arg.size();
383     void   *host_ptr = Arg.ptr();
385     if (host_ptr == nullptr)
386     {
387         TRACE::print("\tOCL Create B:%p\n", nullptr);
388         return nullptr;
389     }
391     bool hostPtrInCMEM = __is_in_malloced_region(host_ptr);
393     // Conservative till we have sufficient information.
394     cl_mem_flags flag = CL_MEM_READ_WRITE;
396     if (hostPtrInCMEM) flag |= (cl_mem_flags)CL_MEM_USE_HOST_PTR;
397     else               flag |= (cl_mem_flags)CL_MEM_COPY_HOST_PTR;
399     cl_int       errcode;
400     cl_mem buffer = clCreateBuffer(context_m,
401                                    flag,
402                                    size,
403                                    host_ptr,
404                                    &errcode);
405     errorCheck(errcode, __LINE__);
407     TRACE::print("\tOCL Create B:%p\n", buffer);
409     return buffer;
412 void Device::ReleaseBuffer(cl_mem M)
414     TRACE::print("\tOCL Release B:%p\n", M);
415     clReleaseMemObject(M);
418 /// Release resources associated with an OpenCL device
419 Device::~Device()
421     TRACE::print("\tOCL Device: deleted\n");
422     for (unsigned int i = 0; i < device_ids_m.size(); i++)
423     {
424         clFinish(queue_m[i]);
425         clReleaseCommandQueue (queue_m[i]);
426     }
428     clReleaseProgram      (program_m);
429     clReleaseContext      (context_m);
432 void errorCheck(cl_int ret, int line)
434     if (ret != CL_SUCCESS)
435     {
436         std::cerr << "ERROR: [ Line: " << line << "] " << error2string(ret) << std::endl;
437         exit(ret);
438     }
441 /// Convert OpenCL error codes to a string
442 const char* error2string(cl_int err)
444     switch(err)
445     {
446          case   0: return "CL_SUCCESS";
447          case  -1: return "CL_DEVICE_NOT_FOUND";
448          case  -2: return "CL_DEVICE_NOT_AVAILABLE";
449          case  -3: return "CL_COMPILER_NOT_AVAILABLE";
450          case  -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
451          case  -5: return "CL_OUT_OF_RESOURCES";
452          case  -6: return "CL_OUT_OF_HOST_MEMORY";
453          case  -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
454          case  -8: return "CL_MEM_COPY_OVERLAP";
455          case  -9: return "CL_IMAGE_FORMAT_MISMATCH";
456          case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
457          case -11: return "CL_BUILD_PROGRAM_FAILURE";
458          case -12: return "CL_MAP_FAILURE";
460          case -30: return "CL_INVALID_VALUE";
461          case -31: return "CL_INVALID_DEVICE_TYPE";
462          case -32: return "CL_INVALID_PLATFORM";
463          case -33: return "CL_INVALID_DEVICE";
464          case -34: return "CL_INVALID_CONTEXT";
465          case -35: return "CL_INVALID_QUEUE_PROPERTIES";
466          case -36: return "CL_INVALID_COMMAND_QUEUE";
467          case -37: return "CL_INVALID_HOST_PTR";
468          case -38: return "CL_INVALID_MEM_OBJECT";
469          case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
470          case -40: return "CL_INVALID_IMAGE_SIZE";
471          case -41: return "CL_INVALID_SAMPLER";
472          case -42: return "CL_INVALID_BINARY";
473          case -43: return "CL_INVALID_BUILD_OPTIONS";
474          case -44: return "CL_INVALID_PROGRAM";
475          case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
476          case -46: return "CL_INVALID_KERNEL_NAME";
477          case -47: return "CL_INVALID_KERNEL_DEFINITION";
478          case -48: return "CL_INVALID_KERNEL";
479          case -49: return "CL_INVALID_ARG_INDEX";
480          case -50: return "CL_INVALID_ARG_VALUE";
481          case -51: return "CL_INVALID_ARG_SIZE";
482          case -52: return "CL_INVALID_KERNEL_ARGS";
483          case -53: return "CL_INVALID_WORK_DIMENSION";
484          case -54: return "CL_INVALID_WORK_GROUP_SIZE";
485          case -55: return "CL_INVALID_WORK_ITEM_SIZE";
486          case -56: return "CL_INVALID_GLOBAL_OFFSET";
487          case -57: return "CL_INVALID_EVENT_WAIT_LIST";
488          case -58: return "CL_INVALID_EVENT";
489          case -59: return "CL_INVALID_OPERATION";
490          case -60: return "CL_INVALID_GL_OBJECT";
491          case -61: return "CL_INVALID_BUFFER_SIZE";
492          case -62: return "CL_INVALID_MIP_LEVEL";
493          case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
494          default: return "Unknown OpenCL error";
495     }
498 Device::Ptr Device::Create(DeviceType core_type, const DeviceIds& ids,
499                            const std::string& name)
501     Device::Ptr p(nullptr);
502     if (core_type == DeviceType::DSP)
503         p.reset(new DspDevice(ids, name));
504     else if (core_type == DeviceType::EVE)
505         p.reset(new EveDevice(ids, name));
507     return p;
510 // Minimum version of OpenCL required for this version of TIDL API
511 #define MIN_OCL_VERSION "01.01.16.00"
512 static bool CheckOpenCLVersion(cl_platform_id id)
514     cl_int err;
515     size_t length;
516     err = clGetPlatformInfo(id, CL_PLATFORM_VERSION, 0, nullptr, &length);
517     if (err != CL_SUCCESS) return false;
519     std::unique_ptr<char> version(new char[length]);
520     err = clGetPlatformInfo(id, CL_PLATFORM_VERSION, length, version.get(),
521                             nullptr);
522     if (err != CL_SUCCESS) return false;
524     std::string v(version.get());
526     if (v.substr(v.find("01."), sizeof(MIN_OCL_VERSION)) >= MIN_OCL_VERSION)
527         return true;
529     std::cerr << "TIDL API Error: OpenCL " << MIN_OCL_VERSION
530               << " or higher required." << std::endl;
532     return false;
535 static bool PlatformIsAM57()
537     cl_platform_id id;
538     cl_int err;
540     err = clGetPlatformIDs(1, &id, nullptr);
541     if (err != CL_SUCCESS) return false;
543     if (!CheckOpenCLVersion(id))
544        return false;
546     // Check if the device name is AM57
547     size_t length;
548     err = clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, nullptr, &length);
549     if (err != CL_SUCCESS) return false;
551     std::unique_ptr<char> name(new char[length]);
553     err = clGetPlatformInfo(id, CL_PLATFORM_NAME, length, name.get(), nullptr);
554     if (err != CL_SUCCESS) return false;
556     std::string platform_name(name.get());
558     if (platform_name.find("AM57") == std::string::npos)
559         return false;
561     return true;
564 // TI DL is supported on AM57x - EVE or C66x devices
565 uint32_t Device::GetNumDevices(DeviceType device_type)
567     if (!PlatformIsAM57()) return 0;
569     // Convert DeviceType to OpenCL device type
570     cl_device_type t = (device_type == DeviceType::EVE) ?
571                                     CL_DEVICE_TYPE_CUSTOM :
572                                     CL_DEVICE_TYPE_ACCELERATOR;
574     // Find all the OpenCL devices available
575     cl_uint num_devices_found;
576     cl_device_id all_device_ids[MAX_DEVICES];
578     cl_int errcode = clGetDeviceIDs(0,                   // platform
579                                     t,                   // device_type
580                                     MAX_DEVICES,         // num_entries
581                                     all_device_ids,      // devices
582                                     &num_devices_found); // num_devices
585     if (errcode != CL_SUCCESS)            return 0;
586     if (num_devices_found == 0)           return 0;
588     // DSP, return the number of compute units since we maintain a
589     // queue to each compute unit (i.e. C66x DSP)
590     if (t == CL_DEVICE_TYPE_ACCELERATOR)
591     {
592         cl_int num_compute_units;
593         errcode = clGetDeviceInfo(all_device_ids[0],
594                                 CL_DEVICE_MAX_COMPUTE_UNITS,
595                                 sizeof(num_compute_units),
596                                 &num_compute_units,
597                                 nullptr);
598         if (errcode != CL_SUCCESS)
599             return 0;
601         return num_compute_units;
602     }
604     // EVE, return the number of devices since each EVE is a device
605     return num_devices_found;