1 /******************************************************************************
2 * Copyright (c) 2017-2018 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
30 #include <cstdlib>
31 #include <cassert>
32 using std::size_t;
34 #include <iostream>
36 #include "ocl_device.h"
37 #include "ocl_util.h"
38 #include "trace.h"
39 #include "../dsp/ocl_wrapper.dsp_h"
41 using namespace tidl;
43 static const char* error2string(cl_int err);
44 static void errorCheck(cl_int ret, int line);
46 Device::Device(cl_device_type t, const DeviceIds& ids):
47 device_type_m(t), device_ids_m(ids)
48 {
49 TRACE::print("\tOCL Device: %s created\n",
50 device_type_m == CL_DEVICE_TYPE_ACCELERATOR ? "DSP" :
51 device_type_m == CL_DEVICE_TYPE_CUSTOM ? "EVE" : "Unknown");
53 for (int i = 0; i < MAX_DEVICES; i++)
54 queue_m[i] = nullptr;
56 }
58 DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename):
59 Device(CL_DEVICE_TYPE_ACCELERATOR, ids)
60 {
61 cl_uint num_devices_found;
62 cl_device_id device_ids[MAX_DEVICES];
64 cl_int errcode = clGetDeviceIDs(0, // platform
65 device_type_m, // device_type
66 MAX_DEVICES, // num_entries
67 device_ids, // devices
68 &num_devices_found); // num_devices
69 errorCheck(errcode, __LINE__);
71 if (num_devices_found != 1)
72 throw Exception("OpenCL DSP device not found",
73 __FILE__, __FUNCTION__, __LINE__);
75 cl_int num_compute_units;
76 errcode = clGetDeviceInfo(device_ids[0],
77 CL_DEVICE_MAX_COMPUTE_UNITS,
78 sizeof(num_compute_units),
79 &num_compute_units,
80 nullptr);
82 if (num_compute_units == 1)
83 {
84 context_m = clCreateContextFromType(0, // properties
85 device_type_m, // device_type
86 0, // pfn_notify
87 0, // user_data
88 &errcode);
89 errorCheck(errcode, __LINE__);
91 // Queue 0 on device 0
92 queue_m[0] = clCreateCommandQueue(context_m,
93 device_ids[0],
94 CL_QUEUE_PROFILING_ENABLE,
95 &errcode);
96 errorCheck(errcode, __LINE__);
97 BuildProgramFromBinary(binary_filename, device_ids, 1);
98 }
99 else
100 {
101 const cl_uint NUM_SUB_DEVICES = 2;
103 // Create 2 sub-device's, each consisting of a C66x DSP
104 cl_device_partition_property properties[3] =
105 { CL_DEVICE_PARTITION_EQUALLY, 1, 0 };
107 // Query the number of sub-devices that can be created
108 cl_uint n_sub_devices = 0;
109 errcode = clCreateSubDevices(device_ids[0], // in_device
110 properties, // properties
111 0, // num_devices
112 NULL, // out_devices
113 &n_sub_devices); // num_devices_ret
114 errorCheck(errcode, __LINE__);
116 assert(n_sub_devices == NUM_SUB_DEVICES);
118 // Create the sub-devices
119 cl_device_id sub_devices[NUM_SUB_DEVICES] = {0, 0};
120 errcode = clCreateSubDevices(device_ids[0], // in_device
121 properties, // properties
122 n_sub_devices, // num_devices
123 sub_devices, // out_devices
124 nullptr); // num_devices_ret
125 errorCheck(errcode, __LINE__);
127 // Create a context containing the sub-devices
128 context_m = clCreateContext(NULL, // properties
129 NUM_SUB_DEVICES, // num_devices
130 sub_devices, // devices
131 NULL, // pfn_notify
132 NULL, // user_data
133 &errcode); // errcode_ret
134 errorCheck(errcode, __LINE__);
136 // Create queues to each sub-device
137 for (auto id : device_ids_m)
138 {
139 int index = static_cast<int>(id);
140 queue_m[index] = clCreateCommandQueue(context_m,
141 sub_devices[index],
142 CL_QUEUE_PROFILING_ENABLE,
143 &errcode);
144 errorCheck(errcode, __LINE__);
145 }
147 BuildProgramFromBinary(binary_filename, sub_devices, NUM_SUB_DEVICES);
148 }
150 errcode = clGetDeviceInfo(device_ids[0],
151 CL_DEVICE_MAX_CLOCK_FREQUENCY,
152 sizeof(freq_in_mhz_m),
153 &freq_in_mhz_m,
154 nullptr);
155 errorCheck(errcode, __LINE__);
156 }
159 EveDevice::EveDevice(const DeviceIds& ids, const std::string &kernel_names):
160 Device(CL_DEVICE_TYPE_CUSTOM, ids)
161 {
162 cl_uint num_devices_found;
163 cl_device_id all_device_ids[MAX_DEVICES];
165 // Find all the OpenCL devices available of the given type
166 cl_int errcode = clGetDeviceIDs(0, // platform
167 device_type_m, // device_type
168 MAX_DEVICES, // num_entries
169 all_device_ids, // devices
170 &num_devices_found); // num_devices
171 errorCheck(errcode, __LINE__);
173 assert (num_devices_found >= device_ids_m.size());
175 context_m = clCreateContextFromType(0, // properties
176 device_type_m, // device_type
177 0, // pfn_notify
178 0, // user_data
179 &errcode);
180 errorCheck(errcode, __LINE__);
183 // Create command queues to OpenCL devices specified by the
184 // device_ids_m set.
185 for (auto id : device_ids_m)
186 {
187 int index = static_cast<int>(id);
188 queue_m[index] = clCreateCommandQueue(context_m,
189 all_device_ids[index],
190 CL_QUEUE_PROFILING_ENABLE,
191 &errcode);
192 errorCheck(errcode, __LINE__);
193 }
195 BuildProgramFromBinary(kernel_names, all_device_ids, device_ids_m.size());
197 errcode = clGetDeviceInfo(all_device_ids[0],
198 CL_DEVICE_MAX_CLOCK_FREQUENCY,
199 sizeof(freq_in_mhz_m),
200 &freq_in_mhz_m,
201 nullptr);
202 errorCheck(errcode, __LINE__);
203 }
206 bool DspDevice::BuildProgramFromBinary(const std::string &BFN,
207 cl_device_id device_ids[],
208 int num_devices)
209 {
210 size_t bin_len = ocl_wrapper_dsp_bin_len;
212 assert (bin_len != 0);
214 // Casting to make ocl_read_binary work with clCreateProgramWithBinary
215 const unsigned char *bin_arrc = reinterpret_cast <const unsigned char *>
216 (ocl_wrapper_dsp_bin);
218 size_t lengths[num_devices];
219 for (int i=0; i < num_devices; i++) lengths[i] = bin_len;
221 const unsigned char* binaries[num_devices];
222 for (int i=0; i < num_devices; i++) binaries[i] = bin_arrc;
224 cl_int err;
225 program_m = clCreateProgramWithBinary(context_m,
226 num_devices,
227 device_ids, // device_list
228 lengths,
229 binaries,
230 0, // binary_status
231 &err);
232 errorCheck(err, __LINE__);
234 const char *options = "";
235 err = clBuildProgram(program_m, num_devices, device_ids, options, 0, 0);
236 errorCheck(err, __LINE__);
238 return true;
239 }
241 bool EveDevice::BuildProgramFromBinary(const std::string& kernel_names,
242 cl_device_id device_ids[],
243 int num_devices)
244 {
245 cl_int err;
246 cl_device_id executor_device_ids[MAX_DEVICES];
248 int i = 0;
249 for (auto id : device_ids_m)
250 executor_device_ids[i++] = device_ids[static_cast<int>(id)];
252 program_m = clCreateProgramWithBuiltInKernels(context_m,
253 num_devices,
254 executor_device_ids, // device_list
255 kernel_names.c_str(),
256 &err);
257 errorCheck(err, __LINE__);
259 return true;
260 }
262 Kernel::Kernel(Device* device, const std::string& name,
263 const KernelArgs& args, uint8_t device_index):
264 name_m(name), device_m(device), device_index_m(device_index),
265 num_running_contexts_m(0)
266 {
267 TRACE::print("Creating kernel %s\n", name.c_str());
268 cl_int err;
269 kernel_m = clCreateKernel(device_m->program_m, name_m.c_str(), &err);
270 errorCheck(err, __LINE__);
272 int arg_index = 0;
273 for (const auto& arg : args)
274 {
275 if (!arg.isLocal())
276 {
277 if (arg.kind() == DeviceArgInfo::Kind::BUFFER)
278 {
279 cl_mem buffer = device_m->CreateBuffer(arg);
281 clSetKernelArg(kernel_m, arg_index, sizeof(cl_mem), &buffer);
282 TRACE::print(" Arg[%d]: %p\n", arg_index, buffer);
284 if (buffer)
285 buffers_m.push_back(buffer);
286 }
287 else if (arg.kind() == DeviceArgInfo::Kind::SCALAR)
288 {
289 clSetKernelArg(kernel_m, arg_index, arg.size(), arg.ptr());
290 TRACE::print(" Arg[%d]: %p\n", arg_index, arg.ptr());
291 }
292 else
293 {
294 assert ("DeviceArgInfo kind not supported");
295 }
296 }
297 else
298 {
299 clSetKernelArg(kernel_m, arg_index, arg.size(), NULL);
300 TRACE::print(" Arg[%d]: local, %d\n", arg_index, arg.size());
301 }
302 arg_index++;
304 }
305 }
307 bool Kernel::UpdateScalarArg(uint32_t index, size_t size, const void *value)
308 {
309 cl_int ret = clSetKernelArg(kernel_m, index, size, value);
310 return ret == CL_SUCCESS;
311 }
313 Kernel& Kernel::RunAsync(uint32_t context_idx)
314 {
315 // Execute kernel
316 TRACE::print("\tKernel: device %d executing %s, context %d\n",
317 device_index_m, name_m.c_str(), context_idx);
318 cl_int ret = clEnqueueTask(device_m->queue_m[device_index_m],
319 kernel_m, 0, 0, &event_m[context_idx]);
320 errorCheck(ret, __LINE__);
321 __sync_fetch_and_add(&num_running_contexts_m, 1);
323 return *this;
324 }
327 bool Kernel::Wait(float *host_elapsed_ms, uint32_t context_idx)
328 {
329 // Wait called without a corresponding RunAsync
330 if (num_running_contexts_m == 0)
331 return false;
333 TRACE::print("\tKernel: waiting context %d...\n", context_idx);
334 cl_int ret = clWaitForEvents(1, &event_m[context_idx]);
335 errorCheck(ret, __LINE__);
337 if (host_elapsed_ms != nullptr)
338 {
339 cl_ulong t_que, t_end;
340 clGetEventProfilingInfo(event_m[context_idx],
341 CL_PROFILING_COMMAND_QUEUED,
342 sizeof(cl_ulong), &t_que, nullptr);
343 clGetEventProfilingInfo(event_m[context_idx], CL_PROFILING_COMMAND_END,
344 sizeof(cl_ulong), &t_end, nullptr);
345 *host_elapsed_ms = (t_end - t_que) / 1.0e6; // nano to milli seconds
346 }
348 ret = clReleaseEvent(event_m[context_idx]);
349 errorCheck(ret, __LINE__);
350 TRACE::print("\tKernel: finished execution\n");
352 __sync_fetch_and_sub(&num_running_contexts_m, 1);
353 return true;
354 }
356 extern void CallbackWrapper(void *user_data) __attribute__((weak));
358 static
359 void EventCallback(cl_event event, cl_int exec_status, void *user_data)
360 {
361 if (exec_status != CL_SUCCESS || user_data == nullptr) return;
362 if (CallbackWrapper) CallbackWrapper(user_data);
363 }
365 bool Kernel::AddCallback(void *user_data, uint32_t context_idx)
366 {
367 if (num_running_contexts_m == 0) return false;
368 return clSetEventCallback(event_m[context_idx], CL_COMPLETE, EventCallback,
369 user_data) == CL_SUCCESS;
370 }
372 Kernel::~Kernel()
373 {
374 for (auto b : buffers_m)
375 device_m->ReleaseBuffer(b);
377 clReleaseKernel(kernel_m);
378 }
380 cl_mem Device::CreateBuffer(const DeviceArgInfo &Arg)
381 {
382 size_t size = Arg.size();
383 void *host_ptr = Arg.ptr();
385 if (host_ptr == nullptr)
386 {
387 TRACE::print("\tOCL Create B:%p\n", nullptr);
388 return nullptr;
389 }
391 bool hostPtrInCMEM = __is_in_malloced_region(host_ptr);
393 // Conservative till we have sufficient information.
394 cl_mem_flags flag = CL_MEM_READ_WRITE;
396 if (hostPtrInCMEM) flag |= (cl_mem_flags)CL_MEM_USE_HOST_PTR;
397 else flag |= (cl_mem_flags)CL_MEM_COPY_HOST_PTR;
399 cl_int errcode;
400 cl_mem buffer = clCreateBuffer(context_m,
401 flag,
402 size,
403 host_ptr,
404 &errcode);
405 errorCheck(errcode, __LINE__);
407 TRACE::print("\tOCL Create B:%p\n", buffer);
409 return buffer;
410 }
412 void Device::ReleaseBuffer(cl_mem M)
413 {
414 TRACE::print("\tOCL Release B:%p\n", M);
415 clReleaseMemObject(M);
416 }
418 /// Release resources associated with an OpenCL device
419 Device::~Device()
420 {
421 TRACE::print("\tOCL Device: deleted\n");
422 for (unsigned int i = 0; i < device_ids_m.size(); i++)
423 {
424 clFinish(queue_m[i]);
425 clReleaseCommandQueue (queue_m[i]);
426 }
428 clReleaseProgram (program_m);
429 clReleaseContext (context_m);
430 }
432 void errorCheck(cl_int ret, int line)
433 {
434 if (ret != CL_SUCCESS)
435 {
436 std::cerr << "ERROR: [ Line: " << line << "] " << error2string(ret) << std::endl;
437 exit(ret);
438 }
439 }
441 /// Convert OpenCL error codes to a string
442 const char* error2string(cl_int err)
443 {
444 switch(err)
445 {
446 case 0: return "CL_SUCCESS";
447 case -1: return "CL_DEVICE_NOT_FOUND";
448 case -2: return "CL_DEVICE_NOT_AVAILABLE";
449 case -3: return "CL_COMPILER_NOT_AVAILABLE";
450 case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
451 case -5: return "CL_OUT_OF_RESOURCES";
452 case -6: return "CL_OUT_OF_HOST_MEMORY";
453 case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
454 case -8: return "CL_MEM_COPY_OVERLAP";
455 case -9: return "CL_IMAGE_FORMAT_MISMATCH";
456 case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
457 case -11: return "CL_BUILD_PROGRAM_FAILURE";
458 case -12: return "CL_MAP_FAILURE";
460 case -30: return "CL_INVALID_VALUE";
461 case -31: return "CL_INVALID_DEVICE_TYPE";
462 case -32: return "CL_INVALID_PLATFORM";
463 case -33: return "CL_INVALID_DEVICE";
464 case -34: return "CL_INVALID_CONTEXT";
465 case -35: return "CL_INVALID_QUEUE_PROPERTIES";
466 case -36: return "CL_INVALID_COMMAND_QUEUE";
467 case -37: return "CL_INVALID_HOST_PTR";
468 case -38: return "CL_INVALID_MEM_OBJECT";
469 case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
470 case -40: return "CL_INVALID_IMAGE_SIZE";
471 case -41: return "CL_INVALID_SAMPLER";
472 case -42: return "CL_INVALID_BINARY";
473 case -43: return "CL_INVALID_BUILD_OPTIONS";
474 case -44: return "CL_INVALID_PROGRAM";
475 case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
476 case -46: return "CL_INVALID_KERNEL_NAME";
477 case -47: return "CL_INVALID_KERNEL_DEFINITION";
478 case -48: return "CL_INVALID_KERNEL";
479 case -49: return "CL_INVALID_ARG_INDEX";
480 case -50: return "CL_INVALID_ARG_VALUE";
481 case -51: return "CL_INVALID_ARG_SIZE";
482 case -52: return "CL_INVALID_KERNEL_ARGS";
483 case -53: return "CL_INVALID_WORK_DIMENSION";
484 case -54: return "CL_INVALID_WORK_GROUP_SIZE";
485 case -55: return "CL_INVALID_WORK_ITEM_SIZE";
486 case -56: return "CL_INVALID_GLOBAL_OFFSET";
487 case -57: return "CL_INVALID_EVENT_WAIT_LIST";
488 case -58: return "CL_INVALID_EVENT";
489 case -59: return "CL_INVALID_OPERATION";
490 case -60: return "CL_INVALID_GL_OBJECT";
491 case -61: return "CL_INVALID_BUFFER_SIZE";
492 case -62: return "CL_INVALID_MIP_LEVEL";
493 case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
494 default: return "Unknown OpenCL error";
495 }
496 }
498 Device::Ptr Device::Create(DeviceType core_type, const DeviceIds& ids,
499 const std::string& name)
500 {
501 Device::Ptr p(nullptr);
502 if (core_type == DeviceType::DSP)
503 p.reset(new DspDevice(ids, name));
504 else if (core_type == DeviceType::EVE)
505 p.reset(new EveDevice(ids, name));
507 return p;
508 }
510 // Minimum version of OpenCL required for this version of TIDL API
511 #define MIN_OCL_VERSION "01.01.16.00"
512 static bool CheckOpenCLVersion(cl_platform_id id)
513 {
514 cl_int err;
515 size_t length;
516 err = clGetPlatformInfo(id, CL_PLATFORM_VERSION, 0, nullptr, &length);
517 if (err != CL_SUCCESS) return false;
519 std::unique_ptr<char> version(new char[length]);
520 err = clGetPlatformInfo(id, CL_PLATFORM_VERSION, length, version.get(),
521 nullptr);
522 if (err != CL_SUCCESS) return false;
524 std::string v(version.get());
526 if (v.substr(v.find("01."), sizeof(MIN_OCL_VERSION)) >= MIN_OCL_VERSION)
527 return true;
529 std::cerr << "TIDL API Error: OpenCL " << MIN_OCL_VERSION
530 << " or higher required." << std::endl;
532 return false;
533 }
535 static bool PlatformIsAM57()
536 {
537 cl_platform_id id;
538 cl_int err;
540 err = clGetPlatformIDs(1, &id, nullptr);
541 if (err != CL_SUCCESS) return false;
543 if (!CheckOpenCLVersion(id))
544 return false;
546 // Check if the device name is AM57
547 size_t length;
548 err = clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, nullptr, &length);
549 if (err != CL_SUCCESS) return false;
551 std::unique_ptr<char> name(new char[length]);
553 err = clGetPlatformInfo(id, CL_PLATFORM_NAME, length, name.get(), nullptr);
554 if (err != CL_SUCCESS) return false;
556 std::string platform_name(name.get());
558 if (platform_name.find("AM57") == std::string::npos)
559 return false;
561 return true;
562 }
564 // TI DL is supported on AM57x - EVE or C66x devices
565 uint32_t Device::GetNumDevices(DeviceType device_type)
566 {
567 if (!PlatformIsAM57()) return 0;
569 // Convert DeviceType to OpenCL device type
570 cl_device_type t = (device_type == DeviceType::EVE) ?
571 CL_DEVICE_TYPE_CUSTOM :
572 CL_DEVICE_TYPE_ACCELERATOR;
574 // Find all the OpenCL devices available
575 cl_uint num_devices_found;
576 cl_device_id all_device_ids[MAX_DEVICES];
578 cl_int errcode = clGetDeviceIDs(0, // platform
579 t, // device_type
580 MAX_DEVICES, // num_entries
581 all_device_ids, // devices
582 &num_devices_found); // num_devices
585 if (errcode != CL_SUCCESS) return 0;
586 if (num_devices_found == 0) return 0;
588 // DSP, return the number of compute units since we maintain a
589 // queue to each compute unit (i.e. C66x DSP)
590 if (t == CL_DEVICE_TYPE_ACCELERATOR)
591 {
592 cl_int num_compute_units;
593 errcode = clGetDeviceInfo(all_device_ids[0],
594 CL_DEVICE_MAX_COMPUTE_UNITS,
595 sizeof(num_compute_units),
596 &num_compute_units,
597 nullptr);
598 if (errcode != CL_SUCCESS)
599 return 0;
601 return num_compute_units;
602 }
604 // EVE, return the number of devices since each EVE is a device
605 return num_devices_found;
606 }