1 /******************************************************************************
2 * Copyright (c) 2017-2018 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
30 #include <cstdlib>
31 #include <cassert>
32 using std::size_t;
34 #include <iostream>
36 #include "ocl_device.h"
37 #include "ocl_util.h"
38 #include "trace.h"
40 using namespace tidl;
42 static const char* error2string(cl_int err);
43 static void errorCheck(cl_int ret, int line);
45 Device::Device(cl_device_type t, const DeviceIds& ids, const char* name):
46 device_type_m(t), device_ids_m(ids)
47 {
48 TRACE::print("\tOCL Device: %s created\n",
49 device_type_m == CL_DEVICE_TYPE_CUSTOM ? name : "Unknown");
51 for (int i = 0; i < MAX_DEVICES; i++)
52 queue_m[i] = nullptr;
54 }
56 DspDevice::DspDevice(const DeviceIds& ids, const std::string &kernel_names):
57 Device(CL_DEVICE_TYPE_CUSTOM, ids, "DSP")
58 {
59 cl_int errcode;
60 cl_device_id device_ids[MAX_DEVICES];
61 cl_device_id out_device_ids[MAX_DEVICES];
62 cl_uint num_compute_units;
63 cl_uint num_out_devices;
65 if (! GetDevices(DeviceType::DSP, device_ids, nullptr, &num_compute_units))
66 throw Exception("OpenCL DSP device not found",
67 __FILE__, __FUNCTION__, __LINE__);
69 if (num_compute_units == 1)
70 {
71 num_out_devices = 1;
72 out_device_ids[0] = device_ids[0];
73 }
74 else
75 {
76 // Create 2 sub-device's, each consisting of a C66x DSP
77 cl_device_partition_property properties[3] =
78 { CL_DEVICE_PARTITION_EQUALLY, 1, 0 };
80 // Query the number of sub-devices that can be created
81 const cl_uint NUM_SUB_DEVICES = 2;
82 errcode = clCreateSubDevices(device_ids[0], // in_device
83 properties, // properties
84 0, // num_devices
85 NULL, // out_devices
86 &num_out_devices); // num_devices_ret
87 errorCheck(errcode, __LINE__);
89 assert(num_out_devices == NUM_SUB_DEVICES);
91 // Create the sub-devices
92 errcode = clCreateSubDevices(device_ids[0], // in_device
93 properties, // properties
94 num_out_devices, // num_devices
95 out_device_ids, // out_devices
96 nullptr); // num_devices_ret
97 errorCheck(errcode, __LINE__);
98 }
100 // Create a context containing the out-devices
101 context_m = clCreateContext(NULL, // properties
102 num_out_devices, // num_devices
103 out_device_ids, // devices
104 NULL, // pfn_notify
105 NULL, // user_data
106 &errcode); // errcode_ret
107 errorCheck(errcode, __LINE__);
109 // Create queues to each out device
110 for (auto id : device_ids_m)
111 {
112 cl_uint index = static_cast<cl_uint>(id);
113 assert(index < num_out_devices);
114 queue_m[index] = clCreateCommandQueue(context_m,
115 out_device_ids[index],
116 CL_QUEUE_PROFILING_ENABLE|
117 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
118 &errcode);
119 errorCheck(errcode, __LINE__);
120 }
122 // Build kernel program
123 BuildBuiltInProgram(kernel_names, out_device_ids, num_out_devices);
125 // Query device frequency
126 errcode = clGetDeviceInfo(device_ids[0],
127 CL_DEVICE_MAX_CLOCK_FREQUENCY,
128 sizeof(freq_in_mhz_m),
129 &freq_in_mhz_m,
130 nullptr);
131 errorCheck(errcode, __LINE__);
132 }
135 EveDevice::EveDevice(const DeviceIds& ids, const std::string &kernel_names):
136 Device(CL_DEVICE_TYPE_CUSTOM, ids, "EVE")
137 {
138 cl_int errcode;
139 cl_device_id all_device_ids[MAX_DEVICES];
140 cl_uint num_devices;
141 if (! GetDevices(DeviceType::EVE, all_device_ids, &num_devices, nullptr))
142 throw Exception("OpenCL EVE device not found",
143 __FILE__, __FUNCTION__, __LINE__);
145 assert (num_devices >= device_ids_m.size());
147 context_m = clCreateContextFromType(0, // properties
148 device_type_m, // device_type
149 0, // pfn_notify
150 0, // user_data
151 &errcode);
152 errorCheck(errcode, __LINE__);
154 // Create command queues to OpenCL devices specified by the
155 // device_ids_m set.
156 for (auto id : device_ids_m)
157 {
158 int index = static_cast<int>(id);
159 queue_m[index] = clCreateCommandQueue(context_m,
160 all_device_ids[index],
161 CL_QUEUE_PROFILING_ENABLE|
162 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
163 &errcode);
164 errorCheck(errcode, __LINE__);
165 }
167 BuildBuiltInProgram(kernel_names, all_device_ids, device_ids_m.size());
169 errcode = clGetDeviceInfo(all_device_ids[0],
170 CL_DEVICE_MAX_CLOCK_FREQUENCY,
171 sizeof(freq_in_mhz_m),
172 &freq_in_mhz_m,
173 nullptr);
174 errorCheck(errcode, __LINE__);
175 }
177 bool DspDevice::BuildBuiltInProgram(const std::string& kernel_names,
178 cl_device_id device_ids[],
179 int num_devices)
180 {
181 cl_int err;
182 program_m = clCreateProgramWithBuiltInKernels(context_m,
183 num_devices,
184 device_ids, // device_list
185 kernel_names.c_str(),
186 &err);
187 errorCheck(err, __LINE__);
189 return true;
190 }
192 bool EveDevice::BuildBuiltInProgram(const std::string& kernel_names,
193 cl_device_id device_ids[],
194 int num_devices)
195 {
196 cl_int err;
197 cl_device_id executor_device_ids[MAX_DEVICES];
199 int i = 0;
200 for (auto id : device_ids_m)
201 executor_device_ids[i++] = device_ids[static_cast<int>(id)];
203 program_m = clCreateProgramWithBuiltInKernels(context_m,
204 num_devices,
205 executor_device_ids, // device_list
206 kernel_names.c_str(),
207 &err);
208 errorCheck(err, __LINE__);
210 return true;
211 }
213 Kernel::Kernel(Device* device, const std::string& name,
214 const KernelArgs& args, uint8_t device_index):
215 name_m(name), device_m(device), device_index_m(device_index)
216 {
217 TRACE::print("Creating kernel %s\n", name.c_str());
218 cl_int err;
219 kernel_m = clCreateKernel(device_m->program_m, name_m.c_str(), &err);
220 errorCheck(err, __LINE__);
222 for (int i=0; i < tidl::internal::NUM_CONTEXTS; i++)
223 event_m[i] = nullptr;
225 int arg_index = 0;
226 for (const auto& arg : args)
227 {
228 if (!arg.isLocal())
229 {
230 if (arg.kind() == DeviceArgInfo::Kind::BUFFER)
231 {
232 cl_mem buffer = device_m->CreateBuffer(arg);
234 clSetKernelArg(kernel_m, arg_index, sizeof(cl_mem), &buffer);
235 TRACE::print(" Arg[%d]: %p\n", arg_index, buffer);
237 if (buffer)
238 buffers_m.push_back(buffer);
239 }
240 else if (arg.kind() == DeviceArgInfo::Kind::SCALAR)
241 {
242 clSetKernelArg(kernel_m, arg_index, arg.size(), arg.ptr());
243 TRACE::print(" Arg[%d]: %p\n", arg_index, arg.ptr());
244 }
245 else
246 {
247 assert ("DeviceArgInfo kind not supported");
248 }
249 }
250 else
251 {
252 clSetKernelArg(kernel_m, arg_index, arg.size(), NULL);
253 TRACE::print(" Arg[%d]: local, %d\n", arg_index, arg.size());
254 }
255 arg_index++;
257 }
258 }
260 bool Kernel::UpdateScalarArg(uint32_t index, size_t size, const void *value)
261 {
262 cl_int ret = clSetKernelArg(kernel_m, index, size, value);
263 return ret == CL_SUCCESS;
264 }
266 Kernel& Kernel::RunAsync(uint32_t context_idx)
267 {
268 // Execute kernel
269 TRACE::print("\tKernel: %s device %d executing %s, context %d\n",
270 device_m->GetDeviceName().c_str(),
271 device_index_m, name_m.c_str(), context_idx);
272 cl_int ret = clEnqueueTask(device_m->queue_m[device_index_m],
273 kernel_m, 0, 0, &event_m[context_idx]);
274 errorCheck(ret, __LINE__);
276 return *this;
277 }
279 bool Kernel::Wait(uint32_t context_idx)
280 {
281 // Wait called without a corresponding RunAsync
282 if (event_m[context_idx] == nullptr)
283 return false;
285 TRACE::print("\tKernel: waiting context %d...\n", context_idx);
286 cl_int ret = clWaitForEvents(1, &event_m[context_idx]);
287 errorCheck(ret, __LINE__);
289 ret = clReleaseEvent(event_m[context_idx]);
290 errorCheck(ret, __LINE__);
291 event_m[context_idx] = nullptr;
293 TRACE::print("\tKernel: finished execution\n");
295 return true;
296 }
298 extern void CallbackWrapper(void *user_data) __attribute__((weak));
300 static
301 void EventCallback(cl_event event, cl_int exec_status, void *user_data)
302 {
303 if (exec_status != CL_SUCCESS || user_data == nullptr) return;
304 if (CallbackWrapper) CallbackWrapper(user_data);
305 }
307 bool Kernel::AddCallback(void *user_data, uint32_t context_idx)
308 {
309 if (event_m[context_idx] == nullptr)
310 return false;
312 return clSetEventCallback(event_m[context_idx], CL_COMPLETE, EventCallback,
313 user_data) == CL_SUCCESS;
314 }
316 Kernel::~Kernel()
317 {
318 for (auto b : buffers_m)
319 device_m->ReleaseBuffer(b);
321 clReleaseKernel(kernel_m);
322 }
324 cl_mem Device::CreateBuffer(const DeviceArgInfo &Arg)
325 {
326 size_t size = Arg.size();
327 void *host_ptr = Arg.ptr();
329 if (host_ptr == nullptr)
330 {
331 TRACE::print("\tOCL Create B:%p\n", nullptr);
332 return nullptr;
333 }
335 bool hostPtrInCMEM = __is_in_malloced_region(host_ptr);
337 // Conservative till we have sufficient information.
338 cl_mem_flags flag = CL_MEM_READ_WRITE;
340 if (hostPtrInCMEM) flag |= (cl_mem_flags)CL_MEM_USE_HOST_PTR;
341 else flag |= (cl_mem_flags)CL_MEM_COPY_HOST_PTR;
343 cl_int errcode;
344 cl_mem buffer = clCreateBuffer(context_m,
345 flag,
346 size,
347 host_ptr,
348 &errcode);
349 errorCheck(errcode, __LINE__);
351 TRACE::print("\tOCL Create B:%p\n", buffer);
353 return buffer;
354 }
356 void Device::ReleaseBuffer(cl_mem M)
357 {
358 TRACE::print("\tOCL Release B:%p\n", M);
359 clReleaseMemObject(M);
360 }
362 /// Release resources associated with an OpenCL device
363 Device::~Device()
364 {
365 TRACE::print("\tOCL Device: deleted\n");
366 for (unsigned int i = 0; i < device_ids_m.size(); i++)
367 {
368 clFinish(queue_m[i]);
369 clReleaseCommandQueue (queue_m[i]);
370 }
372 clReleaseProgram (program_m);
373 clReleaseContext (context_m);
374 }
376 void errorCheck(cl_int ret, int line)
377 {
378 if (ret != CL_SUCCESS)
379 {
380 std::cerr << "ERROR: [ Line: " << line << "] " << error2string(ret) << std::endl;
381 exit(ret);
382 }
383 }
385 /// Convert OpenCL error codes to a string
386 const char* error2string(cl_int err)
387 {
388 switch(err)
389 {
390 case 0: return "CL_SUCCESS";
391 case -1: return "CL_DEVICE_NOT_FOUND";
392 case -2: return "CL_DEVICE_NOT_AVAILABLE";
393 case -3: return "CL_COMPILER_NOT_AVAILABLE";
394 case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
395 case -5: return "CL_OUT_OF_RESOURCES";
396 case -6: return "CL_OUT_OF_HOST_MEMORY";
397 case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
398 case -8: return "CL_MEM_COPY_OVERLAP";
399 case -9: return "CL_IMAGE_FORMAT_MISMATCH";
400 case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
401 case -11: return "CL_BUILD_PROGRAM_FAILURE";
402 case -12: return "CL_MAP_FAILURE";
404 case -30: return "CL_INVALID_VALUE";
405 case -31: return "CL_INVALID_DEVICE_TYPE";
406 case -32: return "CL_INVALID_PLATFORM";
407 case -33: return "CL_INVALID_DEVICE";
408 case -34: return "CL_INVALID_CONTEXT";
409 case -35: return "CL_INVALID_QUEUE_PROPERTIES";
410 case -36: return "CL_INVALID_COMMAND_QUEUE";
411 case -37: return "CL_INVALID_HOST_PTR";
412 case -38: return "CL_INVALID_MEM_OBJECT";
413 case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
414 case -40: return "CL_INVALID_IMAGE_SIZE";
415 case -41: return "CL_INVALID_SAMPLER";
416 case -42: return "CL_INVALID_BINARY";
417 case -43: return "CL_INVALID_BUILD_OPTIONS";
418 case -44: return "CL_INVALID_PROGRAM";
419 case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
420 case -46: return "CL_INVALID_KERNEL_NAME";
421 case -47: return "CL_INVALID_KERNEL_DEFINITION";
422 case -48: return "CL_INVALID_KERNEL";
423 case -49: return "CL_INVALID_ARG_INDEX";
424 case -50: return "CL_INVALID_ARG_VALUE";
425 case -51: return "CL_INVALID_ARG_SIZE";
426 case -52: return "CL_INVALID_KERNEL_ARGS";
427 case -53: return "CL_INVALID_WORK_DIMENSION";
428 case -54: return "CL_INVALID_WORK_GROUP_SIZE";
429 case -55: return "CL_INVALID_WORK_ITEM_SIZE";
430 case -56: return "CL_INVALID_GLOBAL_OFFSET";
431 case -57: return "CL_INVALID_EVENT_WAIT_LIST";
432 case -58: return "CL_INVALID_EVENT";
433 case -59: return "CL_INVALID_OPERATION";
434 case -60: return "CL_INVALID_GL_OBJECT";
435 case -61: return "CL_INVALID_BUFFER_SIZE";
436 case -62: return "CL_INVALID_MIP_LEVEL";
437 case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
438 default: return "Unknown OpenCL error";
439 }
440 }
442 Device::Ptr Device::Create(DeviceType core_type, const DeviceIds& ids,
443 const std::string& name)
444 {
445 Device::Ptr p(nullptr);
446 if (core_type == DeviceType::DSP)
447 p.reset(new DspDevice(ids, name));
448 else if (core_type == DeviceType::EVE)
449 p.reset(new EveDevice(ids, name));
451 return p;
452 }
454 // Minimum version of OpenCL required for this version of TIDL API
455 #define MIN_OCL_VERSION "01.01.19.00"
456 static bool CheckOpenCLVersion(cl_platform_id id)
457 {
458 cl_int err;
459 size_t length;
460 err = clGetPlatformInfo(id, CL_PLATFORM_VERSION, 0, nullptr, &length);
461 if (err != CL_SUCCESS) return false;
463 std::unique_ptr<char[]> version(new char[length]);
464 err = clGetPlatformInfo(id, CL_PLATFORM_VERSION, length, version.get(),
465 nullptr);
466 if (err != CL_SUCCESS) return false;
468 std::string v(version.get());
470 if (v.substr(v.find("01."), sizeof(MIN_OCL_VERSION)) >= MIN_OCL_VERSION)
471 return true;
473 std::cerr << "TIDL API Error: OpenCL " << MIN_OCL_VERSION
474 << " or higher required." << std::endl;
476 return false;
477 }
479 static bool PlatformIsAM57()
480 {
481 cl_platform_id id;
482 cl_int err;
484 err = clGetPlatformIDs(1, &id, nullptr);
485 if (err != CL_SUCCESS) return false;
487 if (!CheckOpenCLVersion(id))
488 return false;
490 // Check if the device name is AM57
491 size_t length;
492 err = clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, nullptr, &length);
493 if (err != CL_SUCCESS) return false;
495 std::unique_ptr<char[]> name(new char[length]);
497 err = clGetPlatformInfo(id, CL_PLATFORM_NAME, length, name.get(), nullptr);
498 if (err != CL_SUCCESS) return false;
500 std::string platform_name(name.get());
502 if (platform_name.find("AM57") == std::string::npos)
503 return false;
505 return true;
506 }
508 // TI DL is supported on AM57x - EVE or C66x devices
509 bool Device::GetDevices(DeviceType device_type,
510 cl_device_id cl_d_ids[],
511 cl_uint *p_num_devices,
512 cl_uint *p_num_compute_units)
513 {
514 if (!PlatformIsAM57()) return false;
516 // Convert DeviceType to OpenCL device type
517 cl_device_type t = CL_DEVICE_TYPE_CUSTOM;
519 // Find all the OpenCL custom devices available
520 cl_uint num_devices_found;
521 cl_device_id all_device_ids[MAX_DEVICES];
523 cl_int errcode = clGetDeviceIDs(0, // platform
524 t, // device_type
525 MAX_DEVICES, // num_entries
526 all_device_ids, // devices
527 &num_devices_found); // num_devices
530 if (errcode != CL_SUCCESS) return false;
531 if (num_devices_found == 0) return false;
533 // Find devices according to device_type
534 // DSP: ACCELERATOR | CUSTOM
535 // EVE: CUSTOM
536 cl_uint num_devices = 0;
537 for (cl_uint i = 0; i < num_devices_found; i++)
538 {
539 cl_device_type cl_d_type;
540 errcode = clGetDeviceInfo(all_device_ids[i], CL_DEVICE_TYPE,
541 sizeof(cl_device_type), &cl_d_type, nullptr);
542 if (errcode != CL_SUCCESS) return false;
544 if ((device_type == DeviceType::DSP &&
545 ((cl_d_type & CL_DEVICE_TYPE_ACCELERATOR) != 0)) ||
546 (device_type == DeviceType::EVE &&
547 ((cl_d_type & CL_DEVICE_TYPE_ACCELERATOR) == 0)))
548 cl_d_ids[num_devices++] = all_device_ids[i];
549 }
550 if (p_num_devices != nullptr) *p_num_devices = num_devices;
552 // DSP, return the number of compute units
553 if (device_type == DeviceType::DSP &&
554 num_devices > 0 && p_num_compute_units != nullptr)
555 {
556 errcode = clGetDeviceInfo(cl_d_ids[0],
557 CL_DEVICE_MAX_COMPUTE_UNITS,
558 sizeof(cl_int),
559 p_num_compute_units,
560 nullptr);
561 if (errcode != CL_SUCCESS) return false;
562 }
564 return true;
565 }
567 uint32_t Device::GetNumDevices(DeviceType device_type)
568 {
569 cl_device_id cl_d_ids[MAX_DEVICES];
570 cl_uint num_devices = 0;
571 cl_uint num_cus = 0;
573 if (! GetDevices(device_type, cl_d_ids, &num_devices, &num_cus)) return 0;
575 // EVE, return the number of devices since each EVE is a device
576 // DSP, return the number of compute units since we maintain a
577 // queue to each compute unit (i.e. C66x DSP)
578 return device_type == DeviceType::EVE ? num_devices : num_cus;
579 }