1 /******************************************************************************
2 * Copyright (c) 2017-2018 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
29 /*! \file execution_object.cpp */
31 #include <string.h>
32 #include <fstream>
33 #include <climits>
34 #include <mutex>
35 #include <condition_variable>
36 #include <chrono>
37 #include "executor.h"
38 #include "execution_object.h"
39 #include "trace.h"
40 #include "ocl_device.h"
41 #include "parameters.h"
42 #include "common_defines.h"
43 #include "tidl_create_params.h"
44 #include "device_arginfo.h"
45 #include "util.h"
47 using namespace tidl;
49 class ExecutionObject::Impl
50 {
51 public:
52 Impl(Device* d, DeviceType t, uint8_t device_index,
53 const DeviceArgInfo& create_arg,
54 const DeviceArgInfo& param_heap_arg,
55 const Configuration& configuration,
56 int layers_group_id);
57 ~Impl() {}
59 bool RunAsync(CallType ct, uint32_t context_idx);
60 bool Wait (CallType ct, uint32_t context_idx);
61 bool AddCallback(CallType ct, void *user_data, uint32_t context_idx);
63 uint64_t GetProcessCycles(uint32_t context_idx) const;
64 int GetLayersGroupId() const;
65 void AcquireContext(uint32_t& context_idx);
66 void ReleaseContext(uint32_t context_idx);
68 // Trace related
69 void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
70 const LayerOutput* GetOutputFromLayer (uint32_t layer_index,
71 uint32_t output_index) const;
72 const LayerOutputs* GetOutputsFromAllLayers() const;
75 Device* device_m;
76 DeviceType device_type_m;
78 // Index of the OpenCL device/queue used by this EO
79 uint8_t device_index_m;
80 std::string device_name_m;
82 up_malloc_ddr<char> tidl_extmem_heap_m;
83 up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
84 up_malloc_ddr<OCL_TIDL_ProcessParams> shared_process_params_m;
86 size_t in_size_m;
87 size_t out_size_m;
88 IODeviceArgInfo in_m[tidl::internal::NUM_CONTEXTS];
89 IODeviceArgInfo out_m[tidl::internal::NUM_CONTEXTS];
91 // Frame being processed by the EO
92 int current_frame_idx_m[tidl::internal::NUM_CONTEXTS];
94 // LayersGroupId being processed by the EO
95 int layers_group_id_m;
97 uint32_t num_network_layers_m;
98 up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
99 size_t trace_buf_params_sz_m;
101 private:
102 void SetupInitializeKernel(const DeviceArgInfo& create_arg,
103 const DeviceArgInfo& param_heap_arg);
104 void EnableOutputBufferTrace();
105 void SetupProcessKernel();
107 void HostWriteNetInput(uint32_t context_idx);
108 void HostReadNetOutput(uint32_t context_idx);
109 void ComputeInputOutputSizes();
111 std::unique_ptr<Kernel> k_initialize_m;
112 std::unique_ptr<Kernel> k_process_m;
113 std::unique_ptr<Kernel> k_cleanup_m;
115 // Guarding sole access to input/output for one frame during execution
116 // Encoding: context at bit index, bit value: 0 for idle, 1 for busy
117 uint32_t idle_encoding_m;
118 std::mutex mutex_access_m;
119 std::condition_variable cv_access_m;
121 const Configuration configuration_m;
122 };
125 ExecutionObject::ExecutionObject(Device* d,
126 DeviceType t,
127 uint8_t device_index,
128 const ArgInfo& create_arg,
129 const ArgInfo& param_heap_arg,
130 const Configuration& configuration,
131 int layers_group_id)
132 {
133 TRACE::print("-> ExecutionObject::ExecutionObject()\n");
135 DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
136 DeviceArgInfo param_heap_arg_d(param_heap_arg, DeviceArgInfo::Kind::BUFFER);
138 pimpl_m = std::unique_ptr<ExecutionObject::Impl>
139 { new ExecutionObject::Impl(d, t, device_index,
140 create_arg_d,
141 param_heap_arg_d,
142 configuration,
143 layers_group_id) };
144 TRACE::print("<- ExecutionObject::ExecutionObject()\n");
145 }
148 ExecutionObject::Impl::Impl(Device* d, DeviceType t, uint8_t device_index,
149 const DeviceArgInfo& create_arg,
150 const DeviceArgInfo& param_heap_arg,
151 const Configuration& configuration,
152 int layers_group_id):
153 device_m(d),
154 device_type_m(t),
155 device_index_m(device_index),
156 tidl_extmem_heap_m (nullptr, &__free_ddr),
157 shared_initialize_params_m(nullptr, &__free_ddr),
158 shared_process_params_m(nullptr, &__free_ddr),
159 in_size_m(0),
160 out_size_m(0),
161 layers_group_id_m(layers_group_id),
162 num_network_layers_m(0),
163 trace_buf_params_m(nullptr, &__free_ddr),
164 trace_buf_params_sz_m(0),
165 k_initialize_m(nullptr),
166 k_process_m(nullptr),
167 k_cleanup_m(nullptr),
168 idle_encoding_m(0), // all contexts are idle
169 configuration_m(configuration)
170 {
171 device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
172 // Save number of layers in the network
173 const TIDL_CreateParams* cp =
174 static_cast<const TIDL_CreateParams *>(create_arg.ptr());
175 num_network_layers_m = cp->net.numLayers;
177 SetupInitializeKernel(create_arg, param_heap_arg);
179 if (configuration_m.enableOutputTrace)
180 EnableOutputBufferTrace();
182 SetupProcessKernel();
184 for (int i = 0; i < tidl::internal::NUM_CONTEXTS; i++)
185 current_frame_idx_m[i] = 0;
186 }
188 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
189 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
190 // unique_ptr's destructor requires a complete type in order to invoke delete
191 ExecutionObject::~ExecutionObject() = default;
193 char* ExecutionObject::GetInputBufferPtr() const
194 {
195 return static_cast<char *>(pimpl_m->in_m[0].GetArg().ptr());
196 }
198 size_t ExecutionObject::GetInputBufferSizeInBytes() const
199 {
200 return pimpl_m->in_size_m;
201 }
203 char* ExecutionObject::GetOutputBufferPtr() const
204 {
205 return static_cast<char *>(pimpl_m->out_m[0].GetArg().ptr());
206 }
208 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
209 {
210 return pimpl_m->out_size_m;
211 }
213 void ExecutionObject::SetFrameIndex(int idx)
214 {
215 pimpl_m->current_frame_idx_m[0] = idx;
216 }
218 int ExecutionObject::GetFrameIndex() const
219 {
220 return pimpl_m->current_frame_idx_m[0];
221 }
223 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in,
224 const ArgInfo& out)
225 {
226 pimpl_m->in_m[0] = IODeviceArgInfo(in);
227 pimpl_m->out_m[0] = IODeviceArgInfo(out);
228 }
230 bool ExecutionObject::ProcessFrameStartAsync()
231 {
232 return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS, 0);
233 }
235 bool ExecutionObject::ProcessFrameWait()
236 {
237 return pimpl_m->Wait(ExecutionObject::CallType::PROCESS, 0);
238 }
240 bool ExecutionObject::RunAsync (CallType ct)
241 {
242 return pimpl_m->RunAsync(ct, 0);
243 }
245 bool ExecutionObject::Wait (CallType ct)
246 {
247 return pimpl_m->Wait(ct, 0);
248 }
251 bool ExecutionObject::AcquireAndRunContext(uint32_t& context_idx,
252 int frame_idx,
253 const IODeviceArgInfo& in,
254 const IODeviceArgInfo& out)
255 {
256 pimpl_m->AcquireContext(context_idx);
258 pimpl_m->current_frame_idx_m[context_idx] = frame_idx;
259 pimpl_m->in_m[context_idx] = in;
260 pimpl_m->out_m[context_idx] = out;
262 return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS,
263 context_idx);
264 }
266 bool ExecutionObject::WaitAndReleaseContext(uint32_t context_idx)
267 {
268 TRACE::print("-> ExecutionObject::WaitAndReleaseContext(%d)\n",
269 context_idx);
271 bool status = pimpl_m->Wait(ExecutionObject::CallType::PROCESS,
272 context_idx);
273 pimpl_m->ReleaseContext(context_idx);
275 return status;
276 }
278 bool ExecutionObject::AddCallback(CallType ct, void *user_data,
279 uint32_t context_idx)
280 {
281 return pimpl_m->AddCallback(ct, user_data, context_idx);
282 }
284 float ExecutionObject::GetProcessTimeInMilliSeconds() const
285 {
286 float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
287 return ((float)pimpl_m->GetProcessCycles(0)) / frequency * 1000;
288 }
290 void
291 ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
292 {
293 pimpl_m->WriteLayerOutputsToFile(filename_prefix);
294 }
296 const LayerOutput* ExecutionObject::GetOutputFromLayer(
297 uint32_t layer_index, uint32_t output_index) const
298 {
299 return pimpl_m->GetOutputFromLayer(layer_index, output_index);
300 }
302 const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
303 {
304 return pimpl_m->GetOutputsFromAllLayers();
305 }
307 int ExecutionObject::GetLayersGroupId() const
308 {
309 return pimpl_m->layers_group_id_m;
310 }
312 const std::string& ExecutionObject::GetDeviceName() const
313 {
314 return pimpl_m->device_name_m;
315 }
318 //
319 // Create a kernel to call the "initialize" function
320 //
321 void
322 ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
323 const DeviceArgInfo& param_heap_arg)
324 {
325 // Allocate a heap for TI DL to use on the device
326 tidl_extmem_heap_m.reset(
327 malloc_ddr<char>(configuration_m.NETWORK_HEAP_SIZE));
329 // Create a kernel for cleanup
330 KernelArgs cleanup_args;
331 k_cleanup_m.reset(new Kernel(device_m,
332 STRING(CLEANUP_KERNEL),
333 cleanup_args, device_index_m));
335 // Set up parameter struct for the initialize kernel
336 shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
337 memset(shared_initialize_params_m.get(), 0,
338 sizeof(OCL_TIDL_InitializeParams));
340 shared_initialize_params_m->tidlHeapSize =configuration_m.NETWORK_HEAP_SIZE;
341 shared_initialize_params_m->l2HeapSize = tidl::internal::DMEM1_SIZE;
342 shared_initialize_params_m->l1HeapSize = tidl::internal::DMEM0_SIZE;
343 shared_initialize_params_m->numContexts = tidl::internal::NUM_CONTEXTS;
345 // Set up execution trace specified in the configuration
346 EnableExecutionTrace(configuration_m,
347 &shared_initialize_params_m->enableTrace);
349 // Setup kernel arguments for initialize
350 KernelArgs args = { create_arg,
351 param_heap_arg,
352 DeviceArgInfo(tidl_extmem_heap_m.get(),
353 configuration_m.NETWORK_HEAP_SIZE,
354 DeviceArgInfo::Kind::BUFFER),
355 DeviceArgInfo(shared_initialize_params_m.get(),
356 sizeof(OCL_TIDL_InitializeParams),
357 DeviceArgInfo::Kind::BUFFER),
358 device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
359 DeviceArgInfo(nullptr, tidl::internal::DMEM1_SIZE,
360 DeviceArgInfo::Kind::LOCAL):
361 DeviceArgInfo(nullptr, 4,
362 DeviceArgInfo::Kind::LOCAL) };
364 k_initialize_m.reset(new Kernel(device_m,
365 STRING(INIT_KERNEL), args,
366 device_index_m));
367 }
369 //
370 // Allocate an OpenCL buffer for TIDL layer output buffer metadata.
371 // The device will populate metadata for every buffer that is used as an
372 // output buffer by a layer. This needs to be done before setting up
373 // process kernel.
374 //
375 void ExecutionObject::Impl::EnableOutputBufferTrace()
376 {
377 trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
378 num_network_layers_m*
379 TIDL_NUM_OUT_BUFS);
381 trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
382 (trace_buf_params_sz_m));
384 // Device will update bufferId if there is valid data for the entry
385 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
386 for (uint32_t i = 0; i < num_network_layers_m; i++)
387 for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
388 {
389 OCL_TIDL_BufParams *bufP =
390 &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
391 bufP->bufferId = UINT_MAX;
392 }
393 }
395 //
396 // Create a kernel to call the "process" function
397 //
398 void
399 ExecutionObject::Impl::SetupProcessKernel()
400 {
401 shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>(
402 tidl::internal::NUM_CONTEXTS * sizeof(OCL_TIDL_ProcessParams)));
404 // Set up execution trace specified in the configuration
405 for (int i = 0; i < tidl::internal::NUM_CONTEXTS; i++)
406 {
407 OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() + i;
408 EnableExecutionTrace(configuration_m, &p_params->enableTrace);
409 }
411 uint32_t context_idx = 0;
412 KernelArgs args = { DeviceArgInfo(shared_process_params_m.get(),
413 tidl::internal::NUM_CONTEXTS *
414 sizeof(OCL_TIDL_ProcessParams),
415 DeviceArgInfo::Kind::BUFFER),
416 DeviceArgInfo(tidl_extmem_heap_m.get(),
417 shared_initialize_params_m->tidlHeapSize,
418 DeviceArgInfo::Kind::BUFFER),
419 DeviceArgInfo(trace_buf_params_m.get(),
420 trace_buf_params_sz_m,
421 DeviceArgInfo::Kind::BUFFER),
422 DeviceArgInfo(&context_idx,
423 sizeof(uint32_t),
424 DeviceArgInfo::Kind::SCALAR)
425 };
427 k_process_m.reset(new Kernel(device_m,
428 STRING(PROCESS_KERNEL), args,
429 device_index_m));
430 }
433 static size_t readDataS8(const char *readPtr, char *ptr, int roi, int n,
434 int width, int height, int pitch,
435 int chOffset)
436 {
437 if (!readPtr) return 0;
439 for(int i2 = 0; i2 < roi; i2++)
440 for(int i0 = 0; i0 < n; i0++)
441 for(int i1 = 0; i1 < height; i1++)
442 memcpy(&ptr[i2*n*chOffset + i0*chOffset + i1*pitch],
443 &readPtr[i2*n*width*height + i0*width*height+ i1*width],
444 width);
446 return width*height*n*roi;
447 }
449 static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
450 int height, int pitch, int chOffset)
451 {
452 if (!writePtr) return 0;
454 for(int i0 = 0; i0 < n; i0++)
455 for(int i1 = 0; i1 < height; i1++)
456 memcpy(&writePtr[i0*width*height + i1*width],
457 &ptr[i0*chOffset + i1*pitch],
458 width);
460 return width*height*n;
461 }
463 //
464 // Copy from host buffer to TIDL device buffer
465 //
466 void ExecutionObject::Impl::HostWriteNetInput(uint32_t context_idx)
467 {
468 const char* readPtr = (const char *) in_m[context_idx].GetArg().ptr();
469 const PipeInfo& pipe = in_m[context_idx].GetPipe();
470 OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
471 + context_idx;
473 for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
474 {
475 OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
476 char *inBufAddr = tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
477 + context_idx * inBuf->contextSize;
479 readPtr += readDataS8(
480 readPtr,
481 (char *) inBufAddr
482 + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
483 + OCL_TIDL_MAX_PAD_SIZE,
484 inBuf->numROIs,
485 inBuf->numChannels,
486 inBuf->ROIWidth,
487 inBuf->ROIHeight,
488 inBuf->bufPlaneWidth,
489 ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) /
490 inBuf->numChannels));
492 p_params->dataQ[i] = pipe.dataQ_m[i];
493 }
494 }
496 //
497 // Copy from TIDL device buffer into host buffer
498 //
499 void ExecutionObject::Impl::HostReadNetOutput(uint32_t context_idx)
500 {
501 char* writePtr = (char *) out_m[context_idx].GetArg().ptr();
502 PipeInfo& pipe = out_m[context_idx].GetPipe();
503 OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
504 + context_idx;
506 for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
507 {
508 OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
509 char *outBufAddr = tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
510 + context_idx * outBuf->contextSize;
511 if (writePtr != nullptr)
512 {
513 writePtr += writeDataS8(
514 writePtr,
515 (char *) outBufAddr
516 + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
517 + OCL_TIDL_MAX_PAD_SIZE,
518 outBuf->numChannels,
519 outBuf->ROIWidth,
520 outBuf->ROIHeight,
521 outBuf->bufPlaneWidth,
522 ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
523 outBuf->numChannels));
524 }
526 pipe.dataQ_m[i] = p_params->dataQ[i];
527 }
528 }
530 void ExecutionObject::Impl::ComputeInputOutputSizes()
531 {
532 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS) return;
534 if (shared_initialize_params_m->numInBufs > OCL_TIDL_MAX_IN_BUFS ||
535 shared_initialize_params_m->numOutBufs > OCL_TIDL_MAX_OUT_BUFS)
536 {
537 std::cout << "Num input/output bufs ("
538 << shared_initialize_params_m->numInBufs << ", "
539 << shared_initialize_params_m->numOutBufs
540 << ") exceeded limit!" << std::endl;
541 shared_initialize_params_m->errorCode = OCL_TIDL_INIT_FAIL;
542 return;
543 }
545 in_size_m = 0;
546 out_size_m = 0;
547 for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
548 {
549 OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
550 in_size_m += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
551 inBuf->ROIHeight;
552 }
553 for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
554 {
555 OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
556 out_size_m += outBuf->numChannels * outBuf->ROIWidth *outBuf->ROIHeight;
557 }
558 }
560 bool ExecutionObject::Impl::RunAsync(CallType ct, uint32_t context_idx)
561 {
562 switch (ct)
563 {
564 case CallType::INIT:
565 {
566 k_initialize_m->RunAsync();
567 break;
568 }
569 case CallType::PROCESS:
570 {
571 RecordEvent(current_frame_idx_m[context_idx],
572 (layers_group_id_m == 1) ? TimeStamp::EO1_PFSA_START:
573 TimeStamp::EO2_PFSA_START,
574 static_cast<int>(device_type_m),
575 device_index_m);
577 OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
578 + context_idx;
579 p_params->frameIdx = current_frame_idx_m[context_idx];
580 HostWriteNetInput(context_idx);
581 {
582 std::unique_lock<std::mutex> lock(mutex_access_m);
583 k_process_m->UpdateScalarArg(3, sizeof(uint32_t), &context_idx);
584 k_process_m->RunAsync(context_idx);
585 }
587 RecordEvent(current_frame_idx_m[context_idx],
588 (layers_group_id_m == 1) ? TimeStamp::EO1_PFSA_END:
589 TimeStamp::EO2_PFSA_END);
590 break;
591 }
592 case CallType::CLEANUP:
593 {
594 k_cleanup_m->RunAsync();
595 break;
596 }
597 default:
598 return false;
599 }
601 return true;
602 }
604 bool ExecutionObject::Impl::Wait(CallType ct, uint32_t context_idx)
605 {
606 switch (ct)
607 {
608 case CallType::INIT:
609 {
610 bool has_work = k_initialize_m->Wait();
612 if (has_work)
613 {
614 ComputeInputOutputSizes();
615 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
616 throw Exception(shared_initialize_params_m->errorCode,
617 __FILE__, __FUNCTION__, __LINE__);
618 }
619 return has_work;
620 }
621 case CallType::PROCESS:
622 {
623 RecordEvent(current_frame_idx_m[context_idx],
624 (layers_group_id_m == 1) ? TimeStamp::EO1_PFW_START:
625 TimeStamp::EO2_PFW_START);
627 bool has_work = k_process_m->Wait(context_idx);
628 if (has_work)
629 {
630 OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
631 + context_idx;
632 if (p_params->errorCode != OCL_TIDL_SUCCESS)
633 throw Exception(p_params->errorCode,
634 __FILE__, __FUNCTION__, __LINE__);
636 HostReadNetOutput(context_idx);
638 RecordEvent(current_frame_idx_m[context_idx],
639 (layers_group_id_m == 1) ? TimeStamp::EO1_PFW_END:
640 TimeStamp::EO2_PFW_END);
641 }
642 else
643 {
644 // If there is no work, reset start event time
645 ResetEvent(current_frame_idx_m[context_idx],
646 (layers_group_id_m == 1) ? TimeStamp::EO1_PFW_START:
647 TimeStamp::EO2_PFW_START);
648 }
650 return has_work;
651 }
652 case CallType::CLEANUP:
653 {
654 return k_cleanup_m->Wait();
655 break;
656 }
657 default:
658 return false;
659 }
661 return false;
662 }
664 bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data,
665 uint32_t context_idx)
666 {
667 switch (ct)
668 {
669 case CallType::PROCESS:
670 {
671 return k_process_m->AddCallback(user_data, context_idx);
672 break;
673 }
674 default:
675 return false;
676 }
678 return false;
679 }
681 uint64_t ExecutionObject::Impl::GetProcessCycles(uint32_t context_idx) const
682 {
683 uint8_t factor = 1;
685 // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
686 if (device_type_m == DeviceType::EVE)
687 factor = 2;
689 OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() +
690 context_idx;
691 return p_params->cycles * factor;
692 }
694 //
695 // Write the trace data to output files
696 //
697 void
698 ExecutionObject::Impl::WriteLayerOutputsToFile(const std::string& filename_prefix) const
699 {
700 if (trace_buf_params_sz_m == 0)
701 return;
703 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
705 for (uint32_t i = 0; i < num_network_layers_m; i++)
706 for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
707 {
708 OCL_TIDL_BufParams* buf = &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
710 if (buf->bufferId == UINT_MAX)
711 continue;
713 size_t buffer_size = buf->numChannels * buf->ROIHeight *
714 buf->ROIWidth;
716 char *tmp = new char[buffer_size];
718 if (tmp == nullptr)
719 throw Exception("Out of memory, new failed",
720 __FILE__, __FUNCTION__, __LINE__);
722 writeDataS8(
723 tmp,
724 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
725 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
726 + OCL_TIDL_MAX_PAD_SIZE,
727 buf->numChannels,
728 buf->ROIWidth,
729 buf->ROIHeight,
730 buf->bufPlaneWidth,
731 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
732 buf->numChannels));
734 std::string filename(filename_prefix);
735 filename += std::to_string(buf->bufferId) + "_";
736 filename += std::to_string(buf->ROIWidth) + "x";
737 filename += std::to_string(buf->ROIHeight) + ".bin";
739 std::ofstream ofs;
740 ofs.open(filename, std::ofstream::out);
741 ofs.write(tmp, buffer_size);
742 ofs.close();
744 delete[] tmp;
745 }
746 }
749 const LayerOutput* ExecutionObject::Impl::GetOutputFromLayer(
750 uint32_t layer_index, uint32_t output_index) const
751 {
752 if (trace_buf_params_sz_m == 0)
753 return nullptr;
755 if (layer_index > num_network_layers_m || output_index > TIDL_NUM_OUT_BUFS)
756 return nullptr;
758 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
759 OCL_TIDL_BufParams* buf = &bufferParams[layer_index*TIDL_NUM_OUT_BUFS+
760 output_index];
762 if (buf->bufferId == UINT_MAX)
763 return nullptr;
765 size_t buffer_size = buf->numChannels * buf->ROIHeight *
766 buf->ROIWidth;
768 char *data = new char[buffer_size];
770 if (data == nullptr)
771 throw Exception("Out of memory, new failed",
772 __FILE__, __FUNCTION__, __LINE__);
774 writeDataS8(data,
775 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
776 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
777 + OCL_TIDL_MAX_PAD_SIZE,
778 buf->numChannels,
779 buf->ROIWidth,
780 buf->ROIHeight,
781 buf->bufPlaneWidth,
782 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
783 buf->numChannels));
785 return new LayerOutput(layer_index, output_index, buf->bufferId,
786 buf->numROIs, buf->numChannels, buf->ROIHeight,
787 buf->ROIWidth, data);
788 }
790 const LayerOutputs* ExecutionObject::Impl::GetOutputsFromAllLayers() const
791 {
792 LayerOutputs* result = new LayerOutputs;
794 for (uint32_t i=0; i < num_network_layers_m; i++)
795 for (int j=0; j < TIDL_NUM_OUT_BUFS; j++)
796 {
797 const LayerOutput* lo = GetOutputFromLayer(i, j);
798 if (lo)
799 result->push_back(std::unique_ptr<const LayerOutput>{ lo });
800 }
802 return result;
803 }
805 LayerOutput::LayerOutput(int layer_index, int output_index, int buffer_id,
806 int num_roi, int num_channels, size_t height,
807 size_t width, const char* data):
808 layer_index_m(layer_index), buffer_id_m(buffer_id),
809 num_roi_m(num_roi), num_channels_m(num_channels),
810 height_m(height), width_m(width), data_m(data)
811 { }
813 LayerOutput::~LayerOutput()
814 {
815 delete[] data_m;
816 }
818 void ExecutionObject::Impl::AcquireContext(uint32_t& context_idx)
819 {
820 std::unique_lock<std::mutex> lock(mutex_access_m);
821 cv_access_m.wait(lock, [this]{ return this->idle_encoding_m <
822 (1 << tidl::internal::NUM_CONTEXTS) - 1; });
824 for (uint32_t i = 0; i < tidl::internal::NUM_CONTEXTS; i++)
825 if (((1 << i) & idle_encoding_m) == 0)
826 {
827 context_idx = i;
828 break;
829 }
830 idle_encoding_m |= (1 << context_idx); // mark the bit as busy
831 }
833 void ExecutionObject::Impl::ReleaseContext(uint32_t context_idx)
834 {
835 {
836 std::unique_lock<std::mutex> lock(mutex_access_m);
837 idle_encoding_m &= (~(1 << context_idx)); // mark the bit as free
838 }
839 cv_access_m.notify_all();
840 }