1 /******************************************************************************
2 * Copyright (c) 2017-2018 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
29 /*! \file execution_object.cpp */
31 #include <string.h>
32 #include <fstream>
33 #include <climits>
34 #include <mutex>
35 #include <condition_variable>
36 #include <chrono>
37 #include "executor.h"
38 #include "execution_object.h"
39 #include "trace.h"
40 #include "ocl_device.h"
41 #include "parameters.h"
42 #include "common_defines.h"
43 #include "tidl_create_params.h"
44 #include "device_arginfo.h"
46 using namespace tidl;
48 class ExecutionObject::Impl
49 {
50 public:
51 Impl(Device* d, uint8_t device_index,
52 const DeviceArgInfo& create_arg,
53 const DeviceArgInfo& param_heap_arg,
54 const Configuration& configuration,
55 int layers_group_id);
56 ~Impl() {}
58 bool RunAsync(CallType ct);
59 bool Wait (CallType ct);
60 bool AddCallback(CallType ct, void *user_data);
62 uint64_t GetProcessCycles() const;
63 int GetLayersGroupId() const;
64 void AcquireLock();
65 void ReleaseLock();
67 Device* device_m;
68 // Index of the OpenCL device/queue used by this EO
69 uint8_t device_index_m;
70 std::string device_name_m;
72 up_malloc_ddr<char> tidl_extmem_heap_m;
73 up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
74 up_malloc_ddr<OCL_TIDL_ProcessParams> shared_process_params_m;
76 size_t in_size_m;
77 size_t out_size_m;
78 IODeviceArgInfo in_m;
79 IODeviceArgInfo out_m;
81 // Frame being processed by the EO
82 int current_frame_idx_m;
84 // LayersGroupId being processed by the EO
85 int layers_group_id_m;
87 // Trace related
88 void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
90 const LayerOutput* GetOutputFromLayer (uint32_t layer_index,
91 uint32_t output_index) const;
92 const LayerOutputs* GetOutputsFromAllLayers() const;
94 uint32_t num_network_layers_m;
95 up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
96 size_t trace_buf_params_sz_m;
98 // host time tracking: eo start to finish
99 float host_time_m;
101 private:
102 void SetupInitializeKernel(const DeviceArgInfo& create_arg,
103 const DeviceArgInfo& param_heap_arg);
104 void EnableOutputBufferTrace();
105 void SetupProcessKernel();
107 void HostWriteNetInput();
108 void HostReadNetOutput();
109 void ComputeInputOutputSizes();
111 std::unique_ptr<Kernel> k_initialize_m;
112 std::unique_ptr<Kernel> k_process_m;
113 std::unique_ptr<Kernel> k_cleanup_m;
115 // Guarding sole access to input/output for one frame during execution
116 bool is_idle_m;
117 std::mutex mutex_access_m;
118 std::condition_variable cv_access_m;
120 const Configuration configuration_m;
121 };
124 ExecutionObject::ExecutionObject(Device* d,
125 uint8_t device_index,
126 const ArgInfo& create_arg,
127 const ArgInfo& param_heap_arg,
128 const Configuration& configuration,
129 int layers_group_id)
130 {
131 TRACE::print("-> ExecutionObject::ExecutionObject()\n");
133 DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
134 DeviceArgInfo param_heap_arg_d(param_heap_arg, DeviceArgInfo::Kind::BUFFER);
136 pimpl_m = std::unique_ptr<ExecutionObject::Impl>
137 { new ExecutionObject::Impl(d, device_index,
138 create_arg_d,
139 param_heap_arg_d,
140 configuration,
141 layers_group_id) };
142 TRACE::print("<- ExecutionObject::ExecutionObject()\n");
143 }
146 ExecutionObject::Impl::Impl(Device* d, uint8_t device_index,
147 const DeviceArgInfo& create_arg,
148 const DeviceArgInfo& param_heap_arg,
149 const Configuration& configuration,
150 int layers_group_id):
151 device_m(d),
152 device_index_m(device_index),
153 tidl_extmem_heap_m (nullptr, &__free_ddr),
154 shared_initialize_params_m(nullptr, &__free_ddr),
155 shared_process_params_m(nullptr, &__free_ddr),
156 in_size_m(0),
157 out_size_m(0),
158 in_m(),
159 out_m(),
160 current_frame_idx_m(0),
161 layers_group_id_m(layers_group_id),
162 num_network_layers_m(0),
163 trace_buf_params_m(nullptr, &__free_ddr),
164 trace_buf_params_sz_m(0),
165 k_initialize_m(nullptr),
166 k_process_m(nullptr),
167 k_cleanup_m(nullptr),
168 is_idle_m(true),
169 configuration_m(configuration)
170 {
171 device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
172 // Save number of layers in the network
173 const TIDL_CreateParams* cp =
174 static_cast<const TIDL_CreateParams *>(create_arg.ptr());
175 num_network_layers_m = cp->net.numLayers;
177 SetupInitializeKernel(create_arg, param_heap_arg);
179 if (configuration_m.enableOutputTrace)
180 EnableOutputBufferTrace();
182 SetupProcessKernel();
183 }
185 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
186 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
187 // unique_ptr's destructor requires a complete type in order to invoke delete
188 ExecutionObject::~ExecutionObject() = default;
190 char* ExecutionObject::GetInputBufferPtr() const
191 {
192 return static_cast<char *>(pimpl_m->in_m.GetArg().ptr());
193 }
195 size_t ExecutionObject::GetInputBufferSizeInBytes() const
196 {
197 return pimpl_m->in_size_m;
198 }
200 char* ExecutionObject::GetOutputBufferPtr() const
201 {
202 return static_cast<char *>(pimpl_m->out_m.GetArg().ptr());
203 }
205 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
206 {
207 return pimpl_m->out_size_m;
208 }
210 void ExecutionObject::SetFrameIndex(int idx)
211 {
212 pimpl_m->current_frame_idx_m = idx;
213 }
215 int ExecutionObject::GetFrameIndex() const
216 {
217 return pimpl_m->current_frame_idx_m;
218 }
220 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
221 {
222 assert(in.ptr() != nullptr && in.size() >= pimpl_m->in_size_m);
223 assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
225 pimpl_m->in_m = IODeviceArgInfo(in);
226 pimpl_m->out_m = IODeviceArgInfo(out);
227 }
229 void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
230 const IODeviceArgInfo* out)
231 {
232 pimpl_m->in_m = *in;
233 pimpl_m->out_m = *out;
234 }
236 bool ExecutionObject::ProcessFrameStartAsync()
237 {
238 TRACE::print("-> ExecutionObject::ProcessFrameStartAsync()\n");
239 assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
240 return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
241 }
243 bool ExecutionObject::ProcessFrameWait()
244 {
245 TRACE::print("-> ExecutionObject::ProcessFrameWait()\n");
246 return pimpl_m->Wait(ExecutionObject::CallType::PROCESS);
247 }
249 bool ExecutionObject::RunAsync (CallType ct)
250 {
251 return pimpl_m->RunAsync(ct);
252 }
254 bool ExecutionObject::Wait (CallType ct)
255 {
256 return pimpl_m->Wait(ct);
257 }
259 bool ExecutionObject::AddCallback(CallType ct, void *user_data)
260 {
261 return pimpl_m->AddCallback(ct, user_data);
262 }
264 float ExecutionObject::GetProcessTimeInMilliSeconds() const
265 {
266 float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
267 return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000;
268 }
270 float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
271 {
272 return pimpl_m->host_time_m;
273 }
275 void
276 ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
277 {
278 pimpl_m->WriteLayerOutputsToFile(filename_prefix);
279 }
281 const LayerOutput* ExecutionObject::GetOutputFromLayer(
282 uint32_t layer_index, uint32_t output_index) const
283 {
284 return pimpl_m->GetOutputFromLayer(layer_index, output_index);
285 }
287 const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
288 {
289 return pimpl_m->GetOutputsFromAllLayers();
290 }
292 int ExecutionObject::GetLayersGroupId() const
293 {
294 return pimpl_m->layers_group_id_m;
295 }
297 const std::string& ExecutionObject::GetDeviceName() const
298 {
299 return pimpl_m->device_name_m;
300 }
302 void ExecutionObject::AcquireLock()
303 {
304 pimpl_m->AcquireLock();
305 }
307 void ExecutionObject::ReleaseLock()
308 {
309 pimpl_m->ReleaseLock();
310 }
312 //
313 // Create a kernel to call the "initialize" function
314 //
315 void
316 ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
317 const DeviceArgInfo& param_heap_arg)
318 {
319 // Allocate a heap for TI DL to use on the device
320 tidl_extmem_heap_m.reset(
321 malloc_ddr<char>(configuration_m.NETWORK_HEAP_SIZE));
323 // Create a kernel for cleanup
324 KernelArgs cleanup_args;
325 k_cleanup_m.reset(new Kernel(device_m,
326 STRING(CLEANUP_KERNEL),
327 cleanup_args, device_index_m));
329 // Set up parameter struct for the initialize kernel
330 shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
331 memset(shared_initialize_params_m.get(), 0,
332 sizeof(OCL_TIDL_InitializeParams));
334 shared_initialize_params_m->tidlHeapSize =configuration_m.NETWORK_HEAP_SIZE;
335 shared_initialize_params_m->l2HeapSize = tidl::internal::DMEM1_SIZE;
336 shared_initialize_params_m->l1HeapSize = tidl::internal::DMEM0_SIZE;
337 shared_initialize_params_m->enableInternalInput =
338 configuration_m.enableInternalInput ? 1 : 0;
340 // Set up execution trace specified in the configuration
341 EnableExecutionTrace(configuration_m,
342 &shared_initialize_params_m->enableTrace);
344 // Setup kernel arguments for initialize
345 KernelArgs args = { create_arg,
346 param_heap_arg,
347 DeviceArgInfo(tidl_extmem_heap_m.get(),
348 configuration_m.NETWORK_HEAP_SIZE,
349 DeviceArgInfo::Kind::BUFFER),
350 DeviceArgInfo(shared_initialize_params_m.get(),
351 sizeof(OCL_TIDL_InitializeParams),
352 DeviceArgInfo::Kind::BUFFER),
353 device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
354 DeviceArgInfo(nullptr, tidl::internal::DMEM1_SIZE,
355 DeviceArgInfo::Kind::LOCAL):
356 DeviceArgInfo(nullptr, 4,
357 DeviceArgInfo::Kind::LOCAL) };
359 k_initialize_m.reset(new Kernel(device_m,
360 STRING(INIT_KERNEL), args,
361 device_index_m));
362 }
364 //
365 // Allocate an OpenCL buffer for TIDL layer output buffer metadata.
366 // The device will populate metadata for every buffer that is used as an
367 // output buffer by a layer. This needs to be done before setting up
368 // process kernel.
369 //
370 void ExecutionObject::Impl::EnableOutputBufferTrace()
371 {
372 trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
373 num_network_layers_m*
374 TIDL_NUM_OUT_BUFS);
376 trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
377 (trace_buf_params_sz_m));
379 // Device will update bufferId if there is valid data for the entry
380 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
381 for (uint32_t i = 0; i < num_network_layers_m; i++)
382 for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
383 {
384 OCL_TIDL_BufParams *bufP =
385 &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
386 bufP->bufferId = UINT_MAX;
387 }
388 }
390 //
391 // Create a kernel to call the "process" function
392 //
393 void
394 ExecutionObject::Impl::SetupProcessKernel()
395 {
396 shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
397 shared_process_params_m->enableInternalInput =
398 shared_initialize_params_m->enableInternalInput;
399 shared_process_params_m->cycles = 0;
401 // Set up execution trace specified in the configuration
402 EnableExecutionTrace(configuration_m,
403 &shared_process_params_m->enableTrace);
405 KernelArgs args = { DeviceArgInfo(shared_process_params_m.get(),
406 sizeof(OCL_TIDL_ProcessParams),
407 DeviceArgInfo::Kind::BUFFER),
408 DeviceArgInfo(tidl_extmem_heap_m.get(),
409 shared_initialize_params_m->tidlHeapSize,
410 DeviceArgInfo::Kind::BUFFER),
411 DeviceArgInfo(trace_buf_params_m.get(),
412 trace_buf_params_sz_m,
413 DeviceArgInfo::Kind::BUFFER)
415 };
417 k_process_m.reset(new Kernel(device_m,
418 STRING(PROCESS_KERNEL), args,
419 device_index_m));
420 }
423 static size_t readDataS8(const char *readPtr, char *ptr, int roi, int n,
424 int width, int height, int pitch,
425 int chOffset)
426 {
427 if (!readPtr) return 0;
429 for(int i2 = 0; i2 < roi; i2++)
430 for(int i0 = 0; i0 < n; i0++)
431 for(int i1 = 0; i1 < height; i1++)
432 memcpy(&ptr[i2*n*chOffset + i0*chOffset + i1*pitch],
433 &readPtr[i2*n*width*height + i0*width*height+ i1*width],
434 width);
436 return width*height*n*roi;
437 }
439 static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
440 int height, int pitch, int chOffset)
441 {
442 if (!writePtr) return 0;
444 for(int i0 = 0; i0 < n; i0++)
445 for(int i1 = 0; i1 < height; i1++)
446 memcpy(&writePtr[i0*width*height + i1*width],
447 &ptr[i0*chOffset + i1*pitch],
448 width);
450 return width*height*n;
451 }
453 //
454 // Copy from host buffer to TIDL device buffer
455 //
456 void ExecutionObject::Impl::HostWriteNetInput()
457 {
458 const char* readPtr = (const char *) in_m.GetArg().ptr();
459 const PipeInfo& pipe = in_m.GetPipe();
461 for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
462 {
463 OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
465 if (shared_process_params_m->enableInternalInput == 0)
466 {
467 readPtr += readDataS8(
468 readPtr,
469 (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
470 + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
471 + OCL_TIDL_MAX_PAD_SIZE,
472 inBuf->numROIs,
473 inBuf->numChannels,
474 inBuf->ROIWidth,
475 inBuf->ROIHeight,
476 inBuf->bufPlaneWidth,
477 ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) /
478 inBuf->numChannels));
479 }
480 else
481 {
482 shared_process_params_m->inBufAddr[i] = pipe.bufAddr_m[i];
483 }
485 shared_process_params_m->inDataQ[i] = pipe.dataQ_m[i];
486 }
487 }
489 //
490 // Copy from TIDL device buffer into host buffer
491 //
492 void ExecutionObject::Impl::HostReadNetOutput()
493 {
494 char* writePtr = (char *) out_m.GetArg().ptr();
495 PipeInfo& pipe = out_m.GetPipe();
497 for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
498 {
499 OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
500 if (writePtr != nullptr)
501 {
502 writePtr += writeDataS8(
503 writePtr,
504 (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
505 + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
506 + OCL_TIDL_MAX_PAD_SIZE,
507 outBuf->numChannels,
508 outBuf->ROIWidth,
509 outBuf->ROIHeight,
510 outBuf->bufPlaneWidth,
511 ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
512 outBuf->numChannels));
513 }
515 pipe.dataQ_m[i] = shared_process_params_m->outDataQ[i];
516 pipe.bufAddr_m[i] = shared_initialize_params_m->bufAddrBase
517 + outBuf->bufPlaneBufOffset;
518 }
519 shared_process_params_m->bytesWritten = writePtr -
520 (char *) out_m.GetArg().ptr();
521 }
523 void ExecutionObject::Impl::ComputeInputOutputSizes()
524 {
525 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS) return;
527 if (shared_initialize_params_m->numInBufs > OCL_TIDL_MAX_IN_BUFS ||
528 shared_initialize_params_m->numOutBufs > OCL_TIDL_MAX_OUT_BUFS)
529 {
530 std::cout << "Num input/output bufs ("
531 << shared_initialize_params_m->numInBufs << ", "
532 << shared_initialize_params_m->numOutBufs
533 << ") exceeded limit!" << std::endl;
534 shared_initialize_params_m->errorCode = OCL_TIDL_INIT_FAIL;
535 return;
536 }
538 in_size_m = 0;
539 out_size_m = 0;
540 for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
541 {
542 OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
543 in_size_m += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
544 inBuf->ROIHeight;
545 }
546 for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
547 {
548 OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
549 out_size_m += outBuf->numChannels * outBuf->ROIWidth *outBuf->ROIHeight;
550 }
551 }
554 bool ExecutionObject::Impl::RunAsync(CallType ct)
555 {
556 switch (ct)
557 {
558 case CallType::INIT:
559 {
560 k_initialize_m->RunAsync();
561 break;
562 }
563 case CallType::PROCESS:
564 {
565 std::chrono::time_point<std::chrono::steady_clock> t1, t2;
566 t1 = std::chrono::steady_clock::now();
568 shared_process_params_m->frameIdx = current_frame_idx_m;
569 shared_process_params_m->bytesWritten = 0;
570 HostWriteNetInput();
571 k_process_m->RunAsync();
573 t2 = std::chrono::steady_clock::now();
574 std::chrono::duration<float> elapsed = t2 - t1;
575 host_time_m = elapsed.count() * 1000;
576 break;
577 }
578 case CallType::CLEANUP:
579 {
580 k_cleanup_m->RunAsync();
581 break;
582 }
583 default:
584 return false;
585 }
587 return true;
588 }
590 bool ExecutionObject::Impl::Wait(CallType ct)
591 {
592 switch (ct)
593 {
594 case CallType::INIT:
595 {
596 bool has_work = k_initialize_m->Wait();
598 if (has_work)
599 {
600 ComputeInputOutputSizes();
601 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
602 throw Exception(shared_initialize_params_m->errorCode,
603 __FILE__, __FUNCTION__, __LINE__);
604 }
605 return has_work;
606 }
607 case CallType::PROCESS:
608 {
609 float host_elapsed_ms = 0.0f;
610 bool has_work = k_process_m->Wait(&host_elapsed_ms);
611 if (has_work)
612 {
613 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
614 throw Exception(shared_process_params_m->errorCode,
615 __FILE__, __FUNCTION__, __LINE__);
617 std::chrono::time_point<std::chrono::steady_clock> t1, t2;
618 t1 = std::chrono::steady_clock::now();
619 HostReadNetOutput();
620 t2 = std::chrono::steady_clock::now();
621 std::chrono::duration<float> elapsed = t2 - t1;
622 host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
623 }
625 return has_work;
626 }
627 case CallType::CLEANUP:
628 {
629 return k_cleanup_m->Wait();
630 break;
631 }
632 default:
633 return false;
634 }
636 return false;
637 }
639 bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data)
640 {
641 switch (ct)
642 {
643 case CallType::PROCESS:
644 {
645 return k_process_m->AddCallback(user_data);
646 break;
647 }
648 default:
649 return false;
650 }
652 return false;
653 }
655 uint64_t ExecutionObject::Impl::GetProcessCycles() const
656 {
657 uint8_t factor = 1;
659 // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
660 if (device_m->type() == CL_DEVICE_TYPE_CUSTOM)
661 factor = 2;
663 return shared_process_params_m.get()->cycles * factor;
664 }
666 //
667 // Write the trace data to output files
668 //
669 void
670 ExecutionObject::Impl::WriteLayerOutputsToFile(const std::string& filename_prefix) const
671 {
672 if (trace_buf_params_sz_m == 0)
673 return;
675 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
677 for (uint32_t i = 0; i < num_network_layers_m; i++)
678 for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
679 {
680 OCL_TIDL_BufParams* buf = &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
682 if (buf->bufferId == UINT_MAX)
683 continue;
685 size_t buffer_size = buf->numChannels * buf->ROIHeight *
686 buf->ROIWidth;
688 char *tmp = new char[buffer_size];
690 if (tmp == nullptr)
691 throw Exception("Out of memory, new failed",
692 __FILE__, __FUNCTION__, __LINE__);
694 writeDataS8(
695 tmp,
696 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
697 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
698 + OCL_TIDL_MAX_PAD_SIZE,
699 buf->numChannels,
700 buf->ROIWidth,
701 buf->ROIHeight,
702 buf->bufPlaneWidth,
703 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
704 buf->numChannels));
706 std::string filename(filename_prefix);
707 filename += std::to_string(buf->bufferId) + "_";
708 filename += std::to_string(buf->ROIWidth) + "x";
709 filename += std::to_string(buf->ROIHeight) + ".bin";
711 std::ofstream ofs;
712 ofs.open(filename, std::ofstream::out);
713 ofs.write(tmp, buffer_size);
714 ofs.close();
716 delete[] tmp;
717 }
718 }
721 const LayerOutput* ExecutionObject::Impl::GetOutputFromLayer(
722 uint32_t layer_index, uint32_t output_index) const
723 {
724 if (trace_buf_params_sz_m == 0)
725 return nullptr;
727 if (layer_index > num_network_layers_m || output_index > TIDL_NUM_OUT_BUFS)
728 return nullptr;
730 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
731 OCL_TIDL_BufParams* buf = &bufferParams[layer_index*TIDL_NUM_OUT_BUFS+
732 output_index];
734 if (buf->bufferId == UINT_MAX)
735 return nullptr;
737 size_t buffer_size = buf->numChannels * buf->ROIHeight *
738 buf->ROIWidth;
740 char *data = new char[buffer_size];
742 if (data == nullptr)
743 throw Exception("Out of memory, new failed",
744 __FILE__, __FUNCTION__, __LINE__);
746 writeDataS8(data,
747 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
748 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
749 + OCL_TIDL_MAX_PAD_SIZE,
750 buf->numChannels,
751 buf->ROIWidth,
752 buf->ROIHeight,
753 buf->bufPlaneWidth,
754 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
755 buf->numChannels));
757 return new LayerOutput(layer_index, output_index, buf->bufferId,
758 buf->numROIs, buf->numChannels, buf->ROIHeight,
759 buf->ROIWidth, data);
760 }
762 const LayerOutputs* ExecutionObject::Impl::GetOutputsFromAllLayers() const
763 {
764 LayerOutputs* result = new LayerOutputs;
766 for (uint32_t i=0; i < num_network_layers_m; i++)
767 for (int j=0; j < TIDL_NUM_OUT_BUFS; j++)
768 {
769 const LayerOutput* lo = GetOutputFromLayer(i, j);
770 if (lo)
771 result->push_back(std::unique_ptr<const LayerOutput>{ lo });
772 }
774 return result;
775 }
777 LayerOutput::LayerOutput(int layer_index, int output_index, int buffer_id,
778 int num_roi, int num_channels, size_t height,
779 size_t width, const char* data):
780 layer_index_m(layer_index), buffer_id_m(buffer_id),
781 num_roi_m(num_roi), num_channels_m(num_channels),
782 height_m(height), width_m(width), data_m(data)
783 { }
785 LayerOutput::~LayerOutput()
786 {
787 delete[] data_m;
788 }
790 void ExecutionObject::Impl::AcquireLock()
791 {
792 std::unique_lock<std::mutex> lock(mutex_access_m);
793 cv_access_m.wait(lock, [this]{ return this->is_idle_m; });
794 is_idle_m = false;
795 }
797 void ExecutionObject::Impl::ReleaseLock()
798 {
799 is_idle_m = true;
800 cv_access_m.notify_all();
801 }