1 /******************************************************************************
2 * Copyright (c) 2017-2018 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
29 /*! \file execution_object.cpp */
31 #include <string.h>
32 #include <fstream>
33 #include <climits>
34 #include <mutex>
35 #include <condition_variable>
36 #include <chrono>
37 #include "executor.h"
38 #include "execution_object.h"
39 #include "trace.h"
40 #include "ocl_device.h"
41 #include "parameters.h"
42 #include "configuration.h"
43 #include "common_defines.h"
44 #include "tidl_create_params.h"
45 #include "device_arginfo.h"
47 using namespace tidl;
49 class ExecutionObject::Impl
50 {
51 public:
52 Impl(Device* d, uint8_t device_index,
53 const DeviceArgInfo& create_arg,
54 const DeviceArgInfo& param_heap_arg,
55 size_t extmem_heap_size,
56 int layers_group_id,
57 bool output_trace,
58 bool internal_input);
59 ~Impl() {}
61 bool RunAsync(CallType ct);
62 bool Wait (CallType ct);
63 bool AddCallback(CallType ct, void *user_data);
65 uint64_t GetProcessCycles() const;
66 int GetLayersGroupId() const;
67 void AcquireLock();
68 void ReleaseLock();
70 Device* device_m;
71 // Index of the OpenCL device/queue used by this EO
72 uint8_t device_index_m;
73 std::string device_name_m;
75 up_malloc_ddr<char> tidl_extmem_heap_m;
76 up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
77 up_malloc_ddr<OCL_TIDL_ProcessParams> shared_process_params_m;
79 size_t in_size_m;
80 size_t out_size_m;
81 IODeviceArgInfo in_m;
82 IODeviceArgInfo out_m;
84 // Frame being processed by the EO
85 int current_frame_idx_m;
87 // LayersGroupId being processed by the EO
88 int layers_group_id_m;
90 // Trace related
91 void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
93 const LayerOutput* GetOutputFromLayer (uint32_t layer_index,
94 uint32_t output_index) const;
95 const LayerOutputs* GetOutputsFromAllLayers() const;
97 uint32_t num_network_layers_m;
98 up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
99 size_t trace_buf_params_sz_m;
101 // host time tracking: eo start to finish
102 float host_time_m;
104 private:
105 void SetupInitializeKernel(const DeviceArgInfo& create_arg,
106 const DeviceArgInfo& param_heap_arg,
107 size_t extmem_heap_size,
108 bool internal_input);
109 void EnableOutputBufferTrace();
110 void SetupProcessKernel();
112 void HostWriteNetInput();
113 void HostReadNetOutput();
114 void ComputeInputOutputSizes();
116 std::unique_ptr<Kernel> k_initialize_m;
117 std::unique_ptr<Kernel> k_process_m;
118 std::unique_ptr<Kernel> k_cleanup_m;
120 // Guarding sole access to input/output for one frame during execution
121 bool is_idle_m;
122 std::mutex mutex_access_m;
123 std::condition_variable cv_access_m;
124 };
127 ExecutionObject::ExecutionObject(Device* d,
128 uint8_t device_index,
129 const ArgInfo& create_arg,
130 const ArgInfo& param_heap_arg,
131 size_t extmem_heap_size,
132 int layers_group_id,
133 bool output_trace,
134 bool internal_input)
135 {
136 DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
137 DeviceArgInfo param_heap_arg_d(param_heap_arg, DeviceArgInfo::Kind::BUFFER);
139 pimpl_m = std::unique_ptr<ExecutionObject::Impl>
140 { new ExecutionObject::Impl(d, device_index,
141 create_arg_d,
142 param_heap_arg_d,
143 extmem_heap_size,
144 layers_group_id,
145 output_trace,
146 internal_input) };
147 }
150 ExecutionObject::Impl::Impl(Device* d,
151 uint8_t device_index,
152 const DeviceArgInfo& create_arg,
153 const DeviceArgInfo& param_heap_arg,
154 size_t extmem_heap_size,
155 int layers_group_id,
156 bool output_trace,
157 bool internal_input):
158 device_m(d),
159 device_index_m(device_index),
160 tidl_extmem_heap_m (nullptr, &__free_ddr),
161 shared_initialize_params_m(nullptr, &__free_ddr),
162 shared_process_params_m(nullptr, &__free_ddr),
163 in_size_m(0),
164 out_size_m(0),
165 in_m(),
166 out_m(),
167 current_frame_idx_m(0),
168 layers_group_id_m(layers_group_id),
169 num_network_layers_m(0),
170 trace_buf_params_m(nullptr, &__free_ddr),
171 trace_buf_params_sz_m(0),
172 k_initialize_m(nullptr),
173 k_process_m(nullptr),
174 k_cleanup_m(nullptr),
175 is_idle_m(true)
176 {
177 device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
178 // Save number of layers in the network
179 const TIDL_CreateParams* cp =
180 static_cast<const TIDL_CreateParams *>(create_arg.ptr());
181 num_network_layers_m = cp->net.numLayers;
183 SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
184 internal_input);
186 if (output_trace) EnableOutputBufferTrace();
187 SetupProcessKernel();
188 }
190 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
191 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
192 // unique_ptr's destructor requires a complete type in order to invoke delete
193 ExecutionObject::~ExecutionObject() = default;
195 char* ExecutionObject::GetInputBufferPtr() const
196 {
197 return static_cast<char *>(pimpl_m->in_m.GetArg().ptr());
198 }
200 size_t ExecutionObject::GetInputBufferSizeInBytes() const
201 {
202 return pimpl_m->in_size_m;
203 }
205 char* ExecutionObject::GetOutputBufferPtr() const
206 {
207 return static_cast<char *>(pimpl_m->out_m.GetArg().ptr());
208 }
210 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
211 {
212 return pimpl_m->out_size_m;
213 }
215 void ExecutionObject::SetFrameIndex(int idx)
216 {
217 pimpl_m->current_frame_idx_m = idx;
218 }
220 int ExecutionObject::GetFrameIndex() const
221 {
222 return pimpl_m->current_frame_idx_m;
223 }
225 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
226 {
227 assert(in.ptr() != nullptr && in.size() >= pimpl_m->in_size_m);
228 assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
230 pimpl_m->in_m = IODeviceArgInfo(in);
231 pimpl_m->out_m = IODeviceArgInfo(out);
232 }
234 void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
235 const IODeviceArgInfo* out)
236 {
237 pimpl_m->in_m = *in;
238 pimpl_m->out_m = *out;
239 }
241 bool ExecutionObject::ProcessFrameStartAsync()
242 {
243 assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
244 return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
245 }
247 bool ExecutionObject::ProcessFrameWait()
248 {
249 return pimpl_m->Wait(ExecutionObject::CallType::PROCESS);
250 }
252 bool ExecutionObject::RunAsync (CallType ct)
253 {
254 return pimpl_m->RunAsync(ct);
255 }
257 bool ExecutionObject::Wait (CallType ct)
258 {
259 return pimpl_m->Wait(ct);
260 }
262 bool ExecutionObject::AddCallback(CallType ct, void *user_data)
263 {
264 return pimpl_m->AddCallback(ct, user_data);
265 }
267 float ExecutionObject::GetProcessTimeInMilliSeconds() const
268 {
269 float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
270 return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000;
271 }
273 float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
274 {
275 return pimpl_m->host_time_m;
276 }
278 void
279 ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
280 {
281 pimpl_m->WriteLayerOutputsToFile(filename_prefix);
282 }
284 const LayerOutput* ExecutionObject::GetOutputFromLayer(
285 uint32_t layer_index, uint32_t output_index) const
286 {
287 return pimpl_m->GetOutputFromLayer(layer_index, output_index);
288 }
290 const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
291 {
292 return pimpl_m->GetOutputsFromAllLayers();
293 }
295 int ExecutionObject::GetLayersGroupId() const
296 {
297 return pimpl_m->layers_group_id_m;
298 }
300 const std::string& ExecutionObject::GetDeviceName() const
301 {
302 return pimpl_m->device_name_m;
303 }
305 void ExecutionObject::AcquireLock()
306 {
307 pimpl_m->AcquireLock();
308 }
310 void ExecutionObject::ReleaseLock()
311 {
312 pimpl_m->ReleaseLock();
313 }
315 //
316 // Create a kernel to call the "initialize" function
317 //
318 void
319 ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
320 const DeviceArgInfo& param_heap_arg,
321 size_t extmem_heap_size,
322 bool internal_input)
323 {
324 // Allocate a heap for TI DL to use on the device
325 tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
327 // Create a kernel for cleanup
328 KernelArgs cleanup_args;
329 k_cleanup_m.reset(new Kernel(device_m,
330 STRING(CLEANUP_KERNEL),
331 cleanup_args, device_index_m));
333 // Set up parameter struct for the initialize kernel
334 shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
335 memset(shared_initialize_params_m.get(), 0,
336 sizeof(OCL_TIDL_InitializeParams));
338 shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
339 shared_initialize_params_m->l2HeapSize = tidl::internal::DMEM1_SIZE;
340 shared_initialize_params_m->l1HeapSize = tidl::internal::DMEM0_SIZE;
341 shared_initialize_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
342 shared_initialize_params_m->enableInternalInput = internal_input ? 1 : 0;
344 // Setup kernel arguments for initialize
345 KernelArgs args = { create_arg,
346 param_heap_arg,
347 DeviceArgInfo(tidl_extmem_heap_m.get(),
348 extmem_heap_size,
349 DeviceArgInfo::Kind::BUFFER),
350 DeviceArgInfo(shared_initialize_params_m.get(),
351 sizeof(OCL_TIDL_InitializeParams),
352 DeviceArgInfo::Kind::BUFFER),
353 device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
354 DeviceArgInfo(nullptr, tidl::internal::DMEM1_SIZE,
355 DeviceArgInfo::Kind::LOCAL):
356 DeviceArgInfo(nullptr, 4,
357 DeviceArgInfo::Kind::LOCAL) };
359 k_initialize_m.reset(new Kernel(device_m,
360 STRING(INIT_KERNEL), args,
361 device_index_m));
362 }
364 //
365 // Allocate an OpenCL buffer for TIDL layer output buffer metadata.
366 // The device will populate metadata for every buffer that is used as an
367 // output buffer by a layer. This needs to be done before setting up
368 // process kernel.
369 //
370 void ExecutionObject::Impl::EnableOutputBufferTrace()
371 {
372 trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
373 num_network_layers_m*
374 TIDL_NUM_OUT_BUFS);
376 trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
377 (trace_buf_params_sz_m));
379 // Device will update bufferId if there is valid data for the entry
380 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
381 for (uint32_t i = 0; i < num_network_layers_m; i++)
382 for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
383 {
384 OCL_TIDL_BufParams *bufP =
385 &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
386 bufP->bufferId = UINT_MAX;
387 }
388 }
390 //
391 // Create a kernel to call the "process" function
392 //
393 void
394 ExecutionObject::Impl::SetupProcessKernel()
395 {
396 shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
397 shared_process_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
398 shared_process_params_m->enableInternalInput =
399 shared_initialize_params_m->enableInternalInput;
400 shared_process_params_m->cycles = 0;
402 KernelArgs args = { DeviceArgInfo(shared_process_params_m.get(),
403 sizeof(OCL_TIDL_ProcessParams),
404 DeviceArgInfo::Kind::BUFFER),
405 DeviceArgInfo(tidl_extmem_heap_m.get(),
406 shared_initialize_params_m->tidlHeapSize,
407 DeviceArgInfo::Kind::BUFFER),
408 DeviceArgInfo(trace_buf_params_m.get(),
409 trace_buf_params_sz_m,
410 DeviceArgInfo::Kind::BUFFER)
412 };
414 k_process_m.reset(new Kernel(device_m,
415 STRING(PROCESS_KERNEL), args,
416 device_index_m));
417 }
420 static size_t readDataS8(const char *readPtr, char *ptr, int roi, int n,
421 int width, int height, int pitch,
422 int chOffset)
423 {
424 if (!readPtr) return 0;
426 for(int i2 = 0; i2 < roi; i2++)
427 for(int i0 = 0; i0 < n; i0++)
428 for(int i1 = 0; i1 < height; i1++)
429 memcpy(&ptr[i2*n*chOffset + i0*chOffset + i1*pitch],
430 &readPtr[i2*n*width*height + i0*width*height+ i1*width],
431 width);
433 return width*height*n*roi;
434 }
436 static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
437 int height, int pitch, int chOffset)
438 {
439 if (!writePtr) return 0;
441 for(int i0 = 0; i0 < n; i0++)
442 for(int i1 = 0; i1 < height; i1++)
443 memcpy(&writePtr[i0*width*height + i1*width],
444 &ptr[i0*chOffset + i1*pitch],
445 width);
447 return width*height*n;
448 }
450 //
451 // Copy from host buffer to TIDL device buffer
452 //
453 void ExecutionObject::Impl::HostWriteNetInput()
454 {
455 const char* readPtr = (const char *) in_m.GetArg().ptr();
456 const PipeInfo& pipe = in_m.GetPipe();
458 for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
459 {
460 OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
462 if (shared_process_params_m->enableInternalInput == 0)
463 {
464 readPtr += readDataS8(
465 readPtr,
466 (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
467 + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
468 + OCL_TIDL_MAX_PAD_SIZE,
469 inBuf->numROIs,
470 inBuf->numChannels,
471 inBuf->ROIWidth,
472 inBuf->ROIHeight,
473 inBuf->bufPlaneWidth,
474 ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) /
475 inBuf->numChannels));
476 }
477 else
478 {
479 shared_process_params_m->inBufAddr[i] = pipe.bufAddr_m[i];
480 }
482 shared_process_params_m->inDataQ[i] = pipe.dataQ_m[i];
483 }
484 }
486 //
487 // Copy from TIDL device buffer into host buffer
488 //
489 void ExecutionObject::Impl::HostReadNetOutput()
490 {
491 char* writePtr = (char *) out_m.GetArg().ptr();
492 PipeInfo& pipe = out_m.GetPipe();
494 for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
495 {
496 OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
497 if (writePtr != nullptr)
498 {
499 writePtr += writeDataS8(
500 writePtr,
501 (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
502 + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
503 + OCL_TIDL_MAX_PAD_SIZE,
504 outBuf->numChannels,
505 outBuf->ROIWidth,
506 outBuf->ROIHeight,
507 outBuf->bufPlaneWidth,
508 ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
509 outBuf->numChannels));
510 }
512 pipe.dataQ_m[i] = shared_process_params_m->outDataQ[i];
513 pipe.bufAddr_m[i] = shared_initialize_params_m->bufAddrBase
514 + outBuf->bufPlaneBufOffset;
515 }
516 shared_process_params_m->bytesWritten = writePtr -
517 (char *) out_m.GetArg().ptr();
518 }
520 void ExecutionObject::Impl::ComputeInputOutputSizes()
521 {
522 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS) return;
524 if (shared_initialize_params_m->numInBufs > OCL_TIDL_MAX_IN_BUFS ||
525 shared_initialize_params_m->numOutBufs > OCL_TIDL_MAX_OUT_BUFS)
526 {
527 std::cout << "Num input/output bufs ("
528 << shared_initialize_params_m->numInBufs << ", "
529 << shared_initialize_params_m->numOutBufs
530 << ") exceeded limit!" << std::endl;
531 shared_initialize_params_m->errorCode = OCL_TIDL_INIT_FAIL;
532 return;
533 }
535 in_size_m = 0;
536 out_size_m = 0;
537 for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
538 {
539 OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
540 in_size_m += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
541 inBuf->ROIHeight;
542 }
543 for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
544 {
545 OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
546 out_size_m += outBuf->numChannels * outBuf->ROIWidth *outBuf->ROIHeight;
547 }
548 }
551 bool ExecutionObject::Impl::RunAsync(CallType ct)
552 {
553 switch (ct)
554 {
555 case CallType::INIT:
556 {
557 k_initialize_m->RunAsync();
558 break;
559 }
560 case CallType::PROCESS:
561 {
562 std::chrono::time_point<std::chrono::steady_clock> t1, t2;
563 t1 = std::chrono::steady_clock::now();
565 shared_process_params_m->frameIdx = current_frame_idx_m;
566 shared_process_params_m->bytesWritten = 0;
567 HostWriteNetInput();
568 k_process_m->RunAsync();
570 t2 = std::chrono::steady_clock::now();
571 std::chrono::duration<float> elapsed = t2 - t1;
572 host_time_m = elapsed.count() * 1000;
573 break;
574 }
575 case CallType::CLEANUP:
576 {
577 k_cleanup_m->RunAsync();
578 break;
579 }
580 default:
581 return false;
582 }
584 return true;
585 }
587 bool ExecutionObject::Impl::Wait(CallType ct)
588 {
589 switch (ct)
590 {
591 case CallType::INIT:
592 {
593 bool has_work = k_initialize_m->Wait();
595 if (has_work)
596 {
597 ComputeInputOutputSizes();
598 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
599 throw Exception(shared_initialize_params_m->errorCode,
600 __FILE__, __FUNCTION__, __LINE__);
601 }
602 return has_work;
603 }
604 case CallType::PROCESS:
605 {
606 float host_elapsed_ms = 0.0f;
607 bool has_work = k_process_m->Wait(&host_elapsed_ms);
608 if (has_work)
609 {
610 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
611 throw Exception(shared_process_params_m->errorCode,
612 __FILE__, __FUNCTION__, __LINE__);
614 std::chrono::time_point<std::chrono::steady_clock> t1, t2;
615 t1 = std::chrono::steady_clock::now();
616 HostReadNetOutput();
617 t2 = std::chrono::steady_clock::now();
618 std::chrono::duration<float> elapsed = t2 - t1;
619 host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
620 }
622 return has_work;
623 }
624 case CallType::CLEANUP:
625 {
626 return k_cleanup_m->Wait();
627 break;
628 }
629 default:
630 return false;
631 }
633 return false;
634 }
636 bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data)
637 {
638 switch (ct)
639 {
640 case CallType::PROCESS:
641 {
642 return k_process_m->AddCallback(user_data);
643 break;
644 }
645 default:
646 return false;
647 }
649 return false;
650 }
652 uint64_t ExecutionObject::Impl::GetProcessCycles() const
653 {
654 uint8_t factor = 1;
656 // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
657 if (device_m->type() == CL_DEVICE_TYPE_CUSTOM)
658 factor = 2;
660 return shared_process_params_m.get()->cycles * factor;
661 }
663 //
664 // Write the trace data to output files
665 //
666 void
667 ExecutionObject::Impl::WriteLayerOutputsToFile(const std::string& filename_prefix) const
668 {
669 if (trace_buf_params_sz_m == 0)
670 return;
672 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
674 for (uint32_t i = 0; i < num_network_layers_m; i++)
675 for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
676 {
677 OCL_TIDL_BufParams* buf = &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
679 if (buf->bufferId == UINT_MAX)
680 continue;
682 size_t buffer_size = buf->numChannels * buf->ROIHeight *
683 buf->ROIWidth;
685 char *tmp = new char[buffer_size];
687 if (tmp == nullptr)
688 throw Exception("Out of memory, new failed",
689 __FILE__, __FUNCTION__, __LINE__);
691 writeDataS8(
692 tmp,
693 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
694 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
695 + OCL_TIDL_MAX_PAD_SIZE,
696 buf->numChannels,
697 buf->ROIWidth,
698 buf->ROIHeight,
699 buf->bufPlaneWidth,
700 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
701 buf->numChannels));
703 std::string filename(filename_prefix);
704 filename += std::to_string(buf->bufferId) + "_";
705 filename += std::to_string(buf->ROIWidth) + "x";
706 filename += std::to_string(buf->ROIHeight) + ".bin";
708 std::ofstream ofs;
709 ofs.open(filename, std::ofstream::out);
710 ofs.write(tmp, buffer_size);
711 ofs.close();
713 delete[] tmp;
714 }
715 }
718 const LayerOutput* ExecutionObject::Impl::GetOutputFromLayer(
719 uint32_t layer_index, uint32_t output_index) const
720 {
721 if (trace_buf_params_sz_m == 0)
722 return nullptr;
724 if (layer_index > num_network_layers_m || output_index > TIDL_NUM_OUT_BUFS)
725 return nullptr;
727 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
728 OCL_TIDL_BufParams* buf = &bufferParams[layer_index*TIDL_NUM_OUT_BUFS+
729 output_index];
731 if (buf->bufferId == UINT_MAX)
732 return nullptr;
734 size_t buffer_size = buf->numChannels * buf->ROIHeight *
735 buf->ROIWidth;
737 char *data = new char[buffer_size];
739 if (data == nullptr)
740 throw Exception("Out of memory, new failed",
741 __FILE__, __FUNCTION__, __LINE__);
743 writeDataS8(data,
744 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
745 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
746 + OCL_TIDL_MAX_PAD_SIZE,
747 buf->numChannels,
748 buf->ROIWidth,
749 buf->ROIHeight,
750 buf->bufPlaneWidth,
751 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
752 buf->numChannels));
754 return new LayerOutput(layer_index, output_index, buf->bufferId,
755 buf->numROIs, buf->numChannels, buf->ROIHeight,
756 buf->ROIWidth, data);
757 }
759 const LayerOutputs* ExecutionObject::Impl::GetOutputsFromAllLayers() const
760 {
761 LayerOutputs* result = new LayerOutputs;
763 for (uint32_t i=0; i < num_network_layers_m; i++)
764 for (int j=0; j < TIDL_NUM_OUT_BUFS; j++)
765 {
766 const LayerOutput* lo = GetOutputFromLayer(i, j);
767 if (lo)
768 result->push_back(std::unique_ptr<const LayerOutput>{ lo });
769 }
771 return result;
772 }
774 LayerOutput::LayerOutput(int layer_index, int output_index, int buffer_id,
775 int num_roi, int num_channels, size_t height,
776 size_t width, const char* data):
777 layer_index_m(layer_index), buffer_id_m(buffer_id),
778 num_roi_m(num_roi), num_channels_m(num_channels),
779 height_m(height), width_m(width), data_m(data)
780 { }
782 LayerOutput::~LayerOutput()
783 {
784 delete[] data_m;
785 }
787 void ExecutionObject::Impl::AcquireLock()
788 {
789 std::unique_lock<std::mutex> lock(mutex_access_m);
790 cv_access_m.wait(lock, [this]{ return this->is_idle_m; });
791 is_idle_m = false;
792 }
794 void ExecutionObject::Impl::ReleaseLock()
795 {
796 is_idle_m = true;
797 cv_access_m.notify_all();
798 }