ExecutionObjectPipeline for executing layersGroups
[tidl/tidl-api.git] / tidl_api / src / execution_object.cpp
1 /******************************************************************************
2  * Copyright (c) 2017-2018 Texas Instruments Incorporated - http://www.ti.com/
3  *  All rights reserved.
4  *
5  *  Redistribution and use in source and binary forms, with or without
6  *  modification, are permitted provided that the following conditions are met:
7  *      * Redistributions of source code must retain the above copyright
8  *        notice, this list of conditions and the following disclaimer.
9  *      * Redistributions in binary form must reproduce the above copyright
10  *        notice, this list of conditions and the following disclaimer in the
11  *        documentation and/or other materials provided with the distribution.
12  *      * Neither the name of Texas Instruments Incorporated nor the
13  *        names of its contributors may be used to endorse or promote products
14  *        derived from this software without specific prior written permission.
15  *
16  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  *  THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
29 /*! \file execution_object.cpp */
31 #include <string.h>
32 #include <fstream>
33 #include <climits>
34 #include <mutex>
35 #include <condition_variable>
36 #include <chrono>
37 #include "executor.h"
38 #include "execution_object.h"
39 #include "trace.h"
40 #include "ocl_device.h"
41 #include "parameters.h"
42 #include "configuration.h"
43 #include "common_defines.h"
44 #include "tidl_create_params.h"
45 #include "device_arginfo.h"
47 using namespace tidl;
49 class ExecutionObject::Impl
50 {
51     public:
52         Impl(Device* d, uint8_t device_index,
53              const DeviceArgInfo& create_arg,
54              const DeviceArgInfo& param_heap_arg,
55              size_t extmem_heap_size,
56              int    layers_group_id,
57              bool   output_trace,
58              bool   internal_input);
59         ~Impl() {}
61         bool RunAsync(CallType ct);
62         bool Wait    (CallType ct);
63         bool AddCallback(CallType ct, void *user_data);
65         uint64_t GetProcessCycles() const;
66         int  GetLayersGroupId() const;
67         void AcquireLock();
68         void ReleaseLock();
70         Device*                         device_m;
71         // Index of the OpenCL device/queue used by this EO
72         uint8_t                         device_index_m;
73         std::string                     device_name_m;
75         up_malloc_ddr<char>             tidl_extmem_heap_m;
76         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
77         up_malloc_ddr<OCL_TIDL_ProcessParams>    shared_process_params_m;
79         size_t                          in_size_m;
80         size_t                          out_size_m;
81         IODeviceArgInfo                 in_m;
82         IODeviceArgInfo                 out_m;
84         // Frame being processed by the EO
85         int                             current_frame_idx_m;
87         // LayersGroupId being processed by the EO
88         int                             layers_group_id_m;
90         // Trace related
91         void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
93         const LayerOutput* GetOutputFromLayer (uint32_t layer_index,
94                                                uint32_t output_index) const;
95         const LayerOutputs* GetOutputsFromAllLayers() const;
97         uint32_t                          num_network_layers_m;
98         up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
99         size_t                            trace_buf_params_sz_m;
101         // host time tracking: eo start to finish
102         float host_time_m;
104     private:
105         void SetupInitializeKernel(const DeviceArgInfo& create_arg,
106                                    const DeviceArgInfo& param_heap_arg,
107                                    size_t extmem_heap_size,
108                                    bool   internal_input);
109         void EnableOutputBufferTrace();
110         void SetupProcessKernel();
112         void HostWriteNetInput();
113         void HostReadNetOutput();
114         void ComputeInputOutputSizes();
116         std::unique_ptr<Kernel>         k_initialize_m;
117         std::unique_ptr<Kernel>         k_process_m;
118         std::unique_ptr<Kernel>         k_cleanup_m;
120         // Guarding sole access to input/output for one frame during execution
121         bool                            is_idle_m;
122         std::mutex                      mutex_access_m;
123         std::condition_variable         cv_access_m;
124 };
127 ExecutionObject::ExecutionObject(Device* d,
128                                  uint8_t device_index,
129                                  const ArgInfo& create_arg,
130                                  const ArgInfo& param_heap_arg,
131                                  size_t extmem_heap_size,
132                                  int    layers_group_id,
133                                  bool   output_trace,
134                                  bool   internal_input)
136     DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
137     DeviceArgInfo param_heap_arg_d(param_heap_arg, DeviceArgInfo::Kind::BUFFER);
139     pimpl_m = std::unique_ptr<ExecutionObject::Impl>
140               { new ExecutionObject::Impl(d, device_index,
141                                           create_arg_d,
142                                           param_heap_arg_d,
143                                           extmem_heap_size,
144                                           layers_group_id,
145                                           output_trace,
146                                           internal_input) };
150 ExecutionObject::Impl::Impl(Device* d,
151                                  uint8_t device_index,
152                                  const DeviceArgInfo& create_arg,
153                                  const DeviceArgInfo& param_heap_arg,
154                                  size_t extmem_heap_size,
155                                  int    layers_group_id,
156                                  bool   output_trace,
157                                  bool   internal_input):
158     device_m(d),
159     device_index_m(device_index),
160     tidl_extmem_heap_m (nullptr, &__free_ddr),
161     shared_initialize_params_m(nullptr, &__free_ddr),
162     shared_process_params_m(nullptr, &__free_ddr),
163     in_size_m(0),
164     out_size_m(0),
165     in_m(),
166     out_m(),
167     current_frame_idx_m(0),
168     layers_group_id_m(layers_group_id),
169     num_network_layers_m(0),
170     trace_buf_params_m(nullptr, &__free_ddr),
171     trace_buf_params_sz_m(0),
172     k_initialize_m(nullptr),
173     k_process_m(nullptr),
174     k_cleanup_m(nullptr),
175     is_idle_m(true)
177     device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
178     // Save number of layers in the network
179     const TIDL_CreateParams* cp =
180                 static_cast<const TIDL_CreateParams *>(create_arg.ptr());
181     num_network_layers_m = cp->net.numLayers;
183     SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
184                           internal_input);
186     if (output_trace)  EnableOutputBufferTrace();
187     SetupProcessKernel();
190 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
191 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
192 // unique_ptr's destructor requires a complete type in order to invoke delete
193 ExecutionObject::~ExecutionObject() = default;
195 char* ExecutionObject::GetInputBufferPtr() const
197     return static_cast<char *>(pimpl_m->in_m.GetArg().ptr());
200 size_t ExecutionObject::GetInputBufferSizeInBytes() const
202     return pimpl_m->in_size_m;
205 char* ExecutionObject::GetOutputBufferPtr() const
207     return static_cast<char *>(pimpl_m->out_m.GetArg().ptr());
210 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
212     return pimpl_m->out_size_m;
215 void  ExecutionObject::SetFrameIndex(int idx)
217     pimpl_m->current_frame_idx_m = idx;
220 int ExecutionObject::GetFrameIndex() const
222     return pimpl_m->current_frame_idx_m;
225 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
227     assert(in.ptr()  != nullptr && in.size()  >= pimpl_m->in_size_m);
228     assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
230     pimpl_m->in_m  = IODeviceArgInfo(in);
231     pimpl_m->out_m = IODeviceArgInfo(out);
234 void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
235                                            const IODeviceArgInfo* out)
237     pimpl_m->in_m  = *in;
238     pimpl_m->out_m = *out;
241 bool ExecutionObject::ProcessFrameStartAsync()
243     assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
244     return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
247 bool ExecutionObject::ProcessFrameWait()
249     return pimpl_m->Wait(ExecutionObject::CallType::PROCESS);
252 bool ExecutionObject::RunAsync (CallType ct)
254     return pimpl_m->RunAsync(ct);
257 bool ExecutionObject::Wait (CallType ct)
259     return pimpl_m->Wait(ct);
262 bool ExecutionObject::AddCallback(CallType ct, void *user_data)
264     return pimpl_m->AddCallback(ct, user_data);
267 float ExecutionObject::GetProcessTimeInMilliSeconds() const
269     float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
270     return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000;
273 float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
275     return pimpl_m->host_time_m;
278 void
279 ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
281     pimpl_m->WriteLayerOutputsToFile(filename_prefix);
284 const LayerOutput* ExecutionObject::GetOutputFromLayer(
285                          uint32_t layer_index, uint32_t output_index) const
287     return pimpl_m->GetOutputFromLayer(layer_index, output_index);
290 const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
292     return pimpl_m->GetOutputsFromAllLayers();
295 int ExecutionObject::GetLayersGroupId() const
297     return pimpl_m->layers_group_id_m;
300 const std::string& ExecutionObject::GetDeviceName() const
302     return pimpl_m->device_name_m;
305 void ExecutionObject::AcquireLock()
307     pimpl_m->AcquireLock();
310 void ExecutionObject::ReleaseLock()
312     pimpl_m->ReleaseLock();
315 //
316 // Create a kernel to call the "initialize" function
317 //
318 void
319 ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
320                                              const DeviceArgInfo& param_heap_arg,
321                                              size_t extmem_heap_size,
322                                              bool   internal_input)
324     // Allocate a heap for TI DL to use on the device
325     tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
327     // Create a kernel for cleanup
328     KernelArgs cleanup_args;
329     k_cleanup_m.reset(new Kernel(device_m,
330                                  STRING(CLEANUP_KERNEL),
331                                  cleanup_args, device_index_m));
333     // Set up parameter struct for the initialize kernel
334     shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
335     memset(shared_initialize_params_m.get(), 0,
336            sizeof(OCL_TIDL_InitializeParams));
338     shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
339     shared_initialize_params_m->l2HeapSize   = tidl::internal::DMEM1_SIZE;
340     shared_initialize_params_m->l1HeapSize   = tidl::internal::DMEM0_SIZE;
341     shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
342     shared_initialize_params_m->enableInternalInput = internal_input ? 1 : 0;
344     // Setup kernel arguments for initialize
345     KernelArgs args = { create_arg,
346                         param_heap_arg,
347                         DeviceArgInfo(tidl_extmem_heap_m.get(),
348                                       extmem_heap_size,
349                                       DeviceArgInfo::Kind::BUFFER),
350                         DeviceArgInfo(shared_initialize_params_m.get(),
351                                       sizeof(OCL_TIDL_InitializeParams),
352                                       DeviceArgInfo::Kind::BUFFER),
353                         device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
354                             DeviceArgInfo(nullptr, tidl::internal::DMEM1_SIZE,
355                                           DeviceArgInfo::Kind::LOCAL):
356                             DeviceArgInfo(nullptr, 4,
357                                           DeviceArgInfo::Kind::LOCAL) };
359     k_initialize_m.reset(new Kernel(device_m,
360                                     STRING(INIT_KERNEL), args,
361                                     device_index_m));
364 //
365 // Allocate an OpenCL buffer for TIDL layer output buffer metadata.
366 // The device will populate metadata for every buffer that is used as an
367 // output buffer by a layer.  This needs to be done before setting up
368 // process kernel.
369 //
370 void ExecutionObject::Impl::EnableOutputBufferTrace()
372     trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
373                              num_network_layers_m*
374                              TIDL_NUM_OUT_BUFS);
376     trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
377                              (trace_buf_params_sz_m));
379     // Device will update bufferId if there is valid data for the entry
380     OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
381     for (uint32_t i = 0; i < num_network_layers_m; i++)
382         for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
383         {
384             OCL_TIDL_BufParams *bufP =
385                                 &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
386             bufP->bufferId = UINT_MAX;
387         }
390 //
391 // Create a kernel to call the "process" function
392 //
393 void
394 ExecutionObject::Impl::SetupProcessKernel()
396     shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
397     shared_process_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
398     shared_process_params_m->enableInternalInput =
399                                shared_initialize_params_m->enableInternalInput;
400     shared_process_params_m->cycles = 0;
402     KernelArgs args = { DeviceArgInfo(shared_process_params_m.get(),
403                                       sizeof(OCL_TIDL_ProcessParams),
404                                       DeviceArgInfo::Kind::BUFFER),
405                         DeviceArgInfo(tidl_extmem_heap_m.get(),
406                                       shared_initialize_params_m->tidlHeapSize,
407                                       DeviceArgInfo::Kind::BUFFER),
408                         DeviceArgInfo(trace_buf_params_m.get(),
409                                       trace_buf_params_sz_m,
410                                       DeviceArgInfo::Kind::BUFFER)
412                       };
414     k_process_m.reset(new Kernel(device_m,
415                                  STRING(PROCESS_KERNEL), args,
416                                  device_index_m));
420 static size_t readDataS8(const char *readPtr, char *ptr, int roi, int n,
421                          int width, int height, int pitch,
422                          int chOffset)
424     if (!readPtr)  return 0;
426     for(int i2 = 0; i2 < roi; i2++)
427         for(int i0 = 0; i0 < n; i0++)
428             for(int i1 = 0; i1 < height; i1++)
429                 memcpy(&ptr[i2*n*chOffset + i0*chOffset + i1*pitch],
430                        &readPtr[i2*n*width*height + i0*width*height+ i1*width],
431                        width);
433     return width*height*n*roi;
436 static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
437                           int height, int pitch, int chOffset)
439     if (!writePtr)  return 0;
441     for(int i0 = 0; i0 < n; i0++)
442         for(int i1 = 0; i1 < height; i1++)
443             memcpy(&writePtr[i0*width*height + i1*width],
444                    &ptr[i0*chOffset + i1*pitch],
445                    width);
447     return width*height*n;
450 //
451 // Copy from host buffer to TIDL device buffer
452 //
453 void ExecutionObject::Impl::HostWriteNetInput()
455     const char*     readPtr  = (const char *) in_m.GetArg().ptr();
456     const PipeInfo& pipe     = in_m.GetPipe();
458     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
459     {
460         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
462         if (shared_process_params_m->enableInternalInput == 0)
463         {
464             readPtr += readDataS8(
465                 readPtr,
466                 (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
467                     + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
468                     + OCL_TIDL_MAX_PAD_SIZE,
469                 inBuf->numROIs,
470                 inBuf->numChannels,
471                 inBuf->ROIWidth,
472                 inBuf->ROIHeight,
473                 inBuf->bufPlaneWidth,
474                 ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) /
475                  inBuf->numChannels));
476         }
477         else
478         {
479             shared_process_params_m->inBufAddr[i] = pipe.bufAddr_m[i];
480         }
482         shared_process_params_m->inDataQ[i]   = pipe.dataQ_m[i];
483     }
486 //
487 // Copy from TIDL device buffer into host buffer
488 //
489 void ExecutionObject::Impl::HostReadNetOutput()
491     char* writePtr = (char *) out_m.GetArg().ptr();
492     PipeInfo& pipe = out_m.GetPipe();
494     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
495     {
496         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
497         if (writePtr != nullptr)
498         {
499             writePtr += writeDataS8(
500                 writePtr,
501                 (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
502                     + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
503                     + OCL_TIDL_MAX_PAD_SIZE,
504                 outBuf->numChannels,
505                 outBuf->ROIWidth,
506                 outBuf->ROIHeight,
507                 outBuf->bufPlaneWidth,
508                 ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
509                  outBuf->numChannels));
510         }
512         pipe.dataQ_m[i]   = shared_process_params_m->outDataQ[i];
513         pipe.bufAddr_m[i] = shared_initialize_params_m->bufAddrBase
514                            + outBuf->bufPlaneBufOffset;
515     }
516     shared_process_params_m->bytesWritten = writePtr -
517                                             (char *) out_m.GetArg().ptr();
520 void ExecutionObject::Impl::ComputeInputOutputSizes()
522     if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)  return;
524     if (shared_initialize_params_m->numInBufs > OCL_TIDL_MAX_IN_BUFS ||
525         shared_initialize_params_m->numOutBufs > OCL_TIDL_MAX_OUT_BUFS)
526     {
527         std::cout << "Num input/output bufs ("
528                   << shared_initialize_params_m->numInBufs << ", "
529                   << shared_initialize_params_m->numOutBufs
530                   << ") exceeded limit!" << std::endl;
531         shared_initialize_params_m->errorCode = OCL_TIDL_INIT_FAIL;
532         return;
533     }
535     in_size_m  = 0;
536     out_size_m = 0;
537     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
538     {
539         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
540         in_size_m += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
541                      inBuf->ROIHeight;
542     }
543     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
544     {
545         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
546         out_size_m += outBuf->numChannels * outBuf->ROIWidth *outBuf->ROIHeight;
547     }
551 bool ExecutionObject::Impl::RunAsync(CallType ct)
553     switch (ct)
554     {
555         case CallType::INIT:
556         {
557             k_initialize_m->RunAsync();
558             break;
559         }
560         case CallType::PROCESS:
561         {
562             std::chrono::time_point<std::chrono::steady_clock> t1, t2;
563             t1 = std::chrono::steady_clock::now();
565             shared_process_params_m->frameIdx = current_frame_idx_m;
566             shared_process_params_m->bytesWritten = 0;
567             HostWriteNetInput();
568             k_process_m->RunAsync();
570             t2 = std::chrono::steady_clock::now();
571             std::chrono::duration<float> elapsed = t2 - t1;
572             host_time_m = elapsed.count() * 1000;
573             break;
574         }
575         case CallType::CLEANUP:
576         {
577             k_cleanup_m->RunAsync();
578             break;
579         }
580         default:
581             return false;
582     }
584     return true;
587 bool ExecutionObject::Impl::Wait(CallType ct)
589     switch (ct)
590     {
591         case CallType::INIT:
592         {
593             bool has_work = k_initialize_m->Wait();
595             if (has_work)
596             {
597                 ComputeInputOutputSizes();
598                 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
599                     throw Exception(shared_initialize_params_m->errorCode,
600                                     __FILE__, __FUNCTION__, __LINE__);
601             }
602             return has_work;
603         }
604         case CallType::PROCESS:
605         {
606             float host_elapsed_ms = 0.0f;
607             bool has_work = k_process_m->Wait(&host_elapsed_ms);
608             if (has_work)
609             {
610                 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
611                     throw Exception(shared_process_params_m->errorCode,
612                                     __FILE__, __FUNCTION__, __LINE__);
614                 std::chrono::time_point<std::chrono::steady_clock> t1, t2;
615                 t1 = std::chrono::steady_clock::now();
616                 HostReadNetOutput();
617                 t2 = std::chrono::steady_clock::now();
618                 std::chrono::duration<float> elapsed = t2 - t1;
619                 host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
620             }
622             return has_work;
623         }
624         case CallType::CLEANUP:
625         {
626             return k_cleanup_m->Wait();
627             break;
628         }
629         default:
630             return false;
631     }
633     return false;
636 bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data)
638     switch (ct)
639     {
640         case CallType::PROCESS:
641         {
642             return k_process_m->AddCallback(user_data);
643             break;
644         }
645         default:
646             return false;
647     }
649     return false;
652 uint64_t ExecutionObject::Impl::GetProcessCycles() const
654     uint8_t factor = 1;
656     // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
657     if (device_m->type() == CL_DEVICE_TYPE_CUSTOM)
658         factor = 2;
660     return shared_process_params_m.get()->cycles * factor;
663 //
664 // Write the trace data to output files
665 //
666 void
667 ExecutionObject::Impl::WriteLayerOutputsToFile(const std::string& filename_prefix) const
669     if (trace_buf_params_sz_m == 0)
670         return;
672     OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
674     for (uint32_t i = 0; i < num_network_layers_m; i++)
675         for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
676         {
677             OCL_TIDL_BufParams* buf = &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
679             if (buf->bufferId == UINT_MAX)
680                 continue;
682             size_t buffer_size = buf->numChannels * buf->ROIHeight *
683                                  buf->ROIWidth;
685             char *tmp = new char[buffer_size];
687             if (tmp == nullptr)
688                 throw Exception("Out of memory, new failed",
689                         __FILE__, __FUNCTION__, __LINE__);
691             writeDataS8(
692                 tmp,
693                 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
694                 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
695                 + OCL_TIDL_MAX_PAD_SIZE,
696                 buf->numChannels,
697                 buf->ROIWidth,
698                 buf->ROIHeight,
699                 buf->bufPlaneWidth,
700                 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
701                  buf->numChannels));
703             std::string filename(filename_prefix);
704             filename += std::to_string(buf->bufferId) + "_";
705             filename += std::to_string(buf->ROIWidth) + "x";
706             filename += std::to_string(buf->ROIHeight) + ".bin";
708             std::ofstream ofs;
709             ofs.open(filename, std::ofstream::out);
710             ofs.write(tmp, buffer_size);
711             ofs.close();
713             delete[] tmp;
714         }
718 const LayerOutput* ExecutionObject::Impl::GetOutputFromLayer(
719                             uint32_t layer_index, uint32_t output_index) const
721     if (trace_buf_params_sz_m == 0)
722         return nullptr;
724     if (layer_index > num_network_layers_m || output_index > TIDL_NUM_OUT_BUFS)
725         return nullptr;
727     OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
728     OCL_TIDL_BufParams* buf = &bufferParams[layer_index*TIDL_NUM_OUT_BUFS+
729                                             output_index];
731     if (buf->bufferId == UINT_MAX)
732         return nullptr;
734     size_t buffer_size = buf->numChannels * buf->ROIHeight *
735                          buf->ROIWidth;
737     char *data = new char[buffer_size];
739     if (data == nullptr)
740         throw Exception("Out of memory, new failed",
741                 __FILE__, __FUNCTION__, __LINE__);
743     writeDataS8(data,
744                 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
745                 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
746                 + OCL_TIDL_MAX_PAD_SIZE,
747                 buf->numChannels,
748                 buf->ROIWidth,
749                 buf->ROIHeight,
750                 buf->bufPlaneWidth,
751                 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
752                  buf->numChannels));
754     return new LayerOutput(layer_index, output_index, buf->bufferId,
755                            buf->numROIs, buf->numChannels, buf->ROIHeight,
756                            buf->ROIWidth, data);
759 const LayerOutputs* ExecutionObject::Impl::GetOutputsFromAllLayers() const
761     LayerOutputs* result = new LayerOutputs;
763     for (uint32_t i=0; i < num_network_layers_m; i++)
764         for (int j=0; j < TIDL_NUM_OUT_BUFS; j++)
765         {
766             const LayerOutput* lo = GetOutputFromLayer(i, j);
767             if (lo)
768                 result->push_back(std::unique_ptr<const LayerOutput>{ lo });
769         }
771     return result;
774 LayerOutput::LayerOutput(int layer_index, int output_index, int buffer_id,
775                          int num_roi, int num_channels, size_t height,
776                          size_t width, const char* data):
777                         layer_index_m(layer_index), buffer_id_m(buffer_id),
778                         num_roi_m(num_roi), num_channels_m(num_channels),
779                         height_m(height), width_m(width), data_m(data)
780 { }
782 LayerOutput::~LayerOutput()
784     delete[] data_m;
787 void ExecutionObject::Impl::AcquireLock()
789     std::unique_lock<std::mutex> lock(mutex_access_m);
790     cv_access_m.wait(lock, [this]{ return this->is_idle_m; });
791     is_idle_m = false;
794 void ExecutionObject::Impl::ReleaseLock()
796     is_idle_m = true;
797     cv_access_m.notify_all();