Enqueue multiple frames at device side
[tidl/tidl-api.git] / tidl_api / src / execution_object.cpp
1 /******************************************************************************
2  * Copyright (c) 2017-2018 Texas Instruments Incorporated - http://www.ti.com/
3  *  All rights reserved.
4  *
5  *  Redistribution and use in source and binary forms, with or without
6  *  modification, are permitted provided that the following conditions are met:
7  *      * Redistributions of source code must retain the above copyright
8  *        notice, this list of conditions and the following disclaimer.
9  *      * Redistributions in binary form must reproduce the above copyright
10  *        notice, this list of conditions and the following disclaimer in the
11  *        documentation and/or other materials provided with the distribution.
12  *      * Neither the name of Texas Instruments Incorporated nor the
13  *        names of its contributors may be used to endorse or promote products
14  *        derived from this software without specific prior written permission.
15  *
16  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  *  THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
29 /*! \file execution_object.cpp */
31 #include <string.h>
32 #include <fstream>
33 #include <climits>
34 #include <mutex>
35 #include <condition_variable>
36 #include <chrono>
37 #include "executor.h"
38 #include "execution_object.h"
39 #include "trace.h"
40 #include "ocl_device.h"
41 #include "parameters.h"
42 #include "common_defines.h"
43 #include "tidl_create_params.h"
44 #include "device_arginfo.h"
46 using namespace tidl;
48 class ExecutionObject::Impl
49 {
50     public:
51         Impl(Device* d, uint8_t device_index,
52              const DeviceArgInfo& create_arg,
53              const DeviceArgInfo& param_heap_arg,
54              const Configuration& configuration,
55              int    layers_group_id);
56         ~Impl() {}
58         bool RunAsync(CallType ct, uint32_t context_idx);
59         bool Wait    (CallType ct, uint32_t context_idx);
60         bool AddCallback(CallType ct, void *user_data, uint32_t context_idx);
62         uint64_t GetProcessCycles(uint32_t context_idx) const;
63         int  GetLayersGroupId() const;
64         void AcquireContext(uint32_t& context_idx);
65         void ReleaseContext(uint32_t  context_idx);
67         Device*                         device_m;
68         // Index of the OpenCL device/queue used by this EO
69         uint8_t                         device_index_m;
70         std::string                     device_name_m;
72         up_malloc_ddr<char>             tidl_extmem_heap_m;
73         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
74         up_malloc_ddr<OCL_TIDL_ProcessParams>    shared_process_params_m;
76         size_t                          in_size_m;
77         size_t                          out_size_m;
78         IODeviceArgInfo                 in_m[tidl::internal::NUM_CONTEXTS];
79         IODeviceArgInfo                 out_m[tidl::internal::NUM_CONTEXTS];
81         // Frame being processed by the EO
82         int                             current_frame_idx_m;
84         // LayersGroupId being processed by the EO
85         int                             layers_group_id_m;
87         // Trace related
88         void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
90         const LayerOutput* GetOutputFromLayer (uint32_t layer_index,
91                                                uint32_t output_index) const;
92         const LayerOutputs* GetOutputsFromAllLayers() const;
94         uint32_t                          num_network_layers_m;
95         up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
96         size_t                            trace_buf_params_sz_m;
98         // host time tracking: eo start to finish
99         float host_time_m[tidl::internal::NUM_CONTEXTS];
101     private:
102         void SetupInitializeKernel(const DeviceArgInfo& create_arg,
103                                    const DeviceArgInfo& param_heap_arg);
104         void EnableOutputBufferTrace();
105         void SetupProcessKernel();
107         void HostWriteNetInput(uint32_t context_idx);
108         void HostReadNetOutput(uint32_t context_idx);
109         void ComputeInputOutputSizes();
111         std::unique_ptr<Kernel>         k_initialize_m;
112         std::unique_ptr<Kernel>         k_process_m;
113         std::unique_ptr<Kernel>         k_cleanup_m;
115         // Guarding sole access to input/output for one frame during execution
116         // Encoding: context at bit index, bit value: 0 for idle, 1 for busy
117         uint32_t                        idle_encoding_m;
118         std::mutex                      mutex_access_m;
119         std::condition_variable         cv_access_m;
121         const Configuration             configuration_m;
122 };
125 ExecutionObject::ExecutionObject(Device* d,
126                                  uint8_t device_index,
127                                  const   ArgInfo& create_arg,
128                                  const   ArgInfo& param_heap_arg,
129                                  const   Configuration& configuration,
130                                  int     layers_group_id)
132     TRACE::print("-> ExecutionObject::ExecutionObject()\n");
134     DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
135     DeviceArgInfo param_heap_arg_d(param_heap_arg, DeviceArgInfo::Kind::BUFFER);
137     pimpl_m = std::unique_ptr<ExecutionObject::Impl>
138               { new ExecutionObject::Impl(d, device_index,
139                                           create_arg_d,
140                                           param_heap_arg_d,
141                                           configuration,
142                                           layers_group_id) };
143     TRACE::print("<- ExecutionObject::ExecutionObject()\n");
147 ExecutionObject::Impl::Impl(Device* d, uint8_t device_index,
148                             const DeviceArgInfo& create_arg,
149                             const DeviceArgInfo& param_heap_arg,
150                             const Configuration& configuration,
151                             int    layers_group_id):
152     device_m(d),
153     device_index_m(device_index),
154     tidl_extmem_heap_m (nullptr, &__free_ddr),
155     shared_initialize_params_m(nullptr, &__free_ddr),
156     shared_process_params_m(nullptr, &__free_ddr),
157     in_size_m(0),
158     out_size_m(0),
159     current_frame_idx_m(0),
160     layers_group_id_m(layers_group_id),
161     num_network_layers_m(0),
162     trace_buf_params_m(nullptr, &__free_ddr),
163     trace_buf_params_sz_m(0),
164     k_initialize_m(nullptr),
165     k_process_m(nullptr),
166     k_cleanup_m(nullptr),
167     idle_encoding_m(0),  // all contexts are idle
168     configuration_m(configuration)
170     device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
171     // Save number of layers in the network
172     const TIDL_CreateParams* cp =
173                 static_cast<const TIDL_CreateParams *>(create_arg.ptr());
174     num_network_layers_m = cp->net.numLayers;
176     SetupInitializeKernel(create_arg, param_heap_arg);
178     if (configuration_m.enableOutputTrace)
179         EnableOutputBufferTrace();
181     SetupProcessKernel();
184 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
185 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
186 // unique_ptr's destructor requires a complete type in order to invoke delete
187 ExecutionObject::~ExecutionObject() = default;
189 char* ExecutionObject::GetInputBufferPtr() const
191     return static_cast<char *>(pimpl_m->in_m[0].GetArg().ptr());
194 size_t ExecutionObject::GetInputBufferSizeInBytes() const
196     return pimpl_m->in_size_m;
199 char* ExecutionObject::GetOutputBufferPtr() const
201     return static_cast<char *>(pimpl_m->out_m[0].GetArg().ptr());
204 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
206     return pimpl_m->out_size_m;
209 void  ExecutionObject::SetFrameIndex(int idx)
211     pimpl_m->current_frame_idx_m = idx;
214 int ExecutionObject::GetFrameIndex() const
216     return pimpl_m->current_frame_idx_m;
219 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in,
220                                            const ArgInfo& out)
222     SetInputOutputBuffer(in, out, 0);
225 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in,
226                                       const ArgInfo& out, uint32_t context_idx)
228     assert(in.ptr()  != nullptr && in.size()  >= pimpl_m->in_size_m);
229     assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
231     pimpl_m->in_m[context_idx]  = IODeviceArgInfo(in);
232     pimpl_m->out_m[context_idx] = IODeviceArgInfo(out);
235 void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
236                                            const IODeviceArgInfo* out,
237                                            uint32_t context_idx)
239     pimpl_m->in_m[context_idx]  = *in;
240     pimpl_m->out_m[context_idx] = *out;
243 bool ExecutionObject::ProcessFrameStartAsync()
245     return ProcessFrameStartAsync(0);
248 bool ExecutionObject::ProcessFrameStartAsync(uint32_t context_idx)
250     TRACE::print("-> ExecutionObject::ProcessFrameStartAsync(%d)\n",
251                  context_idx);
252     assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
253     return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS, context_idx);
256 bool ExecutionObject::ProcessFrameWait()
258     return ProcessFrameWait(0);
261 bool ExecutionObject::ProcessFrameWait(uint32_t context_idx)
263     TRACE::print("-> ExecutionObject::ProcessFrameWait(%d)\n", context_idx);
264     return pimpl_m->Wait(ExecutionObject::CallType::PROCESS, context_idx);
267 bool ExecutionObject::RunAsync (CallType ct)
269     return pimpl_m->RunAsync(ct, 0);
272 bool ExecutionObject::Wait (CallType ct)
274     return pimpl_m->Wait(ct, 0);
277 bool ExecutionObject::AddCallback(CallType ct, void *user_data,
278                                   uint32_t context_idx)
280     return pimpl_m->AddCallback(ct, user_data, context_idx);
283 float ExecutionObject::GetProcessTimeInMilliSeconds() const
285     return GetProcessTimeInMilliSeconds(0);
288 float ExecutionObject::GetProcessTimeInMilliSeconds(uint32_t context_idx) const
290     float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
291     return ((float)pimpl_m->GetProcessCycles(context_idx)) / frequency * 1000;
294 float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
296     return GetHostProcessTimeInMilliSeconds(0);
299 float ExecutionObject::GetHostProcessTimeInMilliSeconds(uint32_t context_idx) const
301     return pimpl_m->host_time_m[context_idx];
304 void
305 ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
307     pimpl_m->WriteLayerOutputsToFile(filename_prefix);
310 const LayerOutput* ExecutionObject::GetOutputFromLayer(
311                          uint32_t layer_index, uint32_t output_index) const
313     return pimpl_m->GetOutputFromLayer(layer_index, output_index);
316 const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
318     return pimpl_m->GetOutputsFromAllLayers();
321 int ExecutionObject::GetLayersGroupId() const
323     return pimpl_m->layers_group_id_m;
326 const std::string& ExecutionObject::GetDeviceName() const
328     return pimpl_m->device_name_m;
331 void ExecutionObject::AcquireContext(uint32_t& context_idx)
333     pimpl_m->AcquireContext(context_idx);
336 void ExecutionObject::ReleaseContext(uint32_t context_idx)
338     pimpl_m->ReleaseContext(context_idx);
341 //
342 // Create a kernel to call the "initialize" function
343 //
344 void
345 ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
346                                              const DeviceArgInfo& param_heap_arg)
348     // Allocate a heap for TI DL to use on the device
349     tidl_extmem_heap_m.reset(
350                          malloc_ddr<char>(configuration_m.NETWORK_HEAP_SIZE));
352     // Create a kernel for cleanup
353     KernelArgs cleanup_args;
354     k_cleanup_m.reset(new Kernel(device_m,
355                                  STRING(CLEANUP_KERNEL),
356                                  cleanup_args, device_index_m));
358     // Set up parameter struct for the initialize kernel
359     shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
360     memset(shared_initialize_params_m.get(), 0,
361            sizeof(OCL_TIDL_InitializeParams));
363     shared_initialize_params_m->tidlHeapSize =configuration_m.NETWORK_HEAP_SIZE;
364     shared_initialize_params_m->l2HeapSize   = tidl::internal::DMEM1_SIZE;
365     shared_initialize_params_m->l1HeapSize   = tidl::internal::DMEM0_SIZE;
366     shared_initialize_params_m->numContexts  = tidl::internal::NUM_CONTEXTS;
368     // Set up execution trace specified in the configuration
369     EnableExecutionTrace(configuration_m,
370                          &shared_initialize_params_m->enableTrace);
372     // Setup kernel arguments for initialize
373     KernelArgs args = { create_arg,
374                         param_heap_arg,
375                         DeviceArgInfo(tidl_extmem_heap_m.get(),
376                                       configuration_m.NETWORK_HEAP_SIZE,
377                                       DeviceArgInfo::Kind::BUFFER),
378                         DeviceArgInfo(shared_initialize_params_m.get(),
379                                       sizeof(OCL_TIDL_InitializeParams),
380                                       DeviceArgInfo::Kind::BUFFER),
381                         device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
382                             DeviceArgInfo(nullptr, tidl::internal::DMEM1_SIZE,
383                                           DeviceArgInfo::Kind::LOCAL):
384                             DeviceArgInfo(nullptr, 4,
385                                           DeviceArgInfo::Kind::LOCAL) };
387     k_initialize_m.reset(new Kernel(device_m,
388                                     STRING(INIT_KERNEL), args,
389                                     device_index_m));
392 //
393 // Allocate an OpenCL buffer for TIDL layer output buffer metadata.
394 // The device will populate metadata for every buffer that is used as an
395 // output buffer by a layer.  This needs to be done before setting up
396 // process kernel.
397 //
398 void ExecutionObject::Impl::EnableOutputBufferTrace()
400     trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
401                              num_network_layers_m*
402                              TIDL_NUM_OUT_BUFS);
404     trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
405                              (trace_buf_params_sz_m));
407     // Device will update bufferId if there is valid data for the entry
408     OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
409     for (uint32_t i = 0; i < num_network_layers_m; i++)
410         for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
411         {
412             OCL_TIDL_BufParams *bufP =
413                                 &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
414             bufP->bufferId = UINT_MAX;
415         }
418 //
419 // Create a kernel to call the "process" function
420 //
421 void
422 ExecutionObject::Impl::SetupProcessKernel()
424     shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>(
425                tidl::internal::NUM_CONTEXTS * sizeof(OCL_TIDL_ProcessParams)));
427     // Set up execution trace specified in the configuration
428     for (int i = 0; i < tidl::internal::NUM_CONTEXTS; i++)
429     {
430         OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() + i;
431         EnableExecutionTrace(configuration_m, &p_params->enableTrace);
432     }
434     uint32_t context_idx = 0;
435     KernelArgs args = { DeviceArgInfo(shared_process_params_m.get(),
436                                       tidl::internal::NUM_CONTEXTS *
437                                       sizeof(OCL_TIDL_ProcessParams),
438                                       DeviceArgInfo::Kind::BUFFER),
439                         DeviceArgInfo(tidl_extmem_heap_m.get(),
440                                       shared_initialize_params_m->tidlHeapSize,
441                                       DeviceArgInfo::Kind::BUFFER),
442                         DeviceArgInfo(trace_buf_params_m.get(),
443                                       trace_buf_params_sz_m,
444                                       DeviceArgInfo::Kind::BUFFER),
445                         DeviceArgInfo(&context_idx,
446                                       sizeof(uint32_t),
447                                       DeviceArgInfo::Kind::SCALAR)
448                       };
450     k_process_m.reset(new Kernel(device_m,
451                                  STRING(PROCESS_KERNEL), args,
452                                  device_index_m));
456 static size_t readDataS8(const char *readPtr, char *ptr, int roi, int n,
457                          int width, int height, int pitch,
458                          int chOffset)
460     if (!readPtr)  return 0;
462     for(int i2 = 0; i2 < roi; i2++)
463         for(int i0 = 0; i0 < n; i0++)
464             for(int i1 = 0; i1 < height; i1++)
465                 memcpy(&ptr[i2*n*chOffset + i0*chOffset + i1*pitch],
466                        &readPtr[i2*n*width*height + i0*width*height+ i1*width],
467                        width);
469     return width*height*n*roi;
472 static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
473                           int height, int pitch, int chOffset)
475     if (!writePtr)  return 0;
477     for(int i0 = 0; i0 < n; i0++)
478         for(int i1 = 0; i1 < height; i1++)
479             memcpy(&writePtr[i0*width*height + i1*width],
480                    &ptr[i0*chOffset + i1*pitch],
481                    width);
483     return width*height*n;
486 //
487 // Copy from host buffer to TIDL device buffer
488 //
489 void ExecutionObject::Impl::HostWriteNetInput(uint32_t context_idx)
491     const char*     readPtr  = (const char *) in_m[context_idx].GetArg().ptr();
492     const PipeInfo& pipe     = in_m[context_idx].GetPipe();
493     OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
494                                        + context_idx;
496     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
497     {
498         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
499         uint32_t context_size = inBuf->bufPlaneWidth * inBuf->bufPlaneHeight;
500                  context_size = (context_size + OCL_TIDL_CACHE_ALIGN - 1) &
501                                 (~(OCL_TIDL_CACHE_ALIGN - 1));
502         char *inBufAddr = tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
503                           + context_idx * context_size;
505             readPtr += readDataS8(
506                 readPtr,
507                 (char *) inBufAddr
508                     + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
509                     + OCL_TIDL_MAX_PAD_SIZE,
510                 inBuf->numROIs,
511                 inBuf->numChannels,
512                 inBuf->ROIWidth,
513                 inBuf->ROIHeight,
514                 inBuf->bufPlaneWidth,
515                 ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) /
516                  inBuf->numChannels));
518         p_params->dataQ[i] = pipe.dataQ_m[i];
519     }
522 //
523 // Copy from TIDL device buffer into host buffer
524 //
525 void ExecutionObject::Impl::HostReadNetOutput(uint32_t context_idx)
527     char* writePtr = (char *) out_m[context_idx].GetArg().ptr();
528     PipeInfo& pipe = out_m[context_idx].GetPipe();
529     OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
530                                        + context_idx;
532     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
533     {
534         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
535         uint32_t context_size = outBuf->bufPlaneWidth * outBuf->bufPlaneHeight;
536                  context_size = (context_size + OCL_TIDL_CACHE_ALIGN - 1) &
537                                 (~(OCL_TIDL_CACHE_ALIGN - 1));
538         char *outBufAddr = tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
539                            + context_idx * context_size;
540         if (writePtr != nullptr)
541         {
542             writePtr += writeDataS8(
543                 writePtr,
544                 (char *) outBufAddr
545                     + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
546                     + OCL_TIDL_MAX_PAD_SIZE,
547                 outBuf->numChannels,
548                 outBuf->ROIWidth,
549                 outBuf->ROIHeight,
550                 outBuf->bufPlaneWidth,
551                 ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
552                  outBuf->numChannels));
553         }
555         pipe.dataQ_m[i]   = p_params->dataQ[i];
556     }
559 void ExecutionObject::Impl::ComputeInputOutputSizes()
561     if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)  return;
563     if (shared_initialize_params_m->numInBufs > OCL_TIDL_MAX_IN_BUFS ||
564         shared_initialize_params_m->numOutBufs > OCL_TIDL_MAX_OUT_BUFS)
565     {
566         std::cout << "Num input/output bufs ("
567                   << shared_initialize_params_m->numInBufs << ", "
568                   << shared_initialize_params_m->numOutBufs
569                   << ") exceeded limit!" << std::endl;
570         shared_initialize_params_m->errorCode = OCL_TIDL_INIT_FAIL;
571         return;
572     }
574     in_size_m  = 0;
575     out_size_m = 0;
576     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
577     {
578         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
579         in_size_m += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
580                      inBuf->ROIHeight;
581     }
582     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
583     {
584         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
585         out_size_m += outBuf->numChannels * outBuf->ROIWidth *outBuf->ROIHeight;
586     }
590 bool ExecutionObject::Impl::RunAsync(CallType ct, uint32_t context_idx)
592     switch (ct)
593     {
594         case CallType::INIT:
595         {
596             k_initialize_m->RunAsync();
597             break;
598         }
599         case CallType::PROCESS:
600         {
601             std::chrono::time_point<std::chrono::steady_clock> t1, t2;
602             t1 = std::chrono::steady_clock::now();
604             OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
605                                                + context_idx;
606             p_params->frameIdx = current_frame_idx_m;
607             HostWriteNetInput(context_idx);
608             {
609                 std::unique_lock<std::mutex> lock(mutex_access_m);
610                 k_process_m->UpdateScalarArg(3, sizeof(uint32_t), &context_idx);
611                 k_process_m->RunAsync(context_idx);
612             }
614             t2 = std::chrono::steady_clock::now();
615             std::chrono::duration<float> elapsed = t2 - t1;
616             host_time_m[context_idx] = elapsed.count() * 1000;
617             break;
618         }
619         case CallType::CLEANUP:
620         {
621             k_cleanup_m->RunAsync();
622             break;
623         }
624         default:
625             return false;
626     }
628     return true;
631 bool ExecutionObject::Impl::Wait(CallType ct, uint32_t context_idx)
633     switch (ct)
634     {
635         case CallType::INIT:
636         {
637             bool has_work = k_initialize_m->Wait();
639             if (has_work)
640             {
641                 ComputeInputOutputSizes();
642                 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
643                     throw Exception(shared_initialize_params_m->errorCode,
644                                     __FILE__, __FUNCTION__, __LINE__);
645             }
646             return has_work;
647         }
648         case CallType::PROCESS:
649         {
650             float host_elapsed_ms = 0.0f;
651             bool has_work = k_process_m->Wait(&host_elapsed_ms);
652             if (has_work)
653             {
654                 OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
655                                                    + context_idx;
656                 if (p_params->errorCode != OCL_TIDL_SUCCESS)
657                     throw Exception(p_params->errorCode,
658                                     __FILE__, __FUNCTION__, __LINE__);
660                 std::chrono::time_point<std::chrono::steady_clock> t1, t2;
661                 t1 = std::chrono::steady_clock::now();
662                 HostReadNetOutput(context_idx);
663                 t2 = std::chrono::steady_clock::now();
664                 std::chrono::duration<float> elapsed = t2 - t1;
665                 host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
666             }
668             return has_work;
669         }
670         case CallType::CLEANUP:
671         {
672             return k_cleanup_m->Wait();
673             break;
674         }
675         default:
676             return false;
677     }
679     return false;
682 bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data,
683                                         uint32_t context_idx)
685     switch (ct)
686     {
687         case CallType::PROCESS:
688         {
689             return k_process_m->AddCallback(user_data, context_idx);
690             break;
691         }
692         default:
693             return false;
694     }
696     return false;
699 uint64_t ExecutionObject::Impl::GetProcessCycles(uint32_t context_idx) const
701     uint8_t factor = 1;
703     // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
704     if (device_m->type() == CL_DEVICE_TYPE_CUSTOM)
705         factor = 2;
707     OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() +
708                                        context_idx;
709     return p_params->cycles * factor;
712 //
713 // Write the trace data to output files
714 //
715 void
716 ExecutionObject::Impl::WriteLayerOutputsToFile(const std::string& filename_prefix) const
718     if (trace_buf_params_sz_m == 0)
719         return;
721     OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
723     for (uint32_t i = 0; i < num_network_layers_m; i++)
724         for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
725         {
726             OCL_TIDL_BufParams* buf = &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
728             if (buf->bufferId == UINT_MAX)
729                 continue;
731             size_t buffer_size = buf->numChannels * buf->ROIHeight *
732                                  buf->ROIWidth;
734             char *tmp = new char[buffer_size];
736             if (tmp == nullptr)
737                 throw Exception("Out of memory, new failed",
738                         __FILE__, __FUNCTION__, __LINE__);
740             writeDataS8(
741                 tmp,
742                 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
743                 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
744                 + OCL_TIDL_MAX_PAD_SIZE,
745                 buf->numChannels,
746                 buf->ROIWidth,
747                 buf->ROIHeight,
748                 buf->bufPlaneWidth,
749                 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
750                  buf->numChannels));
752             std::string filename(filename_prefix);
753             filename += std::to_string(buf->bufferId) + "_";
754             filename += std::to_string(buf->ROIWidth) + "x";
755             filename += std::to_string(buf->ROIHeight) + ".bin";
757             std::ofstream ofs;
758             ofs.open(filename, std::ofstream::out);
759             ofs.write(tmp, buffer_size);
760             ofs.close();
762             delete[] tmp;
763         }
767 const LayerOutput* ExecutionObject::Impl::GetOutputFromLayer(
768                             uint32_t layer_index, uint32_t output_index) const
770     if (trace_buf_params_sz_m == 0)
771         return nullptr;
773     if (layer_index > num_network_layers_m || output_index > TIDL_NUM_OUT_BUFS)
774         return nullptr;
776     OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
777     OCL_TIDL_BufParams* buf = &bufferParams[layer_index*TIDL_NUM_OUT_BUFS+
778                                             output_index];
780     if (buf->bufferId == UINT_MAX)
781         return nullptr;
783     size_t buffer_size = buf->numChannels * buf->ROIHeight *
784                          buf->ROIWidth;
786     char *data = new char[buffer_size];
788     if (data == nullptr)
789         throw Exception("Out of memory, new failed",
790                 __FILE__, __FUNCTION__, __LINE__);
792     writeDataS8(data,
793                 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
794                 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
795                 + OCL_TIDL_MAX_PAD_SIZE,
796                 buf->numChannels,
797                 buf->ROIWidth,
798                 buf->ROIHeight,
799                 buf->bufPlaneWidth,
800                 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
801                  buf->numChannels));
803     return new LayerOutput(layer_index, output_index, buf->bufferId,
804                            buf->numROIs, buf->numChannels, buf->ROIHeight,
805                            buf->ROIWidth, data);
808 const LayerOutputs* ExecutionObject::Impl::GetOutputsFromAllLayers() const
810     LayerOutputs* result = new LayerOutputs;
812     for (uint32_t i=0; i < num_network_layers_m; i++)
813         for (int j=0; j < TIDL_NUM_OUT_BUFS; j++)
814         {
815             const LayerOutput* lo = GetOutputFromLayer(i, j);
816             if (lo)
817                 result->push_back(std::unique_ptr<const LayerOutput>{ lo });
818         }
820     return result;
823 LayerOutput::LayerOutput(int layer_index, int output_index, int buffer_id,
824                          int num_roi, int num_channels, size_t height,
825                          size_t width, const char* data):
826                         layer_index_m(layer_index), buffer_id_m(buffer_id),
827                         num_roi_m(num_roi), num_channels_m(num_channels),
828                         height_m(height), width_m(width), data_m(data)
829 { }
831 LayerOutput::~LayerOutput()
833     delete[] data_m;
836 void ExecutionObject::Impl::AcquireContext(uint32_t& context_idx)
838     std::unique_lock<std::mutex> lock(mutex_access_m);
839     cv_access_m.wait(lock, [this]{ return this->idle_encoding_m <
840                                    (1 << tidl::internal::NUM_CONTEXTS) - 1; });
842     for (uint32_t i = 0; i < tidl::internal::NUM_CONTEXTS; i++)
843         if (((1 << i) & idle_encoding_m) == 0)
844         {
845             context_idx = i;
846             break;
847         }
848     idle_encoding_m |= (1 << context_idx);  // mark the bit as busy
851 void ExecutionObject::Impl::ReleaseContext(uint32_t context_idx)
853     {
854         std::unique_lock<std::mutex> lock(mutex_access_m);
855         idle_encoding_m &= (~(1 << context_idx));  // mark the bit as free
856     }
857     cv_access_m.notify_all();