eb72cbd6dcbf146812d5dd7bc190f5dc21d7d29e
[tidl/tidl-api.git] / tidl_api / src / execution_object.cpp
1 /******************************************************************************
2  * Copyright (c) 2017-2018 Texas Instruments Incorporated - http://www.ti.com/
3  *  All rights reserved.
4  *
5  *  Redistribution and use in source and binary forms, with or without
6  *  modification, are permitted provided that the following conditions are met:
7  *      * Redistributions of source code must retain the above copyright
8  *        notice, this list of conditions and the following disclaimer.
9  *      * Redistributions in binary form must reproduce the above copyright
10  *        notice, this list of conditions and the following disclaimer in the
11  *        documentation and/or other materials provided with the distribution.
12  *      * Neither the name of Texas Instruments Incorporated nor the
13  *        names of its contributors may be used to endorse or promote products
14  *        derived from this software without specific prior written permission.
15  *
16  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  *  THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
29 /*! \file execution_object.cpp */
31 #include "executor.h"
32 #include "execution_object.h"
33 #include "trace.h"
34 #include "ocl_device.h"
35 #include "parameters.h"
36 #include "configuration.h"
37 #include "common_defines.h"
38 #include <string.h>
39 #include "tidl_create_params.h"
40 #include <fstream>
41 #include <climits>
43 using namespace tidl;
45 class ExecutionObject::Impl
46 {
47     public:
48         Impl(Device* d, uint8_t device_index,
49              const ArgInfo& create_arg,
50              const ArgInfo& param_heap_arg,
51              size_t extmem_heap_size,
52              bool   internal_input);
53         ~Impl() {}
55         bool RunAsync(CallType ct);
56         bool Wait    (CallType ct);
58         bool SetupProcessKernel(const ArgInfo& in, const ArgInfo& out);
59         void HostWriteNetInput();
60         void HostReadNetOutput();
61         void ComputeInputOutputSizes();
63         Device*                         device_m;
64         std::unique_ptr<Kernel>         k_initialize_m;
65         std::unique_ptr<Kernel>         k_process_m;
66         std::unique_ptr<Kernel>         k_cleanup_m;
68         up_malloc_ddr<char>             tidl_extmem_heap_m;
69         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
70         up_malloc_ddr<OCL_TIDL_ProcessParams>    shared_process_params_m;
72         size_t                          in_size_m;
73         size_t                          out_size_m;
74         ArgInfo                         in_m;
75         ArgInfo                         out_m;
77         // Index of the OpenCL device/queue used by this EO
78         uint8_t                         device_index_m;
80         // Frame being processed by the EO
81         int                             current_frame_idx_m;
83         // Trace related
84         uint32_t                          num_network_layers_m;
85         up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
86         size_t                            trace_buf_params_sz_m;
87         void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
89         const LayerOutput* GetOutputFromLayer (uint32_t layer_index,
90                                                uint32_t output_index) const;
91         const LayerOutputs* GetOutputsFromAllLayers() const;
92 };
95 ExecutionObject::ExecutionObject(Device* d,
96                                  uint8_t device_index,
97                                  const ArgInfo& create_arg,
98                                  const ArgInfo& param_heap_arg,
99                                  size_t extmem_heap_size,
100                                  bool   internal_input)
102     pimpl_m = std::unique_ptr<ExecutionObject::Impl>
103               { new ExecutionObject::Impl(d, device_index,
104                                           create_arg,
105                                           param_heap_arg,
106                                           extmem_heap_size,
107                                           internal_input) };
111 ExecutionObject::Impl::Impl(Device* d,
112                                  uint8_t device_index,
113                                  const ArgInfo& create_arg,
114                                  const ArgInfo& param_heap_arg,
115                                  size_t extmem_heap_size,
116                                  bool   internal_input):
117     device_m(d),
118     k_initialize_m(nullptr),
119     k_process_m(nullptr),
120     k_cleanup_m(nullptr),
121     tidl_extmem_heap_m (nullptr, &__free_ddr),
122     shared_initialize_params_m(nullptr, &__free_ddr),
123     shared_process_params_m(nullptr, &__free_ddr),
124     in_size_m(0),
125     out_size_m(0),
126     in_m(nullptr, 0),
127     out_m(nullptr, 0),
128     device_index_m(device_index),
129     current_frame_idx_m(0),
130     num_network_layers_m(0),
131     trace_buf_params_m(nullptr, &__free_ddr),
132     trace_buf_params_sz_m(0)
134     // Allocate a heap for TI DL to use on the device
135     tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
137     // Create a kernel for cleanup
138     KernelArgs cleanup_args;
139     k_cleanup_m.reset(new Kernel(device_m,
140                                  STRING(CLEANUP_KERNEL),
141                                  cleanup_args, device_index_m));
143     // Set up parameter struct for the initialize kernel
144     shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
145     memset(shared_initialize_params_m.get(), 0,
146            sizeof(OCL_TIDL_InitializeParams));
148     shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
149     shared_initialize_params_m->l2HeapSize   = tidl::internal::DMEM1_SIZE;
150     shared_initialize_params_m->l1HeapSize   = tidl::internal::DMEM0_SIZE;
151     shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
152     shared_initialize_params_m->enableInternalInput = internal_input ? 1 : 0;
154     // Setup kernel arguments for initialize
155     KernelArgs args = { create_arg,
156                         param_heap_arg,
157                         ArgInfo(tidl_extmem_heap_m.get(),
158                                 extmem_heap_size),
159                         ArgInfo(shared_initialize_params_m.get(),
160                                 sizeof(OCL_TIDL_InitializeParams)),
161                         device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
162                             ArgInfo(nullptr, tidl::internal::DMEM1_SIZE):
163                             ArgInfo(nullptr, 4)                       };
165     k_initialize_m.reset(new Kernel(device_m,
166                                     STRING(INIT_KERNEL), args, device_index_m));
168     // Save number of layers in the network
169     const TIDL_CreateParams* cp =
170                 static_cast<const TIDL_CreateParams *>(create_arg.ptr());
171     num_network_layers_m = cp->net.numLayers;
174 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
175 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
176 // unique_ptr's destructor requires a complete type in order to invoke delete
177 ExecutionObject::~ExecutionObject() = default;
179 char* ExecutionObject::GetInputBufferPtr() const
181     return static_cast<char *>(pimpl_m->in_m.ptr());
184 size_t ExecutionObject::GetInputBufferSizeInBytes() const
186     if (pimpl_m->in_m.ptr() == nullptr)  return pimpl_m->in_size_m;
187     else                                 return pimpl_m->in_m.size();
190 char* ExecutionObject::GetOutputBufferPtr() const
192     return static_cast<char *>(pimpl_m->out_m.ptr());
195 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
197     if (pimpl_m->out_m.ptr() == nullptr)  return pimpl_m->out_size_m;
198     else           return pimpl_m->shared_process_params_m.get()->bytesWritten;
201 void  ExecutionObject::SetFrameIndex(int idx)
203     pimpl_m->current_frame_idx_m = idx;
206 int ExecutionObject::GetFrameIndex() const
208     return pimpl_m->current_frame_idx_m;
211 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
213     pimpl_m->SetupProcessKernel(in, out);
216 bool ExecutionObject::ProcessFrameStartAsync()
218     return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
221 bool ExecutionObject::ProcessFrameWait()
223     return pimpl_m->Wait(ExecutionObject::CallType::PROCESS);
226 bool ExecutionObject::RunAsync (CallType ct)
228     return pimpl_m->RunAsync(ct);
231 bool ExecutionObject::Wait (CallType ct)
233     return pimpl_m->Wait(ct);
236 uint64_t ExecutionObject::GetProcessCycles() const
238     uint8_t factor = 1;
240     // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
241     if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM)
242         factor = 2;
244     return pimpl_m->shared_process_params_m.get()->cycles * factor;
247 float ExecutionObject::GetProcessTimeInMilliSeconds() const
249     float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
250     return ((float)GetProcessCycles())/frequency * 1000;
253 const LayerOutput* ExecutionObject::GetOutputFromLayer(
254                          uint32_t layer_index, uint32_t output_index) const
256     return pimpl_m->GetOutputFromLayer(layer_index, output_index);
259 const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
261     return pimpl_m->GetOutputsFromAllLayers();
264 //
265 // Allocate an OpenCL buffer for TIDL layer output buffer metadata.
266 // The device will populate metadata for every buffer that is used as an
267 // output buffer by a layer.
268 //
269 void ExecutionObject::EnableOutputBufferTrace()
271     pimpl_m->trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
272                                        pimpl_m->num_network_layers_m*
273                                        TIDL_NUM_OUT_BUFS);
275     pimpl_m->trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
276                                       (pimpl_m->trace_buf_params_sz_m));
278     // Device will update bufferId if there is valid data for the entry
279     OCL_TIDL_BufParams* bufferParams = pimpl_m->trace_buf_params_m.get();
280     for (uint32_t i = 0; i < pimpl_m->num_network_layers_m; i++)
281         for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
282         {
283             OCL_TIDL_BufParams *bufP =
284                                 &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
285             bufP->bufferId = UINT_MAX;
286         }
289 void
290 ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
292     pimpl_m->WriteLayerOutputsToFile(filename_prefix);
295 //
296 // Create a kernel to call the "process" function
297 //
298 bool
299 ExecutionObject::Impl::SetupProcessKernel(const ArgInfo& in, const ArgInfo& out)
301     in_m = in;
302     out_m = out;
304     shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
305     shared_process_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
306     shared_process_params_m->enableInternalInput =
307                                shared_initialize_params_m->enableInternalInput;
308     shared_process_params_m->cycles = 0;
310     if (shared_process_params_m->enableInternalInput == 0)
311         assert(in.ptr() != nullptr && in.size() > 0);
313     KernelArgs args = { ArgInfo(shared_process_params_m.get(),
314                                 sizeof(OCL_TIDL_ProcessParams)),
315                         in,
316                         out,
317                         ArgInfo(tidl_extmem_heap_m.get(),
318                                 shared_initialize_params_m->tidlHeapSize),
319                         ArgInfo(trace_buf_params_m.get(),
320                                 trace_buf_params_sz_m)
322                       };
324     k_process_m.reset(new Kernel(device_m,
325                                  STRING(PROCESS_KERNEL), args, device_index_m));
327     return true;
331 static size_t readDataS8(const char *readPtr, char *ptr, int roi, int n,
332                          int width, int height, int pitch,
333                          int chOffset)
335     if (!readPtr)  return 0;
337     for(int i2 = 0; i2 < roi; i2++)
338         for(int i0 = 0; i0 < n; i0++)
339             for(int i1 = 0; i1 < height; i1++)
340                 memcpy(&ptr[i2*n*chOffset + i0*chOffset + i1*pitch],
341                        &readPtr[i2*n*width*height + i0*width*height+ i1*width],
342                        width);
344     return width*height*n*roi;
347 static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
348                           int height, int pitch, int chOffset)
350     if (!writePtr)  return 0;
352     for(int i0 = 0; i0 < n; i0++)
353         for(int i1 = 0; i1 < height; i1++)
354             memcpy(&writePtr[i0*width*height + i1*width],
355                    &ptr[i0*chOffset + i1*pitch],
356                    width);
358     return width*height*n;
361 //
362 // Copy from host buffer to TIDL device buffer
363 //
364 void ExecutionObject::Impl::HostWriteNetInput()
366     const char*     readPtr  = (const char *) in_m.ptr();
367     const PipeInfo* pipe     = in_m.GetPipe();
369     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
370     {
371         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
373         if (shared_process_params_m->enableInternalInput == 0)
374         {
375             readPtr += readDataS8(
376                 readPtr,
377                 (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
378                     + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
379                     + OCL_TIDL_MAX_PAD_SIZE,
380                 inBuf->numROIs,
381                 inBuf->numChannels,
382                 inBuf->ROIWidth,
383                 inBuf->ROIHeight,
384                 inBuf->bufPlaneWidth,
385                 ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) /
386                  inBuf->numChannels));
387         }
388         else
389         {
390             shared_process_params_m->inBufAddr[i] = pipe->bufAddr_m[i];
391         }
393         shared_process_params_m->inDataQ[i]   = pipe->dataQ_m[i];
394     }
397 //
398 // Copy from TIDL device buffer into host buffer
399 //
400 void ExecutionObject::Impl::HostReadNetOutput()
402     char* writePtr = (char *) out_m.ptr();
403     PipeInfo* pipe = out_m.GetPipe();
405     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
406     {
407         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
408         if (writePtr != nullptr)
409         {
410             writePtr += writeDataS8(
411                 writePtr,
412                 (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
413                     + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
414                     + OCL_TIDL_MAX_PAD_SIZE,
415                 outBuf->numChannels,
416                 outBuf->ROIWidth,
417                 outBuf->ROIHeight,
418                 outBuf->bufPlaneWidth,
419                 ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
420                  outBuf->numChannels));
421         }
423         pipe->dataQ_m[i]   = shared_process_params_m->outDataQ[i];
424         pipe->bufAddr_m[i] = shared_initialize_params_m->bufAddrBase
425                            + outBuf->bufPlaneBufOffset;
426     }
427     shared_process_params_m->bytesWritten = writePtr - (char *) out_m.ptr();
430 void ExecutionObject::Impl::ComputeInputOutputSizes()
432     if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)  return;
434     if (shared_initialize_params_m->numInBufs > OCL_TIDL_MAX_IN_BUFS ||
435         shared_initialize_params_m->numOutBufs > OCL_TIDL_MAX_OUT_BUFS)
436     {
437         std::cout << "Num input/output bufs ("
438                   << shared_initialize_params_m->numInBufs << ", "
439                   << shared_initialize_params_m->numOutBufs
440                   << ") exceeded limit!" << std::endl;
441         shared_initialize_params_m->errorCode = OCL_TIDL_INIT_FAIL;
442         return;
443     }
445     in_size_m  = 0;
446     out_size_m = 0;
447     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
448     {
449         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
450         in_size_m += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
451                      inBuf->ROIHeight;
452     }
453     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
454     {
455         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
456         out_size_m += outBuf->numChannels * outBuf->ROIWidth *outBuf->ROIHeight;
457     }
461 bool ExecutionObject::Impl::RunAsync(CallType ct)
463     switch (ct)
464     {
465         case CallType::INIT:
466         {
467             k_initialize_m->RunAsync();
468             break;
469         }
470         case CallType::PROCESS:
471         {
472             shared_process_params_m->frameIdx = current_frame_idx_m;
473             shared_process_params_m->bytesWritten = 0;
474             HostWriteNetInput();
475             k_process_m->RunAsync();
476             break;
477         }
478         case CallType::CLEANUP:
479         {
480             k_cleanup_m->RunAsync();
481             break;
482         }
483         default:
484             return false;
485     }
487     return true;
490 bool ExecutionObject::Impl::Wait(CallType ct)
492     switch (ct)
493     {
494         case CallType::INIT:
495         {
496             bool has_work = k_initialize_m->Wait();
498             if (has_work)
499             {
500                 ComputeInputOutputSizes();
501                 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
502                     throw Exception(shared_initialize_params_m->errorCode,
503                                     __FILE__, __FUNCTION__, __LINE__);
504             }
505             return has_work;
506         }
507         case CallType::PROCESS:
508         {
509             bool has_work = k_process_m->Wait();
510             if (has_work)
511             {
512                 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
513                     throw Exception(shared_process_params_m->errorCode,
514                                     __FILE__, __FUNCTION__, __LINE__);
515                 HostReadNetOutput();
516             }
518             return has_work;
519         }
520         case CallType::CLEANUP:
521         {
522             return k_cleanup_m->Wait();
523             break;
524         }
525         default:
526             return false;
527     }
529     return false;
532 //
533 // Write the trace data to output files
534 //
535 void
536 ExecutionObject::Impl::WriteLayerOutputsToFile(const std::string& filename_prefix) const
538     if (trace_buf_params_sz_m == 0)
539         return;
541     OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
543     for (uint32_t i = 0; i < num_network_layers_m; i++)
544         for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
545         {
546             OCL_TIDL_BufParams* buf = &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
548             if (buf->bufferId == UINT_MAX)
549                 continue;
551             size_t buffer_size = buf->numChannels * buf->ROIHeight *
552                                  buf->ROIWidth;
554             char *tmp = new char[buffer_size];
556             if (tmp == nullptr)
557                 throw Exception("Out of memory, new failed",
558                         __FILE__, __FUNCTION__, __LINE__);
560             writeDataS8(
561                 tmp,
562                 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
563                 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
564                 + OCL_TIDL_MAX_PAD_SIZE,
565                 buf->numChannels,
566                 buf->ROIWidth,
567                 buf->ROIHeight,
568                 buf->bufPlaneWidth,
569                 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
570                  buf->numChannels));
572             std::string filename(filename_prefix);
573             filename += std::to_string(buf->bufferId) + "_";
574             filename += std::to_string(buf->ROIWidth) + "x";
575             filename += std::to_string(buf->ROIHeight) + ".bin";
577             std::ofstream ofs;
578             ofs.open(filename, std::ofstream::out);
579             ofs.write(tmp, buffer_size);
580             ofs.close();
582             delete[] tmp;
583         }
587 const LayerOutput* ExecutionObject::Impl::GetOutputFromLayer(
588                             uint32_t layer_index, uint32_t output_index) const
590     if (trace_buf_params_sz_m == 0)
591         return nullptr;
593     if (layer_index > num_network_layers_m || output_index > TIDL_NUM_OUT_BUFS)
594         return nullptr;
596     OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
597     OCL_TIDL_BufParams* buf = &bufferParams[layer_index*TIDL_NUM_OUT_BUFS+
598                                             output_index];
600     if (buf->bufferId == UINT_MAX)
601         return nullptr;
603     size_t buffer_size = buf->numChannels * buf->ROIHeight *
604                          buf->ROIWidth;
606     char *data = new char[buffer_size];
608     if (data == nullptr)
609         throw Exception("Out of memory, new failed",
610                 __FILE__, __FUNCTION__, __LINE__);
612     writeDataS8(data,
613                 (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
614                 + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
615                 + OCL_TIDL_MAX_PAD_SIZE,
616                 buf->numChannels,
617                 buf->ROIWidth,
618                 buf->ROIHeight,
619                 buf->bufPlaneWidth,
620                 ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
621                  buf->numChannels));
623     return new LayerOutput(layer_index, output_index, buf->bufferId,
624                            buf->numROIs, buf->numChannels, buf->ROIHeight,
625                            buf->ROIWidth, data);
628 const LayerOutputs* ExecutionObject::Impl::GetOutputsFromAllLayers() const
630     LayerOutputs* result = new LayerOutputs;
632     for (uint32_t i=0; i < num_network_layers_m; i++)
633         for (int j=0; j < TIDL_NUM_OUT_BUFS; j++)
634         {
635             const LayerOutput* lo = GetOutputFromLayer(i, j);
636             if (lo)
637                 result->push_back(std::unique_ptr<const LayerOutput>{ lo });
638         }
640     return result;
643 LayerOutput::LayerOutput(int layer_index, int output_index, int buffer_id,
644                          int num_roi, int num_channels, size_t height,
645                          size_t width, const char* data):
646                         layer_index_m(layer_index), buffer_id_m(buffer_id),
647                         num_roi_m(num_roi), num_channels_m(num_channels),
648                         height_m(height), width_m(width), data_m(data)
649 { }
651 LayerOutput::~LayerOutput()
653     delete[] data_m;