c74389ff8cd24dfe7ae874aec994041abbf97ac3
[tidl/tidl-api.git] / tinn_api / src / execution_object.cpp
1 /******************************************************************************
2  * Copyright (c) 2017 Texas Instruments Incorporated - http://www.ti.com/
3  *  All rights reserved.
4  *
5  *  Redistribution and use in source and binary forms, with or without
6  *  modification, are permitted provided that the following conditions are met:
7  *      * Redistributions of source code must retain the above copyright
8  *        notice, this list of conditions and the following disclaimer.
9  *      * Redistributions in binary form must reproduce the above copyright
10  *        notice, this list of conditions and the following disclaimer in the
11  *        documentation and/or other materials provided with the distribution.
12  *      * Neither the name of Texas Instruments Incorporated nor the
13  *        names of its contributors may be used to endorse or promote products
14  *        derived from this software without specific prior written permission.
15  *
16  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  *  THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
29 /*! \file execution_object.cpp */
31 #include "executor.h"
32 #include "execution_object.h"
33 #include "trace.h"
34 #include "ocl_device.h"
35 #include "parameters.h"
36 #include "configuration.h"
37 #include "common_defines.h"
38 #include <string.h>
40 using namespace tinn;
42 class ExecutionObject::Impl
43 {
44     public:
45         Impl(Device* d, uint8_t device_index,
46              const ArgInfo& create_arg,
47              const ArgInfo& param_heap_arg,
48              size_t extmem_heap_size,
49              uint32_t internal_input);
50         ~Impl() {}
52         bool RunAsync(CallType ct);
53         bool Wait    (CallType ct);
55         bool SetupProcessKernel(const ArgInfo& in, const ArgInfo& out);
56         void HostWriteNetInput();
57         void HostReadNetOutput();
58         void ComputeInputOutputSizes();
60         Device*                         device_m;
61         std::unique_ptr<Kernel>         k_initialize_m;
62         std::unique_ptr<Kernel>         k_process_m;
63         std::unique_ptr<Kernel>         k_cleanup_m;
65         up_malloc_ddr<char>             tidl_extmem_heap_m;
66         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
67         up_malloc_ddr<OCL_TIDL_ProcessParams>    shared_process_params_m;
69         size_t                          in_size_m;
70         size_t                          out_size_m;
71         ArgInfo                         in_m;
72         ArgInfo                         out_m;
74         // Index of the OpenCL device/queue used by this EO
75         uint8_t                         device_index_m;
77         // Frame being processed by the EO
78         int                             current_frame_idx_m;
79 };
82 ExecutionObject::ExecutionObject(Device* d,
83                                  uint8_t device_index,
84                                  const ArgInfo& create_arg,
85                                  const ArgInfo& param_heap_arg,
86                                  size_t extmem_heap_size,
87                                  uint32_t internal_input)
88 {
89     pimpl_m = std::unique_ptr<ExecutionObject::Impl>
90               { new ExecutionObject::Impl(d, device_index,
91                                           create_arg,
92                                           param_heap_arg,
93                                           extmem_heap_size,
94                                           internal_input) };
95 }
98 ExecutionObject::Impl::Impl(Device* d,
99                                  uint8_t device_index,
100                                  const ArgInfo& create_arg,
101                                  const ArgInfo& param_heap_arg,
102                                  size_t extmem_heap_size,
103                                  uint32_t internal_input):
104     device_m(d),
105     k_initialize_m(nullptr),
106     k_process_m(nullptr),
107     k_cleanup_m(nullptr),
108     tidl_extmem_heap_m (nullptr, &__free_ddr),
109     shared_initialize_params_m(nullptr, &__free_ddr),
110     shared_process_params_m(nullptr, &__free_ddr),
111     in_size_m(0),
112     out_size_m(0),
113     in_m(nullptr, 0),
114     out_m(nullptr, 0),
115     device_index_m(device_index),
116     current_frame_idx_m(0)
118     // Allocate a heap for TI DL to use on the device
119     tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
121     // Create a kernel for cleanup
122     KernelArgs cleanup_args;
123     k_cleanup_m.reset(new Kernel(device_m,
124                                  STRING(CLEANUP_KERNEL),
125                                  cleanup_args, device_index_m));
127     // Set up parameter struct for the initialize kernel
128     shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
129     memset(shared_initialize_params_m.get(), 0,
130            sizeof(OCL_TIDL_InitializeParams));
132     shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
133     shared_initialize_params_m->l2HeapSize   = tinn::internal::DMEM1_SIZE;
134     shared_initialize_params_m->l1HeapSize   = tinn::internal::DMEM0_SIZE;
135     shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
136     shared_initialize_params_m->enableInternalInput = internal_input;
138     // Setup kernel arguments for initialize
139     KernelArgs args = { create_arg,
140                         param_heap_arg,
141                         ArgInfo(tidl_extmem_heap_m.get(),
142                                 extmem_heap_size),
143                         ArgInfo(shared_initialize_params_m.get(),
144                                 sizeof(OCL_TIDL_InitializeParams)),
145                         device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
146                             ArgInfo(nullptr, tinn::internal::DMEM1_SIZE):
147                             ArgInfo(nullptr, 4)                       };
149     k_initialize_m.reset(new Kernel(device_m,
150                                     STRING(INIT_KERNEL), args, device_index_m));
153 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
154 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
155 // unique_ptr's destructor requires a complete type in order to invoke delete
156 ExecutionObject::~ExecutionObject() = default;
158 char* ExecutionObject::GetInputBufferPtr() const
160     return static_cast<char *>(pimpl_m->in_m.ptr());
163 size_t ExecutionObject::GetInputBufferSizeInBytes() const
165     if (pimpl_m->in_m.ptr() == nullptr)  return pimpl_m->in_size_m;
166     else                                 return pimpl_m->in_m.size();
169 char* ExecutionObject::GetOutputBufferPtr() const
171     return static_cast<char *>(pimpl_m->out_m.ptr());
174 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
176     if (pimpl_m->out_m.ptr() == nullptr)  return pimpl_m->out_size_m;
177     else           return pimpl_m->shared_process_params_m.get()->bytesWritten;
180 void  ExecutionObject::SetFrameIndex(int idx)
182     pimpl_m->current_frame_idx_m = idx;
185 int ExecutionObject::GetFrameIndex() const
187     return pimpl_m->current_frame_idx_m;
190 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
192     pimpl_m->SetupProcessKernel(in, out);
195 bool ExecutionObject::ProcessFrameStartAsync()
197     return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
200 bool ExecutionObject::ProcessFrameWait()
202     return pimpl_m->Wait(ExecutionObject::CallType::PROCESS);
205 bool ExecutionObject::RunAsync (CallType ct)
207     return pimpl_m->RunAsync(ct);
210 bool ExecutionObject::Wait (CallType ct)
212     return pimpl_m->Wait(ct);
215 uint64_t ExecutionObject::GetProcessCycles() const
217     uint8_t factor = 1;
219     // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
220     if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM)
221         factor = 2;
223     return pimpl_m->shared_process_params_m.get()->cycles * factor;
226 float ExecutionObject::GetProcessTimeInMilliSeconds() const
228     float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
229     return ((float)GetProcessCycles())/frequency * 1000;
232 //
233 // Create a kernel to call the "process" function
234 //
235 bool
236 ExecutionObject::Impl::SetupProcessKernel(const ArgInfo& in, const ArgInfo& out)
238     in_m = in;
239     out_m = out;
241     shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
242     shared_process_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
243     shared_process_params_m->enableInternalInput = 
244                                shared_initialize_params_m->enableInternalInput;
245     shared_process_params_m->cycles = 0;
247     if (shared_process_params_m->enableInternalInput == 0)
248         assert(in.ptr() != nullptr && in.size() > 0);
250     KernelArgs args = { ArgInfo(shared_process_params_m.get(),
251                                 sizeof(OCL_TIDL_ProcessParams)),
252                         in,
253                         out,
254                         ArgInfo(tidl_extmem_heap_m.get(),
255                                 shared_initialize_params_m->tidlHeapSize)
256                       };
258     k_process_m.reset(new Kernel(device_m,
259                                  STRING(PROCESS_KERNEL), args, device_index_m));
261     return true;
265 static size_t readDataS8(const char *readPtr, char *ptr, int roi, int n,
266                          int width, int height, int pitch,
267                          int chOffset)
269     if (!readPtr)  return 0;
271     for(int i2 = 0; i2 < roi; i2++)
272         for(int i0 = 0; i0 < n; i0++)
273             for(int i1 = 0; i1 < height; i1++)
274                 memcpy(&ptr[i2*n*chOffset + i0*chOffset + i1*pitch],
275                        &readPtr[i2*n*width*height + i0*width*height+ i1*width],
276                        width);
278     return width*height*n*roi;
281 static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
282                           int height, int pitch, int chOffset)
284     if (!writePtr)  return 0;
286     for(int i0 = 0; i0 < n; i0++)
287         for(int i1 = 0; i1 < height; i1++)
288             memcpy(&writePtr[i0*width*height + i1*width],
289                    &ptr[i0*chOffset + i1*pitch],
290                    width);
292     return width*height*n;
295 void ExecutionObject::Impl::HostWriteNetInput()
297     char* readPtr  = (char *) in_m.ptr();
298     PipeInfo *pipe = in_m.GetPipe();
300     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
301     {
302         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
304         if (shared_process_params_m->enableInternalInput == 0)
305         {
306             readPtr += readDataS8(
307                 readPtr,
308                 (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
309                     + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
310                     + OCL_TIDL_MAX_PAD_SIZE,
311                 inBuf->numROIs,
312                 inBuf->numChannels,
313                 inBuf->ROIWidth,
314                 inBuf->ROIHeight,
315                 inBuf->bufPlaneWidth,
316                 ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) /
317                  inBuf->numChannels));
318         }
319         else
320         {
321             shared_process_params_m->inBufAddr[i] = pipe->bufAddr_m[i];
322         }
324         shared_process_params_m->inDataQ[i]   = pipe->dataQ_m[i];
325     }
328 void ExecutionObject::Impl::HostReadNetOutput()
330     char* writePtr = (char *) out_m.ptr();
331     PipeInfo *pipe = out_m.GetPipe();
333     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
334     {
335         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
336         if (writePtr != nullptr)
337         {
338             writePtr += writeDataS8(
339                 writePtr,
340                 (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
341                     + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
342                     + OCL_TIDL_MAX_PAD_SIZE,
343                 outBuf->numChannels,
344                 outBuf->ROIWidth,
345                 outBuf->ROIHeight,
346                 outBuf->bufPlaneWidth,
347                 ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
348                  outBuf->numChannels));
349         }
351         pipe->dataQ_m[i]   = shared_process_params_m->outDataQ[i];
352         pipe->bufAddr_m[i] = shared_initialize_params_m->bufAddrBase
353                            + outBuf->bufPlaneBufOffset;
354     }
355     shared_process_params_m->bytesWritten = writePtr - (char *) out_m.ptr();
358 void ExecutionObject::Impl::ComputeInputOutputSizes()
360     if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)  return;
362     if (shared_initialize_params_m->numInBufs > OCL_TIDL_MAX_IN_BUFS ||
363         shared_initialize_params_m->numOutBufs > OCL_TIDL_MAX_OUT_BUFS)
364     {
365         std::cout << "Num input/output bufs ("
366                   << shared_initialize_params_m->numInBufs << ", "
367                   << shared_initialize_params_m->numOutBufs
368                   << ") exceeded limit!" << std::endl;
369         shared_initialize_params_m->errorCode = OCL_TIDL_INIT_FAIL;
370         return;
371     }
373     in_size_m  = 0;
374     out_size_m = 0;
375     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
376     {
377         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
378         in_size_m += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
379                      inBuf->ROIHeight;
380     }
381     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
382     {
383         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
384         out_size_m += outBuf->numChannels * outBuf->ROIWidth *outBuf->ROIHeight;
385     }
389 bool ExecutionObject::Impl::RunAsync(CallType ct)
391     switch (ct)
392     {
393         case CallType::INIT:
394         {
395             k_initialize_m->RunAsync();
396             break;
397         }
398         case CallType::PROCESS:
399         {
400             shared_process_params_m->frameIdx = current_frame_idx_m;
401             shared_process_params_m->bytesWritten = 0;
402             HostWriteNetInput();
403             k_process_m->RunAsync();
404             break;
405         }
406         case CallType::CLEANUP:
407         {
408             k_cleanup_m->RunAsync();
409             break;
410         }
411         default:
412             return false;
413     }
415     return true;
418 bool ExecutionObject::Impl::Wait(CallType ct)
420     switch (ct)
421     {
422         case CallType::INIT:
423         {
424             bool has_work = k_initialize_m->Wait();
426             if (has_work)
427             {
428                 ComputeInputOutputSizes();
429                 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
430                     throw Exception(shared_initialize_params_m->errorCode,
431                                     __FILE__, __FUNCTION__, __LINE__);
432             }
433             return has_work;
434         }
435         case CallType::PROCESS:
436         {
437             bool has_work = k_process_m->Wait();
438             if (has_work)
439             {
440                 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
441                     throw Exception(shared_process_params_m->errorCode,
442                                     __FILE__, __FUNCTION__, __LINE__);
443                 HostReadNetOutput();
444             }
446             return has_work;
447         }
448         case CallType::CLEANUP:
449         {
450             return k_cleanup_m->Wait();
451             break;
452         }
453         default:
454             return false;
455     }
457     return false;