Compute input/output size based on network
[tidl/tidl-api.git] / tinn_api / src / execution_object.cpp
1 /******************************************************************************
2  * Copyright (c) 2017 Texas Instruments Incorporated - http://www.ti.com/
3  *  All rights reserved.
4  *
5  *  Redistribution and use in source and binary forms, with or without
6  *  modification, are permitted provided that the following conditions are met:
7  *      * Redistributions of source code must retain the above copyright
8  *        notice, this list of conditions and the following disclaimer.
9  *      * Redistributions in binary form must reproduce the above copyright
10  *        notice, this list of conditions and the following disclaimer in the
11  *        documentation and/or other materials provided with the distribution.
12  *      * Neither the name of Texas Instruments Incorporated nor the
13  *        names of its contributors may be used to endorse or promote products
14  *        derived from this software without specific prior written permission.
15  *
16  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  *  THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
29 /*! \file execution_object.cpp */
31 #include "executor.h"
32 #include "execution_object.h"
33 #include "trace.h"
34 #include "ocl_device.h"
35 #include "parameters.h"
36 #include "configuration.h"
37 #include "common_defines.h"
38 #include <string.h>
40 using namespace tinn;
42 class ExecutionObject::Impl
43 {
44     public:
45         Impl(Device* d, uint8_t device_index,
46              const ArgInfo& create_arg,
47              const ArgInfo& param_heap_arg,
48              size_t extmem_heap_size);
49         ~Impl() {}
51         bool RunAsync(CallType ct);
52         bool Wait    (CallType ct);
54         bool SetupProcessKernel(const ArgInfo& in, const ArgInfo& out);
55         void HostWriteNetInput();
56         void HostReadNetOutput();
57         void ComputeInputOutputSizes();
59         Device*                         device_m;
60         std::unique_ptr<Kernel>         k_initialize_m;
61         std::unique_ptr<Kernel>         k_process_m;
62         std::unique_ptr<Kernel>         k_cleanup_m;
64         up_malloc_ddr<char>             tidl_extmem_heap_m;
65         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
66         up_malloc_ddr<OCL_TIDL_ProcessParams>    shared_process_params_m;
68         size_t                          in_size;
69         size_t                          out_size;
70         ArgInfo                         in_m;
71         ArgInfo                         out_m;
73         // Index of the OpenCL device/queue used by this EO
74         uint8_t                         device_index_m;
76         // Frame being processed by the EO
77         int                             current_frame_idx_m;
78 };
81 ExecutionObject::ExecutionObject(Device* d,
82                                  uint8_t device_index,
83                                  const ArgInfo& create_arg,
84                                  const ArgInfo& param_heap_arg,
85                                  size_t extmem_heap_size)
86 {
87     pimpl_m = std::unique_ptr<ExecutionObject::Impl>
88               { new ExecutionObject::Impl(d, device_index,
89                                           create_arg,
90                                           param_heap_arg,
91                                           extmem_heap_size) };
92 }
95 ExecutionObject::Impl::Impl(Device* d,
96                                  uint8_t device_index,
97                                  const ArgInfo& create_arg,
98                                  const ArgInfo& param_heap_arg,
99                                  size_t extmem_heap_size):
100     device_m(d),
101     k_initialize_m(nullptr),
102     k_process_m(nullptr),
103     k_cleanup_m(nullptr),
104     tidl_extmem_heap_m (nullptr, &__free_ddr),
105     shared_initialize_params_m(nullptr, &__free_ddr),
106     shared_process_params_m(nullptr, &__free_ddr),
107     in_size(0),
108     out_size(0),
109     in_m(nullptr, 0),
110     out_m(nullptr, 0),
111     device_index_m(device_index),
112     current_frame_idx_m(0)
114     // Allocate a heap for TI DL to use on the device
115     tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
117     // Create a kernel for cleanup
118     KernelArgs cleanup_args;
119     k_cleanup_m.reset(new Kernel(device_m,
120                                  STRING(CLEANUP_KERNEL),
121                                  cleanup_args, device_index_m));
123     // Set up parameter struct for the initialize kernel
124     shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
125     memset(shared_initialize_params_m.get(), 0,
126            sizeof(OCL_TIDL_InitializeParams));
128     shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
129     shared_initialize_params_m->l2HeapSize   = tinn::internal::DMEM1_SIZE;
130     shared_initialize_params_m->l1HeapSize   = tinn::internal::DMEM0_SIZE;
131     shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
133     // Setup kernel arguments for initialize
134     KernelArgs args = { create_arg,
135                         param_heap_arg,
136                         ArgInfo(tidl_extmem_heap_m.get(),
137                                 extmem_heap_size),
138                         ArgInfo(shared_initialize_params_m.get(),
139                                 sizeof(OCL_TIDL_InitializeParams)),
140                         device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
141                             ArgInfo(nullptr, tinn::internal::DMEM1_SIZE):
142                             ArgInfo(nullptr, 4)                       };
144     k_initialize_m.reset(new Kernel(device_m,
145                                     STRING(INIT_KERNEL), args, device_index_m));
148 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
149 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
150 // unique_ptr's destructor requires a complete type in order to invoke delete
151 ExecutionObject::~ExecutionObject() = default;
153 char* ExecutionObject::GetInputBufferPtr() const
155     return static_cast<char *>(pimpl_m->in_m.ptr());
158 size_t ExecutionObject::GetInputBufferSizeInBytes() const
160     if (pimpl_m->in_m.ptr() == nullptr)  return pimpl_m->in_size;
161     else                                 return pimpl_m->in_m.size();
164 char* ExecutionObject::GetOutputBufferPtr() const
166     return static_cast<char *>(pimpl_m->out_m.ptr());
169 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
171     if (pimpl_m->out_m.ptr() == nullptr)  return pimpl_m->out_size;
172     else           return pimpl_m->shared_process_params_m.get()->bytesWritten;
175 void  ExecutionObject::SetFrameIndex(int idx)
177     pimpl_m->current_frame_idx_m = idx;
180 int ExecutionObject::GetFrameIndex() const
182     return pimpl_m->current_frame_idx_m;
185 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
187     assert (in.ptr() != nullptr && in.size() > 0);
188     assert (out.ptr() != nullptr && out.size() > 0);
190     pimpl_m->SetupProcessKernel(in, out);
193 bool ExecutionObject::ProcessFrameStartAsync()
195     return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
198 bool ExecutionObject::ProcessFrameWait()
200     return pimpl_m->Wait(ExecutionObject::CallType::PROCESS);
203 bool ExecutionObject::RunAsync (CallType ct)
205     return pimpl_m->RunAsync(ct);
208 bool ExecutionObject::Wait (CallType ct)
210     return pimpl_m->Wait(ct);
213 uint64_t ExecutionObject::GetProcessCycles() const
215     uint8_t factor = 1;
217     // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
218     if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM)
219         factor = 2;
221     return pimpl_m->shared_process_params_m.get()->cycles * factor;
224 float ExecutionObject::GetProcessTimeInMilliSeconds() const
226     float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
227     return ((float)GetProcessCycles())/frequency * 1000;
230 //
231 // Create a kernel to call the "process" function
232 //
233 bool
234 ExecutionObject::Impl::SetupProcessKernel(const ArgInfo& in, const ArgInfo& out)
236     in_m = in;
237     out_m = out;
239     shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
240     shared_process_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
241     shared_process_params_m->cycles = 0;
243     KernelArgs args = { ArgInfo(shared_process_params_m.get(),
244                                 sizeof(OCL_TIDL_ProcessParams)),
245                         in,
246                         out,
247                         ArgInfo(tidl_extmem_heap_m.get(),
248                                 shared_initialize_params_m->tidlHeapSize)
249                       };
251     k_process_m.reset(new Kernel(device_m,
252                                  STRING(PROCESS_KERNEL), args, device_index_m));
254     return true;
258 static size_t readDataS8(const char *readPtr, char *ptr, int roi, int n,
259                          int width, int height, int pitch,
260                          int chOffset)
262     if (!readPtr)  return 0;
264     for(int i2 = 0; i2 < roi; i2++)
265         for(int i0 = 0; i0 < n; i0++)
266             for(int i1 = 0; i1 < height; i1++)
267                 memcpy(&ptr[i2*n*chOffset + i0*chOffset + i1*pitch],
268                        &readPtr[i2*n*width*height + i0*width*height+ i1*width],
269                        width);
271     return width*height*n*roi;
274 static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
275                           int height, int pitch, int chOffset)
277     if (!writePtr)  return 0;
279     for(int i0 = 0; i0 < n; i0++)
280         for(int i1 = 0; i1 < height; i1++)
281             memcpy(&writePtr[i0*width*height + i1*width],
282                    &ptr[i0*chOffset + i1*pitch],
283                    width);
285     return width*height*n;
288 void ExecutionObject::Impl::HostWriteNetInput()
290     char* readPtr = (char *) in_m.ptr();
291     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
292     {
293         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
294         readPtr += readDataS8(
295             readPtr,
296             (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
297                 + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
298                 + OCL_TIDL_MAX_PAD_SIZE,
299             inBuf->numROIs,
300             inBuf->numChannels,
301             inBuf->ROIWidth,
302             inBuf->ROIHeight,
303             inBuf->bufPlaneWidth,
304             inBuf->bufPlaneWidth
305                 * (inBuf->ROIHeight + 2 * OCL_TIDL_MAX_PAD_SIZE) );
306     }
309 void ExecutionObject::Impl::HostReadNetOutput()
311     char* writePtr = (char *) out_m.ptr();
312     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
313     {
314         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
315         writePtr += writeDataS8(
316             writePtr,
317             (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
318                 + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
319                 + OCL_TIDL_MAX_PAD_SIZE,
320             outBuf->numChannels,
321             outBuf->ROIWidth,
322             outBuf->ROIHeight,
323             outBuf->bufPlaneWidth,
324             ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
325              outBuf->numChannels));
326     }
327     shared_process_params_m->bytesWritten = writePtr - (char *) out_m.ptr();
330 void ExecutionObject::Impl::ComputeInputOutputSizes()
332     in_size  = 0;
333     out_size = 0;
334     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
335     {
336         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
337         in_size += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
338                    inBuf->ROIHeight;
339     }
340     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
341     {
342         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
343         out_size += outBuf->numChannels * outBuf->ROIWidth * outBuf->ROIHeight;
344     }
348 bool ExecutionObject::Impl::RunAsync(CallType ct)
350     switch (ct)
351     {
352         case CallType::INIT:
353         {
354             k_initialize_m->RunAsync();
355             break;
356         }
357         case CallType::PROCESS:
358         {
359             shared_process_params_m->frameIdx = current_frame_idx_m;
360             shared_process_params_m->bytesWritten = 0;
361             HostWriteNetInput();
362             k_process_m->RunAsync();
363             break;
364         }
365         case CallType::CLEANUP:
366         {
367             k_cleanup_m->RunAsync();
368             break;
369         }
370         default:
371             return false;
372     }
374     return true;
377 bool ExecutionObject::Impl::Wait(CallType ct)
379     switch (ct)
380     {
381         case CallType::INIT:
382         {
383             bool has_work = k_initialize_m->Wait();
385             if (has_work)
386             {
387                 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
388                     throw Exception(shared_initialize_params_m->errorCode,
389                                     __FILE__, __FUNCTION__, __LINE__);
390                 ComputeInputOutputSizes();
391             }
392             return has_work;
393         }
394         case CallType::PROCESS:
395         {
396             bool has_work = k_process_m->Wait();
397             if (has_work)
398             {
399                 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
400                     throw Exception(shared_process_params_m->errorCode,
401                                     __FILE__, __FUNCTION__, __LINE__);
402                 HostReadNetOutput();
403             }
405             return has_work;
406         }
407         case CallType::CLEANUP:
408         {
409             return k_cleanup_m->Wait();
410             break;
411         }
412         default:
413             return false;
414     }
416     return false;