Initial commit
[tidl/tidl-api.git] / tinn_api / src / execution_object.cpp
1 /******************************************************************************
2  * Copyright (c) 2017 Texas Instruments Incorporated - http://www.ti.com/
3  *  All rights reserved.
4  *
5  *  Redistribution and use in source and binary forms, with or without
6  *  modification, are permitted provided that the following conditions are met:
7  *      * Redistributions of source code must retain the above copyright
8  *        notice, this list of conditions and the following disclaimer.
9  *      * Redistributions in binary form must reproduce the above copyright
10  *        notice, this list of conditions and the following disclaimer in the
11  *        documentation and/or other materials provided with the distribution.
12  *      * Neither the name of Texas Instruments Incorporated nor the
13  *        names of its contributors may be used to endorse or promote products
14  *        derived from this software without specific prior written permission.
15  *
16  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  *  THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
29 /*! \file execution_object.cpp */
31 #include "executor.h"
32 #include "execution_object.h"
33 #include "trace.h"
34 #include "ocl_device.h"
35 #include "parameters.h"
36 #include "configuration.h"
37 #include "common_defines.h"
38 #include <string.h>
40 using namespace tidl;
42 class ExecutionObject::Impl
43 {
44     public:
45         Impl(Device* d, uint8_t device_index,
46              const ArgInfo& create_arg,
47              const ArgInfo& param_heap_arg,
48              size_t extmem_heap_size);
49         ~Impl() {}
51         bool RunAsync(CallType ct);
52         bool Wait    (CallType ct);
54         bool SetupProcessKernel(const ArgInfo& in, const ArgInfo& out);
55         void HostWriteNetInput();
56         void HostReadNetOutput();
58         Device*                         device_m;
59         std::unique_ptr<Kernel>         k_initialize_m;
60         std::unique_ptr<Kernel>         k_process_m;
61         std::unique_ptr<Kernel>         k_cleanup_m;
63         up_malloc_ddr<char>             tidl_extmem_heap_m;
64         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
65         up_malloc_ddr<OCL_TIDL_ProcessParams>    shared_process_params_m;
67         ArgInfo                         in_m;
68         ArgInfo                         out_m;
70         // Index of the OpenCL device/queue used by this EO
71         uint8_t                         device_index_m;
73         // Frame being processed by the EO
74         int                             current_frame_idx_m;
75 };
78 ExecutionObject::ExecutionObject(Device* d,
79                                  uint8_t device_index,
80                                  const ArgInfo& create_arg,
81                                  const ArgInfo& param_heap_arg,
82                                  size_t extmem_heap_size)
83 {
84     pimpl_m = std::unique_ptr<ExecutionObject::Impl>
85               { new ExecutionObject::Impl(d, device_index,
86                                           create_arg,
87                                           param_heap_arg,
88                                           extmem_heap_size) };
89 }
92 ExecutionObject::Impl::Impl(Device* d,
93                                  uint8_t device_index,
94                                  const ArgInfo& create_arg,
95                                  const ArgInfo& param_heap_arg,
96                                  size_t extmem_heap_size):
97     device_m(d),
98     k_initialize_m(nullptr),
99     k_process_m(nullptr),
100     k_cleanup_m(nullptr),
101     tidl_extmem_heap_m (nullptr, &__free_ddr),
102     shared_initialize_params_m(nullptr, &__free_ddr),
103     shared_process_params_m(nullptr, &__free_ddr),
104     in_m(nullptr, 0),
105     out_m(nullptr, 0),
106     device_index_m(device_index),
107     current_frame_idx_m(0)
109     // Allocate a heap for TI DL to use on the device
110     tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
112     // Create a kernel for cleanup
113     KernelArgs cleanup_args;
114     k_cleanup_m.reset(new Kernel(device_m,
115                                  STRING(CLEANUP_KERNEL),
116                                  cleanup_args, device_index_m));
118     // Set up parameter struct for the initialize kernel
119     shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
120     memset(shared_initialize_params_m.get(), 0,
121            sizeof(OCL_TIDL_InitializeParams));
123     shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
124     shared_initialize_params_m->l2HeapSize   = tidl::internal::DMEM1_SIZE;
125     shared_initialize_params_m->l1HeapSize   = tidl::internal::DMEM0_SIZE;
126     shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
128     // Setup kernel arguments for initialize
129     KernelArgs args = { create_arg,
130                         param_heap_arg,
131                         ArgInfo(tidl_extmem_heap_m.get(),
132                                 extmem_heap_size),
133                         ArgInfo(shared_initialize_params_m.get(),
134                                 sizeof(OCL_TIDL_InitializeParams)),
135                         device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
136                             ArgInfo(nullptr, tidl::internal::DMEM1_SIZE):
137                             ArgInfo(nullptr, 4)                       };
139     k_initialize_m.reset(new Kernel(device_m,
140                                     STRING(INIT_KERNEL), args, device_index_m));
143 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
144 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
145 // unique_ptr's destructor requires a complete type in order to invoke delete
146 ExecutionObject::~ExecutionObject() = default;
148 char* ExecutionObject::GetInputBufferPtr() const
150     return static_cast<char *>(pimpl_m->in_m.ptr());
153 size_t ExecutionObject::GetInputBufferSizeInBytes() const
155     return pimpl_m->in_m.size();
158 char* ExecutionObject::GetOutputBufferPtr() const
160     return static_cast<char *>(pimpl_m->out_m.ptr());
163 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
165     return pimpl_m->shared_process_params_m.get()->bytesWritten;
168 void  ExecutionObject::SetFrameIndex(int idx)
170     pimpl_m->current_frame_idx_m = idx;
173 int ExecutionObject::GetFrameIndex() const
175     return pimpl_m->current_frame_idx_m;
178 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
180     assert (in.ptr() != nullptr && in.size() > 0);
181     assert (out.ptr() != nullptr && out.size() > 0);
183     pimpl_m->SetupProcessKernel(in, out);
186 bool ExecutionObject::ProcessFrameStartAsync()
188     return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
191 bool ExecutionObject::ProcessFrameWait()
193     return pimpl_m->Wait(ExecutionObject::CallType::PROCESS);
196 bool ExecutionObject::RunAsync (CallType ct)
198     return pimpl_m->RunAsync(ct);
201 bool ExecutionObject::Wait (CallType ct)
203     return pimpl_m->Wait(ct);
206 uint64_t ExecutionObject::GetProcessCycles() const
208     uint8_t factor = 1;
210     // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
211     if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM)
212         factor = 2;
214     return pimpl_m->shared_process_params_m.get()->cycles * factor;
217 float ExecutionObject::GetProcessTimeInMilliSeconds() const
219     float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
220     return ((float)GetProcessCycles())/frequency * 1000;
223 //
224 // Create a kernel to call the "process" function
225 //
226 bool
227 ExecutionObject::Impl::SetupProcessKernel(const ArgInfo& in, const ArgInfo& out)
229     in_m = in;
230     out_m = out;
232     shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
233     shared_process_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
234     shared_process_params_m->cycles = 0;
236     KernelArgs args = { ArgInfo(shared_process_params_m.get(),
237                                 sizeof(OCL_TIDL_ProcessParams)),
238                         in,
239                         out,
240                         ArgInfo(tidl_extmem_heap_m.get(),
241                                 shared_initialize_params_m->tidlHeapSize)
242                       };
244     k_process_m.reset(new Kernel(device_m,
245                                  STRING(PROCESS_KERNEL), args, device_index_m));
247     return true;
251 static size_t readDataS8(const char *readPtr, char *ptr, int roi, int n,
252                          int width, int height, int pitch,
253                          int chOffset)
255     if (!readPtr)  return 0;
257     for(int i2 = 0; i2 < roi; i2++)
258         for(int i0 = 0; i0 < n; i0++)
259             for(int i1 = 0; i1 < height; i1++)
260                 memcpy(&ptr[i2*n*chOffset + i0*chOffset + i1*pitch],
261                        &readPtr[i2*n*width*height + i0*width*height+ i1*width],
262                        width);
264     return width*height*n*roi;
267 static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
268                           int height, int pitch, int chOffset)
270     if (!writePtr)  return 0;
272     for(int i0 = 0; i0 < n; i0++)
273         for(int i1 = 0; i1 < height; i1++)
274             memcpy(&writePtr[i0*width*height + i1*width],
275                    &ptr[i0*chOffset + i1*pitch],
276                    width);
278     return width*height*n;
281 void ExecutionObject::Impl::HostWriteNetInput()
283     char* readPtr = (char *) in_m.ptr();
284     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
285     {
286         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
287         readPtr += readDataS8(
288             readPtr,
289             (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
290                 + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
291                 + OCL_TIDL_MAX_PAD_SIZE,
292             inBuf->numROIs,
293             inBuf->numChannels,
294             inBuf->ROIWidth,
295             inBuf->ROIHeight,
296             inBuf->bufPlaneWidth,
297             inBuf->bufPlaneWidth
298                 * (inBuf->ROIHeight + 2 * OCL_TIDL_MAX_PAD_SIZE) );
299     }
302 void ExecutionObject::Impl::HostReadNetOutput()
304     char* writePtr = (char *) out_m.ptr();
305     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
306     {
307         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
308         writePtr += writeDataS8(
309             writePtr,
310             (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
311                 + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
312                 + OCL_TIDL_MAX_PAD_SIZE,
313             outBuf->numChannels,
314             outBuf->ROIWidth,
315             outBuf->ROIHeight,
316             outBuf->bufPlaneWidth,
317             ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
318              outBuf->numChannels));
319     }
320     shared_process_params_m->bytesWritten = writePtr - (char *) out_m.ptr();
324 bool ExecutionObject::Impl::RunAsync(CallType ct)
326     switch (ct)
327     {
328         case CallType::INIT:
329         {
330             k_initialize_m->RunAsync();
331             break;
332         }
333         case CallType::PROCESS:
334         {
335             shared_process_params_m->frameIdx = current_frame_idx_m;
336             shared_process_params_m->bytesWritten = 0;
337             HostWriteNetInput();
338             k_process_m->RunAsync();
339             break;
340         }
341         case CallType::CLEANUP:
342         {
343             k_cleanup_m->RunAsync();
344             break;
345         }
346         default:
347             return false;
348     }
350     return true;
353 bool ExecutionObject::Impl::Wait(CallType ct)
355     switch (ct)
356     {
357         case CallType::INIT:
358         {
359             bool has_work = k_initialize_m->Wait();
361             if (has_work)
362             {
363                 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
364                     throw Exception(shared_initialize_params_m->errorCode,
365                                     __FILE__, __FUNCTION__, __LINE__);
366             }
367             return has_work;
368         }
369         case CallType::PROCESS:
370         {
371             bool has_work = k_process_m->Wait();
372             if (has_work)
373             {
374                 HostReadNetOutput();
376                 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
377                     throw Exception(shared_process_params_m->errorCode,
378                                     __FILE__, __FUNCTION__, __LINE__);
379             }
381             return has_work;
382         }
383         case CallType::CLEANUP:
384         {
385             return k_cleanup_m->Wait();
386             break;
387         }
388         default:
389             return false;
390     }
392     return false;