Enqueue multiple frames at device side
[tidl/tidl-api.git] / tidl_api / src / execution_object_pipeline.cpp
1 /******************************************************************************
2  * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
3  *  All rights reserved.
4  *
5  *  Redistribution and use in source and binary forms, with or without
6  *  modification, are permitted provided that the following conditions are met:
7  *      * Redistributions of source code must retain the above copyright
8  *        notice, this list of conditions and the following disclaimer.
9  *      * Redistributions in binary form must reproduce the above copyright
10  *        notice, this list of conditions and the following disclaimer in the
11  *        documentation and/or other materials provided with the distribution.
12  *      * Neither the name of Texas Instruments Incorporated nor the
13  *        names of its contributors may be used to endorse or promote products
14  *        derived from this software without specific prior written permission.
15  *
16  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  *  THE POSSIBILITY OF SUCH DAMAGE.
27  *****************************************************************************/
29 #include <assert.h>
30 #include <mutex>
31 #include <condition_variable>
32 #include <chrono>
33 #include "device_arginfo.h"
34 #include "execution_object_pipeline.h"
35 #include "parameters.h"
37 using namespace tidl;
39 class ExecutionObjectPipeline::Impl
40 {
41     public:
42         Impl(std::vector<ExecutionObject*> &eos);
43         ~Impl();
45         void SetInputOutputBuffer(const ArgInfo &in, const ArgInfo &out);
46         bool RunAsyncStart();
47         bool RunAsyncNext();
48         bool Wait();
50         // Trace related
51         void WriteLayerOutputsToFile(const std::string& filename_prefix) const;
52         const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
53                                               uint32_t output_index) const;
54         const LayerOutputs* GetOutputsFromAllLayers() const;
56         //! for pipelined execution
57         std::vector<ExecutionObject*> eos_m;
58         std::vector<IODeviceArgInfo*> iobufs_m;
59         std::vector<float> eo_device_time_m;
60         std::vector<float> eo_host_time_m;
62         std::string device_name_m;
64         //! current frame index
65         int frame_idx_m;
67         //! current execution object index, and it context index
68         uint32_t curr_eo_idx_m;
69         uint32_t curr_eo_context_idx_m;
71         // device and host time tracking: pipeline start to finish
72         float device_time_m;
73         float host_time_m;
75     private:
76         //! @brief Initialize ExecutionObjectPipeline with given
77         //! ExecutionObjects: check consecutive layersGroup, allocate memory
78         void Initialize();
80         // flag, mutex and cond var for signaling completion and waiting
81         bool has_work_m, is_processed_m;
82         std::mutex mutex_m;
83         std::condition_variable cv_m;
85         // host time tracking: pipeline start to finish
86         std::chrono::time_point<std::chrono::steady_clock> start_m;
87 };
89 ExecutionObjectPipeline::ExecutionObjectPipeline(
90     std::vector<ExecutionObject*> eos)
91 {
92     pimpl_m = std::unique_ptr<Impl> { new Impl(eos) };
93 }
95 ExecutionObjectPipeline::Impl::Impl(std::vector<ExecutionObject *> &eos) :
96     eos_m(eos), has_work_m(false), is_processed_m(false)
97 {
98     Initialize();
99 }
101 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
102 // Both unique_ptr and shared_ptr can be instantiated with an incomplete type
103 // unique_ptr's destructor requires a complete type in order to invoke delete
104 ExecutionObjectPipeline::~ExecutionObjectPipeline() = default;
106 char* ExecutionObjectPipeline::GetInputBufferPtr() const
108     return static_cast<char *>(pimpl_m->iobufs_m.front()->GetArg().ptr());
111 uint32_t ExecutionObjectPipeline::GetNumExecutionObjects() const
113     return pimpl_m->eos_m.size();
116 size_t ExecutionObjectPipeline::GetInputBufferSizeInBytes() const
118     return pimpl_m->eos_m.front()->GetInputBufferSizeInBytes();
121 char* ExecutionObjectPipeline::GetOutputBufferPtr() const
123     return static_cast<char *>(pimpl_m->iobufs_m.back()->GetArg().ptr());
126 size_t ExecutionObjectPipeline::GetOutputBufferSizeInBytes() const
128     return pimpl_m->eos_m.back()->GetOutputBufferSizeInBytes();
131 void ExecutionObjectPipeline::SetInputOutputBuffer(const ArgInfo& in,
132                                                    const ArgInfo& out)
134     assert(in.ptr() != nullptr  && in.size() >= GetInputBufferSizeInBytes());
135     assert(out.ptr() != nullptr && out.size() >= GetOutputBufferSizeInBytes());
136     pimpl_m->SetInputOutputBuffer(in, out);
139 void ExecutionObjectPipeline::SetFrameIndex(int idx)
141     pimpl_m->frame_idx_m = idx;
144 int ExecutionObjectPipeline::GetFrameIndex() const
146     return pimpl_m->frame_idx_m;
149 bool ExecutionObjectPipeline::ProcessFrameStartAsync()
151     assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
152     bool st = pimpl_m->RunAsyncStart();
153     if (st)
154         st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS,
155                                          this, pimpl_m->curr_eo_context_idx_m);
156     return st;
159 bool ExecutionObjectPipeline::ProcessFrameWait()
161     return pimpl_m->Wait();
164 void CallbackWrapper(void *user_data)
166     ((ExecutionObjectPipeline *) user_data)->RunAsyncNext();
169 void ExecutionObjectPipeline::RunAsyncNext()
171     bool has_next = pimpl_m->RunAsyncNext();
172     if (has_next)
173         pimpl_m->eos_m[pimpl_m->curr_eo_idx_m]->AddCallback(
174                                      ExecutionObject::CallType::PROCESS, this,
175                                      pimpl_m->curr_eo_context_idx_m);
178 float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const
180     return pimpl_m->device_time_m;
183 float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds() const
185     return pimpl_m->host_time_m;
188 float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds(
189         uint32_t eo_index) const
191     assert(eo_index < pimpl_m->eos_m.size());
192     return pimpl_m->eo_device_time_m[eo_index];
195 float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds(
196         uint32_t eo_index) const
198     assert(eo_index < pimpl_m->eos_m.size());
199     return pimpl_m->eo_host_time_m[eo_index];
202 const std::string& ExecutionObjectPipeline::GetDeviceName() const
204     return pimpl_m->device_name_m;
207 void
208 ExecutionObjectPipeline::WriteLayerOutputsToFile(
209     const std::string& filename_prefix) const
211     pimpl_m->WriteLayerOutputsToFile(filename_prefix);
214 const LayerOutput*
215 ExecutionObjectPipeline::GetOutputFromLayer(uint32_t layer_index,
216     uint32_t output_index) const
218     return pimpl_m->GetOutputFromLayer(layer_index, output_index);
221 const LayerOutputs*
222 ExecutionObjectPipeline::GetOutputsFromAllLayers() const
224     return pimpl_m->GetOutputsFromAllLayers();
228 /// Impl methods start here
231 static
232 void* AllocateMem(size_t size)
234     if (size == 0)  return nullptr;
235     void *ptr = malloc(size);
236     if (ptr == nullptr)
237         throw Exception("Out of memory, ExecutionObjectPipeline malloc failed",
238                         __FILE__, __FUNCTION__, __LINE__);
239     return ptr;
242 void ExecutionObjectPipeline::Impl::Initialize()
244     // Check consecutive layersGroups to form a pipeline
245     int prev_group = 0;
246     for (auto eo : eos_m)
247     {
248         int group = eo->GetLayersGroupId();
249         if (prev_group != 0 && group != prev_group + 1)
250             throw Exception(
251                 "Non-consecutive layersGroupIds in ExecutionObjectPipeline",
252                 __FILE__, __FUNCTION__, __LINE__);
253         prev_group = group;
254     }
256     for (auto eo : eos_m)
257         device_name_m += eo->GetDeviceName() + "+";
258     device_name_m.resize(device_name_m.size() - 1);
260     // Allocate input and output memory for EOs/layersGroups
261     // Note that i-th EO's output buffer is the same as (i+1)-th EO's input
262     // So, if n EOs, then (n+1) buffers: b EO b EO b EO b ... EO b
263     // User must set the first input buffer and the last output buffer
264     size_t size;
265     ArgInfo in(nullptr, 0);
266     iobufs_m.push_back(new IODeviceArgInfo(in));
267     for (auto eo : eos_m)
268     {
269         if (eo != eos_m.back())
270             size = eo->GetOutputBufferSizeInBytes();
271         else
272             size = 0;
274         void *ptr = AllocateMem(size);
275         ArgInfo out(ptr, size);
276         iobufs_m.push_back(new IODeviceArgInfo(out));
277     }
279     // Record keeping for each EO's device time and host time
280     // because EO could be shared by another EOP
281     eo_device_time_m.resize(eos_m.size());
282     eo_host_time_m.resize(eos_m.size());
285 ExecutionObjectPipeline::Impl::~Impl()
287     int num_iobufs = iobufs_m.size();
288     for (int i = 0; i < num_iobufs; i++)
289     {
290         if (! (i == 0 || i == num_iobufs-1))
291             free(iobufs_m[i]->GetArg().ptr());
292         delete iobufs_m[i];
293     }
296 void ExecutionObjectPipeline::Impl::SetInputOutputBuffer(const ArgInfo &in,
297                                                          const ArgInfo &out)
299     delete iobufs_m.front();
300     delete iobufs_m.back();
301     iobufs_m.front() = new IODeviceArgInfo(in);
302     iobufs_m.back()  = new IODeviceArgInfo(out);
305 bool ExecutionObjectPipeline::Impl::RunAsyncStart()
307     has_work_m = true;
308     is_processed_m = false;
309     device_time_m = 0.0f;
310     host_time_m = 0.0f;
311     curr_eo_idx_m = 0;
312     eos_m[0]->AcquireContext(curr_eo_context_idx_m);
313     if (tidl::internal::NUM_CONTEXTS == 1)
314         start_m = std::chrono::steady_clock::now();
315     eos_m[0]->SetFrameIndex(frame_idx_m);
316     eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1],
317                                    curr_eo_context_idx_m);
318     return eos_m[0]->ProcessFrameStartAsync(curr_eo_context_idx_m);
321 // returns true if we have more EOs to execute
322 bool ExecutionObjectPipeline::Impl::RunAsyncNext()
324     eos_m[curr_eo_idx_m]->ProcessFrameWait(curr_eo_context_idx_m);
325     // need to capture EO's device/host time before we release its lock
326     eo_device_time_m[curr_eo_idx_m] = eos_m[curr_eo_idx_m]->
327                            GetProcessTimeInMilliSeconds(curr_eo_context_idx_m);
328     eo_host_time_m[curr_eo_idx_m]   = eos_m[curr_eo_idx_m]->
329                        GetHostProcessTimeInMilliSeconds(curr_eo_context_idx_m);
330     device_time_m += eo_device_time_m[curr_eo_idx_m];
331     if (tidl::internal::NUM_CONTEXTS > 1)
332         host_time_m += eo_host_time_m[curr_eo_idx_m];
333     eos_m[curr_eo_idx_m]->ReleaseContext(curr_eo_context_idx_m);
334     curr_eo_idx_m += 1;
335     if (curr_eo_idx_m < eos_m.size())
336     {
337         eos_m[curr_eo_idx_m]->AcquireContext(curr_eo_context_idx_m);
338         eos_m[curr_eo_idx_m]->SetFrameIndex(frame_idx_m);
339         eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m],
340                           iobufs_m[curr_eo_idx_m+1], curr_eo_context_idx_m);
341         eos_m[curr_eo_idx_m]->ProcessFrameStartAsync(curr_eo_context_idx_m);
342         return true;
343     }
344     else
345     {
346         if (tidl::internal::NUM_CONTEXTS == 1)
347         {
348             std::chrono::duration<float> elapsed =
349                                     std::chrono::steady_clock::now() - start_m;
350             host_time_m = elapsed.count() * 1000;  // seconds to milliseconds
351         }
352         is_processed_m = true;
353         cv_m.notify_all();
354         return false;
355     }
358 bool ExecutionObjectPipeline::Impl::Wait()
360     if (! has_work_m)  return false;
362     std::unique_lock<std::mutex> lock(mutex_m);
363     cv_m.wait(lock, [this]{ return this->is_processed_m; });
364     has_work_m = false;
365     return true;
368 void
369 ExecutionObjectPipeline::Impl::WriteLayerOutputsToFile(
370     const std::string& filename_prefix) const
372     for (auto eo : eos_m)
373         eo->WriteLayerOutputsToFile(filename_prefix);
376 const LayerOutput*
377 ExecutionObjectPipeline::Impl::GetOutputFromLayer(uint32_t layer_index,
378     uint32_t output_index) const
380     const LayerOutput* lo = nullptr;
381     for (auto eo : eos_m)
382     {
383         lo = eo->GetOutputFromLayer(layer_index, output_index);
384         if (lo != nullptr)  break;
385     }
386     return lo;
389 const LayerOutputs*
390 ExecutionObjectPipeline::Impl::GetOutputsFromAllLayers() const
392     LayerOutputs *all = new LayerOutputs;
393     for (auto eo : eos_m)
394     {
395         LayerOutputs *los = const_cast<LayerOutputs *>(
396                                                 eo->GetOutputsFromAllLayers());
397         for (auto& lo : *los)
398             all->push_back(std::unique_ptr<const LayerOutput>{ lo.release() });
399         delete los;
400     }
401     return all;