Enqueue multiple frames at device side
authorYuan Zhao <yuanzhao@ti.com>
Thu, 4 Oct 2018 16:39:08 +0000 (11:39 -0500)
committerAjay Jayaraj <ajayj@ti.com>
Fri, 19 Oct 2018 15:57:57 +0000 (10:57 -0500)
- Previous implementation won't send/enqueue next frame to device
  until the host has received completion message for current frame.
  The improvement is to create multiple sets/contexts of internal
  TIDL input/output buffers at device side, and to send/enqueue next
  frame using a different set/context of internal TIDL input/output
  buffers to device while device is still processing the current frame.
  When device finishes current frame, it can immediately read
  its messageQ and start processing the next frame, without waiting
  for the completion message reaching the host and the hosting sending
  the next frame.
- In pipelined processing of multiple frames, this optimization can
  effectively hide the round-trip communication between host and device.
- Removed deprecated enableInternalInput feature
- MCT-1059

tidl_api/dsp/ocl_wrapper.cl
tidl_api/inc/execution_object.h
tidl_api/src/device_arginfo.h
tidl_api/src/execution_object.cpp
tidl_api/src/execution_object_pipeline.cpp
tidl_api/src/ocl_device.cpp
tidl_api/src/ocl_device.h
tidl_api/src/parameters.h

index 698d074c14ecbd482f3af8974223561e28bfd612..e75ed1d507c141cdda9d51f8c540dc0fef8913c6 100644 (file)
@@ -55,10 +55,11 @@ void ocl_tidl_initialize(global unsigned char*            createParams,
 kernel
 void ocl_tidl_process(global OCL_TIDL_ProcessParams* processParams,
                       global unsigned char*          externalMemoryHeapBase,
-                      global unsigned char*          traceBufferParams)
+                      global unsigned char*          traceBufferParams,
+                      uint32_t                       contextIndex)
 {
     ocl_dsp_tidl_process(processParams, externalMemoryHeapBase,
-                         traceBufferParams);
+                         traceBufferParams, contextIndex);
 }
 
 
index dad586678cac660dc4a682abd68355d10792e5d1..d875d7b5e9244748c66db8e382ec31334d4f6a49 100644 (file)
@@ -66,6 +66,13 @@ class ExecutionObject : public ExecutionObjectInternalInterface
         void SetInputOutputBuffer(const ArgInfo& in,
                                   const ArgInfo& out) override;
 
+        //! Specify the input and output buffers used by the EO in a context
+        //! @param in buffer used for input.
+        //! @param out buffer used for output.
+        //! @param context_idx the index of the context
+        void SetInputOutputBuffer(const ArgInfo& in,
+                                  const ArgInfo& out, uint32_t context_idx);
+
         //! Returns a pointer to the input buffer set via SetInputOutputBuffer
         char* GetInputBufferPtr() const override;
 
@@ -90,22 +97,46 @@ class ExecutionObject : public ExecutionObjectInternalInterface
         //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait
         bool ProcessFrameStartAsync() override;
 
+        //! @brief Start processing with a context. The call is asynchronous and
+        //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait
+        //! @param context_idx the index of the context
+        bool ProcessFrameStartAsync(uint32_t context_idx);
+
         //! Wait for the execution object to complete processing a frame
         //! @return false if ExecutionObject::ProcessFrameWait was called
         //! without a corresponding call to
         //! ExecutionObject::ProcessFrameStartAsync.
         bool ProcessFrameWait() override;
 
+        //! Wait for the execution object to complete processing with a context
+        //! @param context_idx the index of the context
+        //! @return false if ExecutionObject::ProcessFrameWait was called
+        //! without a corresponding call to
+        //! ExecutionObject::ProcessFrameStartAsync.
+        bool ProcessFrameWait(uint32_t context_idx);
+
         //! @brief return the number of milliseconds taken *on the device* to
         //! execute the process call
         //! @return Number of milliseconds to process a frame on the device.
         float GetProcessTimeInMilliSeconds() const override;
 
+        //! @brief return the number of milliseconds taken *on the device* to
+        //! execute the process call with a contex
+        //! @param context_idx the index of the context
+        //! @return Number of milliseconds to process a frame on the device.
+        float GetProcessTimeInMilliSeconds(uint32_t context_idx) const;
+
         //! @brief return the number of milliseconds taken *on the host* to
         //! execute the process call
         //! @return Number of milliseconds to process a frame on the host.
         float GetHostProcessTimeInMilliSeconds() const override;
 
+        //! @brief return the number of milliseconds taken *on the host* to
+        //! execute the process call with a contex
+        //! @param context_idx the index of the context
+        //! @return Number of milliseconds to process a frame on the host.
+        float GetHostProcessTimeInMilliSeconds(uint32_t context_idx) const;
+
         //! Returns the device name that the ExecutionObject runs on
         const std::string& GetDeviceName() const override;
 
@@ -137,9 +168,9 @@ class ExecutionObject : public ExecutionObjectInternalInterface
 
         //! @private
         // Used by the ExecutionObjectPipeline
-        bool AddCallback(CallType ct, void *user_data);
-        void AcquireLock();
-        void ReleaseLock();
+        bool AddCallback(CallType ct, void *user_data, uint32_t context_idx);
+        void AcquireContext(uint32_t& context_idx);
+        void ReleaseContext(uint32_t  context_idx);
 
         ExecutionObject()                                  = delete;
         ExecutionObject(const ExecutionObject&)            = delete;
@@ -147,7 +178,7 @@ class ExecutionObject : public ExecutionObjectInternalInterface
 
         //! @private
         void SetInputOutputBuffer(const IODeviceArgInfo* in,
-                                  const IODeviceArgInfo* out);
+                             const IODeviceArgInfo* out, uint32_t context_idx);
 
     private:
         class Impl;
index 841e0c14ad537beda063d0480a0d722f254a10b5..f2052df9a55cfc2b5a9cfd0d1efe562ce6876dc7 100644 (file)
@@ -69,7 +69,6 @@ class PipeInfo
 {
     public:
         uint32_t dataQ_m[OCL_TIDL_MAX_IN_BUFS];
-        uint32_t bufAddr_m[OCL_TIDL_MAX_IN_BUFS];
 };
 
 /*! @class IODeviceArgInfo
index 00d6804d03ad2f481e034e78d1ba589c09eaf1e1..9bc9f05d61f1cd06574f986fc175ef95c1347b7c 100644 (file)
@@ -55,14 +55,14 @@ class ExecutionObject::Impl
              int    layers_group_id);
         ~Impl() {}
 
-        bool RunAsync(CallType ct);
-        bool Wait    (CallType ct);
-        bool AddCallback(CallType ct, void *user_data);
+        bool RunAsync(CallType ct, uint32_t context_idx);
+        bool Wait    (CallType ct, uint32_t context_idx);
+        bool AddCallback(CallType ct, void *user_data, uint32_t context_idx);
 
-        uint64_t GetProcessCycles() const;
+        uint64_t GetProcessCycles(uint32_t context_idx) const;
         int  GetLayersGroupId() const;
-        void AcquireLock();
-        void ReleaseLock();
+        void AcquireContext(uint32_t& context_idx);
+        void ReleaseContext(uint32_t  context_idx);
 
         Device*                         device_m;
         // Index of the OpenCL device/queue used by this EO
@@ -75,8 +75,8 @@ class ExecutionObject::Impl
 
         size_t                          in_size_m;
         size_t                          out_size_m;
-        IODeviceArgInfo                 in_m;
-        IODeviceArgInfo                 out_m;
+        IODeviceArgInfo                 in_m[tidl::internal::NUM_CONTEXTS];
+        IODeviceArgInfo                 out_m[tidl::internal::NUM_CONTEXTS];
 
         // Frame being processed by the EO
         int                             current_frame_idx_m;
@@ -96,7 +96,7 @@ class ExecutionObject::Impl
         size_t                            trace_buf_params_sz_m;
 
         // host time tracking: eo start to finish
-        float host_time_m;
+        float host_time_m[tidl::internal::NUM_CONTEXTS];
 
     private:
         void SetupInitializeKernel(const DeviceArgInfo& create_arg,
@@ -104,8 +104,8 @@ class ExecutionObject::Impl
         void EnableOutputBufferTrace();
         void SetupProcessKernel();
 
-        void HostWriteNetInput();
-        void HostReadNetOutput();
+        void HostWriteNetInput(uint32_t context_idx);
+        void HostReadNetOutput(uint32_t context_idx);
         void ComputeInputOutputSizes();
 
         std::unique_ptr<Kernel>         k_initialize_m;
@@ -113,7 +113,8 @@ class ExecutionObject::Impl
         std::unique_ptr<Kernel>         k_cleanup_m;
 
         // Guarding sole access to input/output for one frame during execution
-        bool                            is_idle_m;
+        // Encoding: context at bit index, bit value: 0 for idle, 1 for busy
+        uint32_t                        idle_encoding_m;
         std::mutex                      mutex_access_m;
         std::condition_variable         cv_access_m;
 
@@ -155,8 +156,6 @@ ExecutionObject::Impl::Impl(Device* d, uint8_t device_index,
     shared_process_params_m(nullptr, &__free_ddr),
     in_size_m(0),
     out_size_m(0),
-    in_m(),
-    out_m(),
     current_frame_idx_m(0),
     layers_group_id_m(layers_group_id),
     num_network_layers_m(0),
@@ -165,7 +164,7 @@ ExecutionObject::Impl::Impl(Device* d, uint8_t device_index,
     k_initialize_m(nullptr),
     k_process_m(nullptr),
     k_cleanup_m(nullptr),
-    is_idle_m(true),
+    idle_encoding_m(0),  // all contexts are idle
     configuration_m(configuration)
 {
     device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
@@ -189,7 +188,7 @@ ExecutionObject::~ExecutionObject() = default;
 
 char* ExecutionObject::GetInputBufferPtr() const
 {
-    return static_cast<char *>(pimpl_m->in_m.GetArg().ptr());
+    return static_cast<char *>(pimpl_m->in_m[0].GetArg().ptr());
 }
 
 size_t ExecutionObject::GetInputBufferSizeInBytes() const
@@ -199,7 +198,7 @@ size_t ExecutionObject::GetInputBufferSizeInBytes() const
 
 char* ExecutionObject::GetOutputBufferPtr() const
 {
-    return static_cast<char *>(pimpl_m->out_m.GetArg().ptr());
+    return static_cast<char *>(pimpl_m->out_m[0].GetArg().ptr());
 }
 
 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
@@ -217,59 +216,89 @@ int ExecutionObject::GetFrameIndex() const
     return pimpl_m->current_frame_idx_m;
 }
 
-void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
+void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in,
+                                           const ArgInfo& out)
+{
+    SetInputOutputBuffer(in, out, 0);
+}
+
+void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in,
+                                      const ArgInfo& out, uint32_t context_idx)
 {
     assert(in.ptr()  != nullptr && in.size()  >= pimpl_m->in_size_m);
     assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
 
-    pimpl_m->in_m  = IODeviceArgInfo(in);
-    pimpl_m->out_m = IODeviceArgInfo(out);
+    pimpl_m->in_m[context_idx]  = IODeviceArgInfo(in);
+    pimpl_m->out_m[context_idx] = IODeviceArgInfo(out);
 }
 
 void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
-                                           const IODeviceArgInfo* out)
+                                           const IODeviceArgInfo* out,
+                                           uint32_t context_idx)
 {
-    pimpl_m->in_m  = *in;
-    pimpl_m->out_m = *out;
+    pimpl_m->in_m[context_idx]  = *in;
+    pimpl_m->out_m[context_idx] = *out;
 }
 
 bool ExecutionObject::ProcessFrameStartAsync()
 {
-    TRACE::print("-> ExecutionObject::ProcessFrameStartAsync()\n");
+    return ProcessFrameStartAsync(0);
+}
+
+bool ExecutionObject::ProcessFrameStartAsync(uint32_t context_idx)
+{
+    TRACE::print("-> ExecutionObject::ProcessFrameStartAsync(%d)\n",
+                 context_idx);
     assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
-    return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
+    return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS, context_idx);
 }
 
 bool ExecutionObject::ProcessFrameWait()
 {
-    TRACE::print("-> ExecutionObject::ProcessFrameWait()\n");
-    return pimpl_m->Wait(ExecutionObject::CallType::PROCESS);
+    return ProcessFrameWait(0);
+}
+
+bool ExecutionObject::ProcessFrameWait(uint32_t context_idx)
+{
+    TRACE::print("-> ExecutionObject::ProcessFrameWait(%d)\n", context_idx);
+    return pimpl_m->Wait(ExecutionObject::CallType::PROCESS, context_idx);
 }
 
 bool ExecutionObject::RunAsync (CallType ct)
 {
-    return pimpl_m->RunAsync(ct);
+    return pimpl_m->RunAsync(ct, 0);
 }
 
 bool ExecutionObject::Wait (CallType ct)
 {
-    return pimpl_m->Wait(ct);
+    return pimpl_m->Wait(ct, 0);
 }
 
-bool ExecutionObject::AddCallback(CallType ct, void *user_data)
+bool ExecutionObject::AddCallback(CallType ct, void *user_data,
+                                  uint32_t context_idx)
 {
-    return pimpl_m->AddCallback(ct, user_data);
+    return pimpl_m->AddCallback(ct, user_data, context_idx);
 }
 
 float ExecutionObject::GetProcessTimeInMilliSeconds() const
+{
+    return GetProcessTimeInMilliSeconds(0);
+}
+
+float ExecutionObject::GetProcessTimeInMilliSeconds(uint32_t context_idx) const
 {
     float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
-    return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000;
+    return ((float)pimpl_m->GetProcessCycles(context_idx)) / frequency * 1000;
 }
 
 float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
 {
-    return pimpl_m->host_time_m;
+    return GetHostProcessTimeInMilliSeconds(0);
+}
+
+float ExecutionObject::GetHostProcessTimeInMilliSeconds(uint32_t context_idx) const
+{
+    return pimpl_m->host_time_m[context_idx];
 }
 
 void
@@ -299,14 +328,14 @@ const std::string& ExecutionObject::GetDeviceName() const
     return pimpl_m->device_name_m;
 }
 
-void ExecutionObject::AcquireLock()
+void ExecutionObject::AcquireContext(uint32_t& context_idx)
 {
-    pimpl_m->AcquireLock();
+    pimpl_m->AcquireContext(context_idx);
 }
 
-void ExecutionObject::ReleaseLock()
+void ExecutionObject::ReleaseContext(uint32_t context_idx)
 {
-    pimpl_m->ReleaseLock();
+    pimpl_m->ReleaseContext(context_idx);
 }
 
 //
@@ -334,7 +363,7 @@ ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
     shared_initialize_params_m->tidlHeapSize =configuration_m.NETWORK_HEAP_SIZE;
     shared_initialize_params_m->l2HeapSize   = tidl::internal::DMEM1_SIZE;
     shared_initialize_params_m->l1HeapSize   = tidl::internal::DMEM0_SIZE;
-    shared_initialize_params_m->enableInternalInput = 0;
+    shared_initialize_params_m->numContexts  = tidl::internal::NUM_CONTEXTS;
 
     // Set up execution trace specified in the configuration
     EnableExecutionTrace(configuration_m,
@@ -392,16 +421,19 @@ void ExecutionObject::Impl::EnableOutputBufferTrace()
 void
 ExecutionObject::Impl::SetupProcessKernel()
 {
-    shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
-    shared_process_params_m->enableInternalInput =
-                               shared_initialize_params_m->enableInternalInput;
-    shared_process_params_m->cycles = 0;
+    shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>(
+               tidl::internal::NUM_CONTEXTS * sizeof(OCL_TIDL_ProcessParams)));
 
     // Set up execution trace specified in the configuration
-    EnableExecutionTrace(configuration_m,
-                         &shared_process_params_m->enableTrace);
+    for (int i = 0; i < tidl::internal::NUM_CONTEXTS; i++)
+    {
+        OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() + i;
+        EnableExecutionTrace(configuration_m, &p_params->enableTrace);
+    }
 
+    uint32_t context_idx = 0;
     KernelArgs args = { DeviceArgInfo(shared_process_params_m.get(),
+                                      tidl::internal::NUM_CONTEXTS *
                                       sizeof(OCL_TIDL_ProcessParams),
                                       DeviceArgInfo::Kind::BUFFER),
                         DeviceArgInfo(tidl_extmem_heap_m.get(),
@@ -409,8 +441,10 @@ ExecutionObject::Impl::SetupProcessKernel()
                                       DeviceArgInfo::Kind::BUFFER),
                         DeviceArgInfo(trace_buf_params_m.get(),
                                       trace_buf_params_sz_m,
-                                      DeviceArgInfo::Kind::BUFFER)
-
+                                      DeviceArgInfo::Kind::BUFFER),
+                        DeviceArgInfo(&context_idx,
+                                      sizeof(uint32_t),
+                                      DeviceArgInfo::Kind::SCALAR)
                       };
 
     k_process_m.reset(new Kernel(device_m,
@@ -452,20 +486,25 @@ static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
 //
 // Copy from host buffer to TIDL device buffer
 //
-void ExecutionObject::Impl::HostWriteNetInput()
+void ExecutionObject::Impl::HostWriteNetInput(uint32_t context_idx)
 {
-    const char*     readPtr  = (const char *) in_m.GetArg().ptr();
-    const PipeInfo& pipe     = in_m.GetPipe();
+    const char*     readPtr  = (const char *) in_m[context_idx].GetArg().ptr();
+    const PipeInfo& pipe     = in_m[context_idx].GetPipe();
+    OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
+                                       + context_idx;
 
     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
     {
         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
+        uint32_t context_size = inBuf->bufPlaneWidth * inBuf->bufPlaneHeight;
+                 context_size = (context_size + OCL_TIDL_CACHE_ALIGN - 1) &
+                                (~(OCL_TIDL_CACHE_ALIGN - 1));
+        char *inBufAddr = tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
+                          + context_idx * context_size;
 
-        if (shared_process_params_m->enableInternalInput == 0)
-        {
             readPtr += readDataS8(
                 readPtr,
-                (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
+                (char *) inBufAddr
                     + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
                     + OCL_TIDL_MAX_PAD_SIZE,
                 inBuf->numROIs,
@@ -475,32 +514,34 @@ void ExecutionObject::Impl::HostWriteNetInput()
                 inBuf->bufPlaneWidth,
                 ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) /
                  inBuf->numChannels));
-        }
-        else
-        {
-            shared_process_params_m->inBufAddr[i] = pipe.bufAddr_m[i];
-        }
 
-        shared_process_params_m->inDataQ[i]   = pipe.dataQ_m[i];
+        p_params->dataQ[i] = pipe.dataQ_m[i];
     }
 }
 
 //
 // Copy from TIDL device buffer into host buffer
 //
-void ExecutionObject::Impl::HostReadNetOutput()
+void ExecutionObject::Impl::HostReadNetOutput(uint32_t context_idx)
 {
-    char* writePtr = (char *) out_m.GetArg().ptr();
-    PipeInfo& pipe = out_m.GetPipe();
+    char* writePtr = (char *) out_m[context_idx].GetArg().ptr();
+    PipeInfo& pipe = out_m[context_idx].GetPipe();
+    OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
+                                       + context_idx;
 
     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
     {
         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
+        uint32_t context_size = outBuf->bufPlaneWidth * outBuf->bufPlaneHeight;
+                 context_size = (context_size + OCL_TIDL_CACHE_ALIGN - 1) &
+                                (~(OCL_TIDL_CACHE_ALIGN - 1));
+        char *outBufAddr = tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
+                           + context_idx * context_size;
         if (writePtr != nullptr)
         {
             writePtr += writeDataS8(
                 writePtr,
-                (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
+                (char *) outBufAddr
                     + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
                     + OCL_TIDL_MAX_PAD_SIZE,
                 outBuf->numChannels,
@@ -511,12 +552,8 @@ void ExecutionObject::Impl::HostReadNetOutput()
                  outBuf->numChannels));
         }
 
-        pipe.dataQ_m[i]   = shared_process_params_m->outDataQ[i];
-        pipe.bufAddr_m[i] = shared_initialize_params_m->bufAddrBase
-                           + outBuf->bufPlaneBufOffset;
+        pipe.dataQ_m[i]   = p_params->dataQ[i];
     }
-    shared_process_params_m->bytesWritten = writePtr -
-                                            (char *) out_m.GetArg().ptr();
 }
 
 void ExecutionObject::Impl::ComputeInputOutputSizes()
@@ -550,7 +587,7 @@ void ExecutionObject::Impl::ComputeInputOutputSizes()
 }
 
 
-bool ExecutionObject::Impl::RunAsync(CallType ct)
+bool ExecutionObject::Impl::RunAsync(CallType ct, uint32_t context_idx)
 {
     switch (ct)
     {
@@ -564,14 +601,19 @@ bool ExecutionObject::Impl::RunAsync(CallType ct)
             std::chrono::time_point<std::chrono::steady_clock> t1, t2;
             t1 = std::chrono::steady_clock::now();
 
-            shared_process_params_m->frameIdx = current_frame_idx_m;
-            shared_process_params_m->bytesWritten = 0;
-            HostWriteNetInput();
-            k_process_m->RunAsync();
+            OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
+                                               + context_idx;
+            p_params->frameIdx = current_frame_idx_m;
+            HostWriteNetInput(context_idx);
+            {
+                std::unique_lock<std::mutex> lock(mutex_access_m);
+                k_process_m->UpdateScalarArg(3, sizeof(uint32_t), &context_idx);
+                k_process_m->RunAsync(context_idx);
+            }
 
             t2 = std::chrono::steady_clock::now();
             std::chrono::duration<float> elapsed = t2 - t1;
-            host_time_m = elapsed.count() * 1000;
+            host_time_m[context_idx] = elapsed.count() * 1000;
             break;
         }
         case CallType::CLEANUP:
@@ -586,7 +628,7 @@ bool ExecutionObject::Impl::RunAsync(CallType ct)
     return true;
 }
 
-bool ExecutionObject::Impl::Wait(CallType ct)
+bool ExecutionObject::Impl::Wait(CallType ct, uint32_t context_idx)
 {
     switch (ct)
     {
@@ -609,13 +651,15 @@ bool ExecutionObject::Impl::Wait(CallType ct)
             bool has_work = k_process_m->Wait(&host_elapsed_ms);
             if (has_work)
             {
-                if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
-                    throw Exception(shared_process_params_m->errorCode,
+                OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
+                                                   + context_idx;
+                if (p_params->errorCode != OCL_TIDL_SUCCESS)
+                    throw Exception(p_params->errorCode,
                                     __FILE__, __FUNCTION__, __LINE__);
 
                 std::chrono::time_point<std::chrono::steady_clock> t1, t2;
                 t1 = std::chrono::steady_clock::now();
-                HostReadNetOutput();
+                HostReadNetOutput(context_idx);
                 t2 = std::chrono::steady_clock::now();
                 std::chrono::duration<float> elapsed = t2 - t1;
                 host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
@@ -635,13 +679,14 @@ bool ExecutionObject::Impl::Wait(CallType ct)
     return false;
 }
 
-bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data)
+bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data,
+                                        uint32_t context_idx)
 {
     switch (ct)
     {
         case CallType::PROCESS:
         {
-            return k_process_m->AddCallback(user_data);
+            return k_process_m->AddCallback(user_data, context_idx);
             break;
         }
         default:
@@ -651,7 +696,7 @@ bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data)
     return false;
 }
 
-uint64_t ExecutionObject::Impl::GetProcessCycles() const
+uint64_t ExecutionObject::Impl::GetProcessCycles(uint32_t context_idx) const
 {
     uint8_t factor = 1;
 
@@ -659,7 +704,9 @@ uint64_t ExecutionObject::Impl::GetProcessCycles() const
     if (device_m->type() == CL_DEVICE_TYPE_CUSTOM)
         factor = 2;
 
-    return shared_process_params_m.get()->cycles * factor;
+    OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() +
+                                       context_idx;
+    return p_params->cycles * factor;
 }
 
 //
@@ -786,15 +833,26 @@ LayerOutput::~LayerOutput()
     delete[] data_m;
 }
 
-void ExecutionObject::Impl::AcquireLock()
+void ExecutionObject::Impl::AcquireContext(uint32_t& context_idx)
 {
     std::unique_lock<std::mutex> lock(mutex_access_m);
-    cv_access_m.wait(lock, [this]{ return this->is_idle_m; });
-    is_idle_m = false;
+    cv_access_m.wait(lock, [this]{ return this->idle_encoding_m <
+                                   (1 << tidl::internal::NUM_CONTEXTS) - 1; });
+
+    for (uint32_t i = 0; i < tidl::internal::NUM_CONTEXTS; i++)
+        if (((1 << i) & idle_encoding_m) == 0)
+        {
+            context_idx = i;
+            break;
+        }
+    idle_encoding_m |= (1 << context_idx);  // mark the bit as busy
 }
 
-void ExecutionObject::Impl::ReleaseLock()
+void ExecutionObject::Impl::ReleaseContext(uint32_t context_idx)
 {
-    is_idle_m = true;
+    {
+        std::unique_lock<std::mutex> lock(mutex_access_m);
+        idle_encoding_m &= (~(1 << context_idx));  // mark the bit as free
+    }
     cv_access_m.notify_all();
 }
index 1998da3a4daa34dd7a76bd98f05418ee7ff45801..279d1ed7e9c0262807110aa81533ce5283b26208 100644 (file)
@@ -32,6 +32,7 @@
 #include <chrono>
 #include "device_arginfo.h"
 #include "execution_object_pipeline.h"
+#include "parameters.h"
 
 using namespace tidl;
 
@@ -63,8 +64,9 @@ class ExecutionObjectPipeline::Impl
         //! current frame index
         int frame_idx_m;
 
-        //! current execution object index
+        //! current execution object index, and it context index
         uint32_t curr_eo_idx_m;
+        uint32_t curr_eo_context_idx_m;
 
         // device and host time tracking: pipeline start to finish
         float device_time_m;
@@ -150,7 +152,7 @@ bool ExecutionObjectPipeline::ProcessFrameStartAsync()
     bool st = pimpl_m->RunAsyncStart();
     if (st)
         st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS,
-                                            this);
+                                         this, pimpl_m->curr_eo_context_idx_m);
     return st;
 }
 
@@ -169,7 +171,8 @@ void ExecutionObjectPipeline::RunAsyncNext()
     bool has_next = pimpl_m->RunAsyncNext();
     if (has_next)
         pimpl_m->eos_m[pimpl_m->curr_eo_idx_m]->AddCallback(
-                                     ExecutionObject::CallType::PROCESS, this);
+                                     ExecutionObject::CallType::PROCESS, this,
+                                     pimpl_m->curr_eo_context_idx_m);
 }
 
 float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const
@@ -306,37 +309,46 @@ bool ExecutionObjectPipeline::Impl::RunAsyncStart()
     device_time_m = 0.0f;
     host_time_m = 0.0f;
     curr_eo_idx_m = 0;
-    eos_m[0]->AcquireLock();
-    start_m = std::chrono::steady_clock::now();
-    eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1]);
-    return eos_m[0]->ProcessFrameStartAsync();
+    eos_m[0]->AcquireContext(curr_eo_context_idx_m);
+    if (tidl::internal::NUM_CONTEXTS == 1)
+        start_m = std::chrono::steady_clock::now();
+    eos_m[0]->SetFrameIndex(frame_idx_m);
+    eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1],
+                                   curr_eo_context_idx_m);
+    return eos_m[0]->ProcessFrameStartAsync(curr_eo_context_idx_m);
 }
 
 // returns true if we have more EOs to execute
 bool ExecutionObjectPipeline::Impl::RunAsyncNext()
 {
-    eos_m[curr_eo_idx_m]->ProcessFrameWait();
+    eos_m[curr_eo_idx_m]->ProcessFrameWait(curr_eo_context_idx_m);
     // need to capture EO's device/host time before we release its lock
     eo_device_time_m[curr_eo_idx_m] = eos_m[curr_eo_idx_m]->
-                                                GetProcessTimeInMilliSeconds();
+                           GetProcessTimeInMilliSeconds(curr_eo_context_idx_m);
     eo_host_time_m[curr_eo_idx_m]   = eos_m[curr_eo_idx_m]->
-                                            GetHostProcessTimeInMilliSeconds();
+                       GetHostProcessTimeInMilliSeconds(curr_eo_context_idx_m);
     device_time_m += eo_device_time_m[curr_eo_idx_m];
-    eos_m[curr_eo_idx_m]->ReleaseLock();
+    if (tidl::internal::NUM_CONTEXTS > 1)
+        host_time_m += eo_host_time_m[curr_eo_idx_m];
+    eos_m[curr_eo_idx_m]->ReleaseContext(curr_eo_context_idx_m);
     curr_eo_idx_m += 1;
     if (curr_eo_idx_m < eos_m.size())
     {
-        eos_m[curr_eo_idx_m]->AcquireLock();
+        eos_m[curr_eo_idx_m]->AcquireContext(curr_eo_context_idx_m);
+        eos_m[curr_eo_idx_m]->SetFrameIndex(frame_idx_m);
         eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m],
-                                                   iobufs_m[curr_eo_idx_m+1]);
-        eos_m[curr_eo_idx_m]->ProcessFrameStartAsync();
+                          iobufs_m[curr_eo_idx_m+1], curr_eo_context_idx_m);
+        eos_m[curr_eo_idx_m]->ProcessFrameStartAsync(curr_eo_context_idx_m);
         return true;
     }
     else
     {
-        std::chrono::duration<float> elapsed = std::chrono::steady_clock::now()
-                                               - start_m;
-        host_time_m = elapsed.count() * 1000;  // seconds to milliseconds
+        if (tidl::internal::NUM_CONTEXTS == 1)
+        {
+            std::chrono::duration<float> elapsed =
+                                    std::chrono::steady_clock::now() - start_m;
+            host_time_m = elapsed.count() * 1000;  // seconds to milliseconds
+        }
         is_processed_m = true;
         cv_m.notify_all();
         return false;
index 5768627521a8f33ed46e508b13eb4cba6d9363ab..508c5498a352d43ab99c24390ffbd9010d3d1d88 100644 (file)
@@ -262,7 +262,7 @@ bool EveDevice::BuildProgramFromBinary(const std::string& kernel_names,
 Kernel::Kernel(Device* device, const std::string& name,
                const KernelArgs& args, uint8_t device_index):
            name_m(name), device_m(device), device_index_m(device_index),
-           is_running_m(false)
+           num_running_contexts_m(0)
 {
     TRACE::print("Creating kernel %s\n", name.c_str());
     cl_int err;
@@ -304,45 +304,52 @@ Kernel::Kernel(Device* device, const std::string& name,
     }
 }
 
-Kernel& Kernel::RunAsync()
+bool Kernel::UpdateScalarArg(uint32_t index, size_t size, const void *value)
+{
+    cl_int ret = clSetKernelArg(kernel_m, index, size, value);
+    return ret == CL_SUCCESS;
+}
+
+Kernel& Kernel::RunAsync(uint32_t context_idx)
 {
     // Execute kernel
-    TRACE::print("\tKernel: device %d executing %s\n", device_index_m,
-                                                       name_m.c_str());
+    TRACE::print("\tKernel: device %d executing %s, context %d\n",
+                 device_index_m, name_m.c_str(), context_idx);
     cl_int ret = clEnqueueTask(device_m->queue_m[device_index_m],
-                               kernel_m, 0, 0, &event_m);
+                               kernel_m, 0, 0, &event_m[context_idx]);
     errorCheck(ret, __LINE__);
-    is_running_m = true;
+    __sync_fetch_and_add(&num_running_contexts_m, 1);
 
     return *this;
 }
 
 
-bool Kernel::Wait(float *host_elapsed_ms)
+bool Kernel::Wait(float *host_elapsed_ms, uint32_t context_idx)
 {
     // Wait called without a corresponding RunAsync
-    if (!is_running_m)
+    if (num_running_contexts_m == 0)
         return false;
 
-    TRACE::print("\tKernel: waiting...\n");
-    cl_int ret = clWaitForEvents(1, &event_m);
+    TRACE::print("\tKernel: waiting context %d...\n", context_idx);
+    cl_int ret = clWaitForEvents(1, &event_m[context_idx]);
     errorCheck(ret, __LINE__);
 
     if (host_elapsed_ms != nullptr)
     {
         cl_ulong t_que, t_end;
-        clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_QUEUED,
+        clGetEventProfilingInfo(event_m[context_idx],
+                                CL_PROFILING_COMMAND_QUEUED,
                                 sizeof(cl_ulong), &t_que, nullptr);
-        clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_END,
+        clGetEventProfilingInfo(event_m[context_idx], CL_PROFILING_COMMAND_END,
                                 sizeof(cl_ulong), &t_end, nullptr);
         *host_elapsed_ms = (t_end - t_que) / 1.0e6;  // nano to milli seconds
     }
 
-    ret = clReleaseEvent(event_m);
+    ret = clReleaseEvent(event_m[context_idx]);
     errorCheck(ret, __LINE__);
     TRACE::print("\tKernel: finished execution\n");
 
-    is_running_m = false;
+    __sync_fetch_and_sub(&num_running_contexts_m, 1);
     return true;
 }
 
@@ -355,11 +362,11 @@ void EventCallback(cl_event event, cl_int exec_status, void *user_data)
     if (CallbackWrapper)  CallbackWrapper(user_data);
 }
 
-bool Kernel::AddCallback(void *user_data)
+bool Kernel::AddCallback(void *user_data, uint32_t context_idx)
 {
-    if (! is_running_m)  return false;
-    return clSetEventCallback(event_m, CL_COMPLETE, EventCallback, user_data)
-           == CL_SUCCESS;
+    if (num_running_contexts_m == 0)  return false;
+    return clSetEventCallback(event_m[context_idx], CL_COMPLETE, EventCallback,
+                              user_data) == CL_SUCCESS;
 }
 
 Kernel::~Kernel()
index 04c5db6af4fef1f0ab7984f139a98a495ef0a2ba..7b627e619ce5efce189798fd11e612a9b5b7584c 100644 (file)
@@ -39,6 +39,7 @@
 #include <memory>
 #include "executor.h"
 #include "device_arginfo.h"
+#include "parameters.h"
 
 namespace tidl
 {
@@ -142,19 +143,20 @@ class Kernel
                const KernelArgs &args, uint8_t device_index);
         ~Kernel();
 
-        Kernel& RunAsync();
-        bool Wait(float *host_elapsed_ms = nullptr);
-        bool AddCallback(void *user_data);
+        bool UpdateScalarArg(uint32_t index, size_t size, const void *value);
+        Kernel& RunAsync(uint32_t context_idx = 0);
+        bool Wait(float *host_elapsed_ms = nullptr, uint32_t context_idx = 0);
+        bool AddCallback(void *user_data, uint32_t context_idx = 0);
 
     private:
         cl_kernel           kernel_m;
-        cl_event            event_m;
+        cl_event            event_m[tidl::internal::NUM_CONTEXTS];
         std::vector<cl_mem> buffers_m;
         const std::string   name_m;
 
         Device*             device_m;
         uint8_t             device_index_m;
-        bool                is_running_m;
+        uint32_t            num_running_contexts_m;
 };
 
 
index 3ea047eeb8bf36a9d15e5304705104afaa9fb65f..6c6635830e4489f5ef338cc442318a387c3a5e3e 100644 (file)
@@ -36,6 +36,7 @@ const size_t DMEM1_SIZE             = 128*1024;
 const size_t OCMC_SIZE              = 320*1024;
 const int    CURR_LAYERS_GROUP_ID   = 1;
 const int    CURR_CORE_ID           = 1;
+const int    NUM_CONTEXTS           = 2;
 
 }
 }