From: Yuan Zhao Date: Thu, 4 Oct 2018 16:39:08 +0000 (-0500) Subject: Enqueue multiple frames at device side X-Git-Tag: v01.02.00^2~23 X-Git-Url: https://git.ti.com/gitweb?p=tidl%2Ftidl-api.git;a=commitdiff_plain;h=f4aea3acecbc3fda639eb725bd16ec7f1cb73a99 Enqueue multiple frames at device side - Previous implementation won't send/enqueue next frame to device until the host has received completion message for current frame. The improvement is to create multiple sets/contexts of internal TIDL input/output buffers at device side, and to send/enqueue next frame using a different set/context of internal TIDL input/output buffers to device while device is still processing the current frame. When device finishes current frame, it can immediately read its messageQ and start processing the next frame, without waiting for the completion message reaching the host and the hosting sending the next frame. - In pipelined processing of multiple frames, this optimization can effectively hide the round-trip communication between host and device. - Removed deprecated enableInternalInput feature - MCT-1059 --- diff --git a/tidl_api/dsp/ocl_wrapper.cl b/tidl_api/dsp/ocl_wrapper.cl index 698d074..e75ed1d 100644 --- a/tidl_api/dsp/ocl_wrapper.cl +++ b/tidl_api/dsp/ocl_wrapper.cl @@ -55,10 +55,11 @@ void ocl_tidl_initialize(global unsigned char* createParams, kernel void ocl_tidl_process(global OCL_TIDL_ProcessParams* processParams, global unsigned char* externalMemoryHeapBase, - global unsigned char* traceBufferParams) + global unsigned char* traceBufferParams, + uint32_t contextIndex) { ocl_dsp_tidl_process(processParams, externalMemoryHeapBase, - traceBufferParams); + traceBufferParams, contextIndex); } diff --git a/tidl_api/inc/execution_object.h b/tidl_api/inc/execution_object.h index dad5866..d875d7b 100644 --- a/tidl_api/inc/execution_object.h +++ b/tidl_api/inc/execution_object.h @@ -66,6 +66,13 @@ class ExecutionObject : public ExecutionObjectInternalInterface void SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out) override; + //! Specify the input and output buffers used by the EO in a context + //! @param in buffer used for input. + //! @param out buffer used for output. + //! @param context_idx the index of the context + void SetInputOutputBuffer(const ArgInfo& in, + const ArgInfo& out, uint32_t context_idx); + //! Returns a pointer to the input buffer set via SetInputOutputBuffer char* GetInputBufferPtr() const override; @@ -90,22 +97,46 @@ class ExecutionObject : public ExecutionObjectInternalInterface //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait bool ProcessFrameStartAsync() override; + //! @brief Start processing with a context. The call is asynchronous and + //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait + //! @param context_idx the index of the context + bool ProcessFrameStartAsync(uint32_t context_idx); + //! Wait for the execution object to complete processing a frame //! @return false if ExecutionObject::ProcessFrameWait was called //! without a corresponding call to //! ExecutionObject::ProcessFrameStartAsync. bool ProcessFrameWait() override; + //! Wait for the execution object to complete processing with a context + //! @param context_idx the index of the context + //! @return false if ExecutionObject::ProcessFrameWait was called + //! without a corresponding call to + //! ExecutionObject::ProcessFrameStartAsync. + bool ProcessFrameWait(uint32_t context_idx); + //! @brief return the number of milliseconds taken *on the device* to //! execute the process call //! @return Number of milliseconds to process a frame on the device. float GetProcessTimeInMilliSeconds() const override; + //! @brief return the number of milliseconds taken *on the device* to + //! execute the process call with a contex + //! @param context_idx the index of the context + //! @return Number of milliseconds to process a frame on the device. + float GetProcessTimeInMilliSeconds(uint32_t context_idx) const; + //! @brief return the number of milliseconds taken *on the host* to //! execute the process call //! @return Number of milliseconds to process a frame on the host. float GetHostProcessTimeInMilliSeconds() const override; + //! @brief return the number of milliseconds taken *on the host* to + //! execute the process call with a contex + //! @param context_idx the index of the context + //! @return Number of milliseconds to process a frame on the host. + float GetHostProcessTimeInMilliSeconds(uint32_t context_idx) const; + //! Returns the device name that the ExecutionObject runs on const std::string& GetDeviceName() const override; @@ -137,9 +168,9 @@ class ExecutionObject : public ExecutionObjectInternalInterface //! @private // Used by the ExecutionObjectPipeline - bool AddCallback(CallType ct, void *user_data); - void AcquireLock(); - void ReleaseLock(); + bool AddCallback(CallType ct, void *user_data, uint32_t context_idx); + void AcquireContext(uint32_t& context_idx); + void ReleaseContext(uint32_t context_idx); ExecutionObject() = delete; ExecutionObject(const ExecutionObject&) = delete; @@ -147,7 +178,7 @@ class ExecutionObject : public ExecutionObjectInternalInterface //! @private void SetInputOutputBuffer(const IODeviceArgInfo* in, - const IODeviceArgInfo* out); + const IODeviceArgInfo* out, uint32_t context_idx); private: class Impl; diff --git a/tidl_api/src/device_arginfo.h b/tidl_api/src/device_arginfo.h index 841e0c1..f2052df 100644 --- a/tidl_api/src/device_arginfo.h +++ b/tidl_api/src/device_arginfo.h @@ -69,7 +69,6 @@ class PipeInfo { public: uint32_t dataQ_m[OCL_TIDL_MAX_IN_BUFS]; - uint32_t bufAddr_m[OCL_TIDL_MAX_IN_BUFS]; }; /*! @class IODeviceArgInfo diff --git a/tidl_api/src/execution_object.cpp b/tidl_api/src/execution_object.cpp index 00d6804..9bc9f05 100644 --- a/tidl_api/src/execution_object.cpp +++ b/tidl_api/src/execution_object.cpp @@ -55,14 +55,14 @@ class ExecutionObject::Impl int layers_group_id); ~Impl() {} - bool RunAsync(CallType ct); - bool Wait (CallType ct); - bool AddCallback(CallType ct, void *user_data); + bool RunAsync(CallType ct, uint32_t context_idx); + bool Wait (CallType ct, uint32_t context_idx); + bool AddCallback(CallType ct, void *user_data, uint32_t context_idx); - uint64_t GetProcessCycles() const; + uint64_t GetProcessCycles(uint32_t context_idx) const; int GetLayersGroupId() const; - void AcquireLock(); - void ReleaseLock(); + void AcquireContext(uint32_t& context_idx); + void ReleaseContext(uint32_t context_idx); Device* device_m; // Index of the OpenCL device/queue used by this EO @@ -75,8 +75,8 @@ class ExecutionObject::Impl size_t in_size_m; size_t out_size_m; - IODeviceArgInfo in_m; - IODeviceArgInfo out_m; + IODeviceArgInfo in_m[tidl::internal::NUM_CONTEXTS]; + IODeviceArgInfo out_m[tidl::internal::NUM_CONTEXTS]; // Frame being processed by the EO int current_frame_idx_m; @@ -96,7 +96,7 @@ class ExecutionObject::Impl size_t trace_buf_params_sz_m; // host time tracking: eo start to finish - float host_time_m; + float host_time_m[tidl::internal::NUM_CONTEXTS]; private: void SetupInitializeKernel(const DeviceArgInfo& create_arg, @@ -104,8 +104,8 @@ class ExecutionObject::Impl void EnableOutputBufferTrace(); void SetupProcessKernel(); - void HostWriteNetInput(); - void HostReadNetOutput(); + void HostWriteNetInput(uint32_t context_idx); + void HostReadNetOutput(uint32_t context_idx); void ComputeInputOutputSizes(); std::unique_ptr k_initialize_m; @@ -113,7 +113,8 @@ class ExecutionObject::Impl std::unique_ptr k_cleanup_m; // Guarding sole access to input/output for one frame during execution - bool is_idle_m; + // Encoding: context at bit index, bit value: 0 for idle, 1 for busy + uint32_t idle_encoding_m; std::mutex mutex_access_m; std::condition_variable cv_access_m; @@ -155,8 +156,6 @@ ExecutionObject::Impl::Impl(Device* d, uint8_t device_index, shared_process_params_m(nullptr, &__free_ddr), in_size_m(0), out_size_m(0), - in_m(), - out_m(), current_frame_idx_m(0), layers_group_id_m(layers_group_id), num_network_layers_m(0), @@ -165,7 +164,7 @@ ExecutionObject::Impl::Impl(Device* d, uint8_t device_index, k_initialize_m(nullptr), k_process_m(nullptr), k_cleanup_m(nullptr), - is_idle_m(true), + idle_encoding_m(0), // all contexts are idle configuration_m(configuration) { device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m); @@ -189,7 +188,7 @@ ExecutionObject::~ExecutionObject() = default; char* ExecutionObject::GetInputBufferPtr() const { - return static_cast(pimpl_m->in_m.GetArg().ptr()); + return static_cast(pimpl_m->in_m[0].GetArg().ptr()); } size_t ExecutionObject::GetInputBufferSizeInBytes() const @@ -199,7 +198,7 @@ size_t ExecutionObject::GetInputBufferSizeInBytes() const char* ExecutionObject::GetOutputBufferPtr() const { - return static_cast(pimpl_m->out_m.GetArg().ptr()); + return static_cast(pimpl_m->out_m[0].GetArg().ptr()); } size_t ExecutionObject::GetOutputBufferSizeInBytes() const @@ -217,59 +216,89 @@ int ExecutionObject::GetFrameIndex() const return pimpl_m->current_frame_idx_m; } -void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out) +void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, + const ArgInfo& out) +{ + SetInputOutputBuffer(in, out, 0); +} + +void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, + const ArgInfo& out, uint32_t context_idx) { assert(in.ptr() != nullptr && in.size() >= pimpl_m->in_size_m); assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m); - pimpl_m->in_m = IODeviceArgInfo(in); - pimpl_m->out_m = IODeviceArgInfo(out); + pimpl_m->in_m[context_idx] = IODeviceArgInfo(in); + pimpl_m->out_m[context_idx] = IODeviceArgInfo(out); } void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in, - const IODeviceArgInfo* out) + const IODeviceArgInfo* out, + uint32_t context_idx) { - pimpl_m->in_m = *in; - pimpl_m->out_m = *out; + pimpl_m->in_m[context_idx] = *in; + pimpl_m->out_m[context_idx] = *out; } bool ExecutionObject::ProcessFrameStartAsync() { - TRACE::print("-> ExecutionObject::ProcessFrameStartAsync()\n"); + return ProcessFrameStartAsync(0); +} + +bool ExecutionObject::ProcessFrameStartAsync(uint32_t context_idx) +{ + TRACE::print("-> ExecutionObject::ProcessFrameStartAsync(%d)\n", + context_idx); assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr); - return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS); + return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS, context_idx); } bool ExecutionObject::ProcessFrameWait() { - TRACE::print("-> ExecutionObject::ProcessFrameWait()\n"); - return pimpl_m->Wait(ExecutionObject::CallType::PROCESS); + return ProcessFrameWait(0); +} + +bool ExecutionObject::ProcessFrameWait(uint32_t context_idx) +{ + TRACE::print("-> ExecutionObject::ProcessFrameWait(%d)\n", context_idx); + return pimpl_m->Wait(ExecutionObject::CallType::PROCESS, context_idx); } bool ExecutionObject::RunAsync (CallType ct) { - return pimpl_m->RunAsync(ct); + return pimpl_m->RunAsync(ct, 0); } bool ExecutionObject::Wait (CallType ct) { - return pimpl_m->Wait(ct); + return pimpl_m->Wait(ct, 0); } -bool ExecutionObject::AddCallback(CallType ct, void *user_data) +bool ExecutionObject::AddCallback(CallType ct, void *user_data, + uint32_t context_idx) { - return pimpl_m->AddCallback(ct, user_data); + return pimpl_m->AddCallback(ct, user_data, context_idx); } float ExecutionObject::GetProcessTimeInMilliSeconds() const +{ + return GetProcessTimeInMilliSeconds(0); +} + +float ExecutionObject::GetProcessTimeInMilliSeconds(uint32_t context_idx) const { float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000; - return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000; + return ((float)pimpl_m->GetProcessCycles(context_idx)) / frequency * 1000; } float ExecutionObject::GetHostProcessTimeInMilliSeconds() const { - return pimpl_m->host_time_m; + return GetHostProcessTimeInMilliSeconds(0); +} + +float ExecutionObject::GetHostProcessTimeInMilliSeconds(uint32_t context_idx) const +{ + return pimpl_m->host_time_m[context_idx]; } void @@ -299,14 +328,14 @@ const std::string& ExecutionObject::GetDeviceName() const return pimpl_m->device_name_m; } -void ExecutionObject::AcquireLock() +void ExecutionObject::AcquireContext(uint32_t& context_idx) { - pimpl_m->AcquireLock(); + pimpl_m->AcquireContext(context_idx); } -void ExecutionObject::ReleaseLock() +void ExecutionObject::ReleaseContext(uint32_t context_idx) { - pimpl_m->ReleaseLock(); + pimpl_m->ReleaseContext(context_idx); } // @@ -334,7 +363,7 @@ ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg, shared_initialize_params_m->tidlHeapSize =configuration_m.NETWORK_HEAP_SIZE; shared_initialize_params_m->l2HeapSize = tidl::internal::DMEM1_SIZE; shared_initialize_params_m->l1HeapSize = tidl::internal::DMEM0_SIZE; - shared_initialize_params_m->enableInternalInput = 0; + shared_initialize_params_m->numContexts = tidl::internal::NUM_CONTEXTS; // Set up execution trace specified in the configuration EnableExecutionTrace(configuration_m, @@ -392,16 +421,19 @@ void ExecutionObject::Impl::EnableOutputBufferTrace() void ExecutionObject::Impl::SetupProcessKernel() { - shared_process_params_m.reset(malloc_ddr()); - shared_process_params_m->enableInternalInput = - shared_initialize_params_m->enableInternalInput; - shared_process_params_m->cycles = 0; + shared_process_params_m.reset(malloc_ddr( + tidl::internal::NUM_CONTEXTS * sizeof(OCL_TIDL_ProcessParams))); // Set up execution trace specified in the configuration - EnableExecutionTrace(configuration_m, - &shared_process_params_m->enableTrace); + for (int i = 0; i < tidl::internal::NUM_CONTEXTS; i++) + { + OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() + i; + EnableExecutionTrace(configuration_m, &p_params->enableTrace); + } + uint32_t context_idx = 0; KernelArgs args = { DeviceArgInfo(shared_process_params_m.get(), + tidl::internal::NUM_CONTEXTS * sizeof(OCL_TIDL_ProcessParams), DeviceArgInfo::Kind::BUFFER), DeviceArgInfo(tidl_extmem_heap_m.get(), @@ -409,8 +441,10 @@ ExecutionObject::Impl::SetupProcessKernel() DeviceArgInfo::Kind::BUFFER), DeviceArgInfo(trace_buf_params_m.get(), trace_buf_params_sz_m, - DeviceArgInfo::Kind::BUFFER) - + DeviceArgInfo::Kind::BUFFER), + DeviceArgInfo(&context_idx, + sizeof(uint32_t), + DeviceArgInfo::Kind::SCALAR) }; k_process_m.reset(new Kernel(device_m, @@ -452,20 +486,25 @@ static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width, // // Copy from host buffer to TIDL device buffer // -void ExecutionObject::Impl::HostWriteNetInput() +void ExecutionObject::Impl::HostWriteNetInput(uint32_t context_idx) { - const char* readPtr = (const char *) in_m.GetArg().ptr(); - const PipeInfo& pipe = in_m.GetPipe(); + const char* readPtr = (const char *) in_m[context_idx].GetArg().ptr(); + const PipeInfo& pipe = in_m[context_idx].GetPipe(); + OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() + + context_idx; for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++) { OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i]; + uint32_t context_size = inBuf->bufPlaneWidth * inBuf->bufPlaneHeight; + context_size = (context_size + OCL_TIDL_CACHE_ALIGN - 1) & + (~(OCL_TIDL_CACHE_ALIGN - 1)); + char *inBufAddr = tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset + + context_idx * context_size; - if (shared_process_params_m->enableInternalInput == 0) - { readPtr += readDataS8( readPtr, - (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset + (char *) inBufAddr + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE + OCL_TIDL_MAX_PAD_SIZE, inBuf->numROIs, @@ -475,32 +514,34 @@ void ExecutionObject::Impl::HostWriteNetInput() inBuf->bufPlaneWidth, ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) / inBuf->numChannels)); - } - else - { - shared_process_params_m->inBufAddr[i] = pipe.bufAddr_m[i]; - } - shared_process_params_m->inDataQ[i] = pipe.dataQ_m[i]; + p_params->dataQ[i] = pipe.dataQ_m[i]; } } // // Copy from TIDL device buffer into host buffer // -void ExecutionObject::Impl::HostReadNetOutput() +void ExecutionObject::Impl::HostReadNetOutput(uint32_t context_idx) { - char* writePtr = (char *) out_m.GetArg().ptr(); - PipeInfo& pipe = out_m.GetPipe(); + char* writePtr = (char *) out_m[context_idx].GetArg().ptr(); + PipeInfo& pipe = out_m[context_idx].GetPipe(); + OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() + + context_idx; for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++) { OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i]; + uint32_t context_size = outBuf->bufPlaneWidth * outBuf->bufPlaneHeight; + context_size = (context_size + OCL_TIDL_CACHE_ALIGN - 1) & + (~(OCL_TIDL_CACHE_ALIGN - 1)); + char *outBufAddr = tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset + + context_idx * context_size; if (writePtr != nullptr) { writePtr += writeDataS8( writePtr, - (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset + (char *) outBufAddr + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE + OCL_TIDL_MAX_PAD_SIZE, outBuf->numChannels, @@ -511,12 +552,8 @@ void ExecutionObject::Impl::HostReadNetOutput() outBuf->numChannels)); } - pipe.dataQ_m[i] = shared_process_params_m->outDataQ[i]; - pipe.bufAddr_m[i] = shared_initialize_params_m->bufAddrBase - + outBuf->bufPlaneBufOffset; + pipe.dataQ_m[i] = p_params->dataQ[i]; } - shared_process_params_m->bytesWritten = writePtr - - (char *) out_m.GetArg().ptr(); } void ExecutionObject::Impl::ComputeInputOutputSizes() @@ -550,7 +587,7 @@ void ExecutionObject::Impl::ComputeInputOutputSizes() } -bool ExecutionObject::Impl::RunAsync(CallType ct) +bool ExecutionObject::Impl::RunAsync(CallType ct, uint32_t context_idx) { switch (ct) { @@ -564,14 +601,19 @@ bool ExecutionObject::Impl::RunAsync(CallType ct) std::chrono::time_point t1, t2; t1 = std::chrono::steady_clock::now(); - shared_process_params_m->frameIdx = current_frame_idx_m; - shared_process_params_m->bytesWritten = 0; - HostWriteNetInput(); - k_process_m->RunAsync(); + OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() + + context_idx; + p_params->frameIdx = current_frame_idx_m; + HostWriteNetInput(context_idx); + { + std::unique_lock lock(mutex_access_m); + k_process_m->UpdateScalarArg(3, sizeof(uint32_t), &context_idx); + k_process_m->RunAsync(context_idx); + } t2 = std::chrono::steady_clock::now(); std::chrono::duration elapsed = t2 - t1; - host_time_m = elapsed.count() * 1000; + host_time_m[context_idx] = elapsed.count() * 1000; break; } case CallType::CLEANUP: @@ -586,7 +628,7 @@ bool ExecutionObject::Impl::RunAsync(CallType ct) return true; } -bool ExecutionObject::Impl::Wait(CallType ct) +bool ExecutionObject::Impl::Wait(CallType ct, uint32_t context_idx) { switch (ct) { @@ -609,13 +651,15 @@ bool ExecutionObject::Impl::Wait(CallType ct) bool has_work = k_process_m->Wait(&host_elapsed_ms); if (has_work) { - if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS) - throw Exception(shared_process_params_m->errorCode, + OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() + + context_idx; + if (p_params->errorCode != OCL_TIDL_SUCCESS) + throw Exception(p_params->errorCode, __FILE__, __FUNCTION__, __LINE__); std::chrono::time_point t1, t2; t1 = std::chrono::steady_clock::now(); - HostReadNetOutput(); + HostReadNetOutput(context_idx); t2 = std::chrono::steady_clock::now(); std::chrono::duration elapsed = t2 - t1; host_time_m += elapsed.count() * 1000 + host_elapsed_ms; @@ -635,13 +679,14 @@ bool ExecutionObject::Impl::Wait(CallType ct) return false; } -bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data) +bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data, + uint32_t context_idx) { switch (ct) { case CallType::PROCESS: { - return k_process_m->AddCallback(user_data); + return k_process_m->AddCallback(user_data, context_idx); break; } default: @@ -651,7 +696,7 @@ bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data) return false; } -uint64_t ExecutionObject::Impl::GetProcessCycles() const +uint64_t ExecutionObject::Impl::GetProcessCycles(uint32_t context_idx) const { uint8_t factor = 1; @@ -659,7 +704,9 @@ uint64_t ExecutionObject::Impl::GetProcessCycles() const if (device_m->type() == CL_DEVICE_TYPE_CUSTOM) factor = 2; - return shared_process_params_m.get()->cycles * factor; + OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get() + + context_idx; + return p_params->cycles * factor; } // @@ -786,15 +833,26 @@ LayerOutput::~LayerOutput() delete[] data_m; } -void ExecutionObject::Impl::AcquireLock() +void ExecutionObject::Impl::AcquireContext(uint32_t& context_idx) { std::unique_lock lock(mutex_access_m); - cv_access_m.wait(lock, [this]{ return this->is_idle_m; }); - is_idle_m = false; + cv_access_m.wait(lock, [this]{ return this->idle_encoding_m < + (1 << tidl::internal::NUM_CONTEXTS) - 1; }); + + for (uint32_t i = 0; i < tidl::internal::NUM_CONTEXTS; i++) + if (((1 << i) & idle_encoding_m) == 0) + { + context_idx = i; + break; + } + idle_encoding_m |= (1 << context_idx); // mark the bit as busy } -void ExecutionObject::Impl::ReleaseLock() +void ExecutionObject::Impl::ReleaseContext(uint32_t context_idx) { - is_idle_m = true; + { + std::unique_lock lock(mutex_access_m); + idle_encoding_m &= (~(1 << context_idx)); // mark the bit as free + } cv_access_m.notify_all(); } diff --git a/tidl_api/src/execution_object_pipeline.cpp b/tidl_api/src/execution_object_pipeline.cpp index 1998da3..279d1ed 100644 --- a/tidl_api/src/execution_object_pipeline.cpp +++ b/tidl_api/src/execution_object_pipeline.cpp @@ -32,6 +32,7 @@ #include #include "device_arginfo.h" #include "execution_object_pipeline.h" +#include "parameters.h" using namespace tidl; @@ -63,8 +64,9 @@ class ExecutionObjectPipeline::Impl //! current frame index int frame_idx_m; - //! current execution object index + //! current execution object index, and it context index uint32_t curr_eo_idx_m; + uint32_t curr_eo_context_idx_m; // device and host time tracking: pipeline start to finish float device_time_m; @@ -150,7 +152,7 @@ bool ExecutionObjectPipeline::ProcessFrameStartAsync() bool st = pimpl_m->RunAsyncStart(); if (st) st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS, - this); + this, pimpl_m->curr_eo_context_idx_m); return st; } @@ -169,7 +171,8 @@ void ExecutionObjectPipeline::RunAsyncNext() bool has_next = pimpl_m->RunAsyncNext(); if (has_next) pimpl_m->eos_m[pimpl_m->curr_eo_idx_m]->AddCallback( - ExecutionObject::CallType::PROCESS, this); + ExecutionObject::CallType::PROCESS, this, + pimpl_m->curr_eo_context_idx_m); } float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const @@ -306,37 +309,46 @@ bool ExecutionObjectPipeline::Impl::RunAsyncStart() device_time_m = 0.0f; host_time_m = 0.0f; curr_eo_idx_m = 0; - eos_m[0]->AcquireLock(); - start_m = std::chrono::steady_clock::now(); - eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1]); - return eos_m[0]->ProcessFrameStartAsync(); + eos_m[0]->AcquireContext(curr_eo_context_idx_m); + if (tidl::internal::NUM_CONTEXTS == 1) + start_m = std::chrono::steady_clock::now(); + eos_m[0]->SetFrameIndex(frame_idx_m); + eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1], + curr_eo_context_idx_m); + return eos_m[0]->ProcessFrameStartAsync(curr_eo_context_idx_m); } // returns true if we have more EOs to execute bool ExecutionObjectPipeline::Impl::RunAsyncNext() { - eos_m[curr_eo_idx_m]->ProcessFrameWait(); + eos_m[curr_eo_idx_m]->ProcessFrameWait(curr_eo_context_idx_m); // need to capture EO's device/host time before we release its lock eo_device_time_m[curr_eo_idx_m] = eos_m[curr_eo_idx_m]-> - GetProcessTimeInMilliSeconds(); + GetProcessTimeInMilliSeconds(curr_eo_context_idx_m); eo_host_time_m[curr_eo_idx_m] = eos_m[curr_eo_idx_m]-> - GetHostProcessTimeInMilliSeconds(); + GetHostProcessTimeInMilliSeconds(curr_eo_context_idx_m); device_time_m += eo_device_time_m[curr_eo_idx_m]; - eos_m[curr_eo_idx_m]->ReleaseLock(); + if (tidl::internal::NUM_CONTEXTS > 1) + host_time_m += eo_host_time_m[curr_eo_idx_m]; + eos_m[curr_eo_idx_m]->ReleaseContext(curr_eo_context_idx_m); curr_eo_idx_m += 1; if (curr_eo_idx_m < eos_m.size()) { - eos_m[curr_eo_idx_m]->AcquireLock(); + eos_m[curr_eo_idx_m]->AcquireContext(curr_eo_context_idx_m); + eos_m[curr_eo_idx_m]->SetFrameIndex(frame_idx_m); eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m], - iobufs_m[curr_eo_idx_m+1]); - eos_m[curr_eo_idx_m]->ProcessFrameStartAsync(); + iobufs_m[curr_eo_idx_m+1], curr_eo_context_idx_m); + eos_m[curr_eo_idx_m]->ProcessFrameStartAsync(curr_eo_context_idx_m); return true; } else { - std::chrono::duration elapsed = std::chrono::steady_clock::now() - - start_m; - host_time_m = elapsed.count() * 1000; // seconds to milliseconds + if (tidl::internal::NUM_CONTEXTS == 1) + { + std::chrono::duration elapsed = + std::chrono::steady_clock::now() - start_m; + host_time_m = elapsed.count() * 1000; // seconds to milliseconds + } is_processed_m = true; cv_m.notify_all(); return false; diff --git a/tidl_api/src/ocl_device.cpp b/tidl_api/src/ocl_device.cpp index 5768627..508c549 100644 --- a/tidl_api/src/ocl_device.cpp +++ b/tidl_api/src/ocl_device.cpp @@ -262,7 +262,7 @@ bool EveDevice::BuildProgramFromBinary(const std::string& kernel_names, Kernel::Kernel(Device* device, const std::string& name, const KernelArgs& args, uint8_t device_index): name_m(name), device_m(device), device_index_m(device_index), - is_running_m(false) + num_running_contexts_m(0) { TRACE::print("Creating kernel %s\n", name.c_str()); cl_int err; @@ -304,45 +304,52 @@ Kernel::Kernel(Device* device, const std::string& name, } } -Kernel& Kernel::RunAsync() +bool Kernel::UpdateScalarArg(uint32_t index, size_t size, const void *value) +{ + cl_int ret = clSetKernelArg(kernel_m, index, size, value); + return ret == CL_SUCCESS; +} + +Kernel& Kernel::RunAsync(uint32_t context_idx) { // Execute kernel - TRACE::print("\tKernel: device %d executing %s\n", device_index_m, - name_m.c_str()); + TRACE::print("\tKernel: device %d executing %s, context %d\n", + device_index_m, name_m.c_str(), context_idx); cl_int ret = clEnqueueTask(device_m->queue_m[device_index_m], - kernel_m, 0, 0, &event_m); + kernel_m, 0, 0, &event_m[context_idx]); errorCheck(ret, __LINE__); - is_running_m = true; + __sync_fetch_and_add(&num_running_contexts_m, 1); return *this; } -bool Kernel::Wait(float *host_elapsed_ms) +bool Kernel::Wait(float *host_elapsed_ms, uint32_t context_idx) { // Wait called without a corresponding RunAsync - if (!is_running_m) + if (num_running_contexts_m == 0) return false; - TRACE::print("\tKernel: waiting...\n"); - cl_int ret = clWaitForEvents(1, &event_m); + TRACE::print("\tKernel: waiting context %d...\n", context_idx); + cl_int ret = clWaitForEvents(1, &event_m[context_idx]); errorCheck(ret, __LINE__); if (host_elapsed_ms != nullptr) { cl_ulong t_que, t_end; - clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_QUEUED, + clGetEventProfilingInfo(event_m[context_idx], + CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &t_que, nullptr); - clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_END, + clGetEventProfilingInfo(event_m[context_idx], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t_end, nullptr); *host_elapsed_ms = (t_end - t_que) / 1.0e6; // nano to milli seconds } - ret = clReleaseEvent(event_m); + ret = clReleaseEvent(event_m[context_idx]); errorCheck(ret, __LINE__); TRACE::print("\tKernel: finished execution\n"); - is_running_m = false; + __sync_fetch_and_sub(&num_running_contexts_m, 1); return true; } @@ -355,11 +362,11 @@ void EventCallback(cl_event event, cl_int exec_status, void *user_data) if (CallbackWrapper) CallbackWrapper(user_data); } -bool Kernel::AddCallback(void *user_data) +bool Kernel::AddCallback(void *user_data, uint32_t context_idx) { - if (! is_running_m) return false; - return clSetEventCallback(event_m, CL_COMPLETE, EventCallback, user_data) - == CL_SUCCESS; + if (num_running_contexts_m == 0) return false; + return clSetEventCallback(event_m[context_idx], CL_COMPLETE, EventCallback, + user_data) == CL_SUCCESS; } Kernel::~Kernel() diff --git a/tidl_api/src/ocl_device.h b/tidl_api/src/ocl_device.h index 04c5db6..7b627e6 100644 --- a/tidl_api/src/ocl_device.h +++ b/tidl_api/src/ocl_device.h @@ -39,6 +39,7 @@ #include #include "executor.h" #include "device_arginfo.h" +#include "parameters.h" namespace tidl { @@ -142,19 +143,20 @@ class Kernel const KernelArgs &args, uint8_t device_index); ~Kernel(); - Kernel& RunAsync(); - bool Wait(float *host_elapsed_ms = nullptr); - bool AddCallback(void *user_data); + bool UpdateScalarArg(uint32_t index, size_t size, const void *value); + Kernel& RunAsync(uint32_t context_idx = 0); + bool Wait(float *host_elapsed_ms = nullptr, uint32_t context_idx = 0); + bool AddCallback(void *user_data, uint32_t context_idx = 0); private: cl_kernel kernel_m; - cl_event event_m; + cl_event event_m[tidl::internal::NUM_CONTEXTS]; std::vector buffers_m; const std::string name_m; Device* device_m; uint8_t device_index_m; - bool is_running_m; + uint32_t num_running_contexts_m; }; diff --git a/tidl_api/src/parameters.h b/tidl_api/src/parameters.h index 3ea047e..6c66358 100644 --- a/tidl_api/src/parameters.h +++ b/tidl_api/src/parameters.h @@ -36,6 +36,7 @@ const size_t DMEM1_SIZE = 128*1024; const size_t OCMC_SIZE = 320*1024; const int CURR_LAYERS_GROUP_ID = 1; const int CURR_CORE_ID = 1; +const int NUM_CONTEXTS = 2; } }