Modified IODeviceArgInfo to enable pipelining EOs
[tidl/tidl-api.git] / tidl_api / src / execution_object.cpp
index 2d49bf14347998a00229ce088f39a265ad4c2a58..d722ebb196669019fbc3071338397d62a9a79ba9 100644 (file)
@@ -28,6 +28,9 @@
 
 /*! \file execution_object.cpp */
 
+#include <string.h>
+#include <fstream>
+#include <climits>
 #include "executor.h"
 #include "execution_object.h"
 #include "trace.h"
@@ -35,7 +38,8 @@
 #include "parameters.h"
 #include "configuration.h"
 #include "common_defines.h"
-#include <string.h>
+#include "tidl_create_params.h"
+#include "device_arginfo.h"
 
 using namespace tidl;
 
@@ -43,8 +47,8 @@ class ExecutionObject::Impl
 {
     public:
         Impl(Device* d, uint8_t device_index,
-             const ArgInfo& create_arg,
-             const ArgInfo& param_heap_arg,
+             const DeviceArgInfo& create_arg,
+             const DeviceArgInfo& param_heap_arg,
              size_t extmem_heap_size,
              bool   internal_input);
         ~Impl() {}
@@ -52,15 +56,7 @@ class ExecutionObject::Impl
         bool RunAsync(CallType ct);
         bool Wait    (CallType ct);
 
-        bool SetupProcessKernel(const ArgInfo& in, const ArgInfo& out);
-        void HostWriteNetInput();
-        void HostReadNetOutput();
-        void ComputeInputOutputSizes();
-
         Device*                         device_m;
-        std::unique_ptr<Kernel>         k_initialize_m;
-        std::unique_ptr<Kernel>         k_process_m;
-        std::unique_ptr<Kernel>         k_cleanup_m;
 
         up_malloc_ddr<char>             tidl_extmem_heap_m;
         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
@@ -68,14 +64,42 @@ class ExecutionObject::Impl
 
         size_t                          in_size_m;
         size_t                          out_size_m;
-        ArgInfo                         in_m;
-        ArgInfo                         out_m;
+        IODeviceArgInfo                 in_m;
+        IODeviceArgInfo                 out_m;
+
+        // Frame being processed by the EO
+        int                             current_frame_idx_m;
+
+        // Trace related
+        void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
+
+        const LayerOutput* GetOutputFromLayer (uint32_t layer_index,
+                                               uint32_t output_index) const;
+        const LayerOutputs* GetOutputsFromAllLayers() const;
+
+        uint32_t                          num_network_layers_m;
+        up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
+        size_t                            trace_buf_params_sz_m;
+
+    private:
+        void SetupInitializeKernel(const DeviceArgInfo& create_arg,
+                                   const DeviceArgInfo& param_heap_arg,
+                                   size_t extmem_heap_size,
+                                   bool   internal_input);
+        void SetupProcessKernel();
+
+        void HostWriteNetInput();
+        void HostReadNetOutput();
+        void ComputeInputOutputSizes();
 
         // Index of the OpenCL device/queue used by this EO
         uint8_t                         device_index_m;
 
-        // Frame being processed by the EO
-        int                             current_frame_idx_m;
+        std::unique_ptr<Kernel>         k_initialize_m;
+        std::unique_ptr<Kernel>         k_process_m;
+        std::unique_ptr<Kernel>         k_cleanup_m;
+
+
 };
 
 
@@ -86,10 +110,13 @@ ExecutionObject::ExecutionObject(Device* d,
                                  size_t extmem_heap_size,
                                  bool   internal_input)
 {
+    DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
+    DeviceArgInfo param_heap_arg_d(param_heap_arg, DeviceArgInfo::Kind::BUFFER);
+
     pimpl_m = std::unique_ptr<ExecutionObject::Impl>
               { new ExecutionObject::Impl(d, device_index,
-                                          create_arg,
-                                          param_heap_arg,
+                                          create_arg_d,
+                                          param_heap_arg_d,
                                           extmem_heap_size,
                                           internal_input) };
 }
@@ -97,57 +124,36 @@ ExecutionObject::ExecutionObject(Device* d,
 
 ExecutionObject::Impl::Impl(Device* d,
                                  uint8_t device_index,
-                                 const ArgInfo& create_arg,
-                                 const ArgInfo& param_heap_arg,
+                                 const DeviceArgInfo& create_arg,
+                                 const DeviceArgInfo& param_heap_arg,
                                  size_t extmem_heap_size,
                                  bool   internal_input):
     device_m(d),
-    k_initialize_m(nullptr),
-    k_process_m(nullptr),
-    k_cleanup_m(nullptr),
     tidl_extmem_heap_m (nullptr, &__free_ddr),
     shared_initialize_params_m(nullptr, &__free_ddr),
     shared_process_params_m(nullptr, &__free_ddr),
     in_size_m(0),
     out_size_m(0),
-    in_m(nullptr, 0),
-    out_m(nullptr, 0),
+    in_m(),
+    out_m(),
+    current_frame_idx_m(0),
+    num_network_layers_m(0),
+    trace_buf_params_m(nullptr, &__free_ddr),
+    trace_buf_params_sz_m(0),
     device_index_m(device_index),
-    current_frame_idx_m(0)
+    k_initialize_m(nullptr),
+    k_process_m(nullptr),
+    k_cleanup_m(nullptr)
 {
-    // Allocate a heap for TI DL to use on the device
-    tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
+    SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
+                          internal_input);
 
-    // Create a kernel for cleanup
-    KernelArgs cleanup_args;
-    k_cleanup_m.reset(new Kernel(device_m,
-                                 STRING(CLEANUP_KERNEL),
-                                 cleanup_args, device_index_m));
-
-    // Set up parameter struct for the initialize kernel
-    shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
-    memset(shared_initialize_params_m.get(), 0,
-           sizeof(OCL_TIDL_InitializeParams));
-
-    shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
-    shared_initialize_params_m->l2HeapSize   = tidl::internal::DMEM1_SIZE;
-    shared_initialize_params_m->l1HeapSize   = tidl::internal::DMEM0_SIZE;
-    shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
-    shared_initialize_params_m->enableInternalInput = internal_input ? 1 : 0;
-
-    // Setup kernel arguments for initialize
-    KernelArgs args = { create_arg,
-                        param_heap_arg,
-                        ArgInfo(tidl_extmem_heap_m.get(),
-                                extmem_heap_size),
-                        ArgInfo(shared_initialize_params_m.get(),
-                                sizeof(OCL_TIDL_InitializeParams)),
-                        device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
-                            ArgInfo(nullptr, tidl::internal::DMEM1_SIZE):
-                            ArgInfo(nullptr, 4)                       };
+    SetupProcessKernel();
 
-    k_initialize_m.reset(new Kernel(device_m,
-                                    STRING(INIT_KERNEL), args, device_index_m));
+    // Save number of layers in the network
+    const TIDL_CreateParams* cp =
+                static_cast<const TIDL_CreateParams *>(create_arg.ptr());
+    num_network_layers_m = cp->net.numLayers;
 }
 
 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
@@ -157,24 +163,28 @@ ExecutionObject::~ExecutionObject() = default;
 
 char* ExecutionObject::GetInputBufferPtr() const
 {
-    return static_cast<char *>(pimpl_m->in_m.ptr());
+    return static_cast<char *>(pimpl_m->in_m.GetArg().ptr());
 }
 
 size_t ExecutionObject::GetInputBufferSizeInBytes() const
 {
-    if (pimpl_m->in_m.ptr() == nullptr)  return pimpl_m->in_size_m;
-    else                                 return pimpl_m->in_m.size();
+    const DeviceArgInfo& arg = pimpl_m->in_m.GetArg();
+    if    (arg.ptr() == nullptr)  return pimpl_m->in_size_m;
+    else                          return arg.size();
 }
 
 char* ExecutionObject::GetOutputBufferPtr() const
 {
-    return static_cast<char *>(pimpl_m->out_m.ptr());
+    return static_cast<char *>(pimpl_m->out_m.GetArg().ptr());
 }
 
 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
 {
-    if (pimpl_m->out_m.ptr() == nullptr)  return pimpl_m->out_size_m;
-    else           return pimpl_m->shared_process_params_m.get()->bytesWritten;
+    const DeviceArgInfo& arg = pimpl_m->out_m.GetArg();
+    if   (arg.ptr() == nullptr)
+        return pimpl_m->out_size_m;
+    else
+        return pimpl_m->shared_process_params_m.get()->bytesWritten;
 }
 
 void  ExecutionObject::SetFrameIndex(int idx)
@@ -189,7 +199,18 @@ int ExecutionObject::GetFrameIndex() const
 
 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
 {
-    pimpl_m->SetupProcessKernel(in, out);
+    assert(in.ptr() != nullptr && in.size() > 0);
+    assert(out.ptr() != nullptr && out.size() > 0);
+
+    pimpl_m->in_m  = IODeviceArgInfo(in);
+    pimpl_m->out_m = IODeviceArgInfo(out);
+}
+
+void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
+                                           const IODeviceArgInfo* out)
+{
+    pimpl_m->in_m  = *in;
+    pimpl_m->out_m = *out;
 }
 
 bool ExecutionObject::ProcessFrameStartAsync()
@@ -229,36 +250,125 @@ float ExecutionObject::GetProcessTimeInMilliSeconds() const
     return ((float)GetProcessCycles())/frequency * 1000;
 }
 
+const LayerOutput* ExecutionObject::GetOutputFromLayer(
+                         uint32_t layer_index, uint32_t output_index) const
+{
+    return pimpl_m->GetOutputFromLayer(layer_index, output_index);
+}
+
+const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
+{
+    return pimpl_m->GetOutputsFromAllLayers();
+}
+
 //
-// Create a kernel to call the "process" function
+// Allocate an OpenCL buffer for TIDL layer output buffer metadata.
+// The device will populate metadata for every buffer that is used as an
+// output buffer by a layer.
+//
+void ExecutionObject::EnableOutputBufferTrace()
+{
+    pimpl_m->trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
+                                       pimpl_m->num_network_layers_m*
+                                       TIDL_NUM_OUT_BUFS);
+
+    pimpl_m->trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
+                                      (pimpl_m->trace_buf_params_sz_m));
+
+    // Device will update bufferId if there is valid data for the entry
+    OCL_TIDL_BufParams* bufferParams = pimpl_m->trace_buf_params_m.get();
+    for (uint32_t i = 0; i < pimpl_m->num_network_layers_m; i++)
+        for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
+        {
+            OCL_TIDL_BufParams *bufP =
+                                &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
+            bufP->bufferId = UINT_MAX;
+        }
+}
+
+void
+ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
+{
+    pimpl_m->WriteLayerOutputsToFile(filename_prefix);
+}
+
+
 //
-bool
-ExecutionObject::Impl::SetupProcessKernel(const ArgInfo& in, const ArgInfo& out)
+// Create a kernel to call the "initialize" function
+//
+void
+ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
+                                             const DeviceArgInfo& param_heap_arg,
+                                             size_t extmem_heap_size,
+                                             bool   internal_input)
 {
-    in_m = in;
-    out_m = out;
+    // Allocate a heap for TI DL to use on the device
+    tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
+
+    // Create a kernel for cleanup
+    KernelArgs cleanup_args;
+    k_cleanup_m.reset(new Kernel(device_m,
+                                 STRING(CLEANUP_KERNEL),
+                                 cleanup_args, device_index_m));
+
+    // Set up parameter struct for the initialize kernel
+    shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
+    memset(shared_initialize_params_m.get(), 0,
+           sizeof(OCL_TIDL_InitializeParams));
+
+    shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
+    shared_initialize_params_m->l2HeapSize   = tidl::internal::DMEM1_SIZE;
+    shared_initialize_params_m->l1HeapSize   = tidl::internal::DMEM0_SIZE;
+    shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
+    shared_initialize_params_m->enableInternalInput = internal_input ? 1 : 0;
+
+    // Setup kernel arguments for initialize
+    KernelArgs args = { create_arg,
+                        param_heap_arg,
+                        DeviceArgInfo(tidl_extmem_heap_m.get(),
+                                      extmem_heap_size,
+                                      DeviceArgInfo::Kind::BUFFER),
+                        DeviceArgInfo(shared_initialize_params_m.get(),
+                                      sizeof(OCL_TIDL_InitializeParams),
+                                      DeviceArgInfo::Kind::BUFFER),
+                        device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
+                            DeviceArgInfo(nullptr, tidl::internal::DMEM1_SIZE,
+                                          DeviceArgInfo::Kind::LOCAL):
+                            DeviceArgInfo(nullptr, 4,
+                                          DeviceArgInfo::Kind::LOCAL) };
+
+    k_initialize_m.reset(new Kernel(device_m,
+                                    STRING(INIT_KERNEL), args,
+                                    device_index_m));
+}
 
+//
+// Create a kernel to call the "process" function
+//
+void
+ExecutionObject::Impl::SetupProcessKernel()
+{
     shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
     shared_process_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
     shared_process_params_m->enableInternalInput =
                                shared_initialize_params_m->enableInternalInput;
     shared_process_params_m->cycles = 0;
 
-    if (shared_process_params_m->enableInternalInput == 0)
-        assert(in.ptr() != nullptr && in.size() > 0);
+    KernelArgs args = { DeviceArgInfo(shared_process_params_m.get(),
+                                      sizeof(OCL_TIDL_ProcessParams),
+                                      DeviceArgInfo::Kind::BUFFER),
+                        DeviceArgInfo(tidl_extmem_heap_m.get(),
+                                      shared_initialize_params_m->tidlHeapSize,
+                                      DeviceArgInfo::Kind::BUFFER),
+                        DeviceArgInfo(trace_buf_params_m.get(),
+                                      trace_buf_params_sz_m,
+                                      DeviceArgInfo::Kind::BUFFER)
 
-    KernelArgs args = { ArgInfo(shared_process_params_m.get(),
-                                sizeof(OCL_TIDL_ProcessParams)),
-                        in,
-                        out,
-                        ArgInfo(tidl_extmem_heap_m.get(),
-                                shared_initialize_params_m->tidlHeapSize)
                       };
 
     k_process_m.reset(new Kernel(device_m,
-                                 STRING(PROCESS_KERNEL), args, device_index_m));
-
-    return true;
+                                 STRING(PROCESS_KERNEL), args,
+                                 device_index_m));
 }
 
 
@@ -292,10 +402,13 @@ static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
     return width*height*n;
 }
 
+//
+// Copy from host buffer to TIDL device buffer
+//
 void ExecutionObject::Impl::HostWriteNetInput()
 {
-    char* readPtr  = (char *) in_m.ptr();
-    PipeInfo *pipe = in_m.GetPipe();
+    const char*     readPtr  = (const char *) in_m.GetArg().ptr();
+    const PipeInfo& pipe     = in_m.GetPipe();
 
     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
     {
@@ -318,17 +431,20 @@ void ExecutionObject::Impl::HostWriteNetInput()
         }
         else
         {
-            shared_process_params_m->inBufAddr[i] = pipe->bufAddr_m[i];
+            shared_process_params_m->inBufAddr[i] = pipe.bufAddr_m[i];
         }
 
-        shared_process_params_m->inDataQ[i]   = pipe->dataQ_m[i];
+        shared_process_params_m->inDataQ[i]   = pipe.dataQ_m[i];
     }
 }
 
+//
+// Copy from TIDL device buffer into host buffer
+//
 void ExecutionObject::Impl::HostReadNetOutput()
 {
-    char* writePtr = (char *) out_m.ptr();
-    PipeInfo *pipe = out_m.GetPipe();
+    char* writePtr = (char *) out_m.GetArg().ptr();
+    PipeInfopipe = out_m.GetPipe();
 
     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
     {
@@ -348,11 +464,12 @@ void ExecutionObject::Impl::HostReadNetOutput()
                  outBuf->numChannels));
         }
 
-        pipe->dataQ_m[i]   = shared_process_params_m->outDataQ[i];
-        pipe->bufAddr_m[i] = shared_initialize_params_m->bufAddrBase
+        pipe.dataQ_m[i]   = shared_process_params_m->outDataQ[i];
+        pipe.bufAddr_m[i] = shared_initialize_params_m->bufAddrBase
                            + outBuf->bufPlaneBufOffset;
     }
-    shared_process_params_m->bytesWritten = writePtr - (char *) out_m.ptr();
+    shared_process_params_m->bytesWritten = writePtr -
+                                            (char *) out_m.GetArg().ptr();
 }
 
 void ExecutionObject::Impl::ComputeInputOutputSizes()
@@ -456,3 +573,127 @@ bool ExecutionObject::Impl::Wait(CallType ct)
 
     return false;
 }
+
+//
+// Write the trace data to output files
+//
+void
+ExecutionObject::Impl::WriteLayerOutputsToFile(const std::string& filename_prefix) const
+{
+    if (trace_buf_params_sz_m == 0)
+        return;
+
+    OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
+
+    for (uint32_t i = 0; i < num_network_layers_m; i++)
+        for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
+        {
+            OCL_TIDL_BufParams* buf = &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
+
+            if (buf->bufferId == UINT_MAX)
+                continue;
+
+            size_t buffer_size = buf->numChannels * buf->ROIHeight *
+                                 buf->ROIWidth;
+
+            char *tmp = new char[buffer_size];
+
+            if (tmp == nullptr)
+                throw Exception("Out of memory, new failed",
+                        __FILE__, __FUNCTION__, __LINE__);
+
+            writeDataS8(
+                tmp,
+                (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
+                + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
+                + OCL_TIDL_MAX_PAD_SIZE,
+                buf->numChannels,
+                buf->ROIWidth,
+                buf->ROIHeight,
+                buf->bufPlaneWidth,
+                ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
+                 buf->numChannels));
+
+            std::string filename(filename_prefix);
+            filename += std::to_string(buf->bufferId) + "_";
+            filename += std::to_string(buf->ROIWidth) + "x";
+            filename += std::to_string(buf->ROIHeight) + ".bin";
+
+            std::ofstream ofs;
+            ofs.open(filename, std::ofstream::out);
+            ofs.write(tmp, buffer_size);
+            ofs.close();
+
+            delete[] tmp;
+        }
+}
+
+
+const LayerOutput* ExecutionObject::Impl::GetOutputFromLayer(
+                            uint32_t layer_index, uint32_t output_index) const
+{
+    if (trace_buf_params_sz_m == 0)
+        return nullptr;
+
+    if (layer_index > num_network_layers_m || output_index > TIDL_NUM_OUT_BUFS)
+        return nullptr;
+
+    OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
+    OCL_TIDL_BufParams* buf = &bufferParams[layer_index*TIDL_NUM_OUT_BUFS+
+                                            output_index];
+
+    if (buf->bufferId == UINT_MAX)
+        return nullptr;
+
+    size_t buffer_size = buf->numChannels * buf->ROIHeight *
+                         buf->ROIWidth;
+
+    char *data = new char[buffer_size];
+
+    if (data == nullptr)
+        throw Exception("Out of memory, new failed",
+                __FILE__, __FUNCTION__, __LINE__);
+
+    writeDataS8(data,
+                (char *) tidl_extmem_heap_m.get() + buf->bufPlaneBufOffset
+                + buf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
+                + OCL_TIDL_MAX_PAD_SIZE,
+                buf->numChannels,
+                buf->ROIWidth,
+                buf->ROIHeight,
+                buf->bufPlaneWidth,
+                ((buf->bufPlaneWidth * buf->bufPlaneHeight)/
+                 buf->numChannels));
+
+    return new LayerOutput(layer_index, output_index, buf->bufferId,
+                           buf->numROIs, buf->numChannels, buf->ROIHeight,
+                           buf->ROIWidth, data);
+}
+
+const LayerOutputs* ExecutionObject::Impl::GetOutputsFromAllLayers() const
+{
+    LayerOutputs* result = new LayerOutputs;
+
+    for (uint32_t i=0; i < num_network_layers_m; i++)
+        for (int j=0; j < TIDL_NUM_OUT_BUFS; j++)
+        {
+            const LayerOutput* lo = GetOutputFromLayer(i, j);
+            if (lo)
+                result->push_back(std::unique_ptr<const LayerOutput>{ lo });
+        }
+
+    return result;
+}
+
+LayerOutput::LayerOutput(int layer_index, int output_index, int buffer_id,
+                         int num_roi, int num_channels, size_t height,
+                         size_t width, const char* data):
+                        layer_index_m(layer_index), buffer_id_m(buffer_id),
+                        num_roi_m(num_roi), num_channels_m(num_channels),
+                        height_m(height), width_m(width), data_m(data)
+{ }
+
+LayerOutput::~LayerOutput()
+{
+    delete[] data_m;
+}