Remove input, output buffers from process kernel
authorAjay Jayaraj <ajayj@ti.com>
Fri, 10 Aug 2018 22:39:22 +0000 (17:39 -0500)
committerAjay Jayaraj <ajayj@ti.com>
Fri, 10 Aug 2018 22:39:22 +0000 (17:39 -0500)
Input and output OpenCL buffers do not have to be passed into the process
kernel. The host will directly update input and output in the buffers
allocated by the TIDL library via the HostWriteNetInput and
HostReadNetOutput methods.

(MCT-1030)

tidl_api/dsp/ocl_wrapper.cl
tidl_api/src/execution_object.cpp
tidl_api/src/ocl_device.cpp

index 2ee6bbf88f1d854d5748352882fb2486b36a2ccb..698d074c14ecbd482f3af8974223561e28bfd612 100644 (file)
@@ -40,7 +40,7 @@ void ocl_tidl_setup(global unsigned char*        createParams,
 kernel 
 void ocl_tidl_initialize(global unsigned char*            createParams,
                          global unsigned char*            netParamsBuffer,
-                        global unsigned char*            externalMemoryHeapBase,
+                         global unsigned char*            externalMemoryHeapBase,
                          global OCL_TIDL_InitializeParams* initializeParams,
                          local  unsigned char*            l2HeapBase)
 {
@@ -52,13 +52,12 @@ void ocl_tidl_initialize(global unsigned char*            createParams,
                             l2HeapBase);
 }
 
-kernel void ocl_tidl_process(global OCL_TIDL_ProcessParams* processParams,
-                             global unsigned char*          inputFrame,
-                             global unsigned char*          outputData,
-                             global unsigned char*          externalMemory,
-                             global unsigned char*          traceBufferParams)
+kernel
+void ocl_tidl_process(global OCL_TIDL_ProcessParams* processParams,
+                      global unsigned char*          externalMemoryHeapBase,
+                      global unsigned char*          traceBufferParams)
 {
-    ocl_dsp_tidl_process(processParams, inputFrame, outputData,
+    ocl_dsp_tidl_process(processParams, externalMemoryHeapBase,
                          traceBufferParams);
 }
 
index eb72cbd6dcbf146812d5dd7bc190f5dc21d7d29e..6ebf653d46710e9e9828986da257ee90f3053ebf 100644 (file)
@@ -55,15 +55,7 @@ class ExecutionObject::Impl
         bool RunAsync(CallType ct);
         bool Wait    (CallType ct);
 
-        bool SetupProcessKernel(const ArgInfo& in, const ArgInfo& out);
-        void HostWriteNetInput();
-        void HostReadNetOutput();
-        void ComputeInputOutputSizes();
-
         Device*                         device_m;
-        std::unique_ptr<Kernel>         k_initialize_m;
-        std::unique_ptr<Kernel>         k_process_m;
-        std::unique_ptr<Kernel>         k_cleanup_m;
 
         up_malloc_ddr<char>             tidl_extmem_heap_m;
         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
@@ -74,21 +66,39 @@ class ExecutionObject::Impl
         ArgInfo                         in_m;
         ArgInfo                         out_m;
 
-        // Index of the OpenCL device/queue used by this EO
-        uint8_t                         device_index_m;
-
         // Frame being processed by the EO
         int                             current_frame_idx_m;
 
         // Trace related
-        uint32_t                          num_network_layers_m;
-        up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
-        size_t                            trace_buf_params_sz_m;
         void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
 
         const LayerOutput* GetOutputFromLayer (uint32_t layer_index,
                                                uint32_t output_index) const;
         const LayerOutputs* GetOutputsFromAllLayers() const;
+
+        uint32_t                          num_network_layers_m;
+        up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
+        size_t                            trace_buf_params_sz_m;
+
+    private:
+        void SetupInitializeKernel(const ArgInfo& create_arg,
+                                   const ArgInfo& param_heap_arg,
+                                   size_t extmem_heap_size,
+                                   bool   internal_input);
+        void SetupProcessKernel();
+
+        void HostWriteNetInput();
+        void HostReadNetOutput();
+        void ComputeInputOutputSizes();
+
+        // Index of the OpenCL device/queue used by this EO
+        uint8_t                         device_index_m;
+
+        std::unique_ptr<Kernel>         k_initialize_m;
+        std::unique_ptr<Kernel>         k_process_m;
+        std::unique_ptr<Kernel>         k_cleanup_m;
+
+
 };
 
 
@@ -115,9 +125,6 @@ ExecutionObject::Impl::Impl(Device* d,
                                  size_t extmem_heap_size,
                                  bool   internal_input):
     device_m(d),
-    k_initialize_m(nullptr),
-    k_process_m(nullptr),
-    k_cleanup_m(nullptr),
     tidl_extmem_heap_m (nullptr, &__free_ddr),
     shared_initialize_params_m(nullptr, &__free_ddr),
     shared_process_params_m(nullptr, &__free_ddr),
@@ -125,45 +132,19 @@ ExecutionObject::Impl::Impl(Device* d,
     out_size_m(0),
     in_m(nullptr, 0),
     out_m(nullptr, 0),
-    device_index_m(device_index),
     current_frame_idx_m(0),
     num_network_layers_m(0),
     trace_buf_params_m(nullptr, &__free_ddr),
-    trace_buf_params_sz_m(0)
+    trace_buf_params_sz_m(0),
+    device_index_m(device_index),
+    k_initialize_m(nullptr),
+    k_process_m(nullptr),
+    k_cleanup_m(nullptr)
 {
-    // Allocate a heap for TI DL to use on the device
-    tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
-
-    // Create a kernel for cleanup
-    KernelArgs cleanup_args;
-    k_cleanup_m.reset(new Kernel(device_m,
-                                 STRING(CLEANUP_KERNEL),
-                                 cleanup_args, device_index_m));
-
-    // Set up parameter struct for the initialize kernel
-    shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
-    memset(shared_initialize_params_m.get(), 0,
-           sizeof(OCL_TIDL_InitializeParams));
+    SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
+                          internal_input);
 
-    shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
-    shared_initialize_params_m->l2HeapSize   = tidl::internal::DMEM1_SIZE;
-    shared_initialize_params_m->l1HeapSize   = tidl::internal::DMEM0_SIZE;
-    shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
-    shared_initialize_params_m->enableInternalInput = internal_input ? 1 : 0;
-
-    // Setup kernel arguments for initialize
-    KernelArgs args = { create_arg,
-                        param_heap_arg,
-                        ArgInfo(tidl_extmem_heap_m.get(),
-                                extmem_heap_size),
-                        ArgInfo(shared_initialize_params_m.get(),
-                                sizeof(OCL_TIDL_InitializeParams)),
-                        device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
-                            ArgInfo(nullptr, tidl::internal::DMEM1_SIZE):
-                            ArgInfo(nullptr, 4)                       };
-
-    k_initialize_m.reset(new Kernel(device_m,
-                                    STRING(INIT_KERNEL), args, device_index_m));
+    SetupProcessKernel();
 
     // Save number of layers in the network
     const TIDL_CreateParams* cp =
@@ -210,7 +191,11 @@ int ExecutionObject::GetFrameIndex() const
 
 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
 {
-    pimpl_m->SetupProcessKernel(in, out);
+    assert(in.ptr() != nullptr && in.size() > 0);
+    assert(out.ptr() != nullptr && out.size() > 0);
+
+    pimpl_m->in_m  = in;
+    pimpl_m->out_m = out;
 }
 
 bool ExecutionObject::ProcessFrameStartAsync()
@@ -292,28 +277,66 @@ ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) con
     pimpl_m->WriteLayerOutputsToFile(filename_prefix);
 }
 
+
 //
-// Create a kernel to call the "process" function
+// Create a kernel to call the "initialize" function
 //
-bool
-ExecutionObject::Impl::SetupProcessKernel(const ArgInfo& in, const ArgInfo& out)
+void
+ExecutionObject::Impl::SetupInitializeKernel(const ArgInfo& create_arg,
+                                             const ArgInfo& param_heap_arg,
+                                             size_t extmem_heap_size,
+                                             bool   internal_input)
 {
-    in_m = in;
-    out_m = out;
+    // Allocate a heap for TI DL to use on the device
+    tidl_extmem_heap_m.reset(malloc_ddr<char>(extmem_heap_size));
+
+    // Create a kernel for cleanup
+    KernelArgs cleanup_args;
+    k_cleanup_m.reset(new Kernel(device_m,
+                                 STRING(CLEANUP_KERNEL),
+                                 cleanup_args, device_index_m));
+
+    // Set up parameter struct for the initialize kernel
+    shared_initialize_params_m.reset(malloc_ddr<OCL_TIDL_InitializeParams>());
+    memset(shared_initialize_params_m.get(), 0,
+           sizeof(OCL_TIDL_InitializeParams));
 
+    shared_initialize_params_m->tidlHeapSize = extmem_heap_size;
+    shared_initialize_params_m->l2HeapSize   = tidl::internal::DMEM1_SIZE;
+    shared_initialize_params_m->l1HeapSize   = tidl::internal::DMEM0_SIZE;
+    shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
+    shared_initialize_params_m->enableInternalInput = internal_input ? 1 : 0;
+
+    // Setup kernel arguments for initialize
+    KernelArgs args = { create_arg,
+                        param_heap_arg,
+                        ArgInfo(tidl_extmem_heap_m.get(),
+                                extmem_heap_size),
+                        ArgInfo(shared_initialize_params_m.get(),
+                                sizeof(OCL_TIDL_InitializeParams)),
+                        device_m->type() == CL_DEVICE_TYPE_ACCELERATOR ?
+                            ArgInfo(nullptr, tidl::internal::DMEM1_SIZE):
+                            ArgInfo(nullptr, 4)                       };
+
+    k_initialize_m.reset(new Kernel(device_m,
+                                    STRING(INIT_KERNEL), args,
+                                    device_index_m));
+}
+
+//
+// Create a kernel to call the "process" function
+//
+void
+ExecutionObject::Impl::SetupProcessKernel()
+{
     shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
     shared_process_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
     shared_process_params_m->enableInternalInput =
                                shared_initialize_params_m->enableInternalInput;
     shared_process_params_m->cycles = 0;
 
-    if (shared_process_params_m->enableInternalInput == 0)
-        assert(in.ptr() != nullptr && in.size() > 0);
-
     KernelArgs args = { ArgInfo(shared_process_params_m.get(),
                                 sizeof(OCL_TIDL_ProcessParams)),
-                        in,
-                        out,
                         ArgInfo(tidl_extmem_heap_m.get(),
                                 shared_initialize_params_m->tidlHeapSize),
                         ArgInfo(trace_buf_params_m.get(),
@@ -322,9 +345,8 @@ ExecutionObject::Impl::SetupProcessKernel(const ArgInfo& in, const ArgInfo& out)
                       };
 
     k_process_m.reset(new Kernel(device_m,
-                                 STRING(PROCESS_KERNEL), args, device_index_m));
-
-    return true;
+                                 STRING(PROCESS_KERNEL), args,
+                                 device_index_m));
 }
 
 
index a3853a59a5e6bf6677e64094713b2237c035333b..f20305d1cba21547f84066455207968401f959e0 100644 (file)
@@ -278,15 +278,15 @@ Kernel::Kernel(Device* device, const std::string& name,
             {
                 cl_mem buffer = device_m->CreateBuffer(arg);
 
-                clSetKernelArg(kernel_m, arg_index++, sizeof(cl_mem), &buffer);
-                TRACE::print("  Arg[%d]: %p\n", arg_index-1, buffer);
+                clSetKernelArg(kernel_m, arg_index, sizeof(cl_mem), &buffer);
+                TRACE::print("  Arg[%d]: %p\n", arg_index, buffer);
 
-                if (buffer != nullptr)  buffers_m.push_back(buffer);
+                buffers_m.push_back(buffer);
             }
             else if (arg.kind() == ArgInfo::Kind::SCALAR)
             {
-                clSetKernelArg(kernel_m, arg_index++, arg.size(), arg.ptr());
-                TRACE::print("  Arg[%d]: %p\n", arg_index-1, arg.ptr());
+                clSetKernelArg(kernel_m, arg_index, arg.size(), arg.ptr());
+                TRACE::print("  Arg[%d]: %p\n", arg_index, arg.ptr());
             }
             else
             {
@@ -295,9 +295,10 @@ Kernel::Kernel(Device* device, const std::string& name,
         }
         else
         {
-            clSetKernelArg(kernel_m, arg_index++, arg.size(), NULL);
-            TRACE::print("  Arg[%d]: local, %d\n", arg_index-1, arg.size());
+            clSetKernelArg(kernel_m, arg_index, arg.size(), NULL);
+            TRACE::print("  Arg[%d]: local, %d\n", arg_index, arg.size());
         }
+        arg_index++;
 
     }
 }