Partitioned execution of TI DL network
authorYuan Zhao <yuanzhao@ti.com>
Wed, 9 May 2018 22:31:04 +0000 (17:31 -0500)
committerYuan Zhao <yuanzhao@ti.com>
Thu, 10 May 2018 20:55:52 +0000 (15:55 -0500)
- Enable pipelined and partitioned execution of TIDL network
  by setting the same ArgInfo:
  - as the output of ExecutionObject for first  layersGroup
  - as the input  of ExecutionObject for second layersGroup
  See ssd_multibox for example.
- Enable internal input mode that intermediate results between
  different layersGroup no longer need to be saved in user
  application.
- Fixes for correct partitioned execution of TIDL network,
  e.g. dataQ needs to be passed between layersGroups
- MCT-969, MCT-974

examples/make.common
examples/ssd_multibox/main.cpp
tinn_api/inc/configuration.h
tinn_api/inc/execution_object.h
tinn_api/inc/executor.h
tinn_api/src/configuration.cpp
tinn_api/src/execution_object.cpp
tinn_api/src/executor.cpp
tinn_api/src/ocl_device.cpp

index 891b86e5d9ada7005288371ba012c293c4105ea3..88aec56792c5ec8751ac92193e0e9be8b51c6bdb 100644 (file)
@@ -17,6 +17,8 @@ else
 endif
 
 CXXFLAGS += -I. -I$(TINN_DIR)/inc -std=c++11
+CXXFLAGS += -I$(TI_OCL_INSTALL)/usr/share/ti/opencl
+CXXFLAGS += -I$(TARGET_ROOTDIR)/usr/share/ti/opencl
 
 HEADERS = 
 LIBS    = -lOpenCL -locl_util -lpthread
index 5780fe8d693ccb38dbe9c50f95942be0bf05e112..ad26ba7820ef4196270818fa9c8391a7d44f8467 100644 (file)
@@ -59,6 +59,7 @@ bool __TI_show_debug_ = false;
 bool is_default_input = false;
 bool is_preprocessed_input = false;
 bool is_camera_input       = false;
+bool is_partitioned        = true;
 int  orig_width;
 int  orig_height;
 object_class_table_t *object_class_table;
@@ -67,19 +68,21 @@ using namespace tinn;
 using namespace cv;
 
 
-bool RunConfiguration(const std::string& config_file, int num_devices,
+bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
                       DeviceType device_type, std::string& input_file);
-bool RunAllConfigurations(int32_t num_devices, DeviceType device_type);
-
 bool ReadFrame(ExecutionObject& eo, int frame_idx,
                const Configuration& configuration, int num_frames,
                std::string& image_file, VideoCapture &cap);
-bool WriteFrameOutput(const ExecutionObject &eo,
+bool WriteFrameOutput(const ExecutionObject &eo_in,
+                      const ExecutionObject &eo_out,
                       const Configuration& configuration);
 
+void ReportTime(int frame_index, std::string device_name, double elapsed_host,
+                double elapsed_device);
+
 static void ProcessArgs(int argc, char *argv[],
                         std::string& config,
-                        int& num_devices,
+                        uint32_t& num_devices,
                         DeviceType& device_type,
                         std::string& input_file);
 
@@ -107,10 +110,18 @@ int main(int argc, char *argv[])
     // Process arguments
     std::string config      = DEFAULT_CONFIG;
     std::string input_file  = DEFAULT_INPUT;
-    int         num_devices = 1;
+    uint32_t num_devices    = 1;
     DeviceType  device_type = DeviceType::DLA;
     ProcessArgs(argc, argv, config, num_devices, device_type, input_file);
 
+    if (is_partitioned)
+        num_devices = std::min(num_devices, std::min(num_dla, num_dsp));
+    if (num_devices == 0)
+    {
+        std::cout << "Partitioned execution requires at least 1 DLA and 1 DSP."
+                  << std::endl;
+        return EXIT_FAILURE;
+    }
     if ((object_class_table = GetObjectClassTable(config)) == nullptr)
     {
         std::cout << "No object classes defined for this config." << std::endl;
@@ -138,7 +149,7 @@ int main(int argc, char *argv[])
     return EXIT_SUCCESS;
 }
 
-bool RunConfiguration(const std::string& config_file, int num_devices,
+bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
                       DeviceType device_type, std::string& input_file)
 {
     DeviceIds ids;
@@ -154,8 +165,7 @@ bool RunConfiguration(const std::string& config_file, int num_devices,
                   << std::endl;
         return false;
     }
-    if (device_type == DeviceType::DLA || device_type == DeviceType::DSP)
-        configuration.runFullNet = 1;
+    configuration.runFullNet = is_partitioned ? 0 : 1;
 
     // setup input
     int num_frames = is_default_input ? 3 : 1;
@@ -181,72 +191,111 @@ bool RunConfiguration(const std::string& config_file, int num_devices,
     {
         // Create a executor with the approriate core type, number of cores
         // and configuration specified
-        Executor executor(device_type, ids, configuration);
+        configuration.layersGroupId = 1;
+        Executor *executor_1 = new Executor(device_type, ids, configuration);
+        Executor *executor_2 = nullptr;
+        if (is_partitioned)
+        {
+            configuration.layersGroupId       = 2;
+            configuration.enableInternalInput = 1;  // 0 is also valid
+            executor_2 = new Executor(DeviceType::DSP, ids, configuration);
+        }
 
         // Query Executor for set of ExecutionObjects created
-        const ExecutionObjects& execution_objects =
-                                                executor.GetExecutionObjects();
-        int num_eos = execution_objects.size();
+        const ExecutionObjects *execution_objects_1, *execution_objects_2;
+        execution_objects_1 = & executor_1->GetExecutionObjects();
+        int num_eos = execution_objects_1->size();
+        if (is_partitioned)
+            execution_objects_2 = & executor_2->GetExecutionObjects();
 
         // Allocate input and output buffers for each execution object
         std::vector<void *> buffers;
-        for (auto &eo : execution_objects)
+        for (int i = 0; i < num_eos; i++)
         {
-            size_t in_size  = eo->GetInputBufferSizeInBytes();
-            size_t out_size = eo->GetOutputBufferSizeInBytes();
+            ExecutionObject *eo1 = execution_objects_1->at(i).get();
+            size_t in_size  = eo1->GetInputBufferSizeInBytes();
+            size_t out_size = eo1->GetOutputBufferSizeInBytes();
             ArgInfo in  = { ArgInfo(malloc(in_size),  in_size)};
-            ArgInfo out = { ArgInfo(malloc(out_size), out_size)};
-            eo->SetInputOutputBuffer(in, out);
+            ArgInfo out = { ArgInfo(nullptr, 0) };
+            if (configuration.enableInternalInput == 0)
+                out = ArgInfo(malloc(out_size), out_size);
+            eo1->SetInputOutputBuffer(in, out);
 
             buffers.push_back(in.ptr());
             buffers.push_back(out.ptr());
+
+            if (is_partitioned)
+            {
+                ExecutionObject *eo2 = execution_objects_2->at(i).get();
+                size_t out2_size = eo2->GetOutputBufferSizeInBytes();
+                ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
+                eo2->SetInputOutputBuffer(out, out2);
+                buffers.push_back(out2.ptr());
+            }
         }
 
         #define MAX_NUM_EOS  4
-        struct timespec t0[MAX_NUM_EOS], t1;
+        struct timespec t0[MAX_NUM_EOS], t1, tloop0, tloop1;
+        clock_gettime(CLOCK_MONOTONIC, &tloop0);
 
         // Process frames with available execution objects in a pipelined manner
         // additional num_eos iterations to flush the pipeline (epilogue)
+        ExecutionObject *eo1, *eo2, *eo_wait, *eo_input;
         for (int frame_idx = 0;
              frame_idx < num_frames + num_eos; frame_idx++)
         {
-            ExecutionObject* eo = execution_objects[frame_idx % num_eos].get();
+            eo1 = execution_objects_1->at(frame_idx % num_eos).get();
+            eo_wait = eo1;
+            if (is_partitioned)
+            {
+                eo2 = execution_objects_2->at(frame_idx % num_eos).get();
+                eo_wait = eo2;
+            }
 
             // Wait for previous frame on the same eo to finish processing
-            if (eo->ProcessFrameWait())
+            if (eo_wait->ProcessFrameWait())
             {
+                int finished_idx = eo_wait->GetFrameIndex();
                 clock_gettime(CLOCK_MONOTONIC, &t1);
-                double elapsed_host =
-                                ms_diff(t0[eo->GetFrameIndex() % num_eos], t1);
-                double elapsed_device = eo->GetProcessTimeInMilliSeconds();
-                double overhead = 100 - (elapsed_device/elapsed_host*100);
-
-                std::cout << "frame[" << eo->GetFrameIndex() << "]: "
-                          << "Time on device: "
-                          << std::setw(6) << std::setprecision(4)
-                          << elapsed_device << "ms, "
-                          << "host: "
-                          << std::setw(6) << std::setprecision(4)
-                          << elapsed_host << "ms ";
-                std::cout << "API overhead: "
-                          << std::setw(6) << std::setprecision(3)
-                          << overhead << " %" << std::endl;
-
-                WriteFrameOutput(*eo, configuration);
+                ReportTime(finished_idx,
+                           (is_partitioned || device_type == DeviceType::DSP) ?
+                           "DSP" : "DLA",
+                           ms_diff(t0[finished_idx % num_eos], t1),
+                           eo_wait->GetProcessTimeInMilliSeconds());
+
+                eo_input = execution_objects_1->at(finished_idx %num_eos).get();
+                WriteFrameOutput(*eo_input, *eo_wait, configuration);
             }
 
             // Read a frame and start processing it with current eo
-            if (ReadFrame(*eo, frame_idx, configuration, num_frames,
+            if (ReadFrame(*eo1, frame_idx, configuration, num_frames,
                           image_file, cap))
             {
                 clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
-                eo->ProcessFrameStartAsync();
+                eo1->ProcessFrameStartAsync();
+
+                if (is_partitioned && eo1->ProcessFrameWait())
+                {
+                    clock_gettime(CLOCK_MONOTONIC, &t1);
+                    ReportTime(frame_idx, "DLA",
+                           ms_diff(t0[frame_idx % num_eos], t1),
+                           eo1->GetProcessTimeInMilliSeconds());
+
+                    clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
+                    eo2->ProcessFrameStartAsync();
+                }
             }
         }
 
+        clock_gettime(CLOCK_MONOTONIC, &tloop1);
+        std::cout << "Loop total time (including read/write/print/etc): "
+                  << std::setw(6) << std::setprecision(4)
+                  << ms_diff(tloop0, tloop1) << "ms" << std::endl;
+
+        delete executor_1;
+        delete executor_2;
         for (auto b : buffers)
             free(b);
-
     }
     catch (tinn::Exception &e)
     {
@@ -257,6 +306,22 @@ bool RunConfiguration(const std::string& config_file, int num_devices,
     return status;
 }
 
+void ReportTime(int frame_index, std::string device_name, double elapsed_host,
+                double elapsed_device)
+{
+    double overhead = 100 - (elapsed_device/elapsed_host*100);
+    std::cout << "frame[" << frame_index << "]: "
+              << "Time on " << device_name << ": "
+              << std::setw(6) << std::setprecision(4)
+              << elapsed_device << "ms, "
+              << "host: "
+              << std::setw(6) << std::setprecision(4)
+              << elapsed_host << "ms ";
+    std::cout << "API overhead: "
+              << std::setw(6) << std::setprecision(3)
+              << overhead << " %" << std::endl;
+}
+
 
 bool ReadFrame(ExecutionObject &eo, int frame_idx,
                const Configuration& configuration, int num_frames,
@@ -321,7 +386,8 @@ bool ReadFrame(ExecutionObject &eo, int frame_idx,
 }
 
 // Create frame with boxes drawn around classified objects
-bool WriteFrameOutput(const ExecutionObject &eo,
+bool WriteFrameOutput(const ExecutionObject &eo_in,
+                      const ExecutionObject &eo_out,
                       const Configuration& configuration)
 {
     // Asseembly original frame
@@ -330,13 +396,13 @@ bool WriteFrameOutput(const ExecutionObject &eo,
     int channel_size = width * height;
     Mat frame, r_frame, bgr[3];
 
-    unsigned char *in = (unsigned char *) eo.GetInputBufferPtr();
+    unsigned char *in = (unsigned char *) eo_in.GetInputBufferPtr();
     bgr[0] = Mat(height, width, CV_8UC(1), in);
     bgr[1] = Mat(height, width, CV_8UC(1), in + channel_size);
     bgr[2] = Mat(height, width, CV_8UC(1), in + channel_size*2);
     cv::merge(bgr, 3, frame);
 
-    int frame_index = eo.GetFrameIndex();
+    int frame_index = eo_in.GetFrameIndex();
     char outfile_name[64];
     if (! is_camera_input && is_preprocessed_input)
     {
@@ -346,8 +412,8 @@ bool WriteFrameOutput(const ExecutionObject &eo,
     }
 
     // Draw boxes around classified objects
-    float *out = (float *) eo.GetOutputBufferPtr();
-    int num_floats = eo.GetOutputBufferSizeInBytes() / sizeof(float);
+    float *out = (float *) eo_out.GetOutputBufferPtr();
+    int num_floats = eo_out.GetOutputBufferSizeInBytes() / sizeof(float);
     for (int i = 0; i < num_floats / 7; i++)
     {
         int index = (int)    out[i * 7 + 0];
@@ -395,7 +461,7 @@ bool WriteFrameOutput(const ExecutionObject &eo,
 
 
 void ProcessArgs(int argc, char *argv[], std::string& config,
-                 int& num_devices, DeviceType& device_type,
+                 uint32_t& num_devices, DeviceType& device_type,
                  std::string& input_file)
 {
     const struct option long_options[] =
@@ -428,10 +494,16 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
                       break;
 
             case 't': if (*optarg == 'e')
+                      {
                           device_type = DeviceType::DLA;
+                          is_partitioned = false;
+                      }
 #if 0
                       else if (*optarg == 'd')
+                      {
                           device_type = DeviceType::DSP;
+                          is_partitioned = false;
+                      }
 #endif
                       else
                       {
@@ -473,7 +545,9 @@ void DisplayHelp()
                  " -c <config>          Valid configs: jdetnet, jdetnet_512x256\n"
                  " -n <number of cores> Number of cores to use (1 - 4)\n"
                  " -t <d|e>             Type of core. d -> DSP, e -> DLA\n"
-                 "                      Only support DLA for now\n"
+                 "                      DSP not supported at this time\n"
+                 "                      Default to partitioned execution: \n"
+                 "                          part 1 on DLA, part 2 on DSP\n"
                  " -i <image>           Path to the image file\n"
                  "                      Default is 1 frame in testvecs\n"
                  " -i camera            Use camera as input\n"
index a707a8553704686e84322bf553d72862692ee386..11d19c6e01dc89ac4375d947de4e948c0505c582 100644 (file)
@@ -65,6 +65,10 @@ class Configuration
     //! Force to run all layers, regardless of layersGroupId partitioning
     int     runFullNet;
 
+    //! When set, inputs are taken from TIDL internal buffers that contain
+    //! outputs of previous layersGroupId, instead of from user application
+    int     enableInternalInput;
+
     //! Size of the TI DL per Execution Object heap
     size_t EXTMEM_HEAP_SIZE;
 
index b02a0bb85db7753c9f853be7cee141f43d5b25e7..8a4eb62a4786fad84a9e6eb6ae14e3aca00a57d2 100644 (file)
@@ -50,7 +50,8 @@ class ExecutionObject
         ExecutionObject(Device* d, uint8_t device_index,
                         const  ArgInfo& create_arg,
                         const  ArgInfo& param_heap_arg,
-                        size_t extmem_heap_size);
+                        size_t extmem_heap_size,
+                        uint32_t internal_input);
         //! @private
         ~ExecutionObject();
 
index e2ce077280f1c26ea1e260972923bebd9edf6a2c..f87c6d51b10005862ca0aae51cf659b9d4431d43 100644 (file)
@@ -38,6 +38,7 @@
 #include <exception>
 
 #include "configuration.h"
+#include "custom.h"
 
 namespace tinn {
 
@@ -114,6 +115,16 @@ class Executor
         std::unique_ptr<ExecutorImpl> pimpl_m;
 };
 
+/*! @class PipeInfo
+ *  @brief Describe input and output required by piping output and input
+ *         between Execution Objects
+ */
+class PipeInfo
+{
+    public:
+        uint32_t dataQ_m[OCL_TIDL_MAX_IN_BUFS];
+        uint32_t bufAddr_m[OCL_TIDL_MAX_IN_BUFS];
+};
 
 /*! @class ArgInfo
  *  @brief Describe input and output buffers required by ExecutionObjects
@@ -130,7 +141,8 @@ class ArgInfo
         //! and its size.
         ArgInfo(void *p, size_t size) :
             ptr_m(p), size_m(size),
-            access_m(DeviceAccess::RW), kind_m(Kind::BUFFER) {}
+            access_m(DeviceAccess::RW), kind_m(Kind::BUFFER)
+        { pipe_m = std::make_shared<PipeInfo>(); }
 
         //! Construct an ArgInfo object from a pointer to a chunk of memory
         //! its size and kind
@@ -145,13 +157,17 @@ class ArgInfo
 
         // Only used by tinn::Device
         Kind   kind() const { return kind_m; }
-        bool   isLocal() const { return (ptr_m == nullptr); }
+        bool   isLocal() const { return (ptr_m == nullptr) && (size_m > 0); }
+
+        // Only used by tinn::ExecutionObject::Impl
+        PipeInfo *GetPipe() const { return pipe_m.get(); }
 
     private:
         void*        ptr_m;
         size_t       size_m;
         DeviceAccess access_m;
         Kind         kind_m;
+        std::shared_ptr<PipeInfo> pipe_m;
 };
 
 
index 55dd0fa709f8176756fee2408794f5914a5d791a..b993b1a0aa597449ac826365f46dc86851354fad 100644 (file)
@@ -40,6 +40,7 @@ Configuration::Configuration(): numFrames(0), inHeight(0), inWidth(0),
                      preProcType(0),
                      layersGroupId(tinn::internal::CURR_LAYERS_GROUP_ID),
                      runFullNet(0),
+                     enableInternalInput(0),
                      EXTMEM_HEAP_SIZE(64 << 20),  // 64MB for inceptionNetv1
                      PARAM_HEAP_SIZE(9 << 20)     // 9MB for mobileNet1
 {
@@ -53,6 +54,7 @@ void Configuration::Print(std::ostream &os) const
        << "\nPreProcType              " << preProcType
        << "\nLayersGroupId            " << layersGroupId
        << "\nRunFullNet               " << runFullNet
+       << "\nEnableInternalInput      " << enableInternalInput
        << "\nInputFile                " << inData
        << "\nOutputFile               " << outData
        << "\nNetwork                  " << netBinFile
index 6a71d87b47550c9708709974f161bec95593c163..c74389ff8cd24dfe7ae874aec994041abbf97ac3 100644 (file)
@@ -45,7 +45,8 @@ class ExecutionObject::Impl
         Impl(Device* d, uint8_t device_index,
              const ArgInfo& create_arg,
              const ArgInfo& param_heap_arg,
-             size_t extmem_heap_size);
+             size_t extmem_heap_size,
+             uint32_t internal_input);
         ~Impl() {}
 
         bool RunAsync(CallType ct);
@@ -65,8 +66,8 @@ class ExecutionObject::Impl
         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
         up_malloc_ddr<OCL_TIDL_ProcessParams>    shared_process_params_m;
 
-        size_t                          in_size;
-        size_t                          out_size;
+        size_t                          in_size_m;
+        size_t                          out_size_m;
         ArgInfo                         in_m;
         ArgInfo                         out_m;
 
@@ -82,13 +83,15 @@ ExecutionObject::ExecutionObject(Device* d,
                                  uint8_t device_index,
                                  const ArgInfo& create_arg,
                                  const ArgInfo& param_heap_arg,
-                                 size_t extmem_heap_size)
+                                 size_t extmem_heap_size,
+                                 uint32_t internal_input)
 {
     pimpl_m = std::unique_ptr<ExecutionObject::Impl>
               { new ExecutionObject::Impl(d, device_index,
                                           create_arg,
                                           param_heap_arg,
-                                          extmem_heap_size) };
+                                          extmem_heap_size,
+                                          internal_input) };
 }
 
 
@@ -96,7 +99,8 @@ ExecutionObject::Impl::Impl(Device* d,
                                  uint8_t device_index,
                                  const ArgInfo& create_arg,
                                  const ArgInfo& param_heap_arg,
-                                 size_t extmem_heap_size):
+                                 size_t extmem_heap_size,
+                                 uint32_t internal_input):
     device_m(d),
     k_initialize_m(nullptr),
     k_process_m(nullptr),
@@ -104,8 +108,8 @@ ExecutionObject::Impl::Impl(Device* d,
     tidl_extmem_heap_m (nullptr, &__free_ddr),
     shared_initialize_params_m(nullptr, &__free_ddr),
     shared_process_params_m(nullptr, &__free_ddr),
-    in_size(0),
-    out_size(0),
+    in_size_m(0),
+    out_size_m(0),
     in_m(nullptr, 0),
     out_m(nullptr, 0),
     device_index_m(device_index),
@@ -129,6 +133,7 @@ ExecutionObject::Impl::Impl(Device* d,
     shared_initialize_params_m->l2HeapSize   = tinn::internal::DMEM1_SIZE;
     shared_initialize_params_m->l1HeapSize   = tinn::internal::DMEM0_SIZE;
     shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
+    shared_initialize_params_m->enableInternalInput = internal_input;
 
     // Setup kernel arguments for initialize
     KernelArgs args = { create_arg,
@@ -157,7 +162,7 @@ char* ExecutionObject::GetInputBufferPtr() const
 
 size_t ExecutionObject::GetInputBufferSizeInBytes() const
 {
-    if (pimpl_m->in_m.ptr() == nullptr)  return pimpl_m->in_size;
+    if (pimpl_m->in_m.ptr() == nullptr)  return pimpl_m->in_size_m;
     else                                 return pimpl_m->in_m.size();
 }
 
@@ -168,7 +173,7 @@ char* ExecutionObject::GetOutputBufferPtr() const
 
 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
 {
-    if (pimpl_m->out_m.ptr() == nullptr)  return pimpl_m->out_size;
+    if (pimpl_m->out_m.ptr() == nullptr)  return pimpl_m->out_size_m;
     else           return pimpl_m->shared_process_params_m.get()->bytesWritten;
 }
 
@@ -184,9 +189,6 @@ int ExecutionObject::GetFrameIndex() const
 
 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
 {
-    assert (in.ptr() != nullptr && in.size() > 0);
-    assert (out.ptr() != nullptr && out.size() > 0);
-
     pimpl_m->SetupProcessKernel(in, out);
 }
 
@@ -238,8 +240,13 @@ ExecutionObject::Impl::SetupProcessKernel(const ArgInfo& in, const ArgInfo& out)
 
     shared_process_params_m.reset(malloc_ddr<OCL_TIDL_ProcessParams>());
     shared_process_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
+    shared_process_params_m->enableInternalInput = 
+                               shared_initialize_params_m->enableInternalInput;
     shared_process_params_m->cycles = 0;
 
+    if (shared_process_params_m->enableInternalInput == 0)
+        assert(in.ptr() != nullptr && in.size() > 0);
+
     KernelArgs args = { ArgInfo(shared_process_params_m.get(),
                                 sizeof(OCL_TIDL_ProcessParams)),
                         in,
@@ -287,60 +294,94 @@ static size_t writeDataS8(char *writePtr, const char *ptr, int n, int width,
 
 void ExecutionObject::Impl::HostWriteNetInput()
 {
-    char* readPtr = (char *) in_m.ptr();
+    char* readPtr  = (char *) in_m.ptr();
+    PipeInfo *pipe = in_m.GetPipe();
+
     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
     {
         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
-        readPtr += readDataS8(
-            readPtr,
-            (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
-                + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
-                + OCL_TIDL_MAX_PAD_SIZE,
-            inBuf->numROIs,
-            inBuf->numChannels,
-            inBuf->ROIWidth,
-            inBuf->ROIHeight,
-            inBuf->bufPlaneWidth,
-            inBuf->bufPlaneWidth
-                * (inBuf->ROIHeight + 2 * OCL_TIDL_MAX_PAD_SIZE) );
+
+        if (shared_process_params_m->enableInternalInput == 0)
+        {
+            readPtr += readDataS8(
+                readPtr,
+                (char *) tidl_extmem_heap_m.get() + inBuf->bufPlaneBufOffset
+                    + inBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
+                    + OCL_TIDL_MAX_PAD_SIZE,
+                inBuf->numROIs,
+                inBuf->numChannels,
+                inBuf->ROIWidth,
+                inBuf->ROIHeight,
+                inBuf->bufPlaneWidth,
+                ((inBuf->bufPlaneWidth * inBuf->bufPlaneHeight) /
+                 inBuf->numChannels));
+        }
+        else
+        {
+            shared_process_params_m->inBufAddr[i] = pipe->bufAddr_m[i];
+        }
+
+        shared_process_params_m->inDataQ[i]   = pipe->dataQ_m[i];
     }
 }
 
 void ExecutionObject::Impl::HostReadNetOutput()
 {
     char* writePtr = (char *) out_m.ptr();
+    PipeInfo *pipe = out_m.GetPipe();
+
     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
     {
         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
-        writePtr += writeDataS8(
-            writePtr,
-            (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
-                + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
-                + OCL_TIDL_MAX_PAD_SIZE,
-            outBuf->numChannels,
-            outBuf->ROIWidth,
-            outBuf->ROIHeight,
-            outBuf->bufPlaneWidth,
-            ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
-             outBuf->numChannels));
+        if (writePtr != nullptr)
+        {
+            writePtr += writeDataS8(
+                writePtr,
+                (char *) tidl_extmem_heap_m.get() + outBuf->bufPlaneBufOffset
+                    + outBuf->bufPlaneWidth * OCL_TIDL_MAX_PAD_SIZE
+                    + OCL_TIDL_MAX_PAD_SIZE,
+                outBuf->numChannels,
+                outBuf->ROIWidth,
+                outBuf->ROIHeight,
+                outBuf->bufPlaneWidth,
+                ((outBuf->bufPlaneWidth * outBuf->bufPlaneHeight)/
+                 outBuf->numChannels));
+        }
+
+        pipe->dataQ_m[i]   = shared_process_params_m->outDataQ[i];
+        pipe->bufAddr_m[i] = shared_initialize_params_m->bufAddrBase
+                           + outBuf->bufPlaneBufOffset;
     }
     shared_process_params_m->bytesWritten = writePtr - (char *) out_m.ptr();
 }
 
 void ExecutionObject::Impl::ComputeInputOutputSizes()
 {
-    in_size  = 0;
-    out_size = 0;
+    if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)  return;
+
+    if (shared_initialize_params_m->numInBufs > OCL_TIDL_MAX_IN_BUFS ||
+        shared_initialize_params_m->numOutBufs > OCL_TIDL_MAX_OUT_BUFS)
+    {
+        std::cout << "Num input/output bufs ("
+                  << shared_initialize_params_m->numInBufs << ", "
+                  << shared_initialize_params_m->numOutBufs
+                  << ") exceeded limit!" << std::endl;
+        shared_initialize_params_m->errorCode = OCL_TIDL_INIT_FAIL;
+        return;
+    }
+
+    in_size_m  = 0;
+    out_size_m = 0;
     for (unsigned int i = 0; i < shared_initialize_params_m->numInBufs; i++)
     {
         OCL_TIDL_BufParams *inBuf = &shared_initialize_params_m->inBufs[i];
-        in_size += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
-                   inBuf->ROIHeight;
+        in_size_m += inBuf->numROIs * inBuf->numChannels * inBuf->ROIWidth *
+                     inBuf->ROIHeight;
     }
     for (unsigned int i = 0; i < shared_initialize_params_m->numOutBufs; i++)
     {
         OCL_TIDL_BufParams *outBuf = &shared_initialize_params_m->outBufs[i];
-        out_size += outBuf->numChannels * outBuf->ROIWidth * outBuf->ROIHeight;
+        out_size_m += outBuf->numChannels * outBuf->ROIWidth *outBuf->ROIHeight;
     }
 }
 
@@ -384,10 +425,10 @@ bool ExecutionObject::Impl::Wait(CallType ct)
 
             if (has_work)
             {
+                ComputeInputOutputSizes();
                 if (shared_initialize_params_m->errorCode != OCL_TIDL_SUCCESS)
                     throw Exception(shared_initialize_params_m->errorCode,
                                     __FILE__, __FUNCTION__, __LINE__);
-                ComputeInputOutputSizes();
             }
             return has_work;
         }
index 62bd3a30bced339ce919bf4c8cddc9bd6042f291..09907315aca4f544ddb665e5483f5251ada255c3 100644 (file)
@@ -109,7 +109,8 @@ bool ExecutorImpl::Initialize(const Configuration& configuration)
              unique_ptr<ExecutionObject>
              {new ExecutionObject(device_m.get(), index,
                                   create_arg, param_heap_arg,
-                                  configuration_m.EXTMEM_HEAP_SIZE)} );
+                                  configuration_m.EXTMEM_HEAP_SIZE,
+                                  configuration_m.enableInternalInput)} );
     }
 
     for (auto &eo : execution_objects_m)
index d8aff36299cea067f3151d922e6c5820436a50c9..df4b35467a1ee46c2ee79ba060b8f288c6520e34 100644 (file)
@@ -281,7 +281,7 @@ Kernel::Kernel(Device* device, const std::string& name,
                 clSetKernelArg(kernel_m, arg_index++, sizeof(cl_mem), &buffer);
                 TRACE::print("  Arg[%d]: %p\n", arg_index-1, buffer);
 
-                buffers_m.push_back(buffer);
+                if (buffer != nullptr)  buffers_m.push_back(buffer);
             }
             else if (arg.kind() == ArgInfo::Kind::SCALAR)
             {
@@ -346,6 +346,12 @@ cl_mem Device::CreateBuffer(const ArgInfo &Arg)
     size_t  size     = Arg.size();
     void   *host_ptr = Arg.ptr();
 
+    if (host_ptr == nullptr)
+    {
+        TRACE::print("\tOCL Create B:%p\n", nullptr);
+        return nullptr;
+    }
+
     bool hostPtrInCMEM = __is_in_malloced_region(host_ptr);
 
     // Conservative till we have sufficient information.