Reduce complexity of ssd_multibox example
authorYuan Zhao <yuanzhao@ti.com>
Fri, 11 May 2018 13:32:23 +0000 (08:32 -0500)
committerYuan Zhao <yuanzhao@ti.com>
Fri, 11 May 2018 16:46:13 +0000 (11:46 -0500)
- Only support partitioned mode, remove single device mode
- Remove enableInternalInput mode due to no observable performance gain
- Make layersGroupId assignment part of Executor's construction
- MCT-974

examples/ssd_multibox/main.cpp
tinn_api/inc/configuration.h
tinn_api/inc/execution_object.h
tinn_api/inc/executor.h
tinn_api/src/configuration.cpp
tinn_api/src/execution_object.cpp
tinn_api/src/executor.cpp
tinn_api/src/executor_impl.h

index ad26ba7820ef4196270818fa9c8391a7d44f8467..a44f8cd1e8d7aedd1a605fd4cbfb6cba7ff27476 100644 (file)
@@ -59,7 +59,6 @@ bool __TI_show_debug_ = false;
 bool is_default_input = false;
 bool is_preprocessed_input = false;
 bool is_camera_input       = false;
-bool is_partitioned        = true;
 int  orig_width;
 int  orig_height;
 object_class_table_t *object_class_table;
@@ -114,8 +113,8 @@ int main(int argc, char *argv[])
     DeviceType  device_type = DeviceType::DLA;
     ProcessArgs(argc, argv, config, num_devices, device_type, input_file);
 
-    if (is_partitioned)
-        num_devices = std::min(num_devices, std::min(num_dla, num_dsp));
+    // Use same number of DLAs and DSPs
+    num_devices = std::min(num_devices, std::min(num_dla, num_dsp));
     if (num_devices == 0)
     {
         std::cout << "Partitioned execution requires at least 1 DLA and 1 DSP."
@@ -165,7 +164,6 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
                   << std::endl;
         return false;
     }
-    configuration.runFullNet = is_partitioned ? 0 : 1;
 
     // setup input
     int num_frames = is_default_input ? 3 : 1;
@@ -191,47 +189,39 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
     {
         // Create a executor with the approriate core type, number of cores
         // and configuration specified
-        configuration.layersGroupId = 1;
-        Executor *executor_1 = new Executor(device_type, ids, configuration);
-        Executor *executor_2 = nullptr;
-        if (is_partitioned)
-        {
-            configuration.layersGroupId       = 2;
-            configuration.enableInternalInput = 1;  // 0 is also valid
-            executor_2 = new Executor(DeviceType::DSP, ids, configuration);
-        }
+        // DLA will run layersGroupId 1 in the network, while
+        // DSP will run layersGroupId 2 in the network
+        Executor executor_dla(DeviceType::DLA, ids, configuration, 1);
+        Executor executor_dsp(DeviceType::DSP, ids, configuration, 2);
 
         // Query Executor for set of ExecutionObjects created
-        const ExecutionObjects *execution_objects_1, *execution_objects_2;
-        execution_objects_1 = & executor_1->GetExecutionObjects();
-        int num_eos = execution_objects_1->size();
-        if (is_partitioned)
-            execution_objects_2 = & executor_2->GetExecutionObjects();
+        const ExecutionObjects& execution_objects_dla =
+                                            executor_dla.GetExecutionObjects();
+        const ExecutionObjects& execution_objects_dsp =
+                                            executor_dsp.GetExecutionObjects();
+        int num_eos = execution_objects_dla.size();
 
         // Allocate input and output buffers for each execution object
+        // Note that "out" is both the output of eo_dla and the input of eo_dsp
+        // This is how two layersGroupIds, 1 and 2, are tied together
         std::vector<void *> buffers;
         for (int i = 0; i < num_eos; i++)
         {
-            ExecutionObject *eo1 = execution_objects_1->at(i).get();
-            size_t in_size  = eo1->GetInputBufferSizeInBytes();
-            size_t out_size = eo1->GetOutputBufferSizeInBytes();
-            ArgInfo in  = { ArgInfo(malloc(in_size),  in_size)};
-            ArgInfo out = { ArgInfo(nullptr, 0) };
-            if (configuration.enableInternalInput == 0)
-                out = ArgInfo(malloc(out_size), out_size);
-            eo1->SetInputOutputBuffer(in, out);
+            ExecutionObject *eo_dla = execution_objects_dla[i].get();
+            size_t in_size  = eo_dla->GetInputBufferSizeInBytes();
+            size_t out_size = eo_dla->GetOutputBufferSizeInBytes();
+            ArgInfo in  = { ArgInfo(malloc(in_size),  in_size)  };
+            ArgInfo out = { ArgInfo(malloc(out_size), out_size) };
+            eo_dla->SetInputOutputBuffer(in, out);
+
+            ExecutionObject *eo_dsp = execution_objects_dsp[i].get();
+            size_t out2_size = eo_dsp->GetOutputBufferSizeInBytes();
+            ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
+            eo_dsp->SetInputOutputBuffer(out, out2);
 
             buffers.push_back(in.ptr());
             buffers.push_back(out.ptr());
-
-            if (is_partitioned)
-            {
-                ExecutionObject *eo2 = execution_objects_2->at(i).get();
-                size_t out2_size = eo2->GetOutputBufferSizeInBytes();
-                ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
-                eo2->SetInputOutputBuffer(out, out2);
-                buffers.push_back(out2.ptr());
-            }
+            buffers.push_back(out2.ptr());
         }
 
         #define MAX_NUM_EOS  4
@@ -240,49 +230,42 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
 
         // Process frames with available execution objects in a pipelined manner
         // additional num_eos iterations to flush the pipeline (epilogue)
-        ExecutionObject *eo1, *eo2, *eo_wait, *eo_input;
+        ExecutionObject *eo_dla, *eo_dsp, *eo_input;
         for (int frame_idx = 0;
              frame_idx < num_frames + num_eos; frame_idx++)
         {
-            eo1 = execution_objects_1->at(frame_idx % num_eos).get();
-            eo_wait = eo1;
-            if (is_partitioned)
-            {
-                eo2 = execution_objects_2->at(frame_idx % num_eos).get();
-                eo_wait = eo2;
-            }
+            eo_dla = execution_objects_dla[frame_idx % num_eos].get();
+            eo_dsp = execution_objects_dsp[frame_idx % num_eos].get();
 
             // Wait for previous frame on the same eo to finish processing
-            if (eo_wait->ProcessFrameWait())
+            if (eo_dsp->ProcessFrameWait())
             {
-                int finished_idx = eo_wait->GetFrameIndex();
+                int finished_idx = eo_dsp->GetFrameIndex();
                 clock_gettime(CLOCK_MONOTONIC, &t1);
-                ReportTime(finished_idx,
-                           (is_partitioned || device_type == DeviceType::DSP) ?
-                           "DSP" : "DLA",
+                ReportTime(finished_idx, "DSP",
                            ms_diff(t0[finished_idx % num_eos], t1),
-                           eo_wait->GetProcessTimeInMilliSeconds());
+                           eo_dsp->GetProcessTimeInMilliSeconds());
 
-                eo_input = execution_objects_1->at(finished_idx %num_eos).get();
-                WriteFrameOutput(*eo_input, *eo_wait, configuration);
+                eo_input = execution_objects_dla[finished_idx % num_eos].get();
+                WriteFrameOutput(*eo_input, *eo_dsp, configuration);
             }
 
             // Read a frame and start processing it with current eo
-            if (ReadFrame(*eo1, frame_idx, configuration, num_frames,
+            if (ReadFrame(*eo_dla, frame_idx, configuration, num_frames,
                           image_file, cap))
             {
                 clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
-                eo1->ProcessFrameStartAsync();
+                eo_dla->ProcessFrameStartAsync();
 
-                if (is_partitioned && eo1->ProcessFrameWait())
+                if (eo_dla->ProcessFrameWait())
                 {
                     clock_gettime(CLOCK_MONOTONIC, &t1);
                     ReportTime(frame_idx, "DLA",
-                           ms_diff(t0[frame_idx % num_eos], t1),
-                           eo1->GetProcessTimeInMilliSeconds());
+                               ms_diff(t0[frame_idx % num_eos], t1),
+                               eo_dla->GetProcessTimeInMilliSeconds());
 
                     clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
-                    eo2->ProcessFrameStartAsync();
+                    eo_dsp->ProcessFrameStartAsync();
                 }
             }
         }
@@ -292,8 +275,6 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
                   << std::setw(6) << std::setprecision(4)
                   << ms_diff(tloop0, tloop1) << "ms" << std::endl;
 
-        delete executor_1;
-        delete executor_2;
         for (auto b : buffers)
             free(b);
     }
@@ -468,7 +449,6 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
     {
         {"config",      required_argument, 0, 'c'},
         {"num_devices", required_argument, 0, 'n'},
-        {"device_type", required_argument, 0, 't'},
         {"image_file",  required_argument, 0, 'i'},
         {"help",        no_argument,       0, 'h'},
         {"verbose",     no_argument,       0, 'v'},
@@ -479,7 +459,7 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
 
     while (true)
     {
-        int c = getopt_long(argc, argv, "c:n:t:i:hv", long_options, &option_index);
+        int c = getopt_long(argc, argv, "c:n:i:hv", long_options, &option_index);
 
         if (c == -1)
             break;
@@ -493,27 +473,6 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
                       assert (num_devices > 0 && num_devices <= 4);
                       break;
 
-            case 't': if (*optarg == 'e')
-                      {
-                          device_type = DeviceType::DLA;
-                          is_partitioned = false;
-                      }
-#if 0
-                      else if (*optarg == 'd')
-                      {
-                          device_type = DeviceType::DSP;
-                          is_partitioned = false;
-                      }
-#endif
-                      else
-                      {
-                          //std::cerr << "Invalid argument to -t, only e or d"
-                          std::cerr << "Invalid argument to -t, only e"
-                                       " allowed" << std::endl;
-                          exit(EXIT_FAILURE);
-                      }
-                      break;
-
             case 'i': input_file = optarg;
                       break;
 
@@ -538,16 +497,16 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
 void DisplayHelp()
 {
     std::cout << "Usage: ssd_multibox\n"
-                 "  Will run ssd_multibox network to perform multi-objects"
-                 " classification.\n  Use -c to run a different"
-                 "  segmentation network. Default is jdetnet.\n"
+                 "  Will run partitioned ssd_multibox network to perform "
+                 "multi-objects detection\n"
+                 "  and classification.  First part of network "
+                 "(layersGroupId 1) runs on DLA,\n"
+                 "  second part (layersGroupId 2) runs on DSP.\n"
+                 "  Use -c to run a different segmentation network. "
+                 "Default is jdetnet.\n"
                  "Optional arguments:\n"
-                 " -c <config>          Valid configs: jdetnet, jdetnet_512x256\n"
+                 " -c <config>          Valid configs: jdetnet \n"
                  " -n <number of cores> Number of cores to use (1 - 4)\n"
-                 " -t <d|e>             Type of core. d -> DSP, e -> DLA\n"
-                 "                      DSP not supported at this time\n"
-                 "                      Default to partitioned execution: \n"
-                 "                          part 1 on DLA, part 2 on DSP\n"
                  " -i <image>           Path to the image file\n"
                  "                      Default is 1 frame in testvecs\n"
                  " -i camera            Use camera as input\n"
index 11d19c6e01dc89ac4375d947de4e948c0505c582..088a6291a413af379087acc12d7005826b293128 100644 (file)
@@ -59,9 +59,6 @@ class Configuration
     //! Specific to each network, can take values from 0 to 4, default is 0
     int     preProcType;
 
-    //! layersGroupId in the network that the executor should work on
-    int     layersGroupId;
-
     //! Force to run all layers, regardless of layersGroupId partitioning
     int     runFullNet;
 
index 8a4eb62a4786fad84a9e6eb6ae14e3aca00a57d2..c334ac98c98c8caade37a53e65d6b4ac055b2276 100644 (file)
@@ -51,7 +51,7 @@ class ExecutionObject
                         const  ArgInfo& create_arg,
                         const  ArgInfo& param_heap_arg,
                         size_t extmem_heap_size,
-                        uint32_t internal_input);
+                        bool   internal_input);
         //! @private
         ~ExecutionObject();
 
index f87c6d51b10005862ca0aae51cf659b9d4431d43..05e9cc04923a7d749dc201944de856c884baa241 100644 (file)
@@ -85,8 +85,10 @@ class Executor
         //! @param device_type DSP or EVE/DLA device
         //! @param ids Set of devices uses by this instance of the Executor
         //! @param configuration Configuration used to initialize the Executor
+        //! @param layers_group_id Layers group that this Executor should run
         Executor(DeviceType device_type, const DeviceIds& ids,
-                 const Configuration& configuration);
+                 const Configuration& configuration,
+                 int layers_group_id = OCL_TIDL_DEFAULT_LAYERS_GROUP_ID);
 
         //! @brief Tear down an Executor and free resources used by the
         //! Executor object
index b993b1a0aa597449ac826365f46dc86851354fad..70fd136b65b3bd43d2aa3f83b37eef712c46ed2f 100644 (file)
@@ -38,7 +38,6 @@ Configuration::Configuration(): numFrames(0), inHeight(0), inWidth(0),
                      inNumChannels(0),
                      noZeroCoeffsPercentage(100),
                      preProcType(0),
-                     layersGroupId(tinn::internal::CURR_LAYERS_GROUP_ID),
                      runFullNet(0),
                      enableInternalInput(0),
                      EXTMEM_HEAP_SIZE(64 << 20),  // 64MB for inceptionNetv1
@@ -52,7 +51,6 @@ void Configuration::Print(std::ostream &os) const
        << "\nFrame=      " << numFrames << " " << inWidth << "x"
                            << inHeight << "x" << inNumChannels
        << "\nPreProcType              " << preProcType
-       << "\nLayersGroupId            " << layersGroupId
        << "\nRunFullNet               " << runFullNet
        << "\nEnableInternalInput      " << enableInternalInput
        << "\nInputFile                " << inData
index c74389ff8cd24dfe7ae874aec994041abbf97ac3..dbdb90293206cf804b7fc9458d1cae596c4d444a 100644 (file)
@@ -46,7 +46,7 @@ class ExecutionObject::Impl
              const ArgInfo& create_arg,
              const ArgInfo& param_heap_arg,
              size_t extmem_heap_size,
-             uint32_t internal_input);
+             bool   internal_input);
         ~Impl() {}
 
         bool RunAsync(CallType ct);
@@ -84,7 +84,7 @@ ExecutionObject::ExecutionObject(Device* d,
                                  const ArgInfo& create_arg,
                                  const ArgInfo& param_heap_arg,
                                  size_t extmem_heap_size,
-                                 uint32_t internal_input)
+                                 bool   internal_input)
 {
     pimpl_m = std::unique_ptr<ExecutionObject::Impl>
               { new ExecutionObject::Impl(d, device_index,
@@ -100,7 +100,7 @@ ExecutionObject::Impl::Impl(Device* d,
                                  const ArgInfo& create_arg,
                                  const ArgInfo& param_heap_arg,
                                  size_t extmem_heap_size,
-                                 uint32_t internal_input):
+                                 bool   internal_input):
     device_m(d),
     k_initialize_m(nullptr),
     k_process_m(nullptr),
@@ -133,7 +133,7 @@ ExecutionObject::Impl::Impl(Device* d,
     shared_initialize_params_m->l2HeapSize   = tinn::internal::DMEM1_SIZE;
     shared_initialize_params_m->l1HeapSize   = tinn::internal::DMEM0_SIZE;
     shared_initialize_params_m->enableTrace  = OCL_TIDL_TRACE_OFF;
-    shared_initialize_params_m->enableInternalInput = internal_input;
+    shared_initialize_params_m->enableInternalInput = internal_input ? 1 : 0;
 
     // Setup kernel arguments for initialize
     KernelArgs args = { create_arg,
index 09907315aca4f544ddb665e5483f5251ada255c3..e16f6d0eca7132a9828b13d9c2deeabef825ca23 100644 (file)
@@ -11,10 +11,10 @@ using namespace tinn;
 using std::unique_ptr;
 
 Executor::Executor(DeviceType core_type, const DeviceIds& ids,
-                   const Configuration& configuration)
+                   const Configuration& configuration, int layers_group_id)
 {
     pimpl_m = unique_ptr<ExecutorImpl>
-              { new ExecutorImpl(core_type, ids) };
+              { new ExecutorImpl(core_type, ids, layers_group_id) };
     pimpl_m->Initialize(configuration);
 }
 
@@ -45,11 +45,13 @@ std::string Executor::GetAPIVersion()
 }
 
 
-ExecutorImpl::ExecutorImpl(DeviceType core_type, const DeviceIds& ids):
+ExecutorImpl::ExecutorImpl(DeviceType core_type, const DeviceIds& ids,
+                           int layers_group_id):
     configuration_m(),
     shared_networkparam_heap_m(nullptr, &__free_ddr),
     device_ids_m(ids),
-    core_type_m(core_type)
+    core_type_m(core_type),
+    layers_group_id_m(layers_group_id)
 {
     std::string name;
     if (core_type_m == DeviceType::DSP)
@@ -92,7 +94,7 @@ bool ExecutorImpl::Initialize(const Configuration& configuration)
     {
         for (int i = 0; i < net->numLayers; i++)
             if (net->TIDLLayers[i].layerType != TIDL_DataLayer)
-                net->TIDLLayers[i].layersGroupId = configuration.layersGroupId;
+                net->TIDLLayers[i].layersGroupId = layers_group_id_m;
     }
 
     // Call a setup kernel to allocate and fill network parameters
@@ -188,8 +190,8 @@ void ExecutorImpl::Cleanup()
 void ExecutorImpl::InitializeNetworkCreateParam(TIDL_CreateParams *CP,
                                           const Configuration& configuration)
 {
-    CP->currCoreId           = configuration.layersGroupId;
-    CP->currLayersGroupId    = configuration.layersGroupId;
+    CP->currCoreId           = layers_group_id_m;
+    CP->currLayersGroupId    = layers_group_id_m;
     CP->l1MemSize            = tinn::internal::DMEM0_SIZE;
     CP->l2MemSize            = tinn::internal::DMEM1_SIZE;
     CP->l3MemSize            = tinn::internal::OCMC_SIZE;
index b04c156b53fce24d762b96d664465dd85cc899a9..704b1f7162bbd6416e4d8b04b6b983d322981fc7 100644 (file)
@@ -50,7 +50,8 @@ namespace tinn {
 class ExecutorImpl
 {
     public:
-        ExecutorImpl(DeviceType core_type, const DeviceIds& ids);
+        ExecutorImpl(DeviceType core_type, const DeviceIds& ids,
+                     int layersGroupId);
         ~ExecutorImpl() { Cleanup(); }
 
         bool Initialize(const Configuration& configuration);
@@ -74,6 +75,7 @@ class ExecutorImpl
         up_malloc_ddr<char>  shared_networkparam_heap_m;
         DeviceIds            device_ids_m;
         DeviceType           core_type_m;
+        int                  layers_group_id_m;
 };
 
 } // namespace tinn