Reduce complexity of ssd_multibox example
[tidl/tidl-api.git] / examples / ssd_multibox / main.cpp
index 8f9b6e068473781b09409f835a3234bb889ccd48..a44f8cd1e8d7aedd1a605fd4cbfb6cba7ff27476 100644 (file)
@@ -67,19 +67,21 @@ using namespace tinn;
 using namespace cv;
 
 
-bool RunConfiguration(const std::string& config_file, int num_devices,
+bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
                       DeviceType device_type, std::string& input_file);
-bool RunAllConfigurations(int32_t num_devices, DeviceType device_type);
-
 bool ReadFrame(ExecutionObject& eo, int frame_idx,
                const Configuration& configuration, int num_frames,
                std::string& image_file, VideoCapture &cap);
-bool WriteFrameOutput(const ExecutionObject &eo,
+bool WriteFrameOutput(const ExecutionObject &eo_in,
+                      const ExecutionObject &eo_out,
                       const Configuration& configuration);
 
+void ReportTime(int frame_index, std::string device_name, double elapsed_host,
+                double elapsed_device);
+
 static void ProcessArgs(int argc, char *argv[],
                         std::string& config,
-                        int& num_devices,
+                        uint32_t& num_devices,
                         DeviceType& device_type,
                         std::string& input_file);
 
@@ -107,10 +109,18 @@ int main(int argc, char *argv[])
     // Process arguments
     std::string config      = DEFAULT_CONFIG;
     std::string input_file  = DEFAULT_INPUT;
-    int         num_devices = 1;
+    uint32_t num_devices    = 1;
     DeviceType  device_type = DeviceType::DLA;
     ProcessArgs(argc, argv, config, num_devices, device_type, input_file);
 
+    // Use same number of DLAs and DSPs
+    num_devices = std::min(num_devices, std::min(num_dla, num_dsp));
+    if (num_devices == 0)
+    {
+        std::cout << "Partitioned execution requires at least 1 DLA and 1 DSP."
+                  << std::endl;
+        return EXIT_FAILURE;
+    }
     if ((object_class_table = GetObjectClassTable(config)) == nullptr)
     {
         std::cout << "No object classes defined for this config." << std::endl;
@@ -138,7 +148,7 @@ int main(int argc, char *argv[])
     return EXIT_SUCCESS;
 }
 
-bool RunConfiguration(const std::string& config_file, int num_devices,
+bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
                       DeviceType device_type, std::string& input_file)
 {
     DeviceIds ids;
@@ -154,8 +164,6 @@ bool RunConfiguration(const std::string& config_file, int num_devices,
                   << std::endl;
         return false;
     }
-    if (device_type == DeviceType::DLA || device_type == DeviceType::DSP)
-        configuration.runFullNet = 1;
 
     // setup input
     int num_frames = is_default_input ? 3 : 1;
@@ -177,78 +185,98 @@ bool RunConfiguration(const std::string& config_file, int num_devices,
         image_file = input_file;
     }
 
-    // Determine input frame size from configuration
-    size_t frame_sz = configuration.inWidth * configuration.inHeight *
-                      configuration.inNumChannels;
-
     try
     {
         // Create a executor with the approriate core type, number of cores
         // and configuration specified
-        Executor executor(device_type, ids, configuration);
+        // DLA will run layersGroupId 1 in the network, while
+        // DSP will run layersGroupId 2 in the network
+        Executor executor_dla(DeviceType::DLA, ids, configuration, 1);
+        Executor executor_dsp(DeviceType::DSP, ids, configuration, 2);
 
         // Query Executor for set of ExecutionObjects created
-        const ExecutionObjects& execution_objects =
-                                                executor.GetExecutionObjects();
-        int num_eos = execution_objects.size();
+        const ExecutionObjects& execution_objects_dla =
+                                            executor_dla.GetExecutionObjects();
+        const ExecutionObjects& execution_objects_dsp =
+                                            executor_dsp.GetExecutionObjects();
+        int num_eos = execution_objects_dla.size();
 
         // Allocate input and output buffers for each execution object
+        // Note that "out" is both the output of eo_dla and the input of eo_dsp
+        // This is how two layersGroupIds, 1 and 2, are tied together
         std::vector<void *> buffers;
-        for (auto &eo : execution_objects)
+        for (int i = 0; i < num_eos; i++)
         {
-            ArgInfo in  = { ArgInfo(malloc(frame_sz), frame_sz)};
-            ArgInfo out = { ArgInfo(malloc(frame_sz), frame_sz)};
-            eo->SetInputOutputBuffer(in, out);
+            ExecutionObject *eo_dla = execution_objects_dla[i].get();
+            size_t in_size  = eo_dla->GetInputBufferSizeInBytes();
+            size_t out_size = eo_dla->GetOutputBufferSizeInBytes();
+            ArgInfo in  = { ArgInfo(malloc(in_size),  in_size)  };
+            ArgInfo out = { ArgInfo(malloc(out_size), out_size) };
+            eo_dla->SetInputOutputBuffer(in, out);
+
+            ExecutionObject *eo_dsp = execution_objects_dsp[i].get();
+            size_t out2_size = eo_dsp->GetOutputBufferSizeInBytes();
+            ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
+            eo_dsp->SetInputOutputBuffer(out, out2);
 
             buffers.push_back(in.ptr());
             buffers.push_back(out.ptr());
+            buffers.push_back(out2.ptr());
         }
 
         #define MAX_NUM_EOS  4
-        struct timespec t0[MAX_NUM_EOS], t1;
+        struct timespec t0[MAX_NUM_EOS], t1, tloop0, tloop1;
+        clock_gettime(CLOCK_MONOTONIC, &tloop0);
 
         // Process frames with available execution objects in a pipelined manner
         // additional num_eos iterations to flush the pipeline (epilogue)
+        ExecutionObject *eo_dla, *eo_dsp, *eo_input;
         for (int frame_idx = 0;
              frame_idx < num_frames + num_eos; frame_idx++)
         {
-            ExecutionObject* eo = execution_objects[frame_idx % num_eos].get();
+            eo_dla = execution_objects_dla[frame_idx % num_eos].get();
+            eo_dsp = execution_objects_dsp[frame_idx % num_eos].get();
 
             // Wait for previous frame on the same eo to finish processing
-            if (eo->ProcessFrameWait())
+            if (eo_dsp->ProcessFrameWait())
             {
+                int finished_idx = eo_dsp->GetFrameIndex();
                 clock_gettime(CLOCK_MONOTONIC, &t1);
-                double elapsed_host =
-                                ms_diff(t0[eo->GetFrameIndex() % num_eos], t1);
-                double elapsed_device = eo->GetProcessTimeInMilliSeconds();
-                double overhead = 100 - (elapsed_device/elapsed_host*100);
-
-                std::cout << "frame[" << eo->GetFrameIndex() << "]: "
-                          << "Time on device: "
-                          << std::setw(6) << std::setprecision(4)
-                          << elapsed_device << "ms, "
-                          << "host: "
-                          << std::setw(6) << std::setprecision(4)
-                          << elapsed_host << "ms ";
-                std::cout << "API overhead: "
-                          << std::setw(6) << std::setprecision(3)
-                          << overhead << " %" << std::endl;
-
-                WriteFrameOutput(*eo, configuration);
+                ReportTime(finished_idx, "DSP",
+                           ms_diff(t0[finished_idx % num_eos], t1),
+                           eo_dsp->GetProcessTimeInMilliSeconds());
+
+                eo_input = execution_objects_dla[finished_idx % num_eos].get();
+                WriteFrameOutput(*eo_input, *eo_dsp, configuration);
             }
 
             // Read a frame and start processing it with current eo
-            if (ReadFrame(*eo, frame_idx, configuration, num_frames,
+            if (ReadFrame(*eo_dla, frame_idx, configuration, num_frames,
                           image_file, cap))
             {
                 clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
-                eo->ProcessFrameStartAsync();
+                eo_dla->ProcessFrameStartAsync();
+
+                if (eo_dla->ProcessFrameWait())
+                {
+                    clock_gettime(CLOCK_MONOTONIC, &t1);
+                    ReportTime(frame_idx, "DLA",
+                               ms_diff(t0[frame_idx % num_eos], t1),
+                               eo_dla->GetProcessTimeInMilliSeconds());
+
+                    clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
+                    eo_dsp->ProcessFrameStartAsync();
+                }
             }
         }
 
+        clock_gettime(CLOCK_MONOTONIC, &tloop1);
+        std::cout << "Loop total time (including read/write/print/etc): "
+                  << std::setw(6) << std::setprecision(4)
+                  << ms_diff(tloop0, tloop1) << "ms" << std::endl;
+
         for (auto b : buffers)
             free(b);
-
     }
     catch (tinn::Exception &e)
     {
@@ -259,6 +287,22 @@ bool RunConfiguration(const std::string& config_file, int num_devices,
     return status;
 }
 
+void ReportTime(int frame_index, std::string device_name, double elapsed_host,
+                double elapsed_device)
+{
+    double overhead = 100 - (elapsed_device/elapsed_host*100);
+    std::cout << "frame[" << frame_index << "]: "
+              << "Time on " << device_name << ": "
+              << std::setw(6) << std::setprecision(4)
+              << elapsed_device << "ms, "
+              << "host: "
+              << std::setw(6) << std::setprecision(4)
+              << elapsed_host << "ms ";
+    std::cout << "API overhead: "
+              << std::setw(6) << std::setprecision(3)
+              << overhead << " %" << std::endl;
+}
+
 
 bool ReadFrame(ExecutionObject &eo, int frame_idx,
                const Configuration& configuration, int num_frames,
@@ -323,7 +367,8 @@ bool ReadFrame(ExecutionObject &eo, int frame_idx,
 }
 
 // Create frame with boxes drawn around classified objects
-bool WriteFrameOutput(const ExecutionObject &eo,
+bool WriteFrameOutput(const ExecutionObject &eo_in,
+                      const ExecutionObject &eo_out,
                       const Configuration& configuration)
 {
     // Asseembly original frame
@@ -332,13 +377,13 @@ bool WriteFrameOutput(const ExecutionObject &eo,
     int channel_size = width * height;
     Mat frame, r_frame, bgr[3];
 
-    unsigned char *in = (unsigned char *) eo.GetInputBufferPtr();
+    unsigned char *in = (unsigned char *) eo_in.GetInputBufferPtr();
     bgr[0] = Mat(height, width, CV_8UC(1), in);
     bgr[1] = Mat(height, width, CV_8UC(1), in + channel_size);
     bgr[2] = Mat(height, width, CV_8UC(1), in + channel_size*2);
     cv::merge(bgr, 3, frame);
 
-    int frame_index = eo.GetFrameIndex();
+    int frame_index = eo_in.GetFrameIndex();
     char outfile_name[64];
     if (! is_camera_input && is_preprocessed_input)
     {
@@ -348,8 +393,8 @@ bool WriteFrameOutput(const ExecutionObject &eo,
     }
 
     // Draw boxes around classified objects
-    float *out = (float *) eo.GetOutputBufferPtr();
-    int num_floats = eo.GetOutputBufferSizeInBytes() / sizeof(float);
+    float *out = (float *) eo_out.GetOutputBufferPtr();
+    int num_floats = eo_out.GetOutputBufferSizeInBytes() / sizeof(float);
     for (int i = 0; i < num_floats / 7; i++)
     {
         int index = (int)    out[i * 7 + 0];
@@ -372,8 +417,9 @@ bool WriteFrameOutput(const ExecutionObject &eo,
 #endif
 
         cv::rectangle(frame, Point(xmin, ymin), Point(xmax, ymax),
-                      Scalar(object_class->color[0], object_class->color[1],
-                             object_class->color[2]), 2);
+                      Scalar(object_class->color.blue,
+                             object_class->color.green,
+                             object_class->color.red), 2);
     }
 
     // output
@@ -396,14 +442,13 @@ bool WriteFrameOutput(const ExecutionObject &eo,
 
 
 void ProcessArgs(int argc, char *argv[], std::string& config,
-                 int& num_devices, DeviceType& device_type,
+                 uint32_t& num_devices, DeviceType& device_type,
                  std::string& input_file)
 {
     const struct option long_options[] =
     {
         {"config",      required_argument, 0, 'c'},
         {"num_devices", required_argument, 0, 'n'},
-        {"device_type", required_argument, 0, 't'},
         {"image_file",  required_argument, 0, 'i'},
         {"help",        no_argument,       0, 'h'},
         {"verbose",     no_argument,       0, 'v'},
@@ -414,7 +459,7 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
 
     while (true)
     {
-        int c = getopt_long(argc, argv, "c:n:t:i:hv", long_options, &option_index);
+        int c = getopt_long(argc, argv, "c:n:i:hv", long_options, &option_index);
 
         if (c == -1)
             break;
@@ -428,21 +473,6 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
                       assert (num_devices > 0 && num_devices <= 4);
                       break;
 
-            case 't': if (*optarg == 'e')
-                          device_type = DeviceType::DLA;
-#if 0
-                      else if (*optarg == 'd')
-                          device_type = DeviceType::DSP;
-#endif
-                      else
-                      {
-                          //std::cerr << "Invalid argument to -t, only e or d"
-                          std::cerr << "Invalid argument to -t, only e"
-                                       " allowed" << std::endl;
-                          exit(EXIT_FAILURE);
-                      }
-                      break;
-
             case 'i': input_file = optarg;
                       break;
 
@@ -467,14 +497,16 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
 void DisplayHelp()
 {
     std::cout << "Usage: ssd_multibox\n"
-                 "  Will run ssd_multibox network to perform multi-objects"
-                 " classification.\n  Use -c to run a different"
-                 "  segmentation network. Default is jdetnet.\n"
+                 "  Will run partitioned ssd_multibox network to perform "
+                 "multi-objects detection\n"
+                 "  and classification.  First part of network "
+                 "(layersGroupId 1) runs on DLA,\n"
+                 "  second part (layersGroupId 2) runs on DSP.\n"
+                 "  Use -c to run a different segmentation network. "
+                 "Default is jdetnet.\n"
                  "Optional arguments:\n"
-                 " -c <config>          Valid configs: jdetnet, jdetnet_512x256\n"
+                 " -c <config>          Valid configs: jdetnet \n"
                  " -n <number of cores> Number of cores to use (1 - 4)\n"
-                 " -t <d|e>             Type of core. d -> DSP, e -> DLA\n"
-                 "                      Only support DLA for now\n"
                  " -i <image>           Path to the image file\n"
                  "                      Default is 1 frame in testvecs\n"
                  " -i camera            Use camera as input\n"