index 8f9b6e068473781b09409f835a3234bb889ccd48..c3d9c8e7036de5bc41935d3101509cc0422174b7 100644 (file)
int orig_height;
object_class_table_t *object_class_table;
-using namespace tinn;
+using namespace tidl;
using namespace cv;
-bool RunConfiguration(const std::string& config_file, int num_devices,
+bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
DeviceType device_type, std::string& input_file);
-bool RunAllConfigurations(int32_t num_devices, DeviceType device_type);
-
bool ReadFrame(ExecutionObject& eo, int frame_idx,
const Configuration& configuration, int num_frames,
std::string& image_file, VideoCapture &cap);
-bool WriteFrameOutput(const ExecutionObject &eo,
+bool WriteFrameOutput(const ExecutionObject &eo_in,
+ const ExecutionObject &eo_out,
const Configuration& configuration);
+void ReportTime(int frame_index, std::string device_name, double elapsed_host,
+ double elapsed_device);
+
static void ProcessArgs(int argc, char *argv[],
std::string& config,
- int& num_devices,
+ uint32_t& num_devices,
DeviceType& device_type,
std::string& input_file);
// If there are no devices capable of offloading TIDL on the SoC, exit
uint32_t num_dla = Executor::GetNumDevices(DeviceType::DLA);
uint32_t num_dsp = Executor::GetNumDevices(DeviceType::DSP);
- if (num_dla == 0 && num_dsp == 0)
+ if (num_dla == 0 || num_dsp == 0)
{
- std::cout << "TI DL not supported on this SoC." << std::endl;
+ std::cout << "ssd_multibox requires both DLA and DSP for execution."
+ << std::endl;
return EXIT_SUCCESS;
}
// Process arguments
std::string config = DEFAULT_CONFIG;
std::string input_file = DEFAULT_INPUT;
- int num_devices = 1;
+ uint32_t num_devices = 1;
DeviceType device_type = DeviceType::DLA;
ProcessArgs(argc, argv, config, num_devices, device_type, input_file);
+ // Use same number of DLAs and DSPs
+ num_devices = std::min(num_devices, std::min(num_dla, num_dsp));
+ if (num_devices == 0)
+ {
+ std::cout << "Partitioned execution requires at least 1 DLA and 1 DSP."
+ << std::endl;
+ return EXIT_FAILURE;
+ }
if ((object_class_table = GetObjectClassTable(config)) == nullptr)
{
std::cout << "No object classes defined for this config." << std::endl;
return EXIT_SUCCESS;
}
-bool RunConfiguration(const std::string& config_file, int num_devices,
+bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
DeviceType device_type, std::string& input_file)
{
DeviceIds ids;
<< std::endl;
return false;
}
- if (device_type == DeviceType::DLA || device_type == DeviceType::DSP)
- configuration.runFullNet = 1;
// setup input
int num_frames = is_default_input ? 3 : 1;
image_file = input_file;
}
- // Determine input frame size from configuration
- size_t frame_sz = configuration.inWidth * configuration.inHeight *
- configuration.inNumChannels;
-
try
{
// Create a executor with the approriate core type, number of cores
// and configuration specified
- Executor executor(device_type, ids, configuration);
+ // DLA will run layersGroupId 1 in the network, while
+ // DSP will run layersGroupId 2 in the network
+ Executor executor_dla(DeviceType::DLA, ids, configuration, 1);
+ Executor executor_dsp(DeviceType::DSP, ids, configuration, 2);
// Query Executor for set of ExecutionObjects created
- const ExecutionObjects& execution_objects =
- executor.GetExecutionObjects();
- int num_eos = execution_objects.size();
+ const ExecutionObjects& execution_objects_dla =
+ executor_dla.GetExecutionObjects();
+ const ExecutionObjects& execution_objects_dsp =
+ executor_dsp.GetExecutionObjects();
+ int num_eos = execution_objects_dla.size();
// Allocate input and output buffers for each execution object
+ // Note that "out" is both the output of eo_dla and the input of eo_dsp
+ // This is how two layersGroupIds, 1 and 2, are tied together
std::vector<void *> buffers;
- for (auto &eo : execution_objects)
+ for (int i = 0; i < num_eos; i++)
{
- ArgInfo in = { ArgInfo(malloc(frame_sz), frame_sz)};
- ArgInfo out = { ArgInfo(malloc(frame_sz), frame_sz)};
- eo->SetInputOutputBuffer(in, out);
+ ExecutionObject *eo_dla = execution_objects_dla[i].get();
+ size_t in_size = eo_dla->GetInputBufferSizeInBytes();
+ size_t out_size = eo_dla->GetOutputBufferSizeInBytes();
+ ArgInfo in = { ArgInfo(malloc(in_size), in_size) };
+ ArgInfo out = { ArgInfo(malloc(out_size), out_size) };
+ eo_dla->SetInputOutputBuffer(in, out);
+
+ ExecutionObject *eo_dsp = execution_objects_dsp[i].get();
+ size_t out2_size = eo_dsp->GetOutputBufferSizeInBytes();
+ ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
+ eo_dsp->SetInputOutputBuffer(out, out2);
buffers.push_back(in.ptr());
buffers.push_back(out.ptr());
+ buffers.push_back(out2.ptr());
}
#define MAX_NUM_EOS 4
- struct timespec t0[MAX_NUM_EOS], t1;
+ struct timespec t0[MAX_NUM_EOS], t1, tloop0, tloop1;
+ clock_gettime(CLOCK_MONOTONIC, &tloop0);
// Process frames with available execution objects in a pipelined manner
// additional num_eos iterations to flush the pipeline (epilogue)
+ ExecutionObject *eo_dla, *eo_dsp, *eo_input;
for (int frame_idx = 0;
frame_idx < num_frames + num_eos; frame_idx++)
{
- ExecutionObject* eo = execution_objects[frame_idx % num_eos].get();
+ eo_dla = execution_objects_dla[frame_idx % num_eos].get();
+ eo_dsp = execution_objects_dsp[frame_idx % num_eos].get();
// Wait for previous frame on the same eo to finish processing
- if (eo->ProcessFrameWait())
+ if (eo_dsp->ProcessFrameWait())
{
+ int finished_idx = eo_dsp->GetFrameIndex();
clock_gettime(CLOCK_MONOTONIC, &t1);
- double elapsed_host =
- ms_diff(t0[eo->GetFrameIndex() % num_eos], t1);
- double elapsed_device = eo->GetProcessTimeInMilliSeconds();
- double overhead = 100 - (elapsed_device/elapsed_host*100);
-
- std::cout << "frame[" << eo->GetFrameIndex() << "]: "
- << "Time on device: "
- << std::setw(6) << std::setprecision(4)
- << elapsed_device << "ms, "
- << "host: "
- << std::setw(6) << std::setprecision(4)
- << elapsed_host << "ms ";
- std::cout << "API overhead: "
- << std::setw(6) << std::setprecision(3)
- << overhead << " %" << std::endl;
-
- WriteFrameOutput(*eo, configuration);
+ ReportTime(finished_idx, "DSP",
+ ms_diff(t0[finished_idx % num_eos], t1),
+ eo_dsp->GetProcessTimeInMilliSeconds());
+
+ eo_input = execution_objects_dla[finished_idx % num_eos].get();
+ WriteFrameOutput(*eo_input, *eo_dsp, configuration);
}
// Read a frame and start processing it with current eo
- if (ReadFrame(*eo, frame_idx, configuration, num_frames,
+ if (ReadFrame(*eo_dla, frame_idx, configuration, num_frames,
image_file, cap))
{
clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
- eo->ProcessFrameStartAsync();
+ eo_dla->ProcessFrameStartAsync();
+
+ if (eo_dla->ProcessFrameWait())
+ {
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ ReportTime(frame_idx, "DLA",
+ ms_diff(t0[frame_idx % num_eos], t1),
+ eo_dla->GetProcessTimeInMilliSeconds());
+
+ clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
+ eo_dsp->ProcessFrameStartAsync();
+ }
}
}
+ clock_gettime(CLOCK_MONOTONIC, &tloop1);
+ std::cout << "Loop total time (including read/write/print/etc): "
+ << std::setw(6) << std::setprecision(4)
+ << ms_diff(tloop0, tloop1) << "ms" << std::endl;
+
for (auto b : buffers)
free(b);
-
}
- catch (tinn::Exception &e)
+ catch (tidl::Exception &e)
{
std::cerr << e.what() << std::endl;
status = false;
return status;
}
+void ReportTime(int frame_index, std::string device_name, double elapsed_host,
+ double elapsed_device)
+{
+ double overhead = 100 - (elapsed_device/elapsed_host*100);
+ std::cout << "frame[" << frame_index << "]: "
+ << "Time on " << device_name << ": "
+ << std::setw(6) << std::setprecision(4)
+ << elapsed_device << "ms, "
+ << "host: "
+ << std::setw(6) << std::setprecision(4)
+ << elapsed_host << "ms ";
+ std::cout << "API overhead: "
+ << std::setw(6) << std::setprecision(3)
+ << overhead << " %" << std::endl;
+}
+
bool ReadFrame(ExecutionObject &eo, int frame_idx,
const Configuration& configuration, int num_frames,
}
// Create frame with boxes drawn around classified objects
-bool WriteFrameOutput(const ExecutionObject &eo,
+bool WriteFrameOutput(const ExecutionObject &eo_in,
+ const ExecutionObject &eo_out,
const Configuration& configuration)
{
// Asseembly original frame
int channel_size = width * height;
Mat frame, r_frame, bgr[3];
- unsigned char *in = (unsigned char *) eo.GetInputBufferPtr();
+ unsigned char *in = (unsigned char *) eo_in.GetInputBufferPtr();
bgr[0] = Mat(height, width, CV_8UC(1), in);
bgr[1] = Mat(height, width, CV_8UC(1), in + channel_size);
bgr[2] = Mat(height, width, CV_8UC(1), in + channel_size*2);
cv::merge(bgr, 3, frame);
- int frame_index = eo.GetFrameIndex();
+ int frame_index = eo_in.GetFrameIndex();
char outfile_name[64];
if (! is_camera_input && is_preprocessed_input)
{
}
// Draw boxes around classified objects
- float *out = (float *) eo.GetOutputBufferPtr();
- int num_floats = eo.GetOutputBufferSizeInBytes() / sizeof(float);
+ float *out = (float *) eo_out.GetOutputBufferPtr();
+ int num_floats = eo_out.GetOutputBufferSizeInBytes() / sizeof(float);
for (int i = 0; i < num_floats / 7; i++)
{
int index = (int) out[i * 7 + 0];
#endif
cv::rectangle(frame, Point(xmin, ymin), Point(xmax, ymax),
- Scalar(object_class->color[0], object_class->color[1],
- object_class->color[2]), 2);
+ Scalar(object_class->color.blue,
+ object_class->color.green,
+ object_class->color.red), 2);
}
// output
void ProcessArgs(int argc, char *argv[], std::string& config,
- int& num_devices, DeviceType& device_type,
+ uint32_t& num_devices, DeviceType& device_type,
std::string& input_file)
{
const struct option long_options[] =
{
{"config", required_argument, 0, 'c'},
{"num_devices", required_argument, 0, 'n'},
- {"device_type", required_argument, 0, 't'},
{"image_file", required_argument, 0, 'i'},
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
while (true)
{
- int c = getopt_long(argc, argv, "c:n:t:i:hv", long_options, &option_index);
+ int c = getopt_long(argc, argv, "c:n:i:hv", long_options, &option_index);
if (c == -1)
break;
assert (num_devices > 0 && num_devices <= 4);
break;
- case 't': if (*optarg == 'e')
- device_type = DeviceType::DLA;
-#if 0
- else if (*optarg == 'd')
- device_type = DeviceType::DSP;
-#endif
- else
- {
- //std::cerr << "Invalid argument to -t, only e or d"
- std::cerr << "Invalid argument to -t, only e"
- " allowed" << std::endl;
- exit(EXIT_FAILURE);
- }
- break;
-
case 'i': input_file = optarg;
break;
void DisplayHelp()
{
std::cout << "Usage: ssd_multibox\n"
- " Will run ssd_multibox network to perform multi-objects"
- " classification.\n Use -c to run a different"
- " segmentation network. Default is jdetnet.\n"
+ " Will run partitioned ssd_multibox network to perform "
+ "multi-objects detection\n"
+ " and classification. First part of network "
+ "(layersGroupId 1) runs on DLA,\n"
+ " second part (layersGroupId 2) runs on DSP.\n"
+ " Use -c to run a different segmentation network. "
+ "Default is jdetnet.\n"
"Optional arguments:\n"
- " -c <config> Valid configs: jdetnet, jdetnet_512x256\n"
+ " -c <config> Valid configs: jdetnet \n"
" -n <number of cores> Number of cores to use (1 - 4)\n"
- " -t <d|e> Type of core. d -> DSP, e -> DLA\n"
- " Only support DLA for now\n"
" -i <image> Path to the image file\n"
" Default is 1 frame in testvecs\n"
" -i camera Use camera as input\n"