summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (from parent 1: 27db5c3)
raw | patch | inline | side by side (from parent 1: 27db5c3)
author | Yuan Zhao <yuanzhao@ti.com> | |
Fri, 11 May 2018 13:32:23 +0000 (08:32 -0500) | ||
committer | Yuan Zhao <yuanzhao@ti.com> | |
Fri, 11 May 2018 16:46:13 +0000 (11:46 -0500) |
- Only support partitioned mode, remove single device mode
- Remove enableInternalInput mode due to no observable performance gain
- Make layersGroupId assignment part of Executor's construction
- MCT-974
- Remove enableInternalInput mode due to no observable performance gain
- Make layersGroupId assignment part of Executor's construction
- MCT-974
index ad26ba7820ef4196270818fa9c8391a7d44f8467..a44f8cd1e8d7aedd1a605fd4cbfb6cba7ff27476 100644 (file)
bool is_default_input = false;
bool is_preprocessed_input = false;
bool is_camera_input = false;
-bool is_partitioned = true;
int orig_width;
int orig_height;
object_class_table_t *object_class_table;
DeviceType device_type = DeviceType::DLA;
ProcessArgs(argc, argv, config, num_devices, device_type, input_file);
- if (is_partitioned)
- num_devices = std::min(num_devices, std::min(num_dla, num_dsp));
+ // Use same number of DLAs and DSPs
+ num_devices = std::min(num_devices, std::min(num_dla, num_dsp));
if (num_devices == 0)
{
std::cout << "Partitioned execution requires at least 1 DLA and 1 DSP."
<< std::endl;
return false;
}
- configuration.runFullNet = is_partitioned ? 0 : 1;
// setup input
int num_frames = is_default_input ? 3 : 1;
{
// Create a executor with the approriate core type, number of cores
// and configuration specified
- configuration.layersGroupId = 1;
- Executor *executor_1 = new Executor(device_type, ids, configuration);
- Executor *executor_2 = nullptr;
- if (is_partitioned)
- {
- configuration.layersGroupId = 2;
- configuration.enableInternalInput = 1; // 0 is also valid
- executor_2 = new Executor(DeviceType::DSP, ids, configuration);
- }
+ // DLA will run layersGroupId 1 in the network, while
+ // DSP will run layersGroupId 2 in the network
+ Executor executor_dla(DeviceType::DLA, ids, configuration, 1);
+ Executor executor_dsp(DeviceType::DSP, ids, configuration, 2);
// Query Executor for set of ExecutionObjects created
- const ExecutionObjects *execution_objects_1, *execution_objects_2;
- execution_objects_1 = & executor_1->GetExecutionObjects();
- int num_eos = execution_objects_1->size();
- if (is_partitioned)
- execution_objects_2 = & executor_2->GetExecutionObjects();
+ const ExecutionObjects& execution_objects_dla =
+ executor_dla.GetExecutionObjects();
+ const ExecutionObjects& execution_objects_dsp =
+ executor_dsp.GetExecutionObjects();
+ int num_eos = execution_objects_dla.size();
// Allocate input and output buffers for each execution object
+ // Note that "out" is both the output of eo_dla and the input of eo_dsp
+ // This is how two layersGroupIds, 1 and 2, are tied together
std::vector<void *> buffers;
for (int i = 0; i < num_eos; i++)
{
- ExecutionObject *eo1 = execution_objects_1->at(i).get();
- size_t in_size = eo1->GetInputBufferSizeInBytes();
- size_t out_size = eo1->GetOutputBufferSizeInBytes();
- ArgInfo in = { ArgInfo(malloc(in_size), in_size)};
- ArgInfo out = { ArgInfo(nullptr, 0) };
- if (configuration.enableInternalInput == 0)
- out = ArgInfo(malloc(out_size), out_size);
- eo1->SetInputOutputBuffer(in, out);
+ ExecutionObject *eo_dla = execution_objects_dla[i].get();
+ size_t in_size = eo_dla->GetInputBufferSizeInBytes();
+ size_t out_size = eo_dla->GetOutputBufferSizeInBytes();
+ ArgInfo in = { ArgInfo(malloc(in_size), in_size) };
+ ArgInfo out = { ArgInfo(malloc(out_size), out_size) };
+ eo_dla->SetInputOutputBuffer(in, out);
+
+ ExecutionObject *eo_dsp = execution_objects_dsp[i].get();
+ size_t out2_size = eo_dsp->GetOutputBufferSizeInBytes();
+ ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
+ eo_dsp->SetInputOutputBuffer(out, out2);
buffers.push_back(in.ptr());
buffers.push_back(out.ptr());
-
- if (is_partitioned)
- {
- ExecutionObject *eo2 = execution_objects_2->at(i).get();
- size_t out2_size = eo2->GetOutputBufferSizeInBytes();
- ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
- eo2->SetInputOutputBuffer(out, out2);
- buffers.push_back(out2.ptr());
- }
+ buffers.push_back(out2.ptr());
}
#define MAX_NUM_EOS 4
// Process frames with available execution objects in a pipelined manner
// additional num_eos iterations to flush the pipeline (epilogue)
- ExecutionObject *eo1, *eo2, *eo_wait, *eo_input;
+ ExecutionObject *eo_dla, *eo_dsp, *eo_input;
for (int frame_idx = 0;
frame_idx < num_frames + num_eos; frame_idx++)
{
- eo1 = execution_objects_1->at(frame_idx % num_eos).get();
- eo_wait = eo1;
- if (is_partitioned)
- {
- eo2 = execution_objects_2->at(frame_idx % num_eos).get();
- eo_wait = eo2;
- }
+ eo_dla = execution_objects_dla[frame_idx % num_eos].get();
+ eo_dsp = execution_objects_dsp[frame_idx % num_eos].get();
// Wait for previous frame on the same eo to finish processing
- if (eo_wait->ProcessFrameWait())
+ if (eo_dsp->ProcessFrameWait())
{
- int finished_idx = eo_wait->GetFrameIndex();
+ int finished_idx = eo_dsp->GetFrameIndex();
clock_gettime(CLOCK_MONOTONIC, &t1);
- ReportTime(finished_idx,
- (is_partitioned || device_type == DeviceType::DSP) ?
- "DSP" : "DLA",
+ ReportTime(finished_idx, "DSP",
ms_diff(t0[finished_idx % num_eos], t1),
- eo_wait->GetProcessTimeInMilliSeconds());
+ eo_dsp->GetProcessTimeInMilliSeconds());
- eo_input = execution_objects_1->at(finished_idx %num_eos).get();
- WriteFrameOutput(*eo_input, *eo_wait, configuration);
+ eo_input = execution_objects_dla[finished_idx % num_eos].get();
+ WriteFrameOutput(*eo_input, *eo_dsp, configuration);
}
// Read a frame and start processing it with current eo
- if (ReadFrame(*eo1, frame_idx, configuration, num_frames,
+ if (ReadFrame(*eo_dla, frame_idx, configuration, num_frames,
image_file, cap))
{
clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
- eo1->ProcessFrameStartAsync();
+ eo_dla->ProcessFrameStartAsync();
- if (is_partitioned && eo1->ProcessFrameWait())
+ if (eo_dla->ProcessFrameWait())
{
clock_gettime(CLOCK_MONOTONIC, &t1);
ReportTime(frame_idx, "DLA",
- ms_diff(t0[frame_idx % num_eos], t1),
- eo1->GetProcessTimeInMilliSeconds());
+ ms_diff(t0[frame_idx % num_eos], t1),
+ eo_dla->GetProcessTimeInMilliSeconds());
clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
- eo2->ProcessFrameStartAsync();
+ eo_dsp->ProcessFrameStartAsync();
}
}
}
<< std::setw(6) << std::setprecision(4)
<< ms_diff(tloop0, tloop1) << "ms" << std::endl;
- delete executor_1;
- delete executor_2;
for (auto b : buffers)
free(b);
}
{
{"config", required_argument, 0, 'c'},
{"num_devices", required_argument, 0, 'n'},
- {"device_type", required_argument, 0, 't'},
{"image_file", required_argument, 0, 'i'},
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
while (true)
{
- int c = getopt_long(argc, argv, "c:n:t:i:hv", long_options, &option_index);
+ int c = getopt_long(argc, argv, "c:n:i:hv", long_options, &option_index);
if (c == -1)
break;
assert (num_devices > 0 && num_devices <= 4);
break;
- case 't': if (*optarg == 'e')
- {
- device_type = DeviceType::DLA;
- is_partitioned = false;
- }
-#if 0
- else if (*optarg == 'd')
- {
- device_type = DeviceType::DSP;
- is_partitioned = false;
- }
-#endif
- else
- {
- //std::cerr << "Invalid argument to -t, only e or d"
- std::cerr << "Invalid argument to -t, only e"
- " allowed" << std::endl;
- exit(EXIT_FAILURE);
- }
- break;
-
case 'i': input_file = optarg;
break;
void DisplayHelp()
{
std::cout << "Usage: ssd_multibox\n"
- " Will run ssd_multibox network to perform multi-objects"
- " classification.\n Use -c to run a different"
- " segmentation network. Default is jdetnet.\n"
+ " Will run partitioned ssd_multibox network to perform "
+ "multi-objects detection\n"
+ " and classification. First part of network "
+ "(layersGroupId 1) runs on DLA,\n"
+ " second part (layersGroupId 2) runs on DSP.\n"
+ " Use -c to run a different segmentation network. "
+ "Default is jdetnet.\n"
"Optional arguments:\n"
- " -c <config> Valid configs: jdetnet, jdetnet_512x256\n"
+ " -c <config> Valid configs: jdetnet \n"
" -n <number of cores> Number of cores to use (1 - 4)\n"
- " -t <d|e> Type of core. d -> DSP, e -> DLA\n"
- " DSP not supported at this time\n"
- " Default to partitioned execution: \n"
- " part 1 on DLA, part 2 on DSP\n"
" -i <image> Path to the image file\n"
" Default is 1 frame in testvecs\n"
" -i camera Use camera as input\n"
index 11d19c6e01dc89ac4375d947de4e948c0505c582..088a6291a413af379087acc12d7005826b293128 100644 (file)
//! Specific to each network, can take values from 0 to 4, default is 0
int preProcType;
- //! layersGroupId in the network that the executor should work on
- int layersGroupId;
-
//! Force to run all layers, regardless of layersGroupId partitioning
int runFullNet;
index 8a4eb62a4786fad84a9e6eb6ae14e3aca00a57d2..c334ac98c98c8caade37a53e65d6b4ac055b2276 100644 (file)
const ArgInfo& create_arg,
const ArgInfo& param_heap_arg,
size_t extmem_heap_size,
- uint32_t internal_input);
+ bool internal_input);
//! @private
~ExecutionObject();
index f87c6d51b10005862ca0aae51cf659b9d4431d43..05e9cc04923a7d749dc201944de856c884baa241 100644 (file)
--- a/tinn_api/inc/executor.h
+++ b/tinn_api/inc/executor.h
//! @param device_type DSP or EVE/DLA device
//! @param ids Set of devices uses by this instance of the Executor
//! @param configuration Configuration used to initialize the Executor
+ //! @param layers_group_id Layers group that this Executor should run
Executor(DeviceType device_type, const DeviceIds& ids,
- const Configuration& configuration);
+ const Configuration& configuration,
+ int layers_group_id = OCL_TIDL_DEFAULT_LAYERS_GROUP_ID);
//! @brief Tear down an Executor and free resources used by the
//! Executor object
index b993b1a0aa597449ac826365f46dc86851354fad..70fd136b65b3bd43d2aa3f83b37eef712c46ed2f 100644 (file)
inNumChannels(0),
noZeroCoeffsPercentage(100),
preProcType(0),
- layersGroupId(tinn::internal::CURR_LAYERS_GROUP_ID),
runFullNet(0),
enableInternalInput(0),
EXTMEM_HEAP_SIZE(64 << 20), // 64MB for inceptionNetv1
<< "\nFrame= " << numFrames << " " << inWidth << "x"
<< inHeight << "x" << inNumChannels
<< "\nPreProcType " << preProcType
- << "\nLayersGroupId " << layersGroupId
<< "\nRunFullNet " << runFullNet
<< "\nEnableInternalInput " << enableInternalInput
<< "\nInputFile " << inData
index c74389ff8cd24dfe7ae874aec994041abbf97ac3..dbdb90293206cf804b7fc9458d1cae596c4d444a 100644 (file)
const ArgInfo& create_arg,
const ArgInfo& param_heap_arg,
size_t extmem_heap_size,
- uint32_t internal_input);
+ bool internal_input);
~Impl() {}
bool RunAsync(CallType ct);
const ArgInfo& create_arg,
const ArgInfo& param_heap_arg,
size_t extmem_heap_size,
- uint32_t internal_input)
+ bool internal_input)
{
pimpl_m = std::unique_ptr<ExecutionObject::Impl>
{ new ExecutionObject::Impl(d, device_index,
const ArgInfo& create_arg,
const ArgInfo& param_heap_arg,
size_t extmem_heap_size,
- uint32_t internal_input):
+ bool internal_input):
device_m(d),
k_initialize_m(nullptr),
k_process_m(nullptr),
shared_initialize_params_m->l2HeapSize = tinn::internal::DMEM1_SIZE;
shared_initialize_params_m->l1HeapSize = tinn::internal::DMEM0_SIZE;
shared_initialize_params_m->enableTrace = OCL_TIDL_TRACE_OFF;
- shared_initialize_params_m->enableInternalInput = internal_input;
+ shared_initialize_params_m->enableInternalInput = internal_input ? 1 : 0;
// Setup kernel arguments for initialize
KernelArgs args = { create_arg,
index 09907315aca4f544ddb665e5483f5251ada255c3..e16f6d0eca7132a9828b13d9c2deeabef825ca23 100644 (file)
using std::unique_ptr;
Executor::Executor(DeviceType core_type, const DeviceIds& ids,
- const Configuration& configuration)
+ const Configuration& configuration, int layers_group_id)
{
pimpl_m = unique_ptr<ExecutorImpl>
- { new ExecutorImpl(core_type, ids) };
+ { new ExecutorImpl(core_type, ids, layers_group_id) };
pimpl_m->Initialize(configuration);
}
}
-ExecutorImpl::ExecutorImpl(DeviceType core_type, const DeviceIds& ids):
+ExecutorImpl::ExecutorImpl(DeviceType core_type, const DeviceIds& ids,
+ int layers_group_id):
configuration_m(),
shared_networkparam_heap_m(nullptr, &__free_ddr),
device_ids_m(ids),
- core_type_m(core_type)
+ core_type_m(core_type),
+ layers_group_id_m(layers_group_id)
{
std::string name;
if (core_type_m == DeviceType::DSP)
{
for (int i = 0; i < net->numLayers; i++)
if (net->TIDLLayers[i].layerType != TIDL_DataLayer)
- net->TIDLLayers[i].layersGroupId = configuration.layersGroupId;
+ net->TIDLLayers[i].layersGroupId = layers_group_id_m;
}
// Call a setup kernel to allocate and fill network parameters
void ExecutorImpl::InitializeNetworkCreateParam(TIDL_CreateParams *CP,
const Configuration& configuration)
{
- CP->currCoreId = configuration.layersGroupId;
- CP->currLayersGroupId = configuration.layersGroupId;
+ CP->currCoreId = layers_group_id_m;
+ CP->currLayersGroupId = layers_group_id_m;
CP->l1MemSize = tinn::internal::DMEM0_SIZE;
CP->l2MemSize = tinn::internal::DMEM1_SIZE;
CP->l3MemSize = tinn::internal::OCMC_SIZE;
index b04c156b53fce24d762b96d664465dd85cc899a9..704b1f7162bbd6416e4d8b04b6b983d322981fc7 100644 (file)
class ExecutorImpl
{
public:
- ExecutorImpl(DeviceType core_type, const DeviceIds& ids);
+ ExecutorImpl(DeviceType core_type, const DeviceIds& ids,
+ int layersGroupId);
~ExecutorImpl() { Cleanup(); }
bool Initialize(const Configuration& configuration);
up_malloc_ddr<char> shared_networkparam_heap_m;
DeviceIds device_ids_m;
DeviceType core_type_m;
+ int layers_group_id_m;
};
} // namespace tinn