summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 9835b57)
raw | patch | inline | side by side (parent: 9835b57)
author | Yuan Zhao <yuanzhao@ti.com> | |
Wed, 26 Jun 2019 14:36:54 +0000 (09:36 -0500) | ||
committer | Yuan Zhao <yuanzhao@ti.com> | |
Fri, 28 Jun 2019 21:20:51 +0000 (16:20 -0500) |
- To demonstrate running jdenet/jdetnet_voc on a single core,
without paritioning the network. This is useful for situations
where SoC only has C66x cores but not EVE cores.
- MCT-1202
without paritioning the network. This is useful for situations
where SoC only has C66x cores but not EVE cores.
- MCT-1202
index e69b6e44dbf1508cd5d1f76a9bc73280f16c5b4e..532a436c6dc7d23156d351f128d68e0b970476f9 100644 (file)
Enable a two phase approach to generating execution graphs. Use the
following API function to enable timestamp generation:
- .. code::
+ .. code:: cpp
bool EnableTimeStamps(const std::string& file = "timestamp.log", size_t num_frames=32);
#. Added Python 3 bindings for TIDL API. See the ``examples/pybind`` directory for examples of using the Python bindings. Set PYTHONPATH to the location of ``tidl.so``.
- .. code::
+ .. code:: bash
root@am57xx-evm:~# export PYTHONPATH=/home/root/tidl-api/tidl_api
root@am57xx-evm:~# python3
index ff70804267e5179aab6316088b66020e00a8a4a5..9c696ad8b6cc0f3bf301cc754c7f0747ec640db9 100644 (file)
--- a/docs/source/example.rst
+++ b/docs/source/example.rst
- OpenCV used to read input image from file or capture from camera.
* - ssd_multibox
- Object detection
- - EVE and C66x (network is split across both EVE and C66x)
+ - EVE and C66x (network is split across both EVE and C66x), EVE or C66x (full network on each core)
- OpenCV used to read input image from file or capture from camera.
* - mnist
- handwritten digits recognition (MNIST). This example illustrates
low TIDL API overhead (~1.8%) for small networks with low compute
requirements (<5ms).
- - EVE
+ - EVE or C66x
- Pre-processed white-on-black images read from file, with or without
MNIST database file headers.
* - classification
- 1630 ms
- 1408 ms
+When there is a requirement to run the SSD networks non-partitioned,
+for example, the SoC only has C66x cores but not EVE cores,
+use ``-e 0`` to run the full network only on C66x cores, without partitioning.
+
.. _mnist-example:
MNIST
index ee97e58eb00d5b66cc7c3afda189aafee94934a6..d7c8e1f89c2a00da0f7fdee2cbf5bf833e9582e3 100644 (file)
Application execution fails with the following error message:
-.. code:: shell
+.. code:: console
tidl: device_alloc.h:31: T* tidl::malloc_ddr(size_t) [with T = char; size_t = unsigned int]: Assertion `val != nullptr' failed
+++++++++++++++++++++++++++++++++
Another possible reason is that total memory requirement specified in the ``Configuration`` using NETWORK_HEAP_SIZE and PARAM_HEAP_SIZE exceeds default memory available for OpenCL. Follow the instructions below to increase the amount of CMEM (contiguous memory available for OpenCL) from 192MB (0xc000000) to 384MB (0x18000000):
-.. code:: bash
+.. code:: diff
$ sudo apt-get install device-tree-compiler # In case dtc is not already installed
$ scp root@am57:/boot/am57xx-evm-reva3.dtb .
index 315834501227ccf166ce54cb29b96136d98fa6ea..93f4c89ab5b3aa94d4e36def357d3a7f4dbe1123 100644 (file)
// If there are no devices capable of offloading TIDL on the SoC, exit
uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
- if (num_eves == 0 || num_dsps == 0)
+ if (num_eves == 0 && num_dsps == 0)
{
- cout << "ssd_multibox requires both EVE and DSP for execution." << endl;
+ cout << "ssd_multibox requires EVE or DSP for execution." << endl;
return EXIT_SUCCESS;
}
cmdline_opts_t opts;
opts.config = DEFAULT_CONFIG;
opts.object_classes_list_file = DEFAULT_OBJECT_CLASSES_LIST_FILE;
- opts.num_eves = 1;
- opts.num_dsps = 1;
+ opts.num_eves = num_eves > 0 ? 1 : 0;
+ opts.num_dsps = num_dsps > 0 ? 1 : 0;
opts.input_file = DEFAULT_INPUT;
opts.output_prob_threshold = DEFAULT_OUTPUT_PROB_THRESHOLD;
if (! ProcessArgs(argc, argv, opts))
DisplayHelp();
exit(EXIT_SUCCESS);
}
- assert(opts.num_dsps != 0 && opts.num_eves != 0);
+ assert(opts.num_dsps != 0 || opts.num_eves != 0);
if (opts.num_frames == 0)
opts.num_frames = (opts.is_camera_input || opts.is_video_input) ?
NUM_VIDEO_FRAMES :
return false;
}
c.enableApiTrace = opts.verbose;
+ if (opts.num_eves == 0 || opts.num_dsps == 0)
+ c.runFullNet = true;
+
// setup camera/video input
VideoCapture cap;
if (! SetVideoInputOutput(cap, opts, "SSD_Multibox")) return false;
// DSP will run layersGroupId 2 in the network
Executor* e_eve = CreateExecutor(DeviceType::EVE, opts.num_eves, c, 1);
Executor* e_dsp = CreateExecutor(DeviceType::DSP, opts.num_dsps, c, 2);
-
- // Construct ExecutionObjectPipeline that utilizes multiple
- // ExecutionObjects to process a single frame, each ExecutionObject
- // processes one layerGroup of the network
- //
- // Pipeline depth can enable more optimized pipeline execution:
- // Given one EVE and one DSP as an example, with different
- // pipeline_depth, we have different execution behavior:
- // If pipeline_depth is set to 1,
- // we create one EOP: eop0 (eve0, dsp0)
- // pipeline execution of multiple frames over time is as follows:
- // --------------------- time ------------------->
- // eop0: [eve0...][dsp0]
- // eop0: [eve0...][dsp0]
- // eop0: [eve0...][dsp0]
- // eop0: [eve0...][dsp0]
- // If pipeline_depth is set to 2,
- // we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
- // pipeline execution of multiple frames over time is as follows:
- // --------------------- time ------------------->
- // eop0: [eve0...][dsp0]
- // eop1: [eve0...][dsp0]
- // eop0: [eve0...][dsp0]
- // eop1: [eve0...][dsp0]
- // Additional benefit of setting pipeline_depth to 2 is that
- // it can also overlap host ReadFrame() with device processing:
- // --------------------- time ------------------->
- // eop0: [RF][eve0...][dsp0]
- // eop1: [RF] [eve0...][dsp0]
- // eop0: [RF][eve0...][dsp0]
- // eop1: [RF][eve0...][dsp0]
vector<ExecutionObjectPipeline *> eops;
- uint32_t pipeline_depth = 2; // 2 EOs in EOP -> depth 2
- for (uint32_t j = 0; j < pipeline_depth; j++)
- for (uint32_t i = 0; i < max(opts.num_eves, opts.num_dsps); i++)
- eops.push_back(new ExecutionObjectPipeline(
+
+ if (e_eve != nullptr && e_dsp != nullptr)
+ {
+ // Construct ExecutionObjectPipeline that utilizes multiple
+ // ExecutionObjects to process a single frame, each ExecutionObject
+ // processes one layerGroup of the network
+ //
+ // Pipeline depth can enable more optimized pipeline execution:
+ // Given one EVE and one DSP as an example, with different
+ // pipeline_depth, we have different execution behavior:
+ // If pipeline_depth is set to 1,
+ // we create one EOP: eop0 (eve0, dsp0)
+ // pipeline execution of multiple frames over time is as follows:
+ // --------------------- time ------------------->
+ // eop0: [eve0...][dsp0]
+ // eop0: [eve0...][dsp0]
+ // eop0: [eve0...][dsp0]
+ // eop0: [eve0...][dsp0]
+ // If pipeline_depth is set to 2,
+ // we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
+ // pipeline execution of multiple frames over time is as follows:
+ // --------------------- time ------------------->
+ // eop0: [eve0...][dsp0]
+ // eop1: [eve0...][dsp0]
+ // eop0: [eve0...][dsp0]
+ // eop1: [eve0...][dsp0]
+ // Additional benefit of setting pipeline_depth to 2 is that
+ // it can also overlap host ReadFrame() with device processing:
+ // --------------------- time ------------------->
+ // eop0: [RF][eve0...][dsp0]
+ // eop1: [RF] [eve0...][dsp0]
+ // eop0: [RF][eve0...][dsp0]
+ // eop1: [RF][eve0...][dsp0]
+ uint32_t pipeline_depth = 2; // 2 EOs in EOP -> depth 2
+ for (uint32_t j = 0; j < pipeline_depth; j++)
+ for (uint32_t i = 0; i < max(opts.num_eves, opts.num_dsps); i++)
+ eops.push_back(new ExecutionObjectPipeline(
{(*e_eve)[i%opts.num_eves], (*e_dsp)[i%opts.num_dsps]}));
+ }
+ else
+ {
+ // Construct ExecutionObjectPipeline that utilizes a
+ // ExecutionObject to process a single frame, each ExecutionObject
+ // processes the full network
+ //
+ // Use duplicate EOPs to do double buffering on frame input/output
+ // because each EOP has its own set of input/output buffers,
+ // so that host ReadFrame() can overlap device processing
+ // Use one EO as an example, with different buffer_factor,
+ // we have different execution behavior:
+ // If buffer_factor is set to 1 -> single buffering
+ // we create one EOP: eop0 (eo0)
+ // pipeline execution of multiple frames over time is as follows:
+ // --------------------- time ------------------->
+ // eop0: [RF][eo0.....][WF]
+ // eop0: [RF][eo0.....][WF]
+ // eop0: [RF][eo0.....][WF]
+ // If buffer_factor is set to 2 -> double buffering
+ // we create two EOPs: eop0 (eo0), eop1(eo0)
+ // pipeline execution of multiple frames over time is as follows:
+ // --------------------- time ------------------->
+ // eop0: [RF][eo0.....][WF]
+ // eop1: [RF] [eo0.....][WF]
+ // eop0: [RF] [eo0.....][WF]
+ // eop1: [RF] [eo0.....][WF]
+ uint32_t buffer_factor = 2; // set to 1 for single buffering
+ for (uint32_t j = 0; j < buffer_factor; j++)
+ {
+ for (uint32_t i = 0; i < opts.num_eves; i++)
+ eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
+ for (uint32_t i = 0; i < opts.num_dsps; i++)
+ eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
+ }
+ }
uint32_t num_eops = eops.size();
// Allocate input/output memory for each EOP
// Wait for previous frame on the same eop to finish processing
if (eop->ProcessFrameWait())
- {
WriteFrameOutput(*eop, c, opts, (float)prob_slider);
- }
// Read a frame and start processing it with current eo
if (ReadFrame(*eop, frame_idx, c, opts, cap, ifs))
for (uint32_t i = 0; i < num; i++)
ids.insert(static_cast<DeviceId>(i));
- return new Executor(dt, ids, c, layers_group_id);
+ Executor* e = new Executor(dt, ids, c, layers_group_id);
+ assert(e != nullptr);
+ return e;
}
bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,