aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuan Zhao2019-06-26 09:36:54 -0500
committerYuan Zhao2019-06-28 16:20:51 -0500
commit885ef17911d8f5d9f7585f55c83ae98852df0926 (patch)
tree0b3b1645215e26def08b01f0bf671d7c78f7b131
parent9835b57fe6d0e10e237b7eb4a99b5e0481ae1d01 (diff)
downloadtidl-api-885ef17911d8f5d9f7585f55c83ae98852df0926.tar.gz
tidl-api-885ef17911d8f5d9f7585f55c83ae98852df0926.tar.xz
tidl-api-885ef17911d8f5d9f7585f55c83ae98852df0926.zip
Add ssd_multibox_fullnet example
- To demonstrate running jdenet/jdetnet_voc on a single core, without paritioning the network. This is useful for situations where SoC only has C66x cores but not EVE cores. - MCT-1202
-rw-r--r--docs/source/changelog.rst4
-rw-r--r--docs/source/example.rst8
-rw-r--r--docs/source/faq/out_of_memory.rst4
-rw-r--r--examples/ssd_multibox/main.cpp127
4 files changed, 94 insertions, 49 deletions
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index e69b6e4..532a436 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -30,7 +30,7 @@ Changelog
30 Enable a two phase approach to generating execution graphs. Use the 30 Enable a two phase approach to generating execution graphs. Use the
31 following API function to enable timestamp generation: 31 following API function to enable timestamp generation:
32 32
33 .. code:: 33 .. code:: cpp
34 34
35 bool EnableTimeStamps(const std::string& file = "timestamp.log", size_t num_frames=32); 35 bool EnableTimeStamps(const std::string& file = "timestamp.log", size_t num_frames=32);
36 36
@@ -38,7 +38,7 @@ Changelog
38 38
39#. Added Python 3 bindings for TIDL API. See the ``examples/pybind`` directory for examples of using the Python bindings. Set PYTHONPATH to the location of ``tidl.so``. 39#. Added Python 3 bindings for TIDL API. See the ``examples/pybind`` directory for examples of using the Python bindings. Set PYTHONPATH to the location of ``tidl.so``.
40 40
41 .. code:: 41 .. code:: bash
42 42
43 root@am57xx-evm:~# export PYTHONPATH=/home/root/tidl-api/tidl_api 43 root@am57xx-evm:~# export PYTHONPATH=/home/root/tidl-api/tidl_api
44 root@am57xx-evm:~# python3 44 root@am57xx-evm:~# python3
diff --git a/docs/source/example.rst b/docs/source/example.rst
index ff70804..9c696ad 100644
--- a/docs/source/example.rst
+++ b/docs/source/example.rst
@@ -35,13 +35,13 @@ Examples
35 - OpenCV used to read input image from file or capture from camera. 35 - OpenCV used to read input image from file or capture from camera.
36 * - ssd_multibox 36 * - ssd_multibox
37 - Object detection 37 - Object detection
38 - EVE and C66x (network is split across both EVE and C66x) 38 - EVE and C66x (network is split across both EVE and C66x), EVE or C66x (full network on each core)
39 - OpenCV used to read input image from file or capture from camera. 39 - OpenCV used to read input image from file or capture from camera.
40 * - mnist 40 * - mnist
41 - handwritten digits recognition (MNIST). This example illustrates 41 - handwritten digits recognition (MNIST). This example illustrates
42 low TIDL API overhead (~1.8%) for small networks with low compute 42 low TIDL API overhead (~1.8%) for small networks with low compute
43 requirements (<5ms). 43 requirements (<5ms).
44 - EVE 44 - EVE or C66x
45 - Pre-processed white-on-black images read from file, with or without 45 - Pre-processed white-on-black images read from file, with or without
46 MNIST database file headers. 46 MNIST database file headers.
47 * - classification 47 * - classification
@@ -273,6 +273,10 @@ versus ExecutionObject level.
273 - 1630 ms 273 - 1630 ms
274 - 1408 ms 274 - 1408 ms
275 275
276When there is a requirement to run the SSD networks non-partitioned,
277for example, the SoC only has C66x cores but not EVE cores,
278use ``-e 0`` to run the full network only on C66x cores, without partitioning.
279
276.. _mnist-example: 280.. _mnist-example:
277 281
278MNIST 282MNIST
diff --git a/docs/source/faq/out_of_memory.rst b/docs/source/faq/out_of_memory.rst
index ee97e58..d7c8e1f 100644
--- a/docs/source/faq/out_of_memory.rst
+++ b/docs/source/faq/out_of_memory.rst
@@ -4,7 +4,7 @@ Why do I get an assertion failure from malloc_ddr?
4 4
5Application execution fails with the following error message: 5Application execution fails with the following error message:
6 6
7.. code:: shell 7.. code:: console
8 8
9 tidl: device_alloc.h:31: T* tidl::malloc_ddr(size_t) [with T = char; size_t = unsigned int]: Assertion `val != nullptr' failed 9 tidl: device_alloc.h:31: T* tidl::malloc_ddr(size_t) [with T = char; size_t = unsigned int]: Assertion `val != nullptr' failed
10 10
@@ -28,7 +28,7 @@ Insufficient OpenCL global memory
28+++++++++++++++++++++++++++++++++ 28+++++++++++++++++++++++++++++++++
29Another possible reason is that total memory requirement specified in the ``Configuration`` using NETWORK_HEAP_SIZE and PARAM_HEAP_SIZE exceeds default memory available for OpenCL. Follow the instructions below to increase the amount of CMEM (contiguous memory available for OpenCL) from 192MB (0xc000000) to 384MB (0x18000000): 29Another possible reason is that total memory requirement specified in the ``Configuration`` using NETWORK_HEAP_SIZE and PARAM_HEAP_SIZE exceeds default memory available for OpenCL. Follow the instructions below to increase the amount of CMEM (contiguous memory available for OpenCL) from 192MB (0xc000000) to 384MB (0x18000000):
30 30
31.. code:: bash 31.. code:: diff
32 32
33 $ sudo apt-get install device-tree-compiler # In case dtc is not already installed 33 $ sudo apt-get install device-tree-compiler # In case dtc is not already installed
34 $ scp root@am57:/boot/am57xx-evm-reva3.dtb . 34 $ scp root@am57:/boot/am57xx-evm-reva3.dtb .
diff --git a/examples/ssd_multibox/main.cpp b/examples/ssd_multibox/main.cpp
index 3158345..93f4c89 100644
--- a/examples/ssd_multibox/main.cpp
+++ b/examples/ssd_multibox/main.cpp
@@ -102,9 +102,9 @@ int main(int argc, char *argv[])
102 // If there are no devices capable of offloading TIDL on the SoC, exit 102 // If there are no devices capable of offloading TIDL on the SoC, exit
103 uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE); 103 uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
104 uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP); 104 uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
105 if (num_eves == 0 || num_dsps == 0) 105 if (num_eves == 0 && num_dsps == 0)
106 { 106 {
107 cout << "ssd_multibox requires both EVE and DSP for execution." << endl; 107 cout << "ssd_multibox requires EVE or DSP for execution." << endl;
108 return EXIT_SUCCESS; 108 return EXIT_SUCCESS;
109 } 109 }
110 110
@@ -112,8 +112,8 @@ int main(int argc, char *argv[])
112 cmdline_opts_t opts; 112 cmdline_opts_t opts;
113 opts.config = DEFAULT_CONFIG; 113 opts.config = DEFAULT_CONFIG;
114 opts.object_classes_list_file = DEFAULT_OBJECT_CLASSES_LIST_FILE; 114 opts.object_classes_list_file = DEFAULT_OBJECT_CLASSES_LIST_FILE;
115 opts.num_eves = 1; 115 opts.num_eves = num_eves > 0 ? 1 : 0;
116 opts.num_dsps = 1; 116 opts.num_dsps = num_dsps > 0 ? 1 : 0;
117 opts.input_file = DEFAULT_INPUT; 117 opts.input_file = DEFAULT_INPUT;
118 opts.output_prob_threshold = DEFAULT_OUTPUT_PROB_THRESHOLD; 118 opts.output_prob_threshold = DEFAULT_OUTPUT_PROB_THRESHOLD;
119 if (! ProcessArgs(argc, argv, opts)) 119 if (! ProcessArgs(argc, argv, opts))
@@ -121,7 +121,7 @@ int main(int argc, char *argv[])
121 DisplayHelp(); 121 DisplayHelp();
122 exit(EXIT_SUCCESS); 122 exit(EXIT_SUCCESS);
123 } 123 }
124 assert(opts.num_dsps != 0 && opts.num_eves != 0); 124 assert(opts.num_dsps != 0 || opts.num_eves != 0);
125 if (opts.num_frames == 0) 125 if (opts.num_frames == 0)
126 opts.num_frames = (opts.is_camera_input || opts.is_video_input) ? 126 opts.num_frames = (opts.is_camera_input || opts.is_video_input) ?
127 NUM_VIDEO_FRAMES : 127 NUM_VIDEO_FRAMES :
@@ -164,6 +164,9 @@ bool RunConfiguration(const cmdline_opts_t& opts)
164 return false; 164 return false;
165 } 165 }
166 c.enableApiTrace = opts.verbose; 166 c.enableApiTrace = opts.verbose;
167 if (opts.num_eves == 0 || opts.num_dsps == 0)
168 c.runFullNet = true;
169
167 // setup camera/video input 170 // setup camera/video input
168 VideoCapture cap; 171 VideoCapture cap;
169 if (! SetVideoInputOutput(cap, opts, "SSD_Multibox")) return false; 172 if (! SetVideoInputOutput(cap, opts, "SSD_Multibox")) return false;
@@ -198,43 +201,81 @@ bool RunConfiguration(const cmdline_opts_t& opts)
198 // DSP will run layersGroupId 2 in the network 201 // DSP will run layersGroupId 2 in the network
199 Executor* e_eve = CreateExecutor(DeviceType::EVE, opts.num_eves, c, 1); 202 Executor* e_eve = CreateExecutor(DeviceType::EVE, opts.num_eves, c, 1);
200 Executor* e_dsp = CreateExecutor(DeviceType::DSP, opts.num_dsps, c, 2); 203 Executor* e_dsp = CreateExecutor(DeviceType::DSP, opts.num_dsps, c, 2);
201
202 // Construct ExecutionObjectPipeline that utilizes multiple
203 // ExecutionObjects to process a single frame, each ExecutionObject
204 // processes one layerGroup of the network
205 //
206 // Pipeline depth can enable more optimized pipeline execution:
207 // Given one EVE and one DSP as an example, with different
208 // pipeline_depth, we have different execution behavior:
209 // If pipeline_depth is set to 1,
210 // we create one EOP: eop0 (eve0, dsp0)
211 // pipeline execution of multiple frames over time is as follows:
212 // --------------------- time ------------------->
213 // eop0: [eve0...][dsp0]
214 // eop0: [eve0...][dsp0]
215 // eop0: [eve0...][dsp0]
216 // eop0: [eve0...][dsp0]
217 // If pipeline_depth is set to 2,
218 // we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
219 // pipeline execution of multiple frames over time is as follows:
220 // --------------------- time ------------------->
221 // eop0: [eve0...][dsp0]
222 // eop1: [eve0...][dsp0]
223 // eop0: [eve0...][dsp0]
224 // eop1: [eve0...][dsp0]
225 // Additional benefit of setting pipeline_depth to 2 is that
226 // it can also overlap host ReadFrame() with device processing:
227 // --------------------- time ------------------->
228 // eop0: [RF][eve0...][dsp0]
229 // eop1: [RF] [eve0...][dsp0]
230 // eop0: [RF][eve0...][dsp0]
231 // eop1: [RF][eve0...][dsp0]
232 vector<ExecutionObjectPipeline *> eops; 204 vector<ExecutionObjectPipeline *> eops;
233 uint32_t pipeline_depth = 2; // 2 EOs in EOP -> depth 2 205
234 for (uint32_t j = 0; j < pipeline_depth; j++) 206 if (e_eve != nullptr && e_dsp != nullptr)
235 for (uint32_t i = 0; i < max(opts.num_eves, opts.num_dsps); i++) 207 {
236 eops.push_back(new ExecutionObjectPipeline( 208 // Construct ExecutionObjectPipeline that utilizes multiple
209 // ExecutionObjects to process a single frame, each ExecutionObject
210 // processes one layerGroup of the network
211 //
212 // Pipeline depth can enable more optimized pipeline execution:
213 // Given one EVE and one DSP as an example, with different
214 // pipeline_depth, we have different execution behavior:
215 // If pipeline_depth is set to 1,
216 // we create one EOP: eop0 (eve0, dsp0)
217 // pipeline execution of multiple frames over time is as follows:
218 // --------------------- time ------------------->
219 // eop0: [eve0...][dsp0]
220 // eop0: [eve0...][dsp0]
221 // eop0: [eve0...][dsp0]
222 // eop0: [eve0...][dsp0]
223 // If pipeline_depth is set to 2,
224 // we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
225 // pipeline execution of multiple frames over time is as follows:
226 // --------------------- time ------------------->
227 // eop0: [eve0...][dsp0]
228 // eop1: [eve0...][dsp0]
229 // eop0: [eve0...][dsp0]
230 // eop1: [eve0...][dsp0]
231 // Additional benefit of setting pipeline_depth to 2 is that
232 // it can also overlap host ReadFrame() with device processing:
233 // --------------------- time ------------------->
234 // eop0: [RF][eve0...][dsp0]
235 // eop1: [RF] [eve0...][dsp0]
236 // eop0: [RF][eve0...][dsp0]
237 // eop1: [RF][eve0...][dsp0]
238 uint32_t pipeline_depth = 2; // 2 EOs in EOP -> depth 2
239 for (uint32_t j = 0; j < pipeline_depth; j++)
240 for (uint32_t i = 0; i < max(opts.num_eves, opts.num_dsps); i++)
241 eops.push_back(new ExecutionObjectPipeline(
237 {(*e_eve)[i%opts.num_eves], (*e_dsp)[i%opts.num_dsps]})); 242 {(*e_eve)[i%opts.num_eves], (*e_dsp)[i%opts.num_dsps]}));
243 }
244 else
245 {
246 // Construct ExecutionObjectPipeline that utilizes a
247 // ExecutionObject to process a single frame, each ExecutionObject
248 // processes the full network
249 //
250 // Use duplicate EOPs to do double buffering on frame input/output
251 // because each EOP has its own set of input/output buffers,
252 // so that host ReadFrame() can overlap device processing
253 // Use one EO as an example, with different buffer_factor,
254 // we have different execution behavior:
255 // If buffer_factor is set to 1 -> single buffering
256 // we create one EOP: eop0 (eo0)
257 // pipeline execution of multiple frames over time is as follows:
258 // --------------------- time ------------------->
259 // eop0: [RF][eo0.....][WF]
260 // eop0: [RF][eo0.....][WF]
261 // eop0: [RF][eo0.....][WF]
262 // If buffer_factor is set to 2 -> double buffering
263 // we create two EOPs: eop0 (eo0), eop1(eo0)
264 // pipeline execution of multiple frames over time is as follows:
265 // --------------------- time ------------------->
266 // eop0: [RF][eo0.....][WF]
267 // eop1: [RF] [eo0.....][WF]
268 // eop0: [RF] [eo0.....][WF]
269 // eop1: [RF] [eo0.....][WF]
270 uint32_t buffer_factor = 2; // set to 1 for single buffering
271 for (uint32_t j = 0; j < buffer_factor; j++)
272 {
273 for (uint32_t i = 0; i < opts.num_eves; i++)
274 eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
275 for (uint32_t i = 0; i < opts.num_dsps; i++)
276 eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
277 }
278 }
238 uint32_t num_eops = eops.size(); 279 uint32_t num_eops = eops.size();
239 280
240 // Allocate input/output memory for each EOP 281 // Allocate input/output memory for each EOP
@@ -252,9 +293,7 @@ bool RunConfiguration(const cmdline_opts_t& opts)
252 293
253 // Wait for previous frame on the same eop to finish processing 294 // Wait for previous frame on the same eop to finish processing
254 if (eop->ProcessFrameWait()) 295 if (eop->ProcessFrameWait())
255 {
256 WriteFrameOutput(*eop, c, opts, (float)prob_slider); 296 WriteFrameOutput(*eop, c, opts, (float)prob_slider);
257 }
258 297
259 // Read a frame and start processing it with current eo 298 // Read a frame and start processing it with current eo
260 if (ReadFrame(*eop, frame_idx, c, opts, cap, ifs)) 299 if (ReadFrame(*eop, frame_idx, c, opts, cap, ifs))
@@ -291,7 +330,9 @@ Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
291 for (uint32_t i = 0; i < num; i++) 330 for (uint32_t i = 0; i < num; i++)
292 ids.insert(static_cast<DeviceId>(i)); 331 ids.insert(static_cast<DeviceId>(i));
293 332
294 return new Executor(dt, ids, c, layers_group_id); 333 Executor* e = new Executor(dt, ids, c, layers_group_id);
334 assert(e != nullptr);
335 return e;
295} 336}
296 337
297bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx, 338bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,