Add ssd_multibox_fullnet example

- To demonstrate running jdenet/jdetnet_voc on a single core, without paritioning the network. This is useful for situations where SoC only has C66x cores but not EVE cores. - MCT-1202
author: Yuan Zhao 2019-06-26 09:36:54 -0500
committer: Yuan Zhao 2019-06-28 16:20:51 -0500
commit: 885ef17911d8f5d9f7585f55c83ae98852df0926 (patch)
tree: 0b3b1645215e26def08b01f0bf671d7c78f7b131
parent: 9835b57fe6d0e10e237b7eb4a99b5e0481ae1d01 (diff)
download: tidl-api-885ef17911d8f5d9f7585f55c83ae98852df0926.tar.gz
tidl-api-885ef17911d8f5d9f7585f55c83ae98852df0926.tar.xz
tidl-api-885ef17911d8f5d9f7585f55c83ae98852df0926.zip
4 files changed, 94 insertions, 49 deletions
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index e69b6e4..532a436 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -30,7 +30,7 @@ Changelog
    Enable a two phase approach to generating execution graphs. Use the
    following API function to enable timestamp generation:
-    .. code::
+    .. code:: cpp
        bool EnableTimeStamps(const std::string& file = "timestamp.log", size_t num_frames=32);
@@ -38,7 +38,7 @@ Changelog
 #. Added Python 3 bindings for TIDL API. See the ``examples/pybind`` directory for examples of using the Python bindings. Set PYTHONPATH to the location of ``tidl.so``.
-   .. code::
+   .. code:: bash
        root@am57xx-evm:~# export PYTHONPATH=/home/root/tidl-api/tidl_api
        root@am57xx-evm:~# python3
diff --git a/docs/source/example.rst b/docs/source/example.rst
index ff70804..9c696ad 100644
--- a/docs/source/example.rst
+++ b/docs/source/example.rst
@@ -35,13 +35,13 @@ Examples
     - OpenCV used to read input image from file or capture from camera.
   * - ssd_multibox
     - Object detection
-     - EVE and C66x (network is split across both EVE and C66x)
+     - EVE and C66x (network is split across both EVE and C66x), EVE or C66x (full network on each core)
     - OpenCV used to read input image from file or capture from camera.
   * - mnist
     - handwritten digits recognition (MNIST).  This example illustrates
       low TIDL API overhead (~1.8%) for small networks with low compute
       requirements (<5ms).
-     - EVE
+     - EVE or C66x
     - Pre-processed white-on-black images read from file, with or without
       MNIST database file headers.
   * - classification
@@ -273,6 +273,10 @@ versus ExecutionObject level.
     - 1630 ms
     - 1408 ms
+When there is a requirement to run the SSD networks non-partitioned,
+for example, the SoC only has C66x cores but not EVE cores,
+use ``-e 0`` to run the full network only on C66x cores, without partitioning.
 .. _mnist-example:
 MNIST
diff --git a/docs/source/faq/out_of_memory.rst b/docs/source/faq/out_of_memory.rst
index ee97e58..d7c8e1f 100644
--- a/docs/source/faq/out_of_memory.rst
+++ b/docs/source/faq/out_of_memory.rst
@@ -4,7 +4,7 @@ Why do I get an assertion failure from malloc_ddr?
 Application execution fails with the following error message:
-.. code:: shell
+.. code:: console
   tidl: device_alloc.h:31: T* tidl::malloc_ddr(size_t) [with T = char; size_t = unsigned int]: Assertion `val != nullptr' failed
@@ -28,7 +28,7 @@ Insufficient OpenCL global memory
 +++++++++++++++++++++++++++++++++
 Another possible reason is that total memory requirement specified in the ``Configuration`` using NETWORK_HEAP_SIZE and PARAM_HEAP_SIZE exceeds default memory available for OpenCL.  Follow the instructions below to increase the amount of CMEM (contiguous memory available for OpenCL) from 192MB (0xc000000) to 384MB (0x18000000):
-.. code:: bash
+.. code:: diff
   $ sudo apt-get install device-tree-compiler # In case dtc is not already installed
   $ scp root@am57:/boot/am57xx-evm-reva3.dtb .
diff --git a/examples/ssd_multibox/main.cpp b/examples/ssd_multibox/main.cpp
index 3158345..93f4c89 100644
--- a/examples/ssd_multibox/main.cpp
+++ b/examples/ssd_multibox/main.cpp
@@ -102,9 +102,9 @@ int main(int argc, char *argv[])
    // If there are no devices capable of offloading TIDL on the SoC, exit
    uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
    uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
-    if (num_eves == 0 || num_dsps == 0)
+    if (num_eves == 0 && num_dsps == 0)
    {
-        cout << "ssd_multibox requires both EVE and DSP for execution." << endl;
+        cout << "ssd_multibox requires EVE or DSP for execution." << endl;
        return EXIT_SUCCESS;
    }
@@ -112,8 +112,8 @@ int main(int argc, char *argv[])
    cmdline_opts_t opts;
    opts.config = DEFAULT_CONFIG;
    opts.object_classes_list_file = DEFAULT_OBJECT_CLASSES_LIST_FILE;
-    opts.num_eves = 1;
+    opts.num_eves = num_eves > 0 ? 1 : 0;
-    opts.num_dsps = 1;
+    opts.num_dsps = num_dsps > 0 ? 1 : 0;
    opts.input_file = DEFAULT_INPUT;
    opts.output_prob_threshold = DEFAULT_OUTPUT_PROB_THRESHOLD;
    if (! ProcessArgs(argc, argv, opts))
@@ -121,7 +121,7 @@ int main(int argc, char *argv[])
        DisplayHelp();
        exit(EXIT_SUCCESS);
    }
-    assert(opts.num_dsps != 0 && opts.num_eves != 0);
+    assert(opts.num_dsps != 0 || opts.num_eves != 0);
    if (opts.num_frames == 0)
        opts.num_frames = (opts.is_camera_input || opts.is_video_input) ?
                          NUM_VIDEO_FRAMES :
@@ -164,6 +164,9 @@ bool RunConfiguration(const cmdline_opts_t& opts)
        return false;
    }
    c.enableApiTrace = opts.verbose;
+    if (opts.num_eves == 0 || opts.num_dsps == 0)
+        c.runFullNet = true;
    // setup camera/video input
    VideoCapture cap;
    if (! SetVideoInputOutput(cap, opts, "SSD_Multibox"))  return false;
@@ -198,43 +201,81 @@ bool RunConfiguration(const cmdline_opts_t& opts)
        // DSP will run layersGroupId 2 in the network
        Executor* e_eve = CreateExecutor(DeviceType::EVE, opts.num_eves, c, 1);
        Executor* e_dsp = CreateExecutor(DeviceType::DSP, opts.num_dsps, c, 2);
-        // Construct ExecutionObjectPipeline that utilizes multiple
-        // ExecutionObjects to process a single frame, each ExecutionObject
-        // processes one layerGroup of the network
-        //
-        // Pipeline depth can enable more optimized pipeline execution:
-        // Given one EVE and one DSP as an example, with different
-        //     pipeline_depth, we have different execution behavior:
-        // If pipeline_depth is set to 1,
-        //    we create one EOP: eop0 (eve0, dsp0)
-        //    pipeline execution of multiple frames over time is as follows:
-        //    --------------------- time ------------------->
-        //    eop0: [eve0...][dsp0]
-        //    eop0:                [eve0...][dsp0]
-        //    eop0:                               [eve0...][dsp0]
-        //    eop0:                                              [eve0...][dsp0]
-        // If pipeline_depth is set to 2,
-        //    we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
-        //    pipeline execution of multiple frames over time is as follows:
-        //    --------------------- time ------------------->
-        //    eop0: [eve0...][dsp0]
-        //    eop1:          [eve0...][dsp0]
-        //    eop0:                   [eve0...][dsp0]
-        //    eop1:                            [eve0...][dsp0]
-        // Additional benefit of setting pipeline_depth to 2 is that
-        //    it can also overlap host ReadFrame() with device processing:
-        //    --------------------- time ------------------->
-        //    eop0: [RF][eve0...][dsp0]
-        //    eop1:     [RF]     [eve0...][dsp0]
-        //    eop0:                    [RF][eve0...][dsp0]
-        //    eop1:                             [RF][eve0...][dsp0]
        vector<ExecutionObjectPipeline *> eops;
-        uint32_t pipeline_depth = 2;  // 2 EOs in EOP -> depth 2
-        for (uint32_t j = 0; j < pipeline_depth; j++)
+        if (e_eve != nullptr && e_dsp != nullptr)
-            for (uint32_t i = 0; i < max(opts.num_eves, opts.num_dsps); i++)
+        {
-                eops.push_back(new ExecutionObjectPipeline(
+            // Construct ExecutionObjectPipeline that utilizes multiple
+            // ExecutionObjects to process a single frame, each ExecutionObject
+            // processes one layerGroup of the network
+            //
+            // Pipeline depth can enable more optimized pipeline execution:
+            // Given one EVE and one DSP as an example, with different
+            //     pipeline_depth, we have different execution behavior:
+            // If pipeline_depth is set to 1,
+            //    we create one EOP: eop0 (eve0, dsp0)
+            //    pipeline execution of multiple frames over time is as follows:
+            //    --------------------- time ------------------->
+            //    eop0: [eve0...][dsp0]
+            //    eop0:                [eve0...][dsp0]
+            //    eop0:                               [eve0...][dsp0]
+            //    eop0:                                              [eve0...][dsp0]
+            // If pipeline_depth is set to 2,
+            //    we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
+            //    pipeline execution of multiple frames over time is as follows:
+            //    --------------------- time ------------------->
+            //    eop0: [eve0...][dsp0]
+            //    eop1:          [eve0...][dsp0]
+            //    eop0:                   [eve0...][dsp0]
+            //    eop1:                            [eve0...][dsp0]
+            // Additional benefit of setting pipeline_depth to 2 is that
+            //    it can also overlap host ReadFrame() with device processing:
+            //    --------------------- time ------------------->
+            //    eop0: [RF][eve0...][dsp0]
+            //    eop1:     [RF]     [eve0...][dsp0]
+            //    eop0:                    [RF][eve0...][dsp0]
+            //    eop1:                             [RF][eve0...][dsp0]
+            uint32_t pipeline_depth = 2;  // 2 EOs in EOP -> depth 2
+            for (uint32_t j = 0; j < pipeline_depth; j++)
+                for (uint32_t i = 0; i < max(opts.num_eves, opts.num_dsps); i++)
+                    eops.push_back(new ExecutionObjectPipeline(
                      {(*e_eve)[i%opts.num_eves], (*e_dsp)[i%opts.num_dsps]}));
+        }
+        else
+        {
+            // Construct ExecutionObjectPipeline that utilizes a
+            // ExecutionObject to process a single frame, each ExecutionObject
+            // processes the full network
+            //
+            // Use duplicate EOPs to do double buffering on frame input/output
+            //    because each EOP has its own set of input/output buffers,
+            //    so that host ReadFrame() can overlap device processing
+            // Use one EO as an example, with different buffer_factor,
+            //    we have different execution behavior:
+            // If buffer_factor is set to 1 -> single buffering
+            //    we create one EOP: eop0 (eo0)
+            //    pipeline execution of multiple frames over time is as follows:
+            //    --------------------- time ------------------->
+            //    eop0: [RF][eo0.....][WF]
+            //    eop0:                   [RF][eo0.....][WF]
+            //    eop0:                                     [RF][eo0.....][WF]
+            // If buffer_factor is set to 2 -> double buffering
+            //    we create two EOPs: eop0 (eo0), eop1(eo0)
+            //    pipeline execution of multiple frames over time is as follows:
+            //    --------------------- time ------------------->
+            //    eop0: [RF][eo0.....][WF]
+            //    eop1:     [RF]      [eo0.....][WF]
+            //    eop0:                   [RF]  [eo0.....][WF]
+            //    eop1:                             [RF]  [eo0.....][WF]
+            uint32_t buffer_factor = 2;  // set to 1 for single buffering
+            for (uint32_t j = 0; j < buffer_factor; j++)
+            {
+                for (uint32_t i = 0; i < opts.num_eves; i++)
+                    eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
+                for (uint32_t i = 0; i < opts.num_dsps; i++)
+                    eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
+            }
+        }
        uint32_t num_eops = eops.size();
        // Allocate input/output memory for each EOP
@@ -252,9 +293,7 @@ bool RunConfiguration(const cmdline_opts_t& opts)
            // Wait for previous frame on the same eop to finish processing
            if (eop->ProcessFrameWait())
-            {
                WriteFrameOutput(*eop, c, opts, (float)prob_slider);
-            }
            // Read a frame and start processing it with current eo
            if (ReadFrame(*eop, frame_idx, c, opts, cap, ifs))
@@ -291,7 +330,9 @@ Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
    for (uint32_t i = 0; i < num; i++)
        ids.insert(static_cast<DeviceId>(i));
-    return new Executor(dt, ids, c, layers_group_id);
+    Executor* e = new Executor(dt, ids, c, layers_group_id);
+    assert(e != nullptr);
+    return e;
 }
 bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,
author	Yuan Zhao	2019-06-26 09:36:54 -0500
committer	Yuan Zhao	2019-06-28 16:20:51 -0500
commit	885ef17911d8f5d9f7585f55c83ae98852df0926 (patch)
tree	0b3b1645215e26def08b01f0bf671d7c78f7b131
parent	9835b57fe6d0e10e237b7eb4a99b5e0481ae1d01 (diff)
download	tidl-api-885ef17911d8f5d9f7585f55c83ae98852df0926.tar.gz tidl-api-885ef17911d8f5d9f7585f55c83ae98852df0926.tar.xz tidl-api-885ef17911d8f5d9f7585f55c83ae98852df0926.zip

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index e69b6e4..532a436 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst
@@ -30,7 +30,7 @@ Changelog
30	Enable a two phase approach to generating execution graphs. Use the	30	Enable a two phase approach to generating execution graphs. Use the
31	following API function to enable timestamp generation:	31	following API function to enable timestamp generation:
32		32
33	.. code::	33	.. code:: cpp
34		34
35	bool EnableTimeStamps(const std::string& file = "timestamp.log", size_t num_frames=32);	35	bool EnableTimeStamps(const std::string& file = "timestamp.log", size_t num_frames=32);
36		36
@@ -38,7 +38,7 @@ Changelog
38		38
39	#. Added Python 3 bindings for TIDL API. See the ``examples/pybind`` directory for examples of using the Python bindings. Set PYTHONPATH to the location of ``tidl.so``.	39	#. Added Python 3 bindings for TIDL API. See the ``examples/pybind`` directory for examples of using the Python bindings. Set PYTHONPATH to the location of ``tidl.so``.
40		40
41	.. code::	41	.. code:: bash
42		42
43	root@am57xx-evm:~# export PYTHONPATH=/home/root/tidl-api/tidl_api	43	root@am57xx-evm:~# export PYTHONPATH=/home/root/tidl-api/tidl_api
44	root@am57xx-evm:~# python3	44	root@am57xx-evm:~# python3


diff --git a/docs/source/example.rst b/docs/source/example.rst index ff70804..9c696ad 100644 --- a/docs/source/example.rst +++ b/docs/source/example.rst
@@ -35,13 +35,13 @@ Examples
35	- OpenCV used to read input image from file or capture from camera.	35	- OpenCV used to read input image from file or capture from camera.
36	* - ssd_multibox	36	* - ssd_multibox
37	- Object detection	37	- Object detection
38	- EVE and C66x (network is split across both EVE and C66x)	38	- EVE and C66x (network is split across both EVE and C66x), EVE or C66x (full network on each core)
39	- OpenCV used to read input image from file or capture from camera.	39	- OpenCV used to read input image from file or capture from camera.
40	* - mnist	40	* - mnist
41	- handwritten digits recognition (MNIST). This example illustrates	41	- handwritten digits recognition (MNIST). This example illustrates
42	low TIDL API overhead (~1.8%) for small networks with low compute	42	low TIDL API overhead (~1.8%) for small networks with low compute
43	requirements (<5ms).	43	requirements (<5ms).
44	- EVE	44	- EVE or C66x
45	- Pre-processed white-on-black images read from file, with or without	45	- Pre-processed white-on-black images read from file, with or without
46	MNIST database file headers.	46	MNIST database file headers.
47	* - classification	47	* - classification
@@ -273,6 +273,10 @@ versus ExecutionObject level.
273	- 1630 ms	273	- 1630 ms
274	- 1408 ms	274	- 1408 ms
275		275
		276	When there is a requirement to run the SSD networks non-partitioned,
		277	for example, the SoC only has C66x cores but not EVE cores,
		278	use ``-e 0`` to run the full network only on C66x cores, without partitioning.
		279
276	.. _mnist-example:	280	.. _mnist-example:
277		281
278	MNIST	282	MNIST


diff --git a/docs/source/faq/out_of_memory.rst b/docs/source/faq/out_of_memory.rst index ee97e58..d7c8e1f 100644 --- a/docs/source/faq/out_of_memory.rst +++ b/docs/source/faq/out_of_memory.rst
@@ -4,7 +4,7 @@ Why do I get an assertion failure from malloc_ddr?
4		4
5	Application execution fails with the following error message:	5	Application execution fails with the following error message:
6		6
7	.. code:: shell	7	.. code:: console
8		8
9	tidl: device_alloc.h:31: T* tidl::malloc_ddr(size_t) [with T = char; size_t = unsigned int]: Assertion `val != nullptr' failed	9	tidl: device_alloc.h:31: T* tidl::malloc_ddr(size_t) [with T = char; size_t = unsigned int]: Assertion `val != nullptr' failed
10		10
@@ -28,7 +28,7 @@ Insufficient OpenCL global memory
28	+++++++++++++++++++++++++++++++++	28	+++++++++++++++++++++++++++++++++
29	Another possible reason is that total memory requirement specified in the ``Configuration`` using NETWORK_HEAP_SIZE and PARAM_HEAP_SIZE exceeds default memory available for OpenCL. Follow the instructions below to increase the amount of CMEM (contiguous memory available for OpenCL) from 192MB (0xc000000) to 384MB (0x18000000):	29	Another possible reason is that total memory requirement specified in the ``Configuration`` using NETWORK_HEAP_SIZE and PARAM_HEAP_SIZE exceeds default memory available for OpenCL. Follow the instructions below to increase the amount of CMEM (contiguous memory available for OpenCL) from 192MB (0xc000000) to 384MB (0x18000000):
30		30
31	.. code:: bash	31	.. code:: diff
32		32
33	$ sudo apt-get install device-tree-compiler # In case dtc is not already installed	33	$ sudo apt-get install device-tree-compiler # In case dtc is not already installed
34	$ scp root@am57:/boot/am57xx-evm-reva3.dtb .	34	$ scp root@am57:/boot/am57xx-evm-reva3.dtb .


diff --git a/examples/ssd_multibox/main.cpp b/examples/ssd_multibox/main.cpp index 3158345..93f4c89 100644 --- a/examples/ssd_multibox/main.cpp +++ b/examples/ssd_multibox/main.cpp
@@ -102,9 +102,9 @@ int main(int argc, char *argv[])
102	// If there are no devices capable of offloading TIDL on the SoC, exit	102	// If there are no devices capable of offloading TIDL on the SoC, exit
103	uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);	103	uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
104	uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);	104	uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
105	if (num_eves == 0 \|\| num_dsps == 0)	105	if (num_eves == 0 && num_dsps == 0)
106	{	106	{
107	cout << "ssd_multibox requires both EVE and DSP for execution." << endl;	107	cout << "ssd_multibox requires EVE or DSP for execution." << endl;
108	return EXIT_SUCCESS;	108	return EXIT_SUCCESS;
109	}	109	}
110		110
@@ -112,8 +112,8 @@ int main(int argc, char *argv[])
112	cmdline_opts_t opts;	112	cmdline_opts_t opts;
113	opts.config = DEFAULT_CONFIG;	113	opts.config = DEFAULT_CONFIG;
114	opts.object_classes_list_file = DEFAULT_OBJECT_CLASSES_LIST_FILE;	114	opts.object_classes_list_file = DEFAULT_OBJECT_CLASSES_LIST_FILE;
115	opts.num_eves = 1;	115	opts.num_eves = num_eves > 0 ? 1 : 0;
116	opts.num_dsps = 1;	116	opts.num_dsps = num_dsps > 0 ? 1 : 0;
117	opts.input_file = DEFAULT_INPUT;	117	opts.input_file = DEFAULT_INPUT;
118	opts.output_prob_threshold = DEFAULT_OUTPUT_PROB_THRESHOLD;	118	opts.output_prob_threshold = DEFAULT_OUTPUT_PROB_THRESHOLD;
119	if (! ProcessArgs(argc, argv, opts))	119	if (! ProcessArgs(argc, argv, opts))
@@ -121,7 +121,7 @@ int main(int argc, char *argv[])
121	DisplayHelp();	121	DisplayHelp();
122	exit(EXIT_SUCCESS);	122	exit(EXIT_SUCCESS);
123	}	123	}
124	assert(opts.num_dsps != 0 && opts.num_eves != 0);	124	assert(opts.num_dsps != 0 \|\| opts.num_eves != 0);
125	if (opts.num_frames == 0)	125	if (opts.num_frames == 0)
126	opts.num_frames = (opts.is_camera_input \|\| opts.is_video_input) ?	126	opts.num_frames = (opts.is_camera_input \|\| opts.is_video_input) ?
127	NUM_VIDEO_FRAMES :	127	NUM_VIDEO_FRAMES :
@@ -164,6 +164,9 @@ bool RunConfiguration(const cmdline_opts_t& opts)
164	return false;	164	return false;
165	}	165	}
166	c.enableApiTrace = opts.verbose;	166	c.enableApiTrace = opts.verbose;
		167	if (opts.num_eves == 0 \|\| opts.num_dsps == 0)
		168	c.runFullNet = true;
		169
167	// setup camera/video input	170	// setup camera/video input
168	VideoCapture cap;	171	VideoCapture cap;
169	if (! SetVideoInputOutput(cap, opts, "SSD_Multibox")) return false;	172	if (! SetVideoInputOutput(cap, opts, "SSD_Multibox")) return false;
@@ -198,43 +201,81 @@ bool RunConfiguration(const cmdline_opts_t& opts)
198	// DSP will run layersGroupId 2 in the network	201	// DSP will run layersGroupId 2 in the network
199	Executor* e_eve = CreateExecutor(DeviceType::EVE, opts.num_eves, c, 1);	202	Executor* e_eve = CreateExecutor(DeviceType::EVE, opts.num_eves, c, 1);
200	Executor* e_dsp = CreateExecutor(DeviceType::DSP, opts.num_dsps, c, 2);	203	Executor* e_dsp = CreateExecutor(DeviceType::DSP, opts.num_dsps, c, 2);
201
202	// Construct ExecutionObjectPipeline that utilizes multiple
203	// ExecutionObjects to process a single frame, each ExecutionObject
204	// processes one layerGroup of the network
205	//
206	// Pipeline depth can enable more optimized pipeline execution:
207	// Given one EVE and one DSP as an example, with different
208	// pipeline_depth, we have different execution behavior:
209	// If pipeline_depth is set to 1,
210	// we create one EOP: eop0 (eve0, dsp0)
211	// pipeline execution of multiple frames over time is as follows:
212	// --------------------- time ------------------->
213	// eop0: [eve0...][dsp0]
214	// eop0: [eve0...][dsp0]
215	// eop0: [eve0...][dsp0]
216	// eop0: [eve0...][dsp0]
217	// If pipeline_depth is set to 2,
218	// we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
219	// pipeline execution of multiple frames over time is as follows:
220	// --------------------- time ------------------->
221	// eop0: [eve0...][dsp0]
222	// eop1: [eve0...][dsp0]
223	// eop0: [eve0...][dsp0]
224	// eop1: [eve0...][dsp0]
225	// Additional benefit of setting pipeline_depth to 2 is that
226	// it can also overlap host ReadFrame() with device processing:
227	// --------------------- time ------------------->
228	// eop0: [RF][eve0...][dsp0]
229	// eop1: [RF] [eve0...][dsp0]
230	// eop0: [RF][eve0...][dsp0]
231	// eop1: [RF][eve0...][dsp0]
232	vector<ExecutionObjectPipeline *> eops;	204	vector<ExecutionObjectPipeline *> eops;
233	uint32_t pipeline_depth = 2; // 2 EOs in EOP -> depth 2	205
234	for (uint32_t j = 0; j < pipeline_depth; j++)	206	if (e_eve != nullptr && e_dsp != nullptr)
235	for (uint32_t i = 0; i < max(opts.num_eves, opts.num_dsps); i++)	207	{
236	eops.push_back(new ExecutionObjectPipeline(	208	// Construct ExecutionObjectPipeline that utilizes multiple
		209	// ExecutionObjects to process a single frame, each ExecutionObject
		210	// processes one layerGroup of the network
		211	//
		212	// Pipeline depth can enable more optimized pipeline execution:
		213	// Given one EVE and one DSP as an example, with different
		214	// pipeline_depth, we have different execution behavior:
		215	// If pipeline_depth is set to 1,
		216	// we create one EOP: eop0 (eve0, dsp0)
		217	// pipeline execution of multiple frames over time is as follows:
		218	// --------------------- time ------------------->
		219	// eop0: [eve0...][dsp0]
		220	// eop0: [eve0...][dsp0]
		221	// eop0: [eve0...][dsp0]
		222	// eop0: [eve0...][dsp0]
		223	// If pipeline_depth is set to 2,
		224	// we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
		225	// pipeline execution of multiple frames over time is as follows:
		226	// --------------------- time ------------------->
		227	// eop0: [eve0...][dsp0]
		228	// eop1: [eve0...][dsp0]
		229	// eop0: [eve0...][dsp0]
		230	// eop1: [eve0...][dsp0]
		231	// Additional benefit of setting pipeline_depth to 2 is that
		232	// it can also overlap host ReadFrame() with device processing:
		233	// --------------------- time ------------------->
		234	// eop0: [RF][eve0...][dsp0]
		235	// eop1: [RF] [eve0...][dsp0]
		236	// eop0: [RF][eve0...][dsp0]
		237	// eop1: [RF][eve0...][dsp0]
		238	uint32_t pipeline_depth = 2; // 2 EOs in EOP -> depth 2
		239	for (uint32_t j = 0; j < pipeline_depth; j++)
		240	for (uint32_t i = 0; i < max(opts.num_eves, opts.num_dsps); i++)
		241	eops.push_back(new ExecutionObjectPipeline(
237	{(e_eve)[i%opts.num_eves], (e_dsp)[i%opts.num_dsps]}));	242	{(e_eve)[i%opts.num_eves], (e_dsp)[i%opts.num_dsps]}));
		243	}
		244	else
		245	{
		246	// Construct ExecutionObjectPipeline that utilizes a
		247	// ExecutionObject to process a single frame, each ExecutionObject
		248	// processes the full network
		249	//
		250	// Use duplicate EOPs to do double buffering on frame input/output
		251	// because each EOP has its own set of input/output buffers,
		252	// so that host ReadFrame() can overlap device processing
		253	// Use one EO as an example, with different buffer_factor,
		254	// we have different execution behavior:
		255	// If buffer_factor is set to 1 -> single buffering
		256	// we create one EOP: eop0 (eo0)
		257	// pipeline execution of multiple frames over time is as follows:
		258	// --------------------- time ------------------->
		259	// eop0: [RF][eo0.....][WF]
		260	// eop0: [RF][eo0.....][WF]
		261	// eop0: [RF][eo0.....][WF]
		262	// If buffer_factor is set to 2 -> double buffering
		263	// we create two EOPs: eop0 (eo0), eop1(eo0)
		264	// pipeline execution of multiple frames over time is as follows:
		265	// --------------------- time ------------------->
		266	// eop0: [RF][eo0.....][WF]
		267	// eop1: [RF] [eo0.....][WF]
		268	// eop0: [RF] [eo0.....][WF]
		269	// eop1: [RF] [eo0.....][WF]
		270	uint32_t buffer_factor = 2; // set to 1 for single buffering
		271	for (uint32_t j = 0; j < buffer_factor; j++)
		272	{
		273	for (uint32_t i = 0; i < opts.num_eves; i++)
		274	eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
		275	for (uint32_t i = 0; i < opts.num_dsps; i++)
		276	eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
		277	}
		278	}
238	uint32_t num_eops = eops.size();	279	uint32_t num_eops = eops.size();
239		280
240	// Allocate input/output memory for each EOP	281	// Allocate input/output memory for each EOP
@@ -252,9 +293,7 @@ bool RunConfiguration(const cmdline_opts_t& opts)
252		293
253	// Wait for previous frame on the same eop to finish processing	294	// Wait for previous frame on the same eop to finish processing
254	if (eop->ProcessFrameWait())	295	if (eop->ProcessFrameWait())
255	{
256	WriteFrameOutput(*eop, c, opts, (float)prob_slider);	296	WriteFrameOutput(*eop, c, opts, (float)prob_slider);
257	}
258		297
259	// Read a frame and start processing it with current eo	298	// Read a frame and start processing it with current eo
260	if (ReadFrame(*eop, frame_idx, c, opts, cap, ifs))	299	if (ReadFrame(*eop, frame_idx, c, opts, cap, ifs))
@@ -291,7 +330,9 @@ Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
291	for (uint32_t i = 0; i < num; i++)	330	for (uint32_t i = 0; i < num; i++)
292	ids.insert(static_cast<DeviceId>(i));	331	ids.insert(static_cast<DeviceId>(i));
293		332
294	return new Executor(dt, ids, c, layers_group_id);	333	Executor* e = new Executor(dt, ids, c, layers_group_id);
		334	assert(e != nullptr);
		335	return e;
295	}	336	}
296		337
297	bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,	338	bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,