f569c0d148a9bf5ac345f5c86758170c2093e1a5
1 /******************************************************************************
2 * Copyright (c) 2018, Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28 #include <signal.h>
29 #include <iostream>
30 #include <iomanip>
31 #include <fstream>
32 #include <cassert>
33 #include <string>
34 #include <functional>
35 #include <algorithm>
36 #include <time.h>
37 #include <unistd.h>
39 #include <queue>
40 #include <vector>
41 #include <cstdio>
42 #include <chrono>
44 #include "executor.h"
45 #include "execution_object.h"
46 #include "execution_object_pipeline.h"
47 #include "configuration.h"
48 #include "../common/utils.h"
49 #include "../common/video_utils.h"
51 using namespace std;
52 using namespace tidl;
53 using namespace cv;
56 #define NUM_VIDEO_FRAMES 100
57 #define DEFAULT_CONFIG "../test/testvecs/config/infer/tidl_config_j11_v2.txt"
59 bool RunConfiguration(const cmdline_opts_t& opts);
60 Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
61 int layers_group_id);
62 bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
63 Configuration& configuration,
64 uint32_t num_layers_groups,
65 Executor*& e_eve, Executor*& e_dsp,
66 std::vector<ExecutionObjectPipeline*>& eops);
68 void AllocateMemory(const std::vector<ExecutionObjectPipeline*>& eops);
70 bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,
71 const Configuration& c, const cmdline_opts_t& opts, char *input_frames_buffer);
72 static void DisplayHelp();
75 int main(int argc, char *argv[])
76 {
77 // Catch ctrl-c to ensure a clean exit
78 signal(SIGABRT, exit);
79 signal(SIGTERM, exit);
81 // If there are no devices capable of offloading TIDL on the SoC, exit
82 uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
83 uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
84 if ((num_eves == 0) || (num_dsps == 0))
85 {
86 cout << "mcbench requires EVE and/or DSP for execution." << endl;
87 return EXIT_SUCCESS;
88 }
90 cout << "CMDLINE: ";
91 for(int i = 0; i < argc; ++i) cout << argv[i] << " ";
92 cout << endl;
94 // Process arguments
95 cmdline_opts_t opts;
96 opts.config = DEFAULT_CONFIG;
97 opts.num_eves = 0;
98 opts.num_dsps = 2;
99 if (! ProcessArgs(argc, argv, opts))
100 {
101 DisplayHelp();
102 exit(EXIT_SUCCESS);
103 }
104 assert((opts.num_dsps + opts.num_eves) != 0);
106 if (opts.num_frames == 0)
107 opts.num_frames = NUM_VIDEO_FRAMES;
109 // Run network
110 bool status = RunConfiguration(opts);
112 if (!status)
113 {
114 cout << "mcbench FAILED" << endl;
115 return EXIT_FAILURE;
116 }
118 cout << "mcbench PASSED" << endl;
119 return EXIT_SUCCESS;
120 }
122 bool RunConfiguration(const cmdline_opts_t& opts)
123 {
124 // Read the TI DL configuration file
125 Configuration c;
126 if (!c.ReadFromFile(opts.config))
127 return false;
129 c.enableApiTrace = opts.verbose;
130 if(opts.num_layers_groups == 1)
131 c.runFullNet = true; //Force all layers to be in the same group
133 std::string inputFile;
134 if (opts.input_file.empty())
135 inputFile = c.inData;
136 else
137 inputFile = opts.input_file;
139 int frame_size = c.inNumChannels * c.inWidth * c.inHeight;
141 c.numFrames = GetBinaryFileSize (inputFile) / frame_size;
143 cout << "Input: " << inputFile << " frames:" << c.numFrames << endl;
145 // Read input file into memory buffer
146 char *input_frame_buffer = new char[c.numFrames * frame_size]();
147 ifstream ifs(inputFile, ios::binary);
148 ifs.read(input_frame_buffer, c.numFrames * frame_size);
149 if(!ifs.good()) {
150 std::cout << "Invalid File input:" << inputFile << std::endl;
151 return false;
152 }
154 bool status = true;
155 try
156 {
157 Executor *e_eve = NULL;
158 Executor *e_dsp = NULL;
159 std::vector<ExecutionObjectPipeline *> eops;
160 if (! CreateExecutionObjectPipelines(opts.num_eves, opts.num_dsps, c,
161 opts.num_layers_groups,
162 e_eve, e_dsp, eops))
163 return false;
165 // Allocate input/output memory for each EOP
166 AllocateMemory(eops);
168 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
169 tloop0 = chrono::steady_clock::now();
171 // Process frames with available eops in a pipelined manner
172 // additional num_eops iterations to flush pipeline (epilogue)
173 uint32_t num_eops = eops.size();
174 for (uint32_t frame_idx = 0;
175 frame_idx < opts.num_frames + num_eops; frame_idx++)
176 {
177 ExecutionObjectPipeline* eop = eops[frame_idx % num_eops];
179 // Wait for previous frame on the same eop to finish processing
180 if (eop->ProcessFrameWait())
181 ;
183 // Read a frame and start processing it with current eo
184 if (ReadFrame(*eop, frame_idx, c, opts, input_frame_buffer))
185 eop->ProcessFrameStartAsync();
186 }
188 tloop1 = chrono::steady_clock::now();
189 chrono::duration<float> elapsed = tloop1 - tloop0;
190 cout << "Loop total time: "
191 << setw(6) << setprecision(4)
192 << (elapsed.count() * 1000) << "ms" << endl;
193 cout << "FPS:" << opts.num_frames / elapsed.count() << endl;
195 FreeMemory(eops);
197 for (auto eop : eops)
198 delete eop;
200 delete e_eve;
201 delete e_dsp;
202 }
203 catch (tidl::Exception &e)
204 {
205 cerr << e.what() << endl;
206 status = false;
207 }
209 delete [] input_frame_buffer;
210 return status;
211 }
213 // Create an Executor with the specified type and number of EOs
214 Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
215 int layers_group_id)
216 {
217 if (num == 0) return nullptr;
219 DeviceIds ids;
220 for (uint32_t i = 0; i < num; i++)
221 ids.insert(static_cast<DeviceId>(i));
223 return new Executor(dt, ids, c, layers_group_id);
224 }
226 bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
227 Configuration& configuration,
228 uint32_t num_layers_groups,
229 Executor*& e_eve, Executor*& e_dsp,
230 std::vector<ExecutionObjectPipeline*>& eops)
231 {
232 DeviceIds ids_eve, ids_dsp;
233 for (uint32_t i = 0; i < num_eves; i++)
234 ids_eve.insert(static_cast<DeviceId>(i));
235 for (uint32_t i = 0; i < num_dsps; i++)
236 ids_dsp.insert(static_cast<DeviceId>(i));
238 // Construct ExecutionObjectPipeline that utilizes multiple
239 // ExecutionObjects to process a single frame, each ExecutionObject
240 // processes one layerGroup of the network
241 //
242 // Pipeline depth can enable more optimized pipeline execution:
243 // Given one EVE and one DSP as an example, with different
244 // buffer_factor, we have different execution behavior:
245 // If buffer_factor is set to 1,
246 // we create one EOP: eop0 (eve0, dsp0)
247 // pipeline execution of multiple frames over time is as follows:
248 // --------------------- time ------------------->
249 // eop0: [eve0...][dsp0]
250 // eop0: [eve0...][dsp0]
251 // eop0: [eve0...][dsp0]
252 // eop0: [eve0...][dsp0]
253 // If buffer_factor is set to 2,
254 // we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
255 // pipeline execution of multiple frames over time is as follows:
256 // --------------------- time ------------------->
257 // eop0: [eve0...][dsp0]
258 // eop1: [eve0...][dsp0]
259 // eop0: [eve0...][dsp0]
260 // eop1: [eve0...][dsp0]
261 // Additional benefit of setting buffer_factor to 2 is that
262 // it can also overlap host ReadFrame() with device processing:
263 // --------------------- time ------------------->
264 // eop0: [RF][eve0...][dsp0]
265 // eop1: [RF] [eve0...][dsp0]
266 // eop0: [RF][eve0...][dsp0]
267 // eop1: [RF][eve0...][dsp0]
268 const uint32_t buffer_factor = 2;
270 switch(num_layers_groups)
271 {
272 case 1: // Single layers group
273 e_eve = num_eves == 0 ? nullptr :
274 new Executor(DeviceType::EVE, ids_eve, configuration);
275 e_dsp = num_dsps == 0 ? nullptr :
276 new Executor(DeviceType::DSP, ids_dsp, configuration);
278 // Construct ExecutionObjectPipeline with single Execution Object to
279 // process each frame. This is parallel processing of frames with
280 // as many DSP and EVE cores that we have on hand.
281 // If buffer_factor == 2, duplicating EOPs for double buffering
282 // and overlapping host pre/post-processing with device processing
283 for (uint32_t j = 0; j < buffer_factor; j++)
284 {
285 for (uint32_t i = 0; i < num_eves; i++)
286 eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
287 for (uint32_t i = 0; i < num_dsps; i++)
288 eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
289 }
290 break;
292 case 2: // Two layers group
293 // Create Executors with the approriate core type, number of cores
294 // and configuration specified
295 // EVE will run layersGroupId 1 in the network, while
296 // DSP will run layersGroupId 2 in the network
297 e_eve = num_eves == 0 ? nullptr :
298 new Executor(DeviceType::EVE, ids_eve, configuration, 1);
299 e_dsp = num_dsps == 0 ? nullptr :
300 new Executor(DeviceType::DSP, ids_dsp, configuration, 2);
302 // Construct ExecutionObjectPipeline that utilizes multiple
303 // ExecutionObjects to process a single frame, each ExecutionObject
304 // processes one layerGroup of the network
305 // If buffer_factor == 2, duplicating EOPs for pipelining at
306 // EO level rather than at EOP level, in addition to double buffering
307 // and overlapping host pre/post-processing with device processing
308 for (uint32_t j = 0; j < buffer_factor; j++)
309 for (uint32_t i = 0; i < std::max(num_eves, num_dsps); i++)
310 eops.push_back(new ExecutionObjectPipeline(
311 {(*e_eve)[i%num_eves], (*e_dsp)[i%num_dsps]}));
312 break;
314 default:
315 std::cout << "Layers groups must be either 1 or 2!" << std::endl;
316 return false;
317 break;
318 }
320 return true;
321 }
323 static void subtractMeanValue(unsigned char *frame_buffer, int channel_size,
324 int32_t mean_value)
325 {
326 int32_t one_pixel;
328 for (int i = 0; i < channel_size; i ++)
329 {
330 one_pixel = (int32_t)frame_buffer[i];
331 one_pixel -= mean_value;
332 if(one_pixel > 127) one_pixel = 127;
333 if(one_pixel < -128) one_pixel = -128;
334 frame_buffer[i] = (unsigned char)one_pixel;
335 }
336 }
338 bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,
339 const Configuration& c, const cmdline_opts_t& opts,
340 char *input_frames_buffer)
341 {
342 if (frame_idx >= opts.num_frames)
343 return false;
345 eop.SetFrameIndex(frame_idx);
347 unsigned char* frame_buffer = (unsigned char *)eop.GetInputBufferPtr();
348 assert (frame_buffer != nullptr);
349 //Current implementation of this function assumes 3 channels on input
350 assert (c.inNumChannels == 3);
352 int channel_size = c.inWidth * c.inHeight;
353 char *bgr_frames_input = input_frames_buffer + (frame_idx % c.numFrames) *
354 channel_size * c.inNumChannels;
356 memcpy(frame_buffer, bgr_frames_input + 0, channel_size);
357 if(c.preProcType == 1)
358 subtractMeanValue(frame_buffer, channel_size, 104);
359 else if(c.preProcType == 2)
360 subtractMeanValue(frame_buffer, channel_size, 128);
361 frame_buffer += channel_size;
363 memcpy(frame_buffer, bgr_frames_input + 1 * channel_size, channel_size);
364 if(c.preProcType == 1)
365 subtractMeanValue(frame_buffer, channel_size, 117);
366 else if(c.preProcType == 2)
367 subtractMeanValue(frame_buffer, channel_size, 128);
368 frame_buffer += channel_size;
370 memcpy(frame_buffer, bgr_frames_input + 2 * channel_size, channel_size);
371 if(c.preProcType == 1)
372 subtractMeanValue(frame_buffer, channel_size, 123);
373 else if(c.preProcType == 2)
374 subtractMeanValue(frame_buffer, channel_size, 128);
376 return true;
377 }
379 void DisplayHelp()
380 {
381 std::cout <<
382 "Usage: mcbench\n"
383 " Runs partitioned network to perform multi-object detection\n"
384 " and classification. First part of network (layersGroupId 1) runs on\n"
385 " EVE, second part (layersGroupId 2) runs on DSP.\n"
386 " Use -c to run a different segmentation network. Default is jdetnet.\n"
387 "Optional arguments:\n"
388 " -c <config> Valid configs: ../test/testvecs/config/infer/... \n"
389 " -d <number> Number of DSP cores to use\n"
390 " -e <number> Number of EVE cores to use\n"
391 " -g <1|2> Number of layer groups\n"
392 " -f <number> Number of frames to process\n"
393 " -i <image> Path to the input image file\n"
394 " -v Verbose output during execution\n"
395 " -h Help\n";
396 }