128bc57695504963548a4451992258ab63080bd0
1 /******************************************************************************
2 * Copyright (c) 2018, Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28 #include <signal.h>
29 #include <iostream>
30 #include <iomanip>
31 #include <fstream>
32 #include <cassert>
33 #include <string>
34 #include <functional>
35 #include <algorithm>
36 #include <time.h>
37 #include <unistd.h>
39 #include <queue>
40 #include <vector>
41 #include <cstdio>
42 #include <chrono>
44 #include "executor.h"
45 #include "execution_object.h"
46 #include "execution_object_pipeline.h"
47 #include "configuration.h"
48 #include "../segmentation/object_classes.h"
49 #include "../common/utils.h"
50 #include "../common/video_utils.h"
52 using namespace std;
53 using namespace tidl;
54 using namespace cv;
57 #define NUM_VIDEO_FRAMES 100
58 #define DEFAULT_CONFIG "../test/testvecs/config/infer/tidl_config_j11_v2.txt"
60 bool RunConfiguration(const cmdline_opts_t& opts);
61 Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
62 int layers_group_id);
63 bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
64 Configuration& configuration,
65 uint32_t num_layers_groups,
66 Executor*& e_eve, Executor*& e_dsp,
67 std::vector<ExecutionObjectPipeline*>& eops);
69 void AllocateMemory(const std::vector<ExecutionObjectPipeline*>& eops);
71 bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,
72 const Configuration& c, const cmdline_opts_t& opts, char *input_frames_buffer);
73 static void DisplayHelp();
76 int main(int argc, char *argv[])
77 {
78 // Catch ctrl-c to ensure a clean exit
79 signal(SIGABRT, exit);
80 signal(SIGTERM, exit);
82 // If there are no devices capable of offloading TIDL on the SoC, exit
83 uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
84 uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
85 if ((num_eves == 0) || (num_dsps == 0))
86 {
87 cout << "mcbench requires EVE and/or DSP for execution." << endl;
88 return EXIT_SUCCESS;
89 }
91 cout << "CMDLINE: ";
92 for(int i = 0; i < argc; ++i) cout << argv[i] << " ";
93 cout << endl;
95 // Process arguments
96 cmdline_opts_t opts;
97 opts.config = DEFAULT_CONFIG;
98 opts.num_eves = 0;
99 opts.num_dsps = 2;
100 if (! ProcessArgs(argc, argv, opts))
101 {
102 DisplayHelp();
103 exit(EXIT_SUCCESS);
104 }
105 assert((opts.num_dsps + opts.num_eves) != 0);
107 if (opts.num_frames == 0)
108 opts.num_frames = NUM_VIDEO_FRAMES;
110 // Run network
111 bool status = RunConfiguration(opts);
113 if (!status)
114 {
115 cout << "mcbench FAILED" << endl;
116 return EXIT_FAILURE;
117 }
119 cout << "mcbench PASSED" << endl;
120 return EXIT_SUCCESS;
121 }
123 bool RunConfiguration(const cmdline_opts_t& opts)
124 {
125 // Read the TI DL configuration file
126 Configuration c;
127 if (!c.ReadFromFile(opts.config))
128 return false;
130 c.enableApiTrace = opts.verbose;
131 if(opts.num_layers_groups == 1)
132 c.runFullNet = true; //Force all layers to be in the same group
134 std::string inputFile;
135 if (opts.input_file.empty())
136 inputFile = c.inData;
137 else
138 inputFile = opts.input_file;
140 int frame_size = c.inNumChannels * c.inWidth * c.inHeight;
142 c.numFrames = GetBinaryFileSize (inputFile) / frame_size;
144 cout << "Input: " << inputFile << " frames:" << c.numFrames << endl;
146 // Read input file into memory buffer
147 char *input_frame_buffer = new char[c.numFrames * frame_size]();
148 ifstream ifs(inputFile, ios::binary);
149 ifs.read(input_frame_buffer, c.numFrames * frame_size);
150 if(!ifs.good()) {
151 std::cout << "Invalid File input:" << inputFile << std::endl;
152 return false;
153 }
155 bool status = true;
156 try
157 {
158 Executor *e_eve = NULL;
159 Executor *e_dsp = NULL;
160 std::vector<ExecutionObjectPipeline *> eops;
161 if (! CreateExecutionObjectPipelines(opts.num_eves, opts.num_dsps, c,
162 opts.num_layers_groups,
163 e_eve, e_dsp, eops))
164 return false;
166 // Allocate input/output memory for each EOP
167 AllocateMemory(eops);
169 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
170 tloop0 = chrono::steady_clock::now();
172 // Process frames with available eops in a pipelined manner
173 // additional num_eops iterations to flush pipeline (epilogue)
174 uint32_t num_eops = eops.size();
175 for (uint32_t frame_idx = 0;
176 frame_idx < opts.num_frames + num_eops; frame_idx++)
177 {
178 ExecutionObjectPipeline* eop = eops[frame_idx % num_eops];
180 // Wait for previous frame on the same eop to finish processing
181 if (eop->ProcessFrameWait())
182 ;
184 // Read a frame and start processing it with current eo
185 if (ReadFrame(*eop, frame_idx, c, opts, input_frame_buffer))
186 eop->ProcessFrameStartAsync();
187 }
189 tloop1 = chrono::steady_clock::now();
190 chrono::duration<float> elapsed = tloop1 - tloop0;
191 cout << "Loop total time: "
192 << setw(6) << setprecision(4)
193 << (elapsed.count() * 1000) << "ms" << endl;
194 cout << "FPS:" << opts.num_frames / elapsed.count() << endl;
196 FreeMemory(eops);
198 for (auto eop : eops)
199 delete eop;
201 delete e_eve;
202 delete e_dsp;
203 }
204 catch (tidl::Exception &e)
205 {
206 cerr << e.what() << endl;
207 status = false;
208 }
210 delete [] input_frame_buffer;
211 return status;
212 }
214 // Create an Executor with the specified type and number of EOs
215 Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
216 int layers_group_id)
217 {
218 if (num == 0) return nullptr;
220 DeviceIds ids;
221 for (uint32_t i = 0; i < num; i++)
222 ids.insert(static_cast<DeviceId>(i));
224 return new Executor(dt, ids, c, layers_group_id);
225 }
227 bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
228 Configuration& configuration,
229 uint32_t num_layers_groups,
230 Executor*& e_eve, Executor*& e_dsp,
231 std::vector<ExecutionObjectPipeline*>& eops)
232 {
233 DeviceIds ids_eve, ids_dsp;
234 for (uint32_t i = 0; i < num_eves; i++)
235 ids_eve.insert(static_cast<DeviceId>(i));
236 for (uint32_t i = 0; i < num_dsps; i++)
237 ids_dsp.insert(static_cast<DeviceId>(i));
239 // Construct ExecutionObjectPipeline that utilizes multiple
240 // ExecutionObjects to process a single frame, each ExecutionObject
241 // processes one layerGroup of the network
242 //
243 // Pipeline depth can enable more optimized pipeline execution:
244 // Given one EVE and one DSP as an example, with different
245 // buffer_factor, we have different execution behavior:
246 // If buffer_factor is set to 1,
247 // we create one EOP: eop0 (eve0, dsp0)
248 // pipeline execution of multiple frames over time is as follows:
249 // --------------------- time ------------------->
250 // eop0: [eve0...][dsp0]
251 // eop0: [eve0...][dsp0]
252 // eop0: [eve0...][dsp0]
253 // eop0: [eve0...][dsp0]
254 // If buffer_factor is set to 2,
255 // we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
256 // pipeline execution of multiple frames over time is as follows:
257 // --------------------- time ------------------->
258 // eop0: [eve0...][dsp0]
259 // eop1: [eve0...][dsp0]
260 // eop0: [eve0...][dsp0]
261 // eop1: [eve0...][dsp0]
262 // Additional benefit of setting buffer_factor to 2 is that
263 // it can also overlap host ReadFrame() with device processing:
264 // --------------------- time ------------------->
265 // eop0: [RF][eve0...][dsp0]
266 // eop1: [RF] [eve0...][dsp0]
267 // eop0: [RF][eve0...][dsp0]
268 // eop1: [RF][eve0...][dsp0]
269 const uint32_t buffer_factor = 2;
271 switch(num_layers_groups)
272 {
273 case 1: // Single layers group
274 e_eve = num_eves == 0 ? nullptr :
275 new Executor(DeviceType::EVE, ids_eve, configuration);
276 e_dsp = num_dsps == 0 ? nullptr :
277 new Executor(DeviceType::DSP, ids_dsp, configuration);
279 // Construct ExecutionObjectPipeline with single Execution Object to
280 // process each frame. This is parallel processing of frames with
281 // as many DSP and EVE cores that we have on hand.
282 // If buffer_factor == 2, duplicating EOPs for double buffering
283 // and overlapping host pre/post-processing with device processing
284 for (uint32_t j = 0; j < buffer_factor; j++)
285 {
286 for (uint32_t i = 0; i < num_eves; i++)
287 eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
288 for (uint32_t i = 0; i < num_dsps; i++)
289 eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
290 }
291 break;
293 case 2: // Two layers group
294 // Create Executors with the approriate core type, number of cores
295 // and configuration specified
296 // EVE will run layersGroupId 1 in the network, while
297 // DSP will run layersGroupId 2 in the network
298 e_eve = num_eves == 0 ? nullptr :
299 new Executor(DeviceType::EVE, ids_eve, configuration, 1);
300 e_dsp = num_dsps == 0 ? nullptr :
301 new Executor(DeviceType::DSP, ids_dsp, configuration, 2);
303 // Construct ExecutionObjectPipeline that utilizes multiple
304 // ExecutionObjects to process a single frame, each ExecutionObject
305 // processes one layerGroup of the network
306 // If buffer_factor == 2, duplicating EOPs for pipelining at
307 // EO level rather than at EOP level, in addition to double buffering
308 // and overlapping host pre/post-processing with device processing
309 for (uint32_t j = 0; j < buffer_factor; j++)
310 for (uint32_t i = 0; i < std::max(num_eves, num_dsps); i++)
311 eops.push_back(new ExecutionObjectPipeline(
312 {(*e_eve)[i%num_eves], (*e_dsp)[i%num_dsps]}));
313 break;
315 default:
316 std::cout << "Layers groups must be either 1 or 2!" << std::endl;
317 return false;
318 break;
319 }
321 return true;
322 }
324 static void subtractMeanValue(unsigned char *frame_buffer, int channel_size,
325 int32_t mean_value)
326 {
327 int32_t one_pixel;
329 for (int i = 0; i < channel_size; i ++)
330 {
331 one_pixel = (int32_t)frame_buffer[i];
332 one_pixel -= mean_value;
333 if(one_pixel > 127) one_pixel = 127;
334 if(one_pixel < -128) one_pixel = -128;
335 frame_buffer[i] = (unsigned char)one_pixel;
336 }
337 }
339 bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,
340 const Configuration& c, const cmdline_opts_t& opts,
341 char *input_frames_buffer)
342 {
343 if (frame_idx >= opts.num_frames)
344 return false;
346 eop.SetFrameIndex(frame_idx);
348 unsigned char* frame_buffer = (unsigned char *)eop.GetInputBufferPtr();
349 assert (frame_buffer != nullptr);
350 //Current implementation of this function assumes 3 channels on input
351 assert (c.inNumChannels == 3);
353 int channel_size = c.inWidth * c.inHeight;
354 char *bgr_frames_input = input_frames_buffer + (frame_idx % c.numFrames) *
355 channel_size * c.inNumChannels;
357 memcpy(frame_buffer, bgr_frames_input + 0, channel_size);
358 if(c.preProcType == 1)
359 subtractMeanValue(frame_buffer, channel_size, 104);
360 else if(c.preProcType == 2)
361 subtractMeanValue(frame_buffer, channel_size, 128);
362 frame_buffer += channel_size;
364 memcpy(frame_buffer, bgr_frames_input + 1 * channel_size, channel_size);
365 if(c.preProcType == 1)
366 subtractMeanValue(frame_buffer, channel_size, 117);
367 else if(c.preProcType == 2)
368 subtractMeanValue(frame_buffer, channel_size, 128);
369 frame_buffer += channel_size;
371 memcpy(frame_buffer, bgr_frames_input + 2 * channel_size, channel_size);
372 if(c.preProcType == 1)
373 subtractMeanValue(frame_buffer, channel_size, 123);
374 else if(c.preProcType == 2)
375 subtractMeanValue(frame_buffer, channel_size, 128);
377 return true;
378 }
380 void DisplayHelp()
381 {
382 std::cout <<
383 "Usage: mcbench\n"
384 " Runs partitioned network to perform multi-object detection\n"
385 " and classification. First part of network (layersGroupId 1) runs on\n"
386 " EVE, second part (layersGroupId 2) runs on DSP.\n"
387 " Use -c to run a different segmentation network. Default is jdetnet.\n"
388 "Optional arguments:\n"
389 " -c <config> Valid configs: ../test/testvecs/config/infer/... \n"
390 " -d <number> Number of DSP cores to use\n"
391 " -e <number> Number of EVE cores to use\n"
392 " -g <1|2> Number of layer groups\n"
393 " -f <number> Number of frames to process\n"
394 " -i <image> Path to the input image file\n"
395 " -v Verbose output during execution\n"
396 " -h Help\n";
397 }