8a77f6576eed068ecfc3b20d506c476b26ac3aa7
1 /******************************************************************************
2 * Copyright (c) 2019, Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
29 #include <signal.h>
30 #include <iostream>
31 #include <iomanip>
32 #include <fstream>
33 #include <cassert>
34 #include <string>
35 #include <functional>
36 #include <algorithm>
37 #include <time.h>
38 #include <unistd.h>
40 #include <queue>
41 #include <vector>
42 #include <chrono>
43 #include <future>
45 #include "executor.h"
46 #include "execution_object.h"
47 #include "execution_object_pipeline.h"
48 #include "subgraph_runtime.h"
49 #include "subgraph_data_conv.h"
50 #include "configuration.h"
51 #include "../common/object_classes.h"
52 #include "imgutil.h"
53 #include "../common/video_utils.h"
54 #include "thread_pool.h"
56 #include "opencv2/core.hpp"
57 #include "opencv2/imgproc.hpp"
58 #include "opencv2/highgui.hpp"
59 #include "opencv2/videoio.hpp"
61 using namespace std;
62 using namespace tidl;
63 using namespace cv;
65 #define NUM_VIDEO_FRAMES 300
66 #define DEFAULT_CONFIG "j11_v2"
67 #define NUM_DEFAULT_INPUTS 1
68 #define DEFAULT_OBJECT_CLASSES_LIST_FILE "imagenet_objects.json"
69 #define DEFAULT_OUTPUT_PROB_THRESHOLD 5
70 const char *default_inputs[NUM_DEFAULT_INPUTS] =
71 {
72 "../test/testvecs/input/objects/cat-pet-animal-domestic-104827.jpeg"
73 };
74 std::unique_ptr<ObjectClasses> object_classes;
75 typedef struct {
76 float **inputs;
77 float **outputs;
78 } UserData;
80 bool RunConfiguration(cmdline_opts_t& opts);
81 bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs,
82 int batch_size);
83 bool WriteFrameOutput(float *out, const cmdline_opts_t& opts);
84 void DisplayHelp();
85 void SubgraphUserFunc(void *user_data);
87 const int num_printed_outputs = 4;
88 bool SkipOutputs(int i, int offset, bool &skip_outputs)
89 {
90 if (skip_outputs) return true;
91 if (i >= num_printed_outputs + offset)
92 {
93 if (! skip_outputs)
94 {
95 cout << " ... skippping outputs ..." << endl;
96 skip_outputs = true;
97 }
98 }
99 return skip_outputs;
100 }
102 int main(int argc, char *argv[])
103 {
104 // Catch ctrl-c to ensure a clean exit
105 signal(SIGABRT, exit);
106 signal(SIGTERM, exit);
108 // If there are no devices capable of offloading TIDL on the SoC, exit
109 uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
110 uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
111 if (num_eves == 0 && num_dsps == 0)
112 {
113 cout << "TI DL not supported on this SoC." << endl;
114 return EXIT_SUCCESS;
115 }
117 // Process arguments
118 cmdline_opts_t opts;
119 opts.config = DEFAULT_CONFIG;
120 opts.object_classes_list_file = DEFAULT_OBJECT_CLASSES_LIST_FILE;
121 opts.output_prob_threshold = DEFAULT_OUTPUT_PROB_THRESHOLD;
122 if (num_eves != 0) { opts.num_eves = 1; opts.num_dsps = 0; }
123 else { opts.num_eves = 0; opts.num_dsps = 1; }
124 if (! ProcessArgs(argc, argv, opts))
125 {
126 DisplayHelp();
127 exit(EXIT_SUCCESS);
128 }
129 assert(opts.num_dsps != 0 || opts.num_eves != 0);
130 if (opts.num_frames == 0)
131 opts.num_frames = (opts.is_camera_input || opts.is_video_input) ?
132 NUM_VIDEO_FRAMES : 1;
133 if (opts.input_file.empty())
134 cout << "Input: " << default_inputs[0] << endl;
135 else
136 cout << "Input: " << opts.input_file << endl;
138 // Get object classes list
139 object_classes = std::unique_ptr<ObjectClasses>(
140 new ObjectClasses(opts.object_classes_list_file));
141 if (object_classes->GetNumClasses() == 0)
142 {
143 cout << "No object classes defined for this config." << endl;
144 return EXIT_FAILURE;
145 }
147 // Run network
148 bool status = RunConfiguration(opts);
149 if (!status)
150 {
151 cout << "imagenet FAILED" << endl;
152 return EXIT_FAILURE;
153 }
155 cout << "imagenet PASSED" << endl;
156 return EXIT_SUCCESS;
157 }
159 bool RunConfiguration(cmdline_opts_t& opts)
160 {
161 bool status = true;
163 // setup camera/video input/output
164 VideoCapture cap;
165 if (! SetVideoInputOutput(cap, opts, "ImageNet")) return false;
167 cout << "\n##### Batch size 1 testing ######\n" << endl;
168 try
169 {
170 TidlInitSubgraph(1, 0);
171 float **inputs = new float *[1];
172 inputs[0] = new float[1*3*224*224];
173 float **outputs = new float *[1];
174 outputs[0] = new float[1001];
176 for (int i = 0; i < 5; i ++)
177 {
178 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
179 tloop0 = chrono::steady_clock::now();
181 ReadFrame(opts, cap, inputs, 1);
182 TidlRunSubgraph(1, 0, 1, 1, 1, inputs, outputs);
183 WriteFrameOutput(outputs[0], opts);
185 tloop1 = chrono::steady_clock::now();
186 chrono::duration<float> elapsed = tloop1 - tloop0;
187 cout << "Frame " << i
188 << " time (including read/write/opencv/print/etc): "
189 << setw(6) << setprecision(4)
190 << (elapsed.count() * 1000) << "ms" << endl;
191 }
193 delete [] inputs[0];
194 delete [] inputs;
195 delete [] outputs[0];
196 delete [] outputs;
197 }
198 catch (tidl::Exception &e)
199 {
200 cerr << e.what() << endl;
201 status = false;
202 }
204 // If not doing multi-threaded processing, multiply by 2 or more
205 // for a larger batch to amortize batch initilization/tear down cost
206 int preferred_batch_size = TidlGetPreferredBatchSize(1);
207 for (int multiple = 1; multiple <= 16; multiple *= 2)
208 {
209 int batch_size = preferred_batch_size * multiple;
210 cout << "\n##### Batch size " << batch_size << " testing ######\n"
211 << endl;
212 bool skip_outputs = false;
213 try
214 {
215 float **inputs = new float *[batch_size];
216 float **outputs = new float *[batch_size];
217 for (int i = 0; i < batch_size; i++)
218 {
219 inputs[i] = new float[1*3*224*224];
220 outputs[i] = new float[1001];
221 }
223 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
224 tloop0 = chrono::steady_clock::now();
226 ReadFrame(opts, cap, inputs, batch_size);
227 TidlRunSubgraph(1, 0, batch_size, 1, 1, inputs, outputs);
228 for (int i = 0; i < batch_size; i++)
229 {
230 if (! SkipOutputs(i, 0, skip_outputs))
231 {
232 cout << "Frame " << i << " of " << batch_size
233 << " output:" << endl;
234 WriteFrameOutput(outputs[i], opts);
235 }
236 }
238 tloop1 = chrono::steady_clock::now();
239 chrono::duration<float> elapsed = tloop1 - tloop0;
240 cout << "Batch size " << batch_size
241 << " time: "
242 << setw(6) << setprecision(4)
243 << (elapsed.count() * 1000) << "ms, fps = "
244 << setw(6) << setprecision(4)
245 << (batch_size / elapsed.count())
246 << endl;
248 for (int i = 0; i < batch_size; i++)
249 {
250 delete [] inputs[i];
251 delete [] outputs[i];
252 }
253 delete [] inputs;
254 delete [] outputs;
255 }
256 catch (tidl::Exception &e)
257 {
258 cerr << e.what() << endl;
259 status = false;
260 }
261 }
263 // This is to test the multithreaded inference with async/future
264 // async/future has slightly worse threading performance than
265 // thread pool, however, it is much easier to program
266 cout << "\n##### Multithreaded inference testing (async/future) #####\n"
267 << endl;
268 int num_threads = TidlGetPreferredBatchSize(1) * 2;
269 int num_iters = 100;
270 try
271 {
272 float **inputs = new float *[num_threads];
273 float **outputs = new float *[num_threads];
274 for (int i = 0; i < num_threads; i++)
275 {
276 inputs[i] = new float[1*3*224*224];
277 outputs[i] = new float[1001];
278 }
279 vector<future<bool>> futures(num_threads);
280 bool skip_outputs = false;
282 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
283 tloop0 = chrono::steady_clock::now();
285 for (int i = 0; i < num_iters + num_threads; i++)
286 {
287 int index = i % num_threads;
288 if (i >= num_threads)
289 {
290 if (futures[index].get())
291 {
292 if (! SkipOutputs(i, num_threads, skip_outputs))
293 WriteFrameOutput(outputs[index], opts);
294 }
295 }
297 if (i < num_iters)
298 {
299 ReadFrame(opts, cap, &inputs[index], 1);
300 futures[index] = std::async(std::launch::async,
301 [inputs, outputs](int index) {
302 TidlRunSubgraph(1, 0, 1, 1, 1,
303 &inputs[index], &outputs[index]);
304 return true;
305 },
306 index);
307 }
308 }
310 tloop1 = chrono::steady_clock::now();
311 chrono::duration<float> elapsed = tloop1 - tloop0;
312 cout << "Multithreaded (num_threads=" << num_threads
313 << ", batch_size=1) loop time (" << num_iters << " frames): "
314 << setw(6) << setprecision(4)
315 << (elapsed.count() * 1000) << "ms, fps = "
316 << setw(6) << setprecision(4)
317 << (num_iters / elapsed.count())
318 << endl;
320 for (int i = 0; i < num_threads; i++)
321 {
322 delete [] inputs[i];
323 delete [] outputs[i];
324 }
325 delete [] inputs;
326 delete [] outputs;
327 }
328 catch (tidl::Exception &e)
329 {
330 cerr << e.what() << endl;
331 status = false;
332 }
334 // This is to test the multithreaded inference with a thread pool
335 cout << "\n##### Multithreaded inference testing (thread pool) #####\n"
336 << endl;
337 try
338 {
339 float **inputs = new float *[num_threads];
340 float **outputs = new float *[num_threads];
341 vector<UserData> v_data(num_threads);
342 for (int i = 0; i < num_threads; i++)
343 {
344 inputs[i] = new float[1*3*224*224];
345 outputs[i] = new float[1001];
346 v_data[i].inputs = &inputs[i];
347 v_data[i].outputs = &outputs[i];
348 }
349 ThPool pool(num_threads, SubgraphUserFunc);
350 vector<int> th_ids(num_threads);
351 bool skip_outputs = false;
353 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
354 tloop0 = chrono::steady_clock::now();
356 for (int i = 0; i < num_iters + num_threads; i++)
357 {
358 int index = i % num_threads;
359 if (i >= num_threads)
360 {
361 UserData *data = (UserData *) pool.Wait(th_ids[index]);
362 if (! SkipOutputs(i, num_threads, skip_outputs))
363 WriteFrameOutput(data->outputs[0], opts);
364 }
366 if (i < num_iters)
367 {
368 ReadFrame(opts, cap, &inputs[index], 1);
369 th_ids[index] = pool.RunAsync(&v_data[index]);
370 }
371 }
373 tloop1 = chrono::steady_clock::now();
374 chrono::duration<float> elapsed = tloop1 - tloop0;
375 cout << "Multithreaded (num_threads=" << num_threads
376 << ", batch_size=1) loop time (" << num_iters << " frames): "
377 << setw(6) << setprecision(4)
378 << (elapsed.count() * 1000) << "ms, fps = "
379 << setw(6) << setprecision(4)
380 << (num_iters / elapsed.count())
381 << endl;
383 for (int i = 0; i < num_threads; i++)
384 {
385 delete [] inputs[i];
386 delete [] outputs[i];
387 }
388 delete [] inputs;
389 delete [] outputs;
390 }
391 catch (tidl::Exception &e)
392 {
393 cerr << e.what() << endl;
394 status = false;
395 }
397 num_threads = 2;
398 int batch_size = preferred_batch_size;
399 // This is to test the multithreaded batch inference with async/future
400 // Ideally, batch_size * num_threads <= number of threads
401 cout << "\n##### Multithreaded batch inference testing (async/future)"
402 << " #####\n" << endl;
403 try
404 {
405 float **inputs = new float *[num_threads * batch_size];
406 float **outputs = new float *[num_threads * batch_size];
407 for (int i = 0; i < num_threads * batch_size; i++)
408 {
409 inputs[i] = new float[1*3*224*224];
410 outputs[i] = new float[1001];
411 }
412 vector<future<bool>> futures(num_threads);
413 bool skip_outputs = false;
415 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
416 tloop0 = chrono::steady_clock::now();
418 for (int i = 0; i < num_iters/batch_size + num_threads; i++)
419 {
420 int index = i % num_threads;
421 if (i >= num_threads)
422 {
423 if (futures[index].get())
424 if (! SkipOutputs(i*batch_size, num_threads*batch_size,
425 skip_outputs))
426 for (int b = 0; b < batch_size; b++)
427 WriteFrameOutput(outputs[index*batch_size+b], opts);
428 }
430 if (i < num_iters/batch_size)
431 {
432 ReadFrame(opts, cap, &inputs[index*batch_size], batch_size);
433 futures[index] = std::async(std::launch::async,
434 [inputs, outputs, batch_size](int index) {
435 TidlRunSubgraph(1, 0, batch_size, 1, 1,
436 &inputs[index*batch_size],
437 &outputs[index*batch_size]);
438 return true;
439 },
440 index);
441 }
442 }
444 tloop1 = chrono::steady_clock::now();
445 chrono::duration<float> elapsed = tloop1 - tloop0;
446 cout << "Multithreaded batch (num_threads=" << num_threads
447 << ", batch_size=" << batch_size
448 << ") loop time (" << num_iters << " frames): "
449 << setw(6) << setprecision(4)
450 << (elapsed.count() * 1000) << "ms, fps = "
451 << setw(6) << setprecision(4)
452 << (num_iters / elapsed.count())
453 << endl;
455 for (int i = 0; i < num_threads * batch_size; i++)
456 {
457 delete [] inputs[i];
458 delete [] outputs[i];
459 }
460 delete [] inputs;
461 delete [] outputs;
462 }
463 catch (tidl::Exception &e)
464 {
465 cerr << e.what() << endl;
466 status = false;
467 }
470 return status;
471 }
473 void SubgraphUserFunc(void *user_data)
474 {
475 UserData *data = (UserData *) user_data;
476 //printf("data inputs = %p, outputs = %p\n", data->inputs, data->outputs);
477 TidlRunSubgraph(1, 0, 1, 1, 1, data->inputs, data->outputs);
478 //printf("TidlRunSubgraph finished\n");
479 }
481 bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs,
482 int batch_size)
483 {
484 Configuration c;
485 c.inNumChannels = 3;;
486 c.inWidth = 224;
487 c.inHeight = 224;
488 c.preProcType = 2;
489 SubgraphDataConv in_conv{{0}, {true}, {128.0f}, {false}, {1,3,224,224}};
491 char* frame_buffer = new char[3*224*224];
492 assert (frame_buffer != nullptr);
494 Mat image;
495 if (! opts.is_camera_input && ! opts.is_video_input)
496 {
497 if (opts.input_file.empty())
498 image = cv::imread(default_inputs[0],
499 CV_LOAD_IMAGE_COLOR);
500 else
501 image = cv::imread(opts.input_file, CV_LOAD_IMAGE_COLOR);
502 if (image.empty())
503 {
504 cerr << "Unable to read input image" << endl;
505 return false;
506 }
507 }
508 else
509 {
510 Mat v_image;
511 if (! cap.grab()) return false;
512 if (! cap.retrieve(v_image)) return false;
513 int orig_width = v_image.cols;
514 int orig_height = v_image.rows;
515 // Crop camera/video input to center 256x256 input
516 if (orig_width > 256 && orig_height > 256)
517 {
518 image = Mat(v_image, Rect((orig_width-256)/2, (orig_height-256)/2,
519 256, 256));
520 }
521 else
522 image = v_image;
523 cv::imshow("ImageNet", image);
524 waitKey(2);
525 }
527 // TI DL image preprocessing, into frame_buffer
528 bool status = imgutil::PreprocessImage(image, frame_buffer, c);
529 for (int i = 0; i < batch_size; i++)
530 {
531 std::vector<float *> in_data_v{inputs[i]};
532 in_conv.ScaleDequant((const uint8_t *)frame_buffer, in_data_v);
533 }
534 delete [] frame_buffer;
535 return status;
536 }
538 // Display top 5 classified imagenet classes with probabilities 5% or higher
539 bool WriteFrameOutput(float *out, const cmdline_opts_t& opts)
540 {
541 const int k = 5;
542 int out_size = 1001;
543 // Tensorflow trained network outputs 1001 probabilities,
544 // with 0-index being background, thus we need to subtract 1 when
545 // reporting classified object from 1000 categories
546 int background_offset = out_size == 1001 ? 1 : 0;
548 // sort and get k largest values and corresponding indices
549 typedef pair<float, int> val_index;
550 auto cmp = [](val_index &left, val_index &right)
551 { return left.first > right.first; };
552 priority_queue<val_index, vector<val_index>, decltype(cmp)> queue(cmp);
554 // initialize priority queue with smallest value on top
555 for (int i = 0; i < k; i++)
556 queue.push(val_index(out[i], i));
558 // for rest output, if larger than current min, pop min, push new val
559 for (int i = k; i < out_size; i++)
560 {
561 if (out[i] > queue.top().first)
562 {
563 queue.pop();
564 queue.push(val_index(out[i], i));
565 }
566 }
568 // output top k values in reverse order: largest val first
569 vector<val_index> sorted;
570 while (! queue.empty())
571 {
572 sorted.push_back(queue.top());
573 queue.pop();
574 }
576 for (int i = k - 1; i >= 0; i--)
577 {
578 if (sorted[i].first * 100 < opts.output_prob_threshold) break;
579 int imagenet_index = sorted[i].second - background_offset;
580 cout << k-i << ": [" << imagenet_index << "] "
581 << object_classes->At(imagenet_index).label
582 << ", prob = " << setprecision(4)
583 << (sorted[i].first * 100) << "%" << endl;
584 }
586 return true;
587 }
589 void DisplayHelp()
590 {
591 cout <<
592 "Usage: imagenet\n"
593 " Will run imagenet network to predict top 5 object"
594 " classes for the input.\n Use -c to run a"
595 " different imagenet network. Default is j11_v2.\n"
596 "Optional arguments:\n"
597 " -c <config> Valid configs: j11_bn, j11_prelu, j11_v2\n"
598 " -d <number> Number of dsp cores to use\n"
599 " -e <number> Number of eve cores to use\n"
600 " -i <image> Path to the image file as input\n"
601 " -i camera<number> Use camera as input\n"
602 " video input port: /dev/video<number>\n"
603 " -i <name>.{mp4,mov,avi} Use video file as input\n"
604 " -l <objects_list> Path to the object classes list file\n"
605 " -f <number> Number of frames to process\n"
606 " -p <number> Output probablity threshold in percentage\n"
607 " Default is 5 percent or higher.\n"
608 " -v Verbose output during execution\n"
609 " -h Help\n";
610 }