1 /******************************************************************************
2 * Copyright (c) 2019, Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
29 #include <signal.h>
30 #include <iostream>
31 #include <iomanip>
32 #include <fstream>
33 #include <cassert>
34 #include <string>
35 #include <functional>
36 #include <algorithm>
37 #include <time.h>
38 #include <unistd.h>
40 #include <queue>
41 #include <vector>
42 #include <chrono>
43 #include <future>
45 #include "executor.h"
46 #include "execution_object.h"
47 #include "execution_object_pipeline.h"
48 #include "subgraph_runtime.h"
49 #include "subgraph_data_conv.h"
50 #include "configuration.h"
51 #include "../common/object_classes.h"
52 #include "imgutil.h"
53 #include "../common/video_utils.h"
54 #include "thread_pool.h"
56 #include "opencv2/core.hpp"
57 #include "opencv2/imgproc.hpp"
58 #include "opencv2/highgui.hpp"
59 #include "opencv2/videoio.hpp"
61 using namespace std;
62 using namespace tidl;
63 using namespace cv;
65 #define NUM_VIDEO_FRAMES 300
66 #define DEFAULT_CONFIG "j11_v2"
67 #define NUM_DEFAULT_INPUTS 1
68 #define DEFAULT_OBJECT_CLASSES_LIST_FILE "imagenet_objects.json"
69 #define DEFAULT_OUTPUT_PROB_THRESHOLD 5
70 #define MOBILENET_IN_C (3)
71 #define MOBILENET_IN_H (224)
72 #define MOBILENET_IN_W (224)
73 #define MOBILENET_INPUT_SIZE (1*MOBILENET_IN_C*MOBILENET_IN_H*MOBILENET_IN_W)
74 #define MOBILENET_OUTPUT_SIZE (1001)
75 const char *default_inputs[NUM_DEFAULT_INPUTS] =
76 {
77 "../test/testvecs/input/objects/cat-pet-animal-domestic-104827.jpeg"
78 };
79 std::unique_ptr<ObjectClasses> object_classes;
80 typedef struct {
81 float **inputs;
82 float **outputs;
83 } UserData;
85 bool RunConfiguration(cmdline_opts_t& opts);
86 bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs,
87 int batch_size);
88 bool WriteFrameOutput(float *out, const cmdline_opts_t& opts);
89 void DisplayHelp();
90 void SubgraphUserFunc(void *user_data);
92 const int num_printed_outputs = 4;
93 bool SkipOutputs(int i, int offset, bool &skip_outputs)
94 {
95 if (skip_outputs) return true;
96 if (i >= num_printed_outputs + offset)
97 {
98 if (! skip_outputs)
99 {
100 cout << " ... skippping outputs ..." << endl;
101 skip_outputs = true;
102 }
103 }
104 return skip_outputs;
105 }
107 int main(int argc, char *argv[])
108 {
109 // Catch ctrl-c to ensure a clean exit
110 signal(SIGABRT, exit);
111 signal(SIGTERM, exit);
113 // If there are no devices capable of offloading TIDL on the SoC, exit
114 uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
115 uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
116 if (num_eves == 0 && num_dsps == 0)
117 {
118 cout << "TI DL not supported on this SoC." << endl;
119 return EXIT_SUCCESS;
120 }
122 // Process arguments
123 cmdline_opts_t opts;
124 opts.config = DEFAULT_CONFIG;
125 opts.object_classes_list_file = DEFAULT_OBJECT_CLASSES_LIST_FILE;
126 opts.output_prob_threshold = DEFAULT_OUTPUT_PROB_THRESHOLD;
127 if (num_eves != 0) { opts.num_eves = 1; opts.num_dsps = 0; }
128 else { opts.num_eves = 0; opts.num_dsps = 1; }
129 if (! ProcessArgs(argc, argv, opts))
130 {
131 DisplayHelp();
132 exit(EXIT_SUCCESS);
133 }
134 assert(opts.num_dsps != 0 || opts.num_eves != 0);
135 if (opts.num_frames == 0)
136 opts.num_frames = (opts.is_camera_input || opts.is_video_input) ?
137 NUM_VIDEO_FRAMES : 1;
138 if (opts.input_file.empty())
139 cout << "Input: " << default_inputs[0] << endl;
140 else
141 cout << "Input: " << opts.input_file << endl;
143 // Get object classes list
144 object_classes = std::unique_ptr<ObjectClasses>(
145 new ObjectClasses(opts.object_classes_list_file));
146 if (object_classes->GetNumClasses() == 0)
147 {
148 cout << "No object classes defined for this config." << endl;
149 return EXIT_FAILURE;
150 }
152 // Run network
153 bool status = RunConfiguration(opts);
154 if (!status)
155 {
156 cout << "imagenet FAILED" << endl;
157 return EXIT_FAILURE;
158 }
160 cout << "imagenet PASSED" << endl;
161 return EXIT_SUCCESS;
162 }
164 bool RunConfiguration(cmdline_opts_t& opts)
165 {
166 bool status = true;
168 // setup camera/video input/output
169 VideoCapture cap;
170 if (! SetVideoInputOutput(cap, opts, "ImageNet")) return false;
172 cout << "\n##### Batch size 1 testing ######\n" << endl;
173 try
174 {
175 TidlInitSubgraph(1, 0);
176 float **inputs = new float *[1];
177 inputs[0] = new float[MOBILENET_INPUT_SIZE];
178 float **outputs = new float *[1];
179 outputs[0] = new float[MOBILENET_OUTPUT_SIZE];
181 for (int i = 0; i < 5; i ++)
182 {
183 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
184 tloop0 = chrono::steady_clock::now();
186 ReadFrame(opts, cap, inputs, 1);
187 TidlRunSubgraph(1, 0, 1, 1, 1, inputs, outputs);
188 WriteFrameOutput(outputs[0], opts);
190 tloop1 = chrono::steady_clock::now();
191 chrono::duration<float> elapsed = tloop1 - tloop0;
192 cout << "Frame " << i
193 << " time (including read/write/opencv/print/etc): "
194 << setw(6) << setprecision(4)
195 << (elapsed.count() * 1000) << "ms" << endl;
196 }
198 delete [] inputs[0];
199 delete [] inputs;
200 delete [] outputs[0];
201 delete [] outputs;
202 }
203 catch (tidl::Exception &e)
204 {
205 cerr << e.what() << endl;
206 status = false;
207 }
209 // If not doing multi-threaded processing, multiply by 2 or more
210 // for a larger batch to amortize batch initilization/tear down cost
211 int preferred_batch_size = TidlGetPreferredBatchSize(1);
212 for (int multiple = 1; multiple <= 16; multiple *= 2)
213 {
214 int batch_size = preferred_batch_size * multiple;
215 cout << "\n##### Batch size " << batch_size << " testing ######\n"
216 << endl;
217 bool skip_outputs = false;
218 try
219 {
220 float **inputs = new float *[batch_size];
221 float **outputs = new float *[batch_size];
222 for (int i = 0; i < batch_size; i++)
223 {
224 inputs[i] = new float[MOBILENET_INPUT_SIZE];
225 outputs[i] = new float[MOBILENET_OUTPUT_SIZE];
226 }
228 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
229 tloop0 = chrono::steady_clock::now();
231 ReadFrame(opts, cap, inputs, batch_size);
232 TidlRunSubgraph(1, 0, batch_size, 1, 1, inputs, outputs);
233 for (int i = 0; i < batch_size; i++)
234 {
235 if (! SkipOutputs(i, 0, skip_outputs))
236 {
237 cout << "Frame " << i << " of " << batch_size
238 << " output:" << endl;
239 WriteFrameOutput(outputs[i], opts);
240 }
241 }
243 tloop1 = chrono::steady_clock::now();
244 chrono::duration<float> elapsed = tloop1 - tloop0;
245 cout << "Batch size " << batch_size
246 << " time: "
247 << setw(6) << setprecision(4)
248 << (elapsed.count() * 1000) << "ms, fps = "
249 << setw(6) << setprecision(4)
250 << (batch_size / elapsed.count())
251 << endl;
253 for (int i = 0; i < batch_size; i++)
254 {
255 delete [] inputs[i];
256 delete [] outputs[i];
257 }
258 delete [] inputs;
259 delete [] outputs;
260 }
261 catch (tidl::Exception &e)
262 {
263 cerr << e.what() << endl;
264 status = false;
265 }
266 }
268 // This is to test the multithreaded inference with async/future
269 // async/future has slightly worse threading performance than
270 // thread pool, however, it is much easier to program
271 cout << "\n##### Multithreaded inference testing (async/future) #####\n"
272 << endl;
273 int num_threads = TidlGetPreferredBatchSize(1) * 2;
274 int num_iters = 100;
275 try
276 {
277 float **inputs = new float *[num_threads];
278 float **outputs = new float *[num_threads];
279 for (int i = 0; i < num_threads; i++)
280 {
281 inputs[i] = new float[MOBILENET_INPUT_SIZE];
282 outputs[i] = new float[MOBILENET_OUTPUT_SIZE];
283 }
284 vector<future<bool>> futures(num_threads);
285 bool skip_outputs = false;
287 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
288 tloop0 = chrono::steady_clock::now();
290 for (int i = 0; i < num_iters + num_threads; i++)
291 {
292 int index = i % num_threads;
293 if (i >= num_threads)
294 {
295 if (futures[index].get())
296 {
297 if (! SkipOutputs(i, num_threads, skip_outputs))
298 WriteFrameOutput(outputs[index], opts);
299 }
300 }
302 if (i < num_iters)
303 {
304 ReadFrame(opts, cap, &inputs[index], 1);
305 futures[index] = std::async(std::launch::async,
306 [inputs, outputs](int index) {
307 TidlRunSubgraph(1, 0, 1, 1, 1,
308 &inputs[index], &outputs[index]);
309 return true;
310 },
311 index);
312 }
313 }
315 tloop1 = chrono::steady_clock::now();
316 chrono::duration<float> elapsed = tloop1 - tloop0;
317 cout << "Multithreaded (num_threads=" << num_threads
318 << ", batch_size=1) loop time (" << num_iters << " frames): "
319 << setw(6) << setprecision(4)
320 << (elapsed.count() * 1000) << "ms, fps = "
321 << setw(6) << setprecision(4)
322 << (num_iters / elapsed.count())
323 << endl;
325 for (int i = 0; i < num_threads; i++)
326 {
327 delete [] inputs[i];
328 delete [] outputs[i];
329 }
330 delete [] inputs;
331 delete [] outputs;
332 }
333 catch (tidl::Exception &e)
334 {
335 cerr << e.what() << endl;
336 status = false;
337 }
339 // This is to test the multithreaded inference with a thread pool
340 cout << "\n##### Multithreaded inference testing (thread pool) #####\n"
341 << endl;
342 try
343 {
344 float **inputs = new float *[num_threads];
345 float **outputs = new float *[num_threads];
346 vector<UserData> v_data(num_threads);
347 for (int i = 0; i < num_threads; i++)
348 {
349 inputs[i] = new float[MOBILENET_INPUT_SIZE];
350 outputs[i] = new float[MOBILENET_OUTPUT_SIZE];
351 v_data[i].inputs = &inputs[i];
352 v_data[i].outputs = &outputs[i];
353 }
354 ThPool pool(num_threads, SubgraphUserFunc);
355 vector<int> th_ids(num_threads);
356 bool skip_outputs = false;
358 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
359 tloop0 = chrono::steady_clock::now();
361 for (int i = 0; i < num_iters + num_threads; i++)
362 {
363 int index = i % num_threads;
364 if (i >= num_threads)
365 {
366 UserData *data = (UserData *) pool.Wait(th_ids[index]);
367 if (! SkipOutputs(i, num_threads, skip_outputs))
368 WriteFrameOutput(data->outputs[0], opts);
369 }
371 if (i < num_iters)
372 {
373 ReadFrame(opts, cap, &inputs[index], 1);
374 th_ids[index] = pool.RunAsync(&v_data[index]);
375 }
376 }
378 tloop1 = chrono::steady_clock::now();
379 chrono::duration<float> elapsed = tloop1 - tloop0;
380 cout << "Multithreaded (num_threads=" << num_threads
381 << ", batch_size=1) loop time (" << num_iters << " frames): "
382 << setw(6) << setprecision(4)
383 << (elapsed.count() * 1000) << "ms, fps = "
384 << setw(6) << setprecision(4)
385 << (num_iters / elapsed.count())
386 << endl;
388 for (int i = 0; i < num_threads; i++)
389 {
390 delete [] inputs[i];
391 delete [] outputs[i];
392 }
393 delete [] inputs;
394 delete [] outputs;
395 }
396 catch (tidl::Exception &e)
397 {
398 cerr << e.what() << endl;
399 status = false;
400 }
402 num_threads = 2;
403 int batch_size = preferred_batch_size;
404 // This is to test the multithreaded batch inference with async/future
405 // Ideally, batch_size * num_threads <= number of threads
406 cout << "\n##### Multithreaded batch inference testing (async/future)"
407 << " #####\n" << endl;
408 try
409 {
410 float **inputs = new float *[num_threads * batch_size];
411 float **outputs = new float *[num_threads * batch_size];
412 for (int i = 0; i < num_threads * batch_size; i++)
413 {
414 inputs[i] = new float[MOBILENET_INPUT_SIZE];
415 outputs[i] = new float[MOBILENET_OUTPUT_SIZE];
416 }
417 vector<future<bool>> futures(num_threads);
418 bool skip_outputs = false;
420 chrono::time_point<chrono::steady_clock> tloop0, tloop1;
421 tloop0 = chrono::steady_clock::now();
423 for (int i = 0; i < num_iters/batch_size + num_threads; i++)
424 {
425 int index = i % num_threads;
426 if (i >= num_threads)
427 {
428 if (futures[index].get())
429 if (! SkipOutputs(i*batch_size, num_threads*batch_size,
430 skip_outputs))
431 for (int b = 0; b < batch_size; b++)
432 WriteFrameOutput(outputs[index*batch_size+b], opts);
433 }
435 if (i < num_iters/batch_size)
436 {
437 ReadFrame(opts, cap, &inputs[index*batch_size], batch_size);
438 futures[index] = std::async(std::launch::async,
439 [inputs, outputs, batch_size](int index) {
440 TidlRunSubgraph(1, 0, batch_size, 1, 1,
441 &inputs[index*batch_size],
442 &outputs[index*batch_size]);
443 return true;
444 },
445 index);
446 }
447 }
449 tloop1 = chrono::steady_clock::now();
450 chrono::duration<float> elapsed = tloop1 - tloop0;
451 cout << "Multithreaded batch (num_threads=" << num_threads
452 << ", batch_size=" << batch_size
453 << ") loop time (" << num_iters << " frames): "
454 << setw(6) << setprecision(4)
455 << (elapsed.count() * 1000) << "ms, fps = "
456 << setw(6) << setprecision(4)
457 << (num_iters / elapsed.count())
458 << endl;
460 for (int i = 0; i < num_threads * batch_size; i++)
461 {
462 delete [] inputs[i];
463 delete [] outputs[i];
464 }
465 delete [] inputs;
466 delete [] outputs;
467 }
468 catch (tidl::Exception &e)
469 {
470 cerr << e.what() << endl;
471 status = false;
472 }
475 return status;
476 }
478 void SubgraphUserFunc(void *user_data)
479 {
480 UserData *data = (UserData *) user_data;
481 //printf("data inputs = %p, outputs = %p\n", data->inputs, data->outputs);
482 TidlRunSubgraph(1, 0, 1, 1, 1, data->inputs, data->outputs);
483 //printf("TidlRunSubgraph finished\n");
484 }
486 bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs,
487 int batch_size)
488 {
489 Configuration c;
490 c.inNumChannels = MOBILENET_IN_C;
491 c.inWidth = MOBILENET_IN_W;
492 c.inHeight = MOBILENET_IN_H;
493 c.preProcType = 2;
494 SubgraphDataConv in_conv{{0}, {true}, {128.0f}, {false},
495 {1,MOBILENET_IN_C,MOBILENET_IN_H,MOBILENET_IN_W}};
497 char* frame_buffer = new char[MOBILENET_INPUT_SIZE];
498 assert (frame_buffer != nullptr);
500 Mat image;
501 if (! opts.is_camera_input && ! opts.is_video_input)
502 {
503 if (opts.input_file.empty())
504 image = cv::imread(default_inputs[0],
505 CV_LOAD_IMAGE_COLOR);
506 else
507 image = cv::imread(opts.input_file, CV_LOAD_IMAGE_COLOR);
508 if (image.empty())
509 {
510 cerr << "Unable to read input image" << endl;
511 return false;
512 }
513 }
514 else
515 {
516 Mat v_image;
517 if (! cap.grab()) return false;
518 if (! cap.retrieve(v_image)) return false;
519 int orig_width = v_image.cols;
520 int orig_height = v_image.rows;
521 // Crop camera/video input to center 256x256 input
522 if (orig_width > 256 && orig_height > 256)
523 {
524 image = Mat(v_image, Rect((orig_width-256)/2, (orig_height-256)/2,
525 256, 256));
526 }
527 else
528 image = v_image;
529 cv::imshow("ImageNet", image);
530 waitKey(2);
531 }
533 // TI DL image preprocessing, into frame_buffer
534 bool status = imgutil::PreprocessImage(image, frame_buffer, c);
535 for (int i = 0; i < batch_size; i++)
536 {
537 std::vector<float *> in_data_v{inputs[i]};
538 in_conv.ScaleDequant((const uint8_t *)frame_buffer, in_data_v);
539 }
540 delete [] frame_buffer;
541 return status;
542 }
544 // Display top 5 classified imagenet classes with probabilities 5% or higher
545 bool WriteFrameOutput(float *out, const cmdline_opts_t& opts)
546 {
547 const int k = 5;
548 int out_size = 1001;
549 // Tensorflow trained network outputs 1001 probabilities,
550 // with 0-index being background, thus we need to subtract 1 when
551 // reporting classified object from 1000 categories
552 int background_offset = out_size == 1001 ? 1 : 0;
554 // sort and get k largest values and corresponding indices
555 typedef pair<float, int> val_index;
556 auto cmp = [](val_index &left, val_index &right)
557 { return left.first > right.first; };
558 priority_queue<val_index, vector<val_index>, decltype(cmp)> queue(cmp);
560 // initialize priority queue with smallest value on top
561 for (int i = 0; i < k; i++)
562 queue.push(val_index(out[i], i));
564 // for rest output, if larger than current min, pop min, push new val
565 for (int i = k; i < out_size; i++)
566 {
567 if (out[i] > queue.top().first)
568 {
569 queue.pop();
570 queue.push(val_index(out[i], i));
571 }
572 }
574 // output top k values in reverse order: largest val first
575 vector<val_index> sorted;
576 while (! queue.empty())
577 {
578 sorted.push_back(queue.top());
579 queue.pop();
580 }
582 for (int i = k - 1; i >= 0; i--)
583 {
584 if (sorted[i].first * 100 < opts.output_prob_threshold) break;
585 int imagenet_index = sorted[i].second - background_offset;
586 cout << k-i << ": [" << imagenet_index << "] "
587 << object_classes->At(imagenet_index).label
588 << ", prob = " << setprecision(4)
589 << (sorted[i].first * 100) << "%" << endl;
590 }
592 return true;
593 }
595 void DisplayHelp()
596 {
597 cout <<
598 "Usage: imagenet\n"
599 " Will run imagenet network to predict top 5 object"
600 " classes for the input.\n Use -c to run a"
601 " different imagenet network. Default is j11_v2.\n"
602 "Optional arguments:\n"
603 " -c <config> Valid configs: j11_bn, j11_prelu, j11_v2\n"
604 " -d <number> Number of dsp cores to use\n"
605 " -e <number> Number of eve cores to use\n"
606 " -i <image> Path to the image file as input\n"
607 " -i camera<number> Use camera as input\n"
608 " video input port: /dev/video<number>\n"
609 " -i <name>.{mp4,mov,avi} Use video file as input\n"
610 " -l <objects_list> Path to the object classes list file\n"
611 " -f <number> Number of frames to process\n"
612 " -p <number> Output probablity threshold in percentage\n"
613 " Default is 5 percent or higher.\n"
614 " -v Verbose output during execution\n"
615 " -h Help\n";
616 }