examples/mcbench/main.cpp

   1 /******************************************************************************
   2  * Copyright (c) 2018, Texas Instruments Incorporated - http://www.ti.com/
   3  *   All rights reserved.
   4  *
   5  *   Redistribution and use in source and binary forms, with or without
   6  *   modification, are permitted provided that the following conditions are met:
   7  *       * Redistributions of source code must retain the above copyright
   8  *         notice, this list of conditions and the following disclaimer.
   9  *       * Redistributions in binary form must reproduce the above copyright
  10  *         notice, this list of conditions and the following disclaimer in the
  11  *         documentation and/or other materials provided with the distribution.
  12  *       * Neither the name of Texas Instruments Incorporated nor the
  13  *         names of its contributors may be used to endorse or promote products
  14  *         derived from this software without specific prior written permission.
  15  *
  16  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17  *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20  *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21  *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22  *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23  *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24  *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25  *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  26  *   THE POSSIBILITY OF SUCH DAMAGE.
  27  *****************************************************************************/
  28 #include <signal.h>
  29 #include <iostream>
  30 #include <iomanip>
  31 #include <fstream>
  32 #include <cassert>
  33 #include <string>
  34 #include <functional>
  35 #include <algorithm>
  36 #include <time.h>
  37 #include <unistd.h>
  38
  39 #include <queue>
  40 #include <vector>
  41 #include <cstdio>
  42 #include <chrono>
  43
  44 #include "executor.h"
  45 #include "execution_object.h"
  46 #include "execution_object_pipeline.h"
  47 #include "configuration.h"
  48 #include "../segmentation/object_classes.h"
  49 #include "../common/utils.h"
  50 #include "../common/video_utils.h"
  51
  52 using namespace std;
  53 using namespace tidl;
  54 using namespace cv;
  55
  56
  57 #define NUM_VIDEO_FRAMES  100
  58 #define DEFAULT_CONFIG    "../test/testvecs/config/infer/tidl_config_j11_v2.txt"
  59
  60 bool RunConfiguration(const cmdline_opts_t& opts);
  61 Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
  62                          int layers_group_id);
  63 bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
  64                                     Configuration& configuration,
  65                                     uint32_t num_layers_groups,
  66                                     Executor*& e_eve, Executor*& e_dsp,
  67                                   std::vector<ExecutionObjectPipeline*>& eops);
  68
  69 void AllocateMemory(const std::vector<ExecutionObjectPipeline*>& eops);
  70
  71 bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,
  72                const Configuration& c, const cmdline_opts_t& opts, char *input_frames_buffer);
  73 static void DisplayHelp();
  74
  75
  76 int main(int argc, char *argv[])
  77 {
  78     // Catch ctrl-c to ensure a clean exit
  79     signal(SIGABRT, exit);
  80     signal(SIGTERM, exit);
  81
  82     // If there are no devices capable of offloading TIDL on the SoC, exit
  83     uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
  84     uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
  85     if ((num_eves == 0) || (num_dsps == 0))
  86     {
  87         cout << "mcbench requires EVE and/or DSP for execution." << endl;
  88         return EXIT_SUCCESS;
  89     }
  90
  91     cout << "CMDLINE: ";
  92     for(int i = 0; i < argc; ++i) cout << argv[i] << " ";
  93     cout << endl;
  94
  95     // Process arguments
  96     cmdline_opts_t opts;
  97     opts.config = DEFAULT_CONFIG;
  98     opts.num_eves = 0;
  99     opts.num_dsps = 2;
 100     if (! ProcessArgs(argc, argv, opts))
 101     {
 102         DisplayHelp();
 103         exit(EXIT_SUCCESS);
 104     }
 105     assert((opts.num_dsps + opts.num_eves) != 0);
 106
 107     if (opts.num_frames == 0)
 108         opts.num_frames = NUM_VIDEO_FRAMES;
 109
 110     // Run network
 111     bool status = RunConfiguration(opts);
 112
 113     if (!status)
 114     {
 115         cout << "mcbench FAILED" << endl;
 116         return EXIT_FAILURE;
 117     }
 118
 119     cout << "mcbench PASSED" << endl;
 120     return EXIT_SUCCESS;
 121 }
 122
 123 bool RunConfiguration(const cmdline_opts_t& opts)
 124 {
 125     // Read the TI DL configuration file
 126     Configuration c;
 127     if (!c.ReadFromFile(opts.config))
 128         return false;
 129
 130     c.enableApiTrace = opts.verbose;
 131     if(opts.num_layers_groups == 1)
 132        c.runFullNet = true; //Force all layers to be in the same group
 133
 134     std::string inputFile;
 135     if (opts.input_file.empty())
 136         inputFile   = c.inData;
 137     else
 138         inputFile = opts.input_file;
 139
 140     int frame_size = c.inNumChannels * c.inWidth * c.inHeight;
 141
 142     c.numFrames = GetBinaryFileSize (inputFile) / frame_size;
 143
 144     cout << "Input: " << inputFile << " frames:" << c.numFrames << endl;
 145
 146     // Read input file into memory buffer
 147     char *input_frame_buffer = new char[c.numFrames * frame_size]();
 148     ifstream ifs(inputFile, ios::binary);
 149     ifs.read(input_frame_buffer, c.numFrames * frame_size);
 150     if(!ifs.good()) {
 151        std::cout << "Invalid File input:" << inputFile << std::endl;
 152        return false;
 153     }
 154
 155     bool status = true;
 156     try
 157     {
 158         Executor *e_eve = NULL;
 159         Executor *e_dsp = NULL;
 160         std::vector<ExecutionObjectPipeline *> eops;
 161         if (! CreateExecutionObjectPipelines(opts.num_eves, opts.num_dsps, c,
 162                                              opts.num_layers_groups,
 163                                              e_eve, e_dsp, eops))
 164             return false;
 165
 166         // Allocate input/output memory for each EOP
 167         AllocateMemory(eops);
 168
 169         chrono::time_point<chrono::steady_clock> tloop0, tloop1;
 170         tloop0 = chrono::steady_clock::now();
 171
 172         // Process frames with available eops in a pipelined manner
 173         // additional num_eops iterations to flush pipeline (epilogue)
 174         uint32_t num_eops = eops.size();
 175         for (uint32_t frame_idx = 0;
 176              frame_idx < opts.num_frames + num_eops; frame_idx++)
 177         {
 178             ExecutionObjectPipeline* eop = eops[frame_idx % num_eops];
 179
 180             // Wait for previous frame on the same eop to finish processing
 181             if (eop->ProcessFrameWait())
 182                 ;
 183
 184             // Read a frame and start processing it with current eo
 185             if (ReadFrame(*eop, frame_idx, c, opts, input_frame_buffer))
 186                 eop->ProcessFrameStartAsync();
 187         }
 188
 189         tloop1 = chrono::steady_clock::now();
 190         chrono::duration<float> elapsed = tloop1 - tloop0;
 191         cout << "Loop total time: "
 192                   << setw(6) << setprecision(4)
 193                   << (elapsed.count() * 1000) << "ms" << endl;
 194         cout << "FPS:" << opts.num_frames / elapsed.count() << endl;
 195
 196         FreeMemory(eops);
 197
 198         for (auto eop : eops)
 199             delete eop;
 200
 201         delete e_eve;
 202         delete e_dsp;
 203     }
 204     catch (tidl::Exception &e)
 205     {
 206         cerr << e.what() << endl;
 207         status = false;
 208     }
 209
 210     delete [] input_frame_buffer;
 211     return status;
 212 }
 213
 214 // Create an Executor with the specified type and number of EOs
 215 Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
 216                          int layers_group_id)
 217 {
 218     if (num == 0) return nullptr;
 219
 220     DeviceIds ids;
 221     for (uint32_t i = 0; i < num; i++)
 222         ids.insert(static_cast<DeviceId>(i));
 223
 224     return new Executor(dt, ids, c, layers_group_id);
 225 }
 226
 227 bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
 228                                     Configuration& configuration,
 229                                     uint32_t num_layers_groups,
 230                                     Executor*& e_eve, Executor*& e_dsp,
 231                                     std::vector<ExecutionObjectPipeline*>& eops)
 232 {
 233     DeviceIds ids_eve, ids_dsp;
 234     for (uint32_t i = 0; i < num_eves; i++)
 235         ids_eve.insert(static_cast<DeviceId>(i));
 236     for (uint32_t i = 0; i < num_dsps; i++)
 237         ids_dsp.insert(static_cast<DeviceId>(i));
 238
 239     // Construct ExecutionObjectPipeline that utilizes multiple
 240     // ExecutionObjects to process a single frame, each ExecutionObject
 241     // processes one layerGroup of the network
 242     //
 243     // Pipeline depth can enable more optimized pipeline execution:
 244     // Given one EVE and one DSP as an example, with different
 245     //     buffer_factor, we have different execution behavior:
 246     // If buffer_factor is set to 1,
 247     //    we create one EOP: eop0 (eve0, dsp0)
 248     //    pipeline execution of multiple frames over time is as follows:
 249     //    --------------------- time ------------------->
 250     //    eop0: [eve0...][dsp0]
 251     //    eop0:                [eve0...][dsp0]
 252     //    eop0:                               [eve0...][dsp0]
 253     //    eop0:                                              [eve0...][dsp0]
 254     // If buffer_factor is set to 2,
 255     //    we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
 256     //    pipeline execution of multiple frames over time is as follows:
 257     //    --------------------- time ------------------->
 258     //    eop0: [eve0...][dsp0]
 259     //    eop1:          [eve0...][dsp0]
 260     //    eop0:                   [eve0...][dsp0]
 261     //    eop1:                            [eve0...][dsp0]
 262     // Additional benefit of setting buffer_factor to 2 is that
 263     //    it can also overlap host ReadFrame() with device processing:
 264     //    --------------------- time ------------------->
 265     //    eop0: [RF][eve0...][dsp0]
 266     //    eop1:     [RF]     [eve0...][dsp0]
 267     //    eop0:                    [RF][eve0...][dsp0]
 268     //    eop1:                             [RF][eve0...][dsp0]
 269     const uint32_t buffer_factor = 2;
 270
 271     switch(num_layers_groups)
 272     {
 273     case 1: // Single layers group
 274         e_eve = num_eves == 0 ? nullptr :
 275                 new Executor(DeviceType::EVE, ids_eve, configuration);
 276         e_dsp = num_dsps == 0 ? nullptr :
 277                 new Executor(DeviceType::DSP, ids_dsp, configuration);
 278
 279         // Construct ExecutionObjectPipeline with single Execution Object to
 280         // process each frame. This is parallel processing of frames with
 281         // as many DSP and EVE cores that we have on hand.
 282         // If buffer_factor == 2, duplicating EOPs for double buffering
 283         // and overlapping host pre/post-processing with device processing
 284         for (uint32_t j = 0; j < buffer_factor; j++)
 285         {
 286             for (uint32_t i = 0; i < num_eves; i++)
 287                 eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
 288             for (uint32_t i = 0; i < num_dsps; i++)
 289                 eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
 290         }
 291         break;
 292
 293     case 2: // Two layers group
 294         // Create Executors with the approriate core type, number of cores
 295         // and configuration specified
 296         // EVE will run layersGroupId 1 in the network, while
 297         // DSP will run layersGroupId 2 in the network
 298         e_eve = num_eves == 0 ? nullptr :
 299                 new Executor(DeviceType::EVE, ids_eve, configuration, 1);
 300         e_dsp = num_dsps == 0 ? nullptr :
 301                 new Executor(DeviceType::DSP, ids_dsp, configuration, 2);
 302
 303         // Construct ExecutionObjectPipeline that utilizes multiple
 304         // ExecutionObjects to process a single frame, each ExecutionObject
 305         // processes one layerGroup of the network
 306         // If buffer_factor == 2, duplicating EOPs for pipelining at
 307         // EO level rather than at EOP level, in addition to double buffering
 308         // and overlapping host pre/post-processing with device processing
 309         for (uint32_t j = 0; j < buffer_factor; j++)
 310             for (uint32_t i = 0; i < std::max(num_eves, num_dsps); i++)
 311                 eops.push_back(new ExecutionObjectPipeline(
 312                                 {(*e_eve)[i%num_eves], (*e_dsp)[i%num_dsps]}));
 313         break;
 314
 315     default:
 316         std::cout << "Layers groups must be either 1 or 2!" << std::endl;
 317         return false;
 318         break;
 319     }
 320
 321     return true;
 322 }
 323
 324 static void subtractMeanValue(unsigned char *frame_buffer, int channel_size,
 325                               int32_t mean_value)
 326 {
 327     int32_t one_pixel;
 328
 329     for (int i = 0; i < channel_size; i ++)
 330     {
 331         one_pixel  = (int32_t)frame_buffer[i];
 332         one_pixel -= mean_value;
 333         if(one_pixel > 127)  one_pixel = 127;
 334         if(one_pixel < -128) one_pixel = -128;
 335         frame_buffer[i] = (unsigned char)one_pixel;
 336     }
 337 }
 338
 339 bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,
 340                const Configuration& c, const cmdline_opts_t& opts,
 341                char *input_frames_buffer)
 342 {
 343     if (frame_idx >= opts.num_frames)
 344         return false;
 345
 346     eop.SetFrameIndex(frame_idx);
 347
 348     unsigned char* frame_buffer = (unsigned char *)eop.GetInputBufferPtr();
 349     assert (frame_buffer != nullptr);
 350     //Current implementation of this function assumes 3 channels on input
 351     assert (c.inNumChannels == 3);
 352
 353     int channel_size = c.inWidth * c.inHeight;
 354     char *bgr_frames_input = input_frames_buffer + (frame_idx % c.numFrames) *
 355                              channel_size * c.inNumChannels;
 356
 357     memcpy(frame_buffer,                bgr_frames_input + 0, channel_size);
 358     if(c.preProcType == 1)
 359        subtractMeanValue(frame_buffer, channel_size, 104);
 360     else if(c.preProcType == 2)
 361        subtractMeanValue(frame_buffer, channel_size, 128);
 362     frame_buffer += channel_size;
 363
 364     memcpy(frame_buffer, bgr_frames_input + 1 * channel_size, channel_size);
 365     if(c.preProcType == 1)
 366        subtractMeanValue(frame_buffer, channel_size, 117);
 367     else if(c.preProcType == 2)
 368        subtractMeanValue(frame_buffer, channel_size, 128);
 369     frame_buffer += channel_size;
 370
 371     memcpy(frame_buffer, bgr_frames_input + 2 * channel_size, channel_size);
 372     if(c.preProcType == 1)
 373        subtractMeanValue(frame_buffer, channel_size, 123);
 374     else if(c.preProcType == 2)
 375        subtractMeanValue(frame_buffer, channel_size, 128);
 376
 377     return true;
 378 }
 379
 380 void DisplayHelp()
 381 {
 382     std::cout <<
 383     "Usage: mcbench\n"
 384     "  Runs partitioned network to perform multi-object detection\n"
 385     "  and classification. First part of network (layersGroupId 1) runs on\n"
 386     "  EVE, second part (layersGroupId 2) runs on DSP.\n"
 387     "  Use -c to run a different segmentation network.  Default is jdetnet.\n"
 388     "Optional arguments:\n"
 389     " -c <config>          Valid configs: ../test/testvecs/config/infer/... \n"
 390     " -d <number>          Number of DSP cores to use\n"
 391     " -e <number>          Number of EVE cores to use\n"
 392     " -g <1|2>             Number of layer groups\n"
 393     " -f <number>          Number of frames to process\n"
 394     " -i <image>           Path to the input image file\n"
 395     " -v                   Verbose output during execution\n"
 396     " -h                   Help\n";
 397 }
 398