examples/mcbench/main.cpp

   1 /******************************************************************************
   2  * Copyright (c) 2018, Texas Instruments Incorporated - http://www.ti.com/
   3  *   All rights reserved.
   4  *
   5  *   Redistribution and use in source and binary forms, with or without
   6  *   modification, are permitted provided that the following conditions are met:
   7  *       * Redistributions of source code must retain the above copyright
   8  *         notice, this list of conditions and the following disclaimer.
   9  *       * Redistributions in binary form must reproduce the above copyright
  10  *         notice, this list of conditions and the following disclaimer in the
  11  *         documentation and/or other materials provided with the distribution.
  12  *       * Neither the name of Texas Instruments Incorporated nor the
  13  *         names of its contributors may be used to endorse or promote products
  14  *         derived from this software without specific prior written permission.
  15  *
  16  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17  *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20  *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21  *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22  *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23  *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24  *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25  *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  26  *   THE POSSIBILITY OF SUCH DAMAGE.
  27  *****************************************************************************/
  28 #include <signal.h>
  29 #include <iostream>
  30 #include <iomanip>
  31 #include <fstream>
  32 #include <cassert>
  33 #include <string>
  34 #include <functional>
  35 #include <algorithm>
  36 #include <time.h>
  37 #include <unistd.h>
  38
  39 #include <queue>
  40 #include <vector>
  41 #include <cstdio>
  42 #include <chrono>
  43
  44 #include "executor.h"
  45 #include "execution_object.h"
  46 #include "execution_object_pipeline.h"
  47 #include "configuration.h"
  48 #include "../common/utils.h"
  49 #include "../common/video_utils.h"
  50
  51 using namespace std;
  52 using namespace tidl;
  53 using namespace cv;
  54
  55
  56 #define NUM_VIDEO_FRAMES  100
  57 #define DEFAULT_CONFIG    "../test/testvecs/config/infer/tidl_config_j11_v2.txt"
  58
  59 bool RunConfiguration(const cmdline_opts_t& opts);
  60 Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
  61                          int layers_group_id);
  62 bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
  63                                     Configuration& configuration,
  64                                     uint32_t num_layers_groups,
  65                                     Executor*& e_eve, Executor*& e_dsp,
  66                                   std::vector<ExecutionObjectPipeline*>& eops);
  67
  68 void AllocateMemory(const std::vector<ExecutionObjectPipeline*>& eops);
  69
  70 bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,
  71                const Configuration& c, const cmdline_opts_t& opts, char *input_frames_buffer);
  72 static void DisplayHelp();
  73
  74
  75 int main(int argc, char *argv[])
  76 {
  77     // Catch ctrl-c to ensure a clean exit
  78     signal(SIGABRT, exit);
  79     signal(SIGTERM, exit);
  80
  81     // If there are no devices capable of offloading TIDL on the SoC, exit
  82     uint32_t num_eves = Executor::GetNumDevices(DeviceType::EVE);
  83     uint32_t num_dsps = Executor::GetNumDevices(DeviceType::DSP);
  84     if ((num_eves == 0) || (num_dsps == 0))
  85     {
  86         cout << "mcbench requires EVE and/or DSP for execution." << endl;
  87         return EXIT_SUCCESS;
  88     }
  89
  90     cout << "CMDLINE: ";
  91     for(int i = 0; i < argc; ++i) cout << argv[i] << " ";
  92     cout << endl;
  93
  94     // Process arguments
  95     cmdline_opts_t opts;
  96     opts.config = DEFAULT_CONFIG;
  97     opts.num_eves = 0;
  98     opts.num_dsps = 2;
  99     if (! ProcessArgs(argc, argv, opts))
 100     {
 101         DisplayHelp();
 102         exit(EXIT_SUCCESS);
 103     }
 104     assert((opts.num_dsps + opts.num_eves) != 0);
 105
 106     if (opts.num_frames == 0)
 107         opts.num_frames = NUM_VIDEO_FRAMES;
 108
 109     // Run network
 110     bool status = RunConfiguration(opts);
 111
 112     if (!status)
 113     {
 114         cout << "mcbench FAILED" << endl;
 115         return EXIT_FAILURE;
 116     }
 117
 118     cout << "mcbench PASSED" << endl;
 119     return EXIT_SUCCESS;
 120 }
 121
 122 bool RunConfiguration(const cmdline_opts_t& opts)
 123 {
 124     // Read the TI DL configuration file
 125     Configuration c;
 126     if (!c.ReadFromFile(opts.config))
 127         return false;
 128
 129     c.enableApiTrace = opts.verbose;
 130     if(opts.num_layers_groups == 1)
 131        c.runFullNet = true; //Force all layers to be in the same group
 132
 133     std::string inputFile;
 134     if (opts.input_file.empty())
 135         inputFile   = c.inData;
 136     else
 137         inputFile = opts.input_file;
 138
 139     int frame_size = c.inNumChannels * c.inWidth * c.inHeight;
 140
 141     c.numFrames = GetBinaryFileSize (inputFile) / frame_size;
 142
 143     cout << "Input: " << inputFile << " frames:" << c.numFrames << endl;
 144
 145     // Read input file into memory buffer
 146     char *input_frame_buffer = new char[c.numFrames * frame_size]();
 147     ifstream ifs(inputFile, ios::binary);
 148     ifs.read(input_frame_buffer, c.numFrames * frame_size);
 149     if(!ifs.good()) {
 150        std::cout << "Invalid File input:" << inputFile << std::endl;
 151        return false;
 152     }
 153
 154     bool status = true;
 155     try
 156     {
 157         Executor *e_eve = NULL;
 158         Executor *e_dsp = NULL;
 159         std::vector<ExecutionObjectPipeline *> eops;
 160         if (! CreateExecutionObjectPipelines(opts.num_eves, opts.num_dsps, c,
 161                                              opts.num_layers_groups,
 162                                              e_eve, e_dsp, eops))
 163             return false;
 164
 165         // Allocate input/output memory for each EOP
 166         AllocateMemory(eops);
 167
 168         chrono::time_point<chrono::steady_clock> tloop0, tloop1;
 169         tloop0 = chrono::steady_clock::now();
 170
 171         // Process frames with available eops in a pipelined manner
 172         // additional num_eops iterations to flush pipeline (epilogue)
 173         uint32_t num_eops = eops.size();
 174         for (uint32_t frame_idx = 0;
 175              frame_idx < opts.num_frames + num_eops; frame_idx++)
 176         {
 177             ExecutionObjectPipeline* eop = eops[frame_idx % num_eops];
 178
 179             // Wait for previous frame on the same eop to finish processing
 180             if (eop->ProcessFrameWait())
 181                 ;
 182
 183             // Read a frame and start processing it with current eo
 184             if (ReadFrame(*eop, frame_idx, c, opts, input_frame_buffer))
 185                 eop->ProcessFrameStartAsync();
 186         }
 187
 188         tloop1 = chrono::steady_clock::now();
 189         chrono::duration<float> elapsed = tloop1 - tloop0;
 190         cout << "Loop total time: "
 191                   << setw(6) << setprecision(4)
 192                   << (elapsed.count() * 1000) << "ms" << endl;
 193         cout << "FPS:" << opts.num_frames / elapsed.count() << endl;
 194
 195         FreeMemory(eops);
 196
 197         for (auto eop : eops)
 198             delete eop;
 199
 200         delete e_eve;
 201         delete e_dsp;
 202     }
 203     catch (tidl::Exception &e)
 204     {
 205         cerr << e.what() << endl;
 206         status = false;
 207     }
 208
 209     delete [] input_frame_buffer;
 210     return status;
 211 }
 212
 213 // Create an Executor with the specified type and number of EOs
 214 Executor* CreateExecutor(DeviceType dt, uint32_t num, const Configuration& c,
 215                          int layers_group_id)
 216 {
 217     if (num == 0) return nullptr;
 218
 219     DeviceIds ids;
 220     for (uint32_t i = 0; i < num; i++)
 221         ids.insert(static_cast<DeviceId>(i));
 222
 223     return new Executor(dt, ids, c, layers_group_id);
 224 }
 225
 226 bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
 227                                     Configuration& configuration,
 228                                     uint32_t num_layers_groups,
 229                                     Executor*& e_eve, Executor*& e_dsp,
 230                                     std::vector<ExecutionObjectPipeline*>& eops)
 231 {
 232     DeviceIds ids_eve, ids_dsp;
 233     for (uint32_t i = 0; i < num_eves; i++)
 234         ids_eve.insert(static_cast<DeviceId>(i));
 235     for (uint32_t i = 0; i < num_dsps; i++)
 236         ids_dsp.insert(static_cast<DeviceId>(i));
 237
 238     // Construct ExecutionObjectPipeline that utilizes multiple
 239     // ExecutionObjects to process a single frame, each ExecutionObject
 240     // processes one layerGroup of the network
 241     //
 242     // Pipeline depth can enable more optimized pipeline execution:
 243     // Given one EVE and one DSP as an example, with different
 244     //     buffer_factor, we have different execution behavior:
 245     // If buffer_factor is set to 1,
 246     //    we create one EOP: eop0 (eve0, dsp0)
 247     //    pipeline execution of multiple frames over time is as follows:
 248     //    --------------------- time ------------------->
 249     //    eop0: [eve0...][dsp0]
 250     //    eop0:                [eve0...][dsp0]
 251     //    eop0:                               [eve0...][dsp0]
 252     //    eop0:                                              [eve0...][dsp0]
 253     // If buffer_factor is set to 2,
 254     //    we create two EOPs: eop0 (eve0, dsp0), eop1(eve0, dsp0)
 255     //    pipeline execution of multiple frames over time is as follows:
 256     //    --------------------- time ------------------->
 257     //    eop0: [eve0...][dsp0]
 258     //    eop1:          [eve0...][dsp0]
 259     //    eop0:                   [eve0...][dsp0]
 260     //    eop1:                            [eve0...][dsp0]
 261     // Additional benefit of setting buffer_factor to 2 is that
 262     //    it can also overlap host ReadFrame() with device processing:
 263     //    --------------------- time ------------------->
 264     //    eop0: [RF][eve0...][dsp0]
 265     //    eop1:     [RF]     [eve0...][dsp0]
 266     //    eop0:                    [RF][eve0...][dsp0]
 267     //    eop1:                             [RF][eve0...][dsp0]
 268     const uint32_t buffer_factor = 2;
 269
 270     switch(num_layers_groups)
 271     {
 272     case 1: // Single layers group
 273         e_eve = num_eves == 0 ? nullptr :
 274                 new Executor(DeviceType::EVE, ids_eve, configuration);
 275         e_dsp = num_dsps == 0 ? nullptr :
 276                 new Executor(DeviceType::DSP, ids_dsp, configuration);
 277
 278         // Construct ExecutionObjectPipeline with single Execution Object to
 279         // process each frame. This is parallel processing of frames with
 280         // as many DSP and EVE cores that we have on hand.
 281         // If buffer_factor == 2, duplicating EOPs for double buffering
 282         // and overlapping host pre/post-processing with device processing
 283         for (uint32_t j = 0; j < buffer_factor; j++)
 284         {
 285             for (uint32_t i = 0; i < num_eves; i++)
 286                 eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
 287             for (uint32_t i = 0; i < num_dsps; i++)
 288                 eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
 289         }
 290         break;
 291
 292     case 2: // Two layers group
 293         // Create Executors with the approriate core type, number of cores
 294         // and configuration specified
 295         // EVE will run layersGroupId 1 in the network, while
 296         // DSP will run layersGroupId 2 in the network
 297         e_eve = num_eves == 0 ? nullptr :
 298                 new Executor(DeviceType::EVE, ids_eve, configuration, 1);
 299         e_dsp = num_dsps == 0 ? nullptr :
 300                 new Executor(DeviceType::DSP, ids_dsp, configuration, 2);
 301
 302         // Construct ExecutionObjectPipeline that utilizes multiple
 303         // ExecutionObjects to process a single frame, each ExecutionObject
 304         // processes one layerGroup of the network
 305         // If buffer_factor == 2, duplicating EOPs for pipelining at
 306         // EO level rather than at EOP level, in addition to double buffering
 307         // and overlapping host pre/post-processing with device processing
 308         for (uint32_t j = 0; j < buffer_factor; j++)
 309             for (uint32_t i = 0; i < std::max(num_eves, num_dsps); i++)
 310                 eops.push_back(new ExecutionObjectPipeline(
 311                                 {(*e_eve)[i%num_eves], (*e_dsp)[i%num_dsps]}));
 312         break;
 313
 314     default:
 315         std::cout << "Layers groups must be either 1 or 2!" << std::endl;
 316         return false;
 317         break;
 318     }
 319
 320     return true;
 321 }
 322
 323 static void subtractMeanValue(unsigned char *frame_buffer, int channel_size,
 324                               int32_t mean_value)
 325 {
 326     int32_t one_pixel;
 327
 328     for (int i = 0; i < channel_size; i ++)
 329     {
 330         one_pixel  = (int32_t)frame_buffer[i];
 331         one_pixel -= mean_value;
 332         if(one_pixel > 127)  one_pixel = 127;
 333         if(one_pixel < -128) one_pixel = -128;
 334         frame_buffer[i] = (unsigned char)one_pixel;
 335     }
 336 }
 337
 338 bool ReadFrame(ExecutionObjectPipeline& eop, uint32_t frame_idx,
 339                const Configuration& c, const cmdline_opts_t& opts,
 340                char *input_frames_buffer)
 341 {
 342     if (frame_idx >= opts.num_frames)
 343         return false;
 344
 345     eop.SetFrameIndex(frame_idx);
 346
 347     unsigned char* frame_buffer = (unsigned char *)eop.GetInputBufferPtr();
 348     assert (frame_buffer != nullptr);
 349     //Current implementation of this function assumes 3 channels on input
 350     assert (c.inNumChannels == 3);
 351
 352     int channel_size = c.inWidth * c.inHeight;
 353     char *bgr_frames_input = input_frames_buffer + (frame_idx % c.numFrames) *
 354                              channel_size * c.inNumChannels;
 355
 356     memcpy(frame_buffer,                bgr_frames_input + 0, channel_size);
 357     if(c.preProcType == 1)
 358        subtractMeanValue(frame_buffer, channel_size, 104);
 359     else if(c.preProcType == 2)
 360        subtractMeanValue(frame_buffer, channel_size, 128);
 361     frame_buffer += channel_size;
 362
 363     memcpy(frame_buffer, bgr_frames_input + 1 * channel_size, channel_size);
 364     if(c.preProcType == 1)
 365        subtractMeanValue(frame_buffer, channel_size, 117);
 366     else if(c.preProcType == 2)
 367        subtractMeanValue(frame_buffer, channel_size, 128);
 368     frame_buffer += channel_size;
 369
 370     memcpy(frame_buffer, bgr_frames_input + 2 * channel_size, channel_size);
 371     if(c.preProcType == 1)
 372        subtractMeanValue(frame_buffer, channel_size, 123);
 373     else if(c.preProcType == 2)
 374        subtractMeanValue(frame_buffer, channel_size, 128);
 375
 376     return true;
 377 }
 378
 379 void DisplayHelp()
 380 {
 381     std::cout <<
 382     "Usage: mcbench\n"
 383     "  Runs partitioned network to perform multi-object detection\n"
 384     "  and classification. First part of network (layersGroupId 1) runs on\n"
 385     "  EVE, second part (layersGroupId 2) runs on DSP.\n"
 386     "  Use -c to run a different segmentation network.  Default is jdetnet.\n"
 387     "Optional arguments:\n"
 388     " -c <config>          Valid configs: ../test/testvecs/config/infer/... \n"
 389     " -d <number>          Number of DSP cores to use\n"
 390     " -e <number>          Number of EVE cores to use\n"
 391     " -g <1|2>             Number of layer groups\n"
 392     " -f <number>          Number of frames to process\n"
 393     " -i <image>           Path to the input image file\n"
 394     " -v                   Verbose output during execution\n"
 395     " -h                   Help\n";
 396 }
 397