Optimize classification perf, report loop avg_fps
authorYuan Zhao <yuanzhao@ti.com>
Sat, 8 Sep 2018 05:06:48 +0000 (00:06 -0500)
committerYuan Zhao <yuanzhao@ti.com>
Mon, 10 Sep 2018 16:58:37 +0000 (11:58 -0500)
- Double buffer EOPs to overlap host pre/post-processing
  and device processing.  When EOP contains more than one EO,
  pipeline at EO level rather than at EOP level.
- Compute average FPS across a sliding window of frames
  using host loop iteration/frame time.
- MCT-1049

examples/classification/avg_fps_window.h [new file with mode: 0644]
examples/classification/main.cpp

diff --git a/examples/classification/avg_fps_window.h b/examples/classification/avg_fps_window.h
new file mode 100644 (file)
index 0000000..b8b5644
--- /dev/null
@@ -0,0 +1,89 @@
+/******************************************************************************
+ * Copyright (c) 2018, Texas Instruments Incorporated - http://www.ti.com/
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *       * Neither the name of Texas Instruments Incorporated nor the
+ *         names of its contributors may be used to endorse or promote products
+ *         derived from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#pragma once
+
+#include <vector>
+#include <chrono>
+
+#define MAX_WINDOW_SIZE 64
+#define INIT_FRAME_TIME 0.001
+
+// Compute average FPS across a sliding window of frames
+class AvgFPSWindow
+{
+  public:
+    AvgFPSWindow(uint32_t window_size) :
+        window_size_m(window_size), circ_idx_m(0), total_time_m(0.0)
+    {
+        if (window_size_m == 0 || window_size_m > MAX_WINDOW_SIZE)
+            window_size_m = MAX_WINDOW_SIZE;
+        history_times_m.assign(window_size_m, INIT_FRAME_TIME);
+        frame_time_m = INIT_FRAME_TIME;
+        total_time_m = window_size_m * INIT_FRAME_TIME;
+        t0_m = std::chrono::steady_clock::now();
+    }
+
+    // Invoked per loop iteration to capture frame time
+    void Tick()
+    {
+        t1_m = std::chrono::steady_clock::now();
+        std::chrono::duration<double> elapsed = t1_m - t0_m;
+        frame_time_m = elapsed.count();  // in seconds
+        t0_m = t1_m;
+    }
+
+    // Update the frame_time_m into circular array for history timing
+    // Reading will only become valid after window_size_m frames
+    // Return updated average FPS
+    double UpdateAvgFPS()
+    {
+        total_time_m += frame_time_m - history_times_m[circ_idx_m];
+        history_times_m[circ_idx_m] = frame_time_m;
+        circ_idx_m = (circ_idx_m + 1) % window_size_m;
+        return (1.0 * window_size_m) / total_time_m;
+    }
+
+    // Return average FPS
+    double GetAvgFPS()
+    {
+        return (1.0 * window_size_m) / total_time_m;
+    }
+
+    AvgFPSWindow() =delete;
+    AvgFPSWindow(const AvgFPSWindow&) =delete;
+    AvgFPSWindow& operator=(const AvgFPSWindow&) =delete;
+
+  private:
+    uint32_t window_size_m;
+    uint32_t circ_idx_m;
+    double total_time_m;
+    std::vector<double> history_times_m;
+    std::chrono::time_point<std::chrono::steady_clock> t0_m, t1_m;
+    double frame_time_m;
+};
index 21c05a74f4ac39d0a81703edc179b580a728433c..749b713f8dc43010322bba9e964e0ac2ca8bbfac 100644 (file)
 #include "execution_object.h"
 #include "execution_object_pipeline.h"
 #include "configuration.h"
+#include "avg_fps_window.h"
 
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/videoio.hpp"
 
+
 //#define TWO_ROIs
 #define LIVE_DISPLAY
 #define PERF_VERBOSE
@@ -105,7 +107,8 @@ void imagenetCallBackFunc(int event, int x, int y, int flags, void* userdata)
 Mat in_image, image, r_image, cnn_image, show_image, bgr_frames[3];
 Mat to_stream;
 Rect rectCrop[NUM_ROI];
-double avg_fps;
+// Report average FPS across a sliding window of 16 frames
+AvgFPSWindow fps_window(16);
 
 static int tf_postprocess(uchar *in, int size, int roi_idx, int frame_idx, int f_id);
 static void tf_preprocess(uchar *out, uchar *in, int size);
@@ -229,7 +232,6 @@ bool RunConfiguration(const std::string& config_file, int num_layers_groups, uin
         for (int k = 0; k < NUM_ROI; k++)
             for(int i = 0; i < 3; i ++)
                 selclass_history[k][i] = -1;
-        avg_fps = 0.0;
         int num_frames = configuration.numFrames;
         std::cout << "About to start ProcessFrame loop!!" << std::endl;
  
@@ -249,6 +251,7 @@ bool RunConfiguration(const std::string& config_file, int num_layers_groups, uin
                  DisplayFrame(eop, writer, frame_idx, num_eops,
                               num_eves, num_dsps);
             }
+            fps_window.Tick();
 
             if (ReadFrame(eop, frame_idx, num_frames, cap, writer))
                 eop->ProcessFrameStartAsync();
@@ -289,6 +292,7 @@ bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
         ids_eve.insert(static_cast<DeviceId>(i));
     for (uint32_t i = 0; i < num_dsps; i++)
         ids_dsp.insert(static_cast<DeviceId>(i));
+    const uint32_t buffer_factor = 2;
 
     switch(num_layers_groups)
     {
@@ -301,10 +305,15 @@ bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
         // Construct ExecutionObjectPipeline with single Execution Object to
         // process each frame. This is parallel processing of frames with
         // as many DSP and EVE cores that we have on hand.
-        for (uint32_t i = 0; i < num_eves; i++)
-            eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
-        for (uint32_t i = 0; i < num_dsps; i++)
-            eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
+        // If buffer_factor == 2, duplicating EOPs for double buffering
+        // and overlapping host pre/post-processing with device processing
+        for (uint32_t j = 0; j < buffer_factor; j++)
+        {
+            for (uint32_t i = 0; i < num_eves; i++)
+                eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i]}));
+            for (uint32_t i = 0; i < num_dsps; i++)
+                eops.push_back(new ExecutionObjectPipeline({(*e_dsp)[i]}));
+        }
         break;
 
     case 2: // Two layers group
@@ -324,9 +333,15 @@ bool CreateExecutionObjectPipelines(uint32_t num_eves, uint32_t num_dsps,
         // Construct ExecutionObjectPipeline that utilizes multiple
         // ExecutionObjects to process a single frame, each ExecutionObject
         // processes one layerGroup of the network
-        for (uint32_t i = 0; i < std::max(num_eves, num_dsps); i++)
-            eops.push_back(new ExecutionObjectPipeline({(*e_eve)[i%num_eves],
-                                                        (*e_dsp)[i%num_dsps]}));
+        // If buffer_factor == 2, duplicating EOPs for pipelining at
+        // EO level rather than at EOP level, in addition to double buffering
+        // and overlapping host pre/post-processing with device processing
+        for (uint32_t j = 0; j < buffer_factor; j++)
+        {
+            for (uint32_t i = 0; i < std::max(num_eves, num_dsps); i++)
+                eops.push_back(new ExecutionObjectPipeline(
+                                {(*e_eve)[i%num_eves], (*e_dsp)[i%num_dsps]}));
+        }
         break;
 
     default:
@@ -585,10 +600,8 @@ void DisplayFrame(const ExecutionObjectPipeline* eop, VideoWriter& writer,
                            selected_items[k] == rpt_id ? cv::Scalar(0,0,255) :
                                                 cv::Scalar(255,255,255), 1, 8);
             }
-            double elapsed_host = eop->GetHostProcessTimeInMilliSeconds();
-            /* Exponential averaging */
-            avg_fps = 0.1 * ((double)num_eops * 1000.0 /
-                             ((double)NUM_ROI * elapsed_host)) + 0.9 * avg_fps;
+
+            double avg_fps = fps_window.UpdateAvgFPS();
             sprintf(tmp_classwindow_string, "FPS:%5.2lf", avg_fps );
 
 #ifdef PERF_VERBOSE