ExecutionObjectPipeline for executing layersGroups

author Yuan Zhao <yuanzhao@ti.com>

Fri, 10 Aug 2018 04:42:42 +0000 (23:42 -0500)

committer Yuan Zhao <yuanzhao@ti.com>

Mon, 20 Aug 2018 15:57:44 +0000 (10:57 -0500)
author Yuan Zhao <yuanzhao@ti.com>
Fri, 10 Aug 2018 04:42:42 +0000 (23:42 -0500)
committer Yuan Zhao <yuanzhao@ti.com>
Mon, 20 Aug 2018 15:57:44 +0000 (10:57 -0500)
diff --git a/examples/ssd_multibox/main.cpp b/examples/ssd_multibox/main.cpp

index 6d39dda1561658b70a54964fc7efdf8a4505a4cb..b302cfa7529128f69580152adcc17169af6db815 100644 (file)
--- a/examples/ssd_multibox/main.cpp
+++ b/examples/ssd_multibox/main.cpp
@@ -43,6 +43,7 @@
  
  #include "executor.h"
  #include "execution_object.h"
+#include "execution_object_pipeline.h"
  #include "configuration.h"
  #include "../segmentation/object_classes.h"
  
@@ -67,13 +68,13 @@ using namespace tidl;
  using namespace cv;
  
  
-bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
+bool RunConfiguration(const std::string& config_file,
+                      uint32_t num_dsps, uint32_t num_eves,
                        DeviceType device_type, std::string& input_file);
-bool ReadFrame(ExecutionObject& eo, int frame_idx,
+bool ReadFrame(ExecutionObjectPipeline& eop, int frame_idx,
                 const Configuration& configuration, int num_frames,
                 std::string& image_file, VideoCapture &cap);
-bool WriteFrameOutput(const ExecutionObject &eo_in,
-                      const ExecutionObject &eo_out,
+bool WriteFrameOutput(const ExecutionObjectPipeline& eop,
                        const Configuration& configuration);
  
  void ReportTime(int frame_index, std::string device_name, double elapsed_host,
@@ -81,7 +82,8 @@ void ReportTime(int frame_index, std::string device_name, double elapsed_host,
  
  static void ProcessArgs(int argc, char *argv[],
                          std::string& config,
-                        uint32_t& num_devices,
+                        uint32_t& num_dsps,
+                        uint32_t& num_eves,
                          DeviceType& device_type,
                          std::string& input_file);
  
@@ -110,18 +112,12 @@ int main(int argc, char *argv[])
      // Process arguments
      std::string config      = DEFAULT_CONFIG;
      std::string input_file  = DEFAULT_INPUT;
-    uint32_t num_devices    = 1;
+    uint32_t num_dsps    = 1;
+    uint32_t num_eves    = 1;
      DeviceType  device_type = DeviceType::EVE;
-    ProcessArgs(argc, argv, config, num_devices, device_type, input_file);
+    ProcessArgs(argc, argv, config, num_dsps, num_eves,
+                device_type, input_file);
  
-    // Use same number of EVEs and DSPs
-    num_devices = std::min(num_devices, std::min(num_eve, num_dsp));
-    if (num_devices == 0)
-    {
-        std::cout << "Partitioned execution requires at least 1 EVE and 1 DSP."
-                  << std::endl;
-        return EXIT_FAILURE;
-    }
      if ((object_class_table = GetObjectClassTable(config)) == nullptr)
      {
          std::cout << "No object classes defined for this config." << std::endl;
@@ -136,8 +132,8 @@ int main(int argc, char *argv[])
      std::cout << "Input: " << input_file << std::endl;
      std::string config_file = "../test/testvecs/config/infer/tidl_config_"
                                + config + ".txt";
-    bool status = RunConfiguration(config_file, num_devices, device_type,
-                                   input_file);
+    bool status = RunConfiguration(config_file, num_dsps, num_eves,
+                                   device_type, input_file);
  
      if (!status)
      {
@@ -149,12 +145,15 @@ int main(int argc, char *argv[])
      return EXIT_SUCCESS;
  }
  
-bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
+bool RunConfiguration(const std::string& config_file,
+                      uint32_t num_dsps, uint32_t num_eves,
                        DeviceType device_type, std::string& input_file)
  {
-    DeviceIds ids;
-    for (int i = 0; i < num_devices; i++)
-        ids.insert(static_cast<DeviceId>(i));
+    DeviceIds ids_eve, ids_dsp;
+    for (int i = 0; i < num_eves; i++)
+        ids_eve.insert(static_cast<DeviceId>(i));
+    for (int i = 0; i < num_dsps; i++)
+        ids_dsp.insert(static_cast<DeviceId>(i));
  
      // Read the TI DL configuration file
      Configuration configuration;
@@ -167,7 +166,7 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
      }
  
      // setup input
-    int num_frames = is_default_input ? 3 : 1;
+    int num_frames = is_default_input ? 9 : 9;
      VideoCapture cap;
      std::string image_file;
      if (is_camera_input)
@@ -192,82 +191,58 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
          // and configuration specified
          // EVE will run layersGroupId 1 in the network, while
          // DSP will run layersGroupId 2 in the network
-        Executor executor_eve(DeviceType::EVE, ids, configuration, 1);
-        Executor executor_dsp(DeviceType::DSP, ids, configuration, 2);
-
-        // Query Executor for set of ExecutionObjects created
-        const ExecutionObjects& execution_objects_eve =
-                                            executor_eve.GetExecutionObjects();
-        const ExecutionObjects& execution_objects_dsp =
-                                            executor_dsp.GetExecutionObjects();
-        int num_eos = execution_objects_eve.size();
-
-        // Allocate input and output buffers for each execution object
-        // Note that "out" is both the output of eo_eve and the input of eo_dsp
-        // This is how two layersGroupIds, 1 and 2, are tied together
+        Executor exe_eve(DeviceType::EVE, ids_eve, configuration, 1);
+        Executor exe_dsp(DeviceType::DSP, ids_dsp, configuration, 2);
+
+        // Construct ExecutionObjectPipeline that utilizes multiple
+        // ExecutionObjects to process a single frame, each ExecutionObject
+        // processes one layerGroup of the network
+        int num_eops = std::max(num_eves, num_dsps);
+        std::vector<ExecutionObjectPipeline *> eops;
+        for (int i = 0; i < num_eops; i++)
+            eops.push_back(new ExecutionObjectPipeline({exe_eve[i%num_eves],
+                                                        exe_dsp[i%num_dsps]}));
+
+        // Allocate input/output memory for each EOP
          std::vector<void *> buffers;
-        for (int i = 0; i < num_eos; i++)
+        for (auto eop : eops)
          {
-            ExecutionObject *eo_eve = execution_objects_eve[i].get();
-            size_t in_size  = eo_eve->GetInputBufferSizeInBytes();
-            size_t out_size = eo_eve->GetOutputBufferSizeInBytes();
-            ArgInfo in  = { ArgInfo(malloc(in_size),  in_size)  };
-            ArgInfo out = { ArgInfo(malloc(out_size), out_size) };
-            eo_eve->SetInputOutputBuffer(in, out);
-
-            ExecutionObject *eo_dsp = execution_objects_dsp[i].get();
-            size_t out2_size = eo_dsp->GetOutputBufferSizeInBytes();
-            ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
-            eo_dsp->SetInputOutputBuffer(out, out2);
-
-            buffers.push_back(in.ptr());
-            buffers.push_back(out.ptr());
-            buffers.push_back(out2.ptr());
+            size_t in_size  = eop->GetInputBufferSizeInBytes();
+            size_t out_size = eop->GetOutputBufferSizeInBytes();
+            void*  in_ptr   = malloc(in_size);
+            void*  out_ptr  = malloc(out_size);
+            assert(in_ptr != nullptr && out_ptr != nullptr);
+            buffers.push_back(in_ptr);
+            buffers.push_back(out_ptr);
+
+            ArgInfo in(in_ptr,   in_size);
+            ArgInfo out(out_ptr, out_size);
+            eop->SetInputOutputBuffer(in, out);
          }
  
-        #define MAX_NUM_EOS  4
-        struct timespec t0[MAX_NUM_EOS], t1, tloop0, tloop1;
+        struct timespec tloop0, tloop1;
          clock_gettime(CLOCK_MONOTONIC, &tloop0);
  
-        // Process frames with available execution objects in a pipelined manner
-        // additional num_eos iterations to flush the pipeline (epilogue)
-        ExecutionObject *eo_eve, *eo_dsp, *eo_input;
-        for (int frame_idx = 0;
-             frame_idx < num_frames + num_eos; frame_idx++)
+        // Process frames with ExecutionObjectPipelines in a pipelined manner
+        // additional num_eops iterations to flush pipeline (epilogue)
+        for (int frame_idx = 0; frame_idx < num_frames + num_eops; frame_idx++)
          {
-            eo_eve = execution_objects_eve[frame_idx % num_eos].get();
-            eo_dsp = execution_objects_dsp[frame_idx % num_eos].get();
+            ExecutionObjectPipeline* eop = eops[frame_idx % num_eops];
  
-            // Wait for previous frame on the same eo to finish processing
-            if (eo_dsp->ProcessFrameWait())
+            // Wait for previous frame on the same eop to finish processing
+            if (eop->ProcessFrameWait())
              {
-                int finished_idx = eo_dsp->GetFrameIndex();
-                clock_gettime(CLOCK_MONOTONIC, &t1);
-                ReportTime(finished_idx, "DSP",
-                           ms_diff(t0[finished_idx % num_eos], t1),
-                           eo_dsp->GetProcessTimeInMilliSeconds());
-
-                eo_input = execution_objects_eve[finished_idx % num_eos].get();
-                WriteFrameOutput(*eo_input, *eo_dsp, configuration);
+                ReportTime(eop->GetFrameIndex(), eop->GetDeviceName(),
+                           eop->GetHostProcessTimeInMilliSeconds(),
+                           eop->GetProcessTimeInMilliSeconds());
+                WriteFrameOutput(*eop, configuration);
              }
  
              // Read a frame and start processing it with current eo
-            if (ReadFrame(*eo_eve, frame_idx, configuration, num_frames,
+            if (ReadFrame(*eop, frame_idx, configuration, num_frames,
                            image_file, cap))
              {
-                clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
-                eo_eve->ProcessFrameStartAsync();
-
-                if (eo_eve->ProcessFrameWait())
-                {
-                    clock_gettime(CLOCK_MONOTONIC, &t1);
-                    ReportTime(frame_idx, "EVE",
-                               ms_diff(t0[frame_idx % num_eos], t1),
-                               eo_eve->GetProcessTimeInMilliSeconds());
-
-                    clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
-                    eo_dsp->ProcessFrameStartAsync();
-                }
+                eop->ProcessFrameStartAsync();
              }
          }
  
@@ -276,6 +251,8 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
                    << std::setw(6) << std::setprecision(4)
                    << ms_diff(tloop0, tloop1) << "ms" << std::endl;
  
+        for (auto eop : eops)
+            delete eop;
          for (auto b : buffers)
              free(b);
      }
@@ -305,15 +282,15 @@ void ReportTime(int frame_index, std::string device_name, double elapsed_host,
  }
  
  
-bool ReadFrame(ExecutionObject &eo, int frame_idx,
+bool ReadFrame(ExecutionObjectPipeline& eop, int frame_idx,
                 const Configuration& configuration, int num_frames,
                 std::string& image_file, VideoCapture &cap)
  {
      if (frame_idx >= num_frames)
          return false;
-    eo.SetFrameIndex(frame_idx);
+    eop.SetFrameIndex(frame_idx);
  
-    char*  frame_buffer = eo.GetInputBufferPtr();
+    char*  frame_buffer = eop.GetInputBufferPtr();
      assert (frame_buffer != nullptr);
      int channel_size = configuration.inWidth * configuration.inHeight;
  
@@ -323,7 +300,7 @@ bool ReadFrame(ExecutionObject &eo, int frame_idx,
          if (is_preprocessed_input)
          {
              std::ifstream ifs(image_file, std::ios::binary);
-            ifs.seekg(frame_idx * channel_size * 3);
+            //ifs.seekg(frame_idx * channel_size * 3);
              ifs.read(frame_buffer, channel_size * 3);
              bool ifs_status = ifs.good();
              ifs.close();
@@ -368,8 +345,7 @@ bool ReadFrame(ExecutionObject &eo, int frame_idx,
  }
  
  // Create frame with boxes drawn around classified objects
-bool WriteFrameOutput(const ExecutionObject &eo_in,
-                      const ExecutionObject &eo_out,
+bool WriteFrameOutput(const ExecutionObjectPipeline& eop,
                        const Configuration& configuration)
  {
      // Asseembly original frame
@@ -378,13 +354,13 @@ bool WriteFrameOutput(const ExecutionObject &eo_in,
      int channel_size = width * height;
      Mat frame, r_frame, bgr[3];
  
-    unsigned char *in = (unsigned char *) eo_in.GetInputBufferPtr();
+    unsigned char *in = (unsigned char *) eop.GetInputBufferPtr();
      bgr[0] = Mat(height, width, CV_8UC(1), in);
      bgr[1] = Mat(height, width, CV_8UC(1), in + channel_size);
      bgr[2] = Mat(height, width, CV_8UC(1), in + channel_size*2);
      cv::merge(bgr, 3, frame);
  
-    int frame_index = eo_in.GetFrameIndex();
+    int frame_index = eop.GetFrameIndex();
      char outfile_name[64];
      if (! is_camera_input && is_preprocessed_input)
      {
@@ -394,8 +370,8 @@ bool WriteFrameOutput(const ExecutionObject &eo_in,
      }
  
      // Draw boxes around classified objects
-    float *out = (float *) eo_out.GetOutputBufferPtr();
-    int num_floats = eo_out.GetOutputBufferSizeInBytes() / sizeof(float);
+    float *out = (float *) eop.GetOutputBufferPtr();
+    int num_floats = eop.GetOutputBufferSizeInBytes() / sizeof(float);
      for (int i = 0; i < num_floats / 7; i++)
      {
          int index = (int)    out[i * 7 + 0];
@@ -443,13 +419,14 @@ bool WriteFrameOutput(const ExecutionObject &eo_in,
  
  
  void ProcessArgs(int argc, char *argv[], std::string& config,
-                 uint32_t& num_devices, DeviceType& device_type,
-                 std::string& input_file)
+                 uint32_t& num_dsps, uint32_t& num_eves,
+                 DeviceType& device_type, std::string& input_file)
  {
      const struct option long_options[] =
      {
          {"config",      required_argument, 0, 'c'},
-        {"num_devices", required_argument, 0, 'n'},
+        {"num_dsps",    required_argument, 0, 'd'},
+        {"num_eves",    required_argument, 0, 'e'},
          {"image_file",  required_argument, 0, 'i'},
          {"help",        no_argument,       0, 'h'},
          {"verbose",     no_argument,       0, 'v'},
@@ -460,7 +437,8 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
  
      while (true)
      {
-        int c = getopt_long(argc, argv, "c:n:i:hv", long_options, &option_index);
+        int c = getopt_long(argc, argv, "c:d:e:i:hv", long_options,
+                            &option_index);
  
          if (c == -1)
              break;
@@ -470,8 +448,14 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
              case 'c': config = optarg;
                        break;
  
-            case 'n': num_devices = atoi(optarg);
-                      assert (num_devices > 0 && num_devices <= 4);
+            case 'd': num_dsps = atoi(optarg);
+                      assert (num_dsps > 0 && num_dsps <= 
+                                     Executor::GetNumDevices(DeviceType::DSP));
+                      break;
+
+            case 'e': num_eves = atoi(optarg);
+                      assert (num_eves > 0 && num_eves <=
+                                     Executor::GetNumDevices(DeviceType::EVE));
                        break;
  
              case 'i': input_file = optarg;
@@ -507,7 +491,8 @@ void DisplayHelp()
                   "Default is jdetnet.\n"
                   "Optional arguments:\n"
                   " -c <config>          Valid configs: jdetnet \n"
-                 " -n <number of cores> Number of cores to use (1 - 4)\n"
+                 " -d <number>          Number of dsp cores to use\n"
+                 " -e <number>          Number of eve cores to use\n"
                   " -i <image>           Path to the image file\n"
                   "                      Default is 1 frame in testvecs\n"
                   " -i camera            Use camera as input\n"
diff --git a/tidl_api/Makefile b/tidl_api/Makefile

index 05a3704af885d2f055b38b0f5f74c92748a485eb..3fc6a2c1922f86dc09aa5e64d83cf7e0c5873f21 100644 (file)
--- a/tidl_api/Makefile
+++ b/tidl_api/Makefile
@@ -39,7 +39,8 @@ AR = ar
  
  
  SRCS = ocl_device.cpp configuration_parser.cpp configuration.cpp\
-          executor.cpp execution_object.cpp trace.cpp util.cpp
+          executor.cpp execution_object.cpp trace.cpp util.cpp \
+           execution_object_pipeline.cpp
  SRCS_IMGUTIL = imgutil.cpp
  
  OBJS = $(SRCS:.cpp=.o)
@@ -53,8 +54,7 @@ HOST_OBJ_IMGUTIL_FILES = $(addprefix obj/,$(OBJS_IMGUTIL))
  HEADERS  = src/common_defines.h src/executor_impl.h src/ocl_device.h
  HEADERS += src/parameters.h src/tidl_create_params.h src/trace.h src/util.h
  HEADERS += inc/configuration.h inc/execution_object.h inc/executor.h
-HEADERS += inc/imgutil.h src/device_arginfo.h
-
+HEADERS += inc/imgutil.h src/device_arginfo.h inc/execution_object_pipeline.h
  
  ifeq ($(BUILD), debug)
         CXXFLAGS += -Og -g -ggdb
diff --git a/tidl_api/inc/execution_object.h b/tidl_api/inc/execution_object.h

index e78ad2e98f0ec8ffd4305536bca46c368dbb4f3b..c1d86fc126bb8e243a67df04d30bfb5c3aca63d8 100644 (file)
--- a/tidl_api/inc/execution_object.h
+++ b/tidl_api/inc/execution_object.h
@@ -31,6 +31,7 @@
  #pragma once
  
  #include <memory>
+#include "execution_object_internal.h"
  
  namespace tidl {
  
@@ -39,13 +40,12 @@ class Device;
  class LayerOutput;
  class IODeviceArgInfo;
  
-typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs;
  
  /*! @class ExecutionObject
      @brief Runs the TIDL network on an OpenCL device
  */
  
-class ExecutionObject
+class ExecutionObject : public ExecutionObjectInternalInterface
  {
      public:
  
@@ -55,6 +55,8 @@ class ExecutionObject
                          const  ArgInfo& create_arg,
                          const  ArgInfo& param_heap_arg,
                          size_t extmem_heap_size,
+                        int    layersGroupId,
+                        bool   output_trace,
                          bool   internal_input);
          //! @private
          ~ExecutionObject();
@@ -62,52 +64,56 @@ class ExecutionObject
          //! Specify the input and output buffers used by the EO
          //! @param in buffer used for input.
          //! @param out buffer used for output.
-        void SetInputOutputBuffer (const ArgInfo& in, const ArgInfo& out);
+        void SetInputOutputBuffer(const ArgInfo& in,
+                                  const ArgInfo& out) override;
  
          //! Returns a pointer to the input buffer set via SetInputOutputBuffer
-        char* GetInputBufferPtr() const;
+        char* GetInputBufferPtr() const override;
  
          //! Returns size of the input buffer
-        size_t GetInputBufferSizeInBytes() const;
+        size_t GetInputBufferSizeInBytes() const override;
+
+        //! Returns a pointer to the output buffer
+        char* GetOutputBufferPtr() const override;
+
+        //! Returns size of the output buffer
+        size_t GetOutputBufferSizeInBytes() const override;
  
          //! @brief Set the frame index of the frame currently processed by the
          //! ExecutionObject. Used for trace/debug messages
          //! @param idx index of the frame
-        void  SetFrameIndex(int idx);
+        void  SetFrameIndex(int idx) override;
  
          //! Returns the index of a frame being processed (set by SetFrameIndex)
-        int   GetFrameIndex() const;
-
-        //! Returns a pointer to the output buffer
-        char* GetOutputBufferPtr() const;
-
-        //! Returns the number of bytes written to the output buffer
-        size_t GetOutputBufferSizeInBytes() const;
+        int   GetFrameIndex() const override;
  
-        //! @brief Start processing a frame. The call is asynchronous and returns
-        //! immediately. Use ExecutionObject::ProcessFrameWait to wait
-        bool ProcessFrameStartAsync();
+        //! @brief Start processing a frame. The call is asynchronous and
+        //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait
+        bool ProcessFrameStartAsync() override;
  
          //! Wait for the execution object to complete processing a frame
          //! @return false if ExecutionObject::ProcessFrameWait was called
          //! without a corresponding call to
          //! ExecutionObject::ProcessFrameStartAsync.
-        bool ProcessFrameWait();
-
-        //! @brief return the number of cycles taken *on the device* to
-        //! execute the process call
-        //! @return Number of cycles to process a frame on the device.
-        uint64_t GetProcessCycles() const;
+        bool ProcessFrameWait() override;
  
          //! @brief return the number of milliseconds taken *on the device* to
          //! execute the process call
          //! @return Number of milliseconds to process a frame on the device.
-        float    GetProcessTimeInMilliSeconds() const;
+        float GetProcessTimeInMilliSeconds() const override;
+
+        //! @brief return the number of milliseconds taken *on the host* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the host.
+        float GetHostProcessTimeInMilliSeconds() const override;
+
+        //! Returns the device name that the ExecutionObject runs on
+        const std::string& GetDeviceName() const override;
  
          //! Write the output buffer for each layer to a file
-        //! <filename_prefix>_<ID>_HxW.bin
+        //! \<filename_prefix>_<ID>_HxW.bin
          void WriteLayerOutputsToFile(const std::string& filename_prefix=
-                                     "trace_dump_") const;
+                                     "trace_dump_") const override;
  
          //! Returns a LayerOutput object corresponding to a layer.
          //! Caller is responsible for deleting the LayerOutput object.
@@ -116,10 +122,13 @@ class ExecutionObject
          //! @param output_index The output index of the buffer for a given
          //!                     layer. Defaults to 0.
          const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
-                                              uint32_t output_index=0) const;
+                                       uint32_t output_index=0) const override;
  
          //! Get output buffers from all layers
-        const LayerOutputs* GetOutputsFromAllLayers() const;
+        const LayerOutputs* GetOutputsFromAllLayers() const override;
+
+        //! Returns the layersGrupId that the ExecutionObject is processing
+        int   GetLayersGroupId() const;
  
          //! @private
          // Used by the Executor
@@ -127,12 +136,16 @@ class ExecutionObject
          bool RunAsync(CallType ct);
          bool Wait    (CallType ct);
  
+        //! @private
+        // Used by the ExecutionObjectPipeline
+        bool AddCallback(CallType ct, void *user_data);
+        void AcquireLock();
+        void ReleaseLock();
+
          ExecutionObject()                                  = delete;
          ExecutionObject(const ExecutionObject&)            = delete;
          ExecutionObject& operator=(const ExecutionObject&) = delete;
  
-        void EnableOutputBufferTrace();
-
          //! @private
          void SetInputOutputBuffer(const IODeviceArgInfo* in,
                                    const IODeviceArgInfo* out);
diff --git a/tidl_api/inc/execution_object_internal.h b/tidl_api/inc/execution_object_internal.h

new file mode 100644 (file)

index 0000000..816da94
--- /dev/null
+++ b/tidl_api/inc/execution_object_internal.h
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+/*! @file execution_object_internal.h */
+
+#pragma once
+
+namespace tidl {
+
+class LayerOutput;
+
+typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs;
+
+/*! @cond HIDDEN_SYMBOLS
+    @class ExecutionObjectInternalInterface
+    @brief Internal interface for running the TIDL network on OpenCL devices
+           Do not use this internal class directly.
+           Please use ExecutionObject or ExecutionObejctPipeline instead.
+*/
+class ExecutionObjectInternalInterface
+{
+    public:
+        virtual ~ExecutionObjectInternalInterface() {};
+
+        //! Specify the input and output buffers used by the EO
+        //! @param in buffer used for input.
+        //! @param out buffer used for output.
+        virtual void SetInputOutputBuffer(const ArgInfo& in,
+                                          const ArgInfo& out) =0;
+
+        //! Returns a pointer to the input buffer set via SetInputOutputBuffer
+        virtual char* GetInputBufferPtr() const =0;
+
+        //! Returns size of the input buffer
+        virtual size_t GetInputBufferSizeInBytes() const =0;
+
+        //! Returns a pointer to the output buffer
+        virtual char* GetOutputBufferPtr() const =0;
+
+        //! Returns size of the output buffer
+        virtual size_t GetOutputBufferSizeInBytes() const =0;
+
+        //! @brief Set the frame index of the frame currently processed by the
+        //! ExecutionObject. Used for trace/debug messages
+        //! @param idx index of the frame
+        virtual void  SetFrameIndex(int idx) =0;
+
+        //! Returns the index of a frame being processed (set by SetFrameIndex)
+        virtual int   GetFrameIndex() const =0;
+
+        //! @brief Start processing a frame. The call is asynchronous and returns
+        //! immediately. Use ExecutionObject::ProcessFrameWait to wait
+        virtual bool ProcessFrameStartAsync() =0;
+
+        //! Wait for the execution object to complete processing a frame
+        //! @return false if ExecutionObject::ProcessFrameWait was called
+        //! without a corresponding call to
+        //! ExecutionObject::ProcessFrameStartAsync.
+        virtual bool ProcessFrameWait() =0;
+
+        //! @brief return the number of milliseconds taken *on the device* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the device.
+        virtual float GetProcessTimeInMilliSeconds() const =0;
+
+        //! @brief return the number of milliseconds taken *on the host* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the host.
+        virtual float GetHostProcessTimeInMilliSeconds() const =0;
+
+        //! Returns the device name that the ExecutionObject runs on
+        virtual const std::string& GetDeviceName() const =0;
+
+        //! Write the output buffer for each layer to a file
+        //! \<filename_prefix>_<ID>_HxW.bin
+        virtual void WriteLayerOutputsToFile(const std::string& filename_prefix=
+                                             "trace_dump_") const =0;
+
+        //! Returns a LayerOutput object corresponding to a layer.
+        //! Caller is responsible for deleting the LayerOutput object.
+        //! @see LayerOutput
+        //! @param layer_index The layer index of the layer
+        //! @param output_index The output index of the buffer for a given
+        //!                     layer. Defaults to 0.
+        virtual const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
+                                             uint32_t output_index=0) const =0;
+
+        //! Get output buffers from all layers
+        virtual const LayerOutputs* GetOutputsFromAllLayers() const =0;
+};
+/*!  @endcond
+*/
+
+} // namespace tidl
diff --git a/tidl_api/inc/execution_object_pipeline.h b/tidl_api/inc/execution_object_pipeline.h

new file mode 100644 (file)

index 0000000..aaa6cf0
--- /dev/null
+++ b/tidl_api/inc/execution_object_pipeline.h
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+//! @file execution_object_pipeline.h
+
+#pragma once
+#include <string>
+#include <vector>
+#include <cstdint>
+#include <cassert>
+
+#include "executor.h"
+#include "execution_object_internal.h"
+#include "execution_object.h"
+
+namespace tidl {
+
+/*! @class ExecutionObjectPipeline
+    @brief Manages the pipelined execution using multiple ExecutionObjects.
+    Each executor runs one layersGroup of the network.  ExecutionObjects
+    must run consecutive layersGroups to form a pipelined execution.
+*/
+class ExecutionObjectPipeline : public ExecutionObjectInternalInterface
+{
+    public:
+        //! @brief Create an ExecutionObjectPipeline object.
+        //!
+        //! The ExecutionObjectPipeline will take the provided ExecutionObjects
+        //! to create an execution pipeline.  E.g.
+        //! @code
+        //!   Configuration config("path to configuration file");
+        //!   DeviceIds ids = {DeviceId::ID0, DeviceId::ID1};
+        //!   Executor exe_eve(DeviceType::EVE, ids, config, 1);
+        //!   Executor exe_dsp(DeviceType::DSP, ids, config, 2);
+        //!   ExecutionObjectPipeline ep0({exe_eve[0], exe_dsp[0]});
+        //!   ExecutionObjectPipeline ep1({exe_eve[1], exe_dsp[1]});
+        //! @endcode
+        //!
+        //! @param eos DSP or EVE ExecutionObjects forming a pipeline
+        ExecutionObjectPipeline(std::vector<ExecutionObject*> eos);
+
+        //! @brief Tear down an ExecutionObjectPipeline and free used resources
+        ~ExecutionObjectPipeline();
+
+        //! Specify the input and output buffers used by the EOP
+        //! @param in buffer used for input.
+        //! @param out buffer used for output.
+        void SetInputOutputBuffer (const ArgInfo& in,
+                                   const ArgInfo& out) override;
+
+        //! Returns a pointer to the input buffer
+        char* GetInputBufferPtr() const override;
+
+        //! Returns size of the input buffer
+        size_t GetInputBufferSizeInBytes() const override;
+
+        //! Returns a pointer to the output buffer
+        char* GetOutputBufferPtr() const override;
+
+        //! Returns the number of bytes written to the output buffer
+        size_t GetOutputBufferSizeInBytes() const override;
+
+        //! @brief Set the frame index of the frame currently processed by the
+        //! ExecutionObjectPipeline. Used for trace/debug messages
+        //! @param idx index of the frame
+        void SetFrameIndex(int idx) override;
+
+        //! Returns the index of a frame being processed (set by SetFrameIndex)
+        int  GetFrameIndex() const override;
+
+        //! @brief Start processing a frame. The call is asynchronous and
+        //! returns immediately. Use ProcessFrameWait() to wait
+        bool ProcessFrameStartAsync() override;
+
+        //! Wait for the executor pipeline to complete processing a frame
+        //! @return false if ProcessFrameWait() was called
+        //! without a corresponding call to
+        //! ExecutionObjectPipeline::ProcessFrameStartAsync().
+        bool ProcessFrameWait() override;
+
+        //! @brief return the number of milliseconds taken *on the device* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the device.
+        float GetProcessTimeInMilliSeconds() const override;
+
+        //! @brief return the number of milliseconds taken *on the host* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the host.
+        float GetHostProcessTimeInMilliSeconds() const override;
+
+        //! Return the combined device names that this pipeline runs on
+        const std::string& GetDeviceName() const override;
+
+        //! Write the output buffer for each layer to a file
+        //! \<filename_prefix>_<ID>_HxW.bin
+        void WriteLayerOutputsToFile(const std::string& filename_prefix=
+                                     "trace_dump_") const override;
+
+        //! Returns a LayerOutput object corresponding to a layer.
+        //! Caller is responsible for deleting the LayerOutput object.
+        //! @see LayerOutput
+        //! @param layer_index The layer index of the layer
+        //! @param output_index The output index of the buffer for a given
+        //!                     layer. Defaults to 0.
+        const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
+                                       uint32_t output_index=0) const override;
+
+        //! Get output buffers from all layers
+        const LayerOutputs* GetOutputsFromAllLayers() const override;
+
+        //! @private Used by runtime
+        //! @brief callback function at the completion of each ExecutionObject,
+        //! to chain the next ExectionObject for execution
+        void RunAsyncNext();
+
+        ExecutionObjectPipeline()                                     = delete;
+        ExecutionObjectPipeline(const ExecutionObjectPipeline&)       = delete;
+        ExecutionObjectPipeline& operator=(const ExecutionObjectPipeline&)
+                                                                      = delete;
+
+    private:
+        class Impl;
+        std::unique_ptr<Impl> pimpl_m;
+};
+
+} // namespace tidl
diff --git a/tidl_api/inc/executor.h b/tidl_api/inc/executor.h

index 23d92ffef1b2ff815ca3ae23a267e24472dd8b02..1febfeadc2c687a6ce047c16a5f53fef69e00d75 100644 (file)
--- a/tidl_api/inc/executor.h
+++ b/tidl_api/inc/executor.h
@@ -64,7 +64,7 @@ class ExecutionObject;
  typedef std::vector<std::unique_ptr<ExecutionObject>> ExecutionObjects;
  
  /*! @class Executor
-    @brief Manages the overall execution of a network using the
+    @brief Manages the overall execution of a layersGroup in a network using the
      specified configuration and the set of devices available to the
      executor.
  */
@@ -78,7 +78,7 @@ class Executor
          //! @code
          //!   Configuration configuration;
          //!   configuration.ReadFromFile("path to configuration file");
-        //!   DeviceIds ids1 = {DeviceId::ID2, DeviceId::ID3};
+        //!   DeviceIds ids = {DeviceId::ID2, DeviceId::ID3};
          //!   Executor executor(DeviceType::EVE, ids, configuration);
          //! @endcode
          //!
@@ -98,6 +98,9 @@ class Executor
          //! available on this instance of the Executor
          const ExecutionObjects& GetExecutionObjects() const;
  
+        //! Returns a single execution object at index
+        ExecutionObject* operator[](uint32_t index) const;
+
          //! @brief Returns the number of devices of the specified type
          //! available for TI DL.
          //! @param  device_type DSP or EVE/EVE device
@@ -106,7 +109,7 @@ class Executor
  
          //! @brief Returns a string corresponding to the API version
          //!
-        //! @return <major_ver>.<minor_ver>.<patch_ver>.<git_sha>
+        //! @return \<major_ver>.\<minor_ver>.\<patch_ver>.\<git_sha>
          static std::string GetAPIVersion();
  
          Executor(const Executor&) = delete;
diff --git a/tidl_api/src/execution_object.cpp b/tidl_api/src/execution_object.cpp

index d722ebb196669019fbc3071338397d62a9a79ba9..178bbcaeb9f256c47c38df355171622c8413638f 100644 (file)
--- a/tidl_api/src/execution_object.cpp
+++ b/tidl_api/src/execution_object.cpp
@@ -31,6 +31,9 @@
  #include <string.h>
  #include <fstream>
  #include <climits>
+#include <mutex>
+#include <condition_variable>
+#include <chrono>
  #include "executor.h"
  #include "execution_object.h"
  #include "trace.h"
@@ -50,13 +53,24 @@ class ExecutionObject::Impl
               const DeviceArgInfo& create_arg,
               const DeviceArgInfo& param_heap_arg,
               size_t extmem_heap_size,
+             int    layers_group_id,
+             bool   output_trace,
               bool   internal_input);
          ~Impl() {}
  
          bool RunAsync(CallType ct);
          bool Wait    (CallType ct);
+        bool AddCallback(CallType ct, void *user_data);
+
+        uint64_t GetProcessCycles() const;
+        int  GetLayersGroupId() const;
+        void AcquireLock();
+        void ReleaseLock();
  
          Device*                         device_m;
+        // Index of the OpenCL device/queue used by this EO
+        uint8_t                         device_index_m;
+        std::string                     device_name_m;
  
          up_malloc_ddr<char>             tidl_extmem_heap_m;
          up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
@@ -70,6 +84,9 @@ class ExecutionObject::Impl
          // Frame being processed by the EO
          int                             current_frame_idx_m;
  
+        // LayersGroupId being processed by the EO
+        int                             layers_group_id_m;
+
          // Trace related
          void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
  
@@ -81,25 +98,29 @@ class ExecutionObject::Impl
          up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
          size_t                            trace_buf_params_sz_m;
  
+        // host time tracking: eo start to finish
+        float host_time_m;
+
      private:
          void SetupInitializeKernel(const DeviceArgInfo& create_arg,
                                     const DeviceArgInfo& param_heap_arg,
                                     size_t extmem_heap_size,
                                     bool   internal_input);
+        void EnableOutputBufferTrace();
          void SetupProcessKernel();
  
          void HostWriteNetInput();
          void HostReadNetOutput();
          void ComputeInputOutputSizes();
  
-        // Index of the OpenCL device/queue used by this EO
-        uint8_t                         device_index_m;
-
          std::unique_ptr<Kernel>         k_initialize_m;
          std::unique_ptr<Kernel>         k_process_m;
          std::unique_ptr<Kernel>         k_cleanup_m;
  
-
+        // Guarding sole access to input/output for one frame during execution
+        bool                            is_idle_m;
+        std::mutex                      mutex_access_m;
+        std::condition_variable         cv_access_m;
  };
  
  
@@ -108,6 +129,8 @@ ExecutionObject::ExecutionObject(Device* d,
                                   const ArgInfo& create_arg,
                                   const ArgInfo& param_heap_arg,
                                   size_t extmem_heap_size,
+                                 int    layers_group_id,
+                                 bool   output_trace,
                                   bool   internal_input)
  {
      DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
@@ -118,6 +141,8 @@ ExecutionObject::ExecutionObject(Device* d,
                                            create_arg_d,
                                            param_heap_arg_d,
                                            extmem_heap_size,
+                                          layers_group_id,
+                                          output_trace,
                                            internal_input) };
  }
  
@@ -127,8 +152,11 @@ ExecutionObject::Impl::Impl(Device* d,
                                   const DeviceArgInfo& create_arg,
                                   const DeviceArgInfo& param_heap_arg,
                                   size_t extmem_heap_size,
+                                 int    layers_group_id,
+                                 bool   output_trace,
                                   bool   internal_input):
      device_m(d),
+    device_index_m(device_index),
      tidl_extmem_heap_m (nullptr, &__free_ddr),
      shared_initialize_params_m(nullptr, &__free_ddr),
      shared_process_params_m(nullptr, &__free_ddr),
@@ -137,23 +165,26 @@ ExecutionObject::Impl::Impl(Device* d,
      in_m(),
      out_m(),
      current_frame_idx_m(0),
+    layers_group_id_m(layers_group_id),
      num_network_layers_m(0),
      trace_buf_params_m(nullptr, &__free_ddr),
      trace_buf_params_sz_m(0),
-    device_index_m(device_index),
      k_initialize_m(nullptr),
      k_process_m(nullptr),
-    k_cleanup_m(nullptr)
+    k_cleanup_m(nullptr),
+    is_idle_m(true)
  {
-    SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
-                          internal_input);
-
-    SetupProcessKernel();
-
+    device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
      // Save number of layers in the network
      const TIDL_CreateParams* cp =
                  static_cast<const TIDL_CreateParams *>(create_arg.ptr());
      num_network_layers_m = cp->net.numLayers;
+
+    SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
+                          internal_input);
+
+    if (output_trace)  EnableOutputBufferTrace();
+    SetupProcessKernel();
  }
  
  // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
@@ -168,9 +199,7 @@ char* ExecutionObject::GetInputBufferPtr() const
  
  size_t ExecutionObject::GetInputBufferSizeInBytes() const
  {
-    const DeviceArgInfo& arg = pimpl_m->in_m.GetArg();
-    if    (arg.ptr() == nullptr)  return pimpl_m->in_size_m;
-    else                          return arg.size();
+    return pimpl_m->in_size_m;
  }
  
  char* ExecutionObject::GetOutputBufferPtr() const
@@ -180,11 +209,7 @@ char* ExecutionObject::GetOutputBufferPtr() const
  
  size_t ExecutionObject::GetOutputBufferSizeInBytes() const
  {
-    const DeviceArgInfo& arg = pimpl_m->out_m.GetArg();
-    if   (arg.ptr() == nullptr)
-        return pimpl_m->out_size_m;
-    else
-        return pimpl_m->shared_process_params_m.get()->bytesWritten;
+    return pimpl_m->out_size_m;
  }
  
  void  ExecutionObject::SetFrameIndex(int idx)
@@ -199,8 +224,8 @@ int ExecutionObject::GetFrameIndex() const
  
  void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
  {
-    assert(in.ptr() != nullptr && in.size() > 0);
-    assert(out.ptr() != nullptr && out.size() > 0);
+    assert(in.ptr()  != nullptr && in.size()  >= pimpl_m->in_size_m);
+    assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
  
      pimpl_m->in_m  = IODeviceArgInfo(in);
      pimpl_m->out_m = IODeviceArgInfo(out);
@@ -215,6 +240,7 @@ void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
  
  bool ExecutionObject::ProcessFrameStartAsync()
  {
+    assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
      return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
  }
  
@@ -233,21 +259,26 @@ bool ExecutionObject::Wait (CallType ct)
      return pimpl_m->Wait(ct);
  }
  
-uint64_t ExecutionObject::GetProcessCycles() const
+bool ExecutionObject::AddCallback(CallType ct, void *user_data)
  {
-    uint8_t factor = 1;
-
-    // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
-    if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM)
-        factor = 2;
-
-    return pimpl_m->shared_process_params_m.get()->cycles * factor;
+    return pimpl_m->AddCallback(ct, user_data);
  }
  
  float ExecutionObject::GetProcessTimeInMilliSeconds() const
  {
      float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
-    return ((float)GetProcessCycles())/frequency * 1000;
+    return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000;
+}
+
+float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
+{
+    return pimpl_m->host_time_m;
+}
+
+void
+ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
+{
+    pimpl_m->WriteLayerOutputsToFile(filename_prefix);
  }
  
  const LayerOutput* ExecutionObject::GetOutputFromLayer(
@@ -261,37 +292,25 @@ const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
      return pimpl_m->GetOutputsFromAllLayers();
  }
  
-//
-// Allocate an OpenCL buffer for TIDL layer output buffer metadata.
-// The device will populate metadata for every buffer that is used as an
-// output buffer by a layer.
-//
-void ExecutionObject::EnableOutputBufferTrace()
+int ExecutionObject::GetLayersGroupId() const
  {
-    pimpl_m->trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
-                                       pimpl_m->num_network_layers_m*
-                                       TIDL_NUM_OUT_BUFS);
-
-    pimpl_m->trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
-                                      (pimpl_m->trace_buf_params_sz_m));
+    return pimpl_m->layers_group_id_m;
+}
  
-    // Device will update bufferId if there is valid data for the entry
-    OCL_TIDL_BufParams* bufferParams = pimpl_m->trace_buf_params_m.get();
-    for (uint32_t i = 0; i < pimpl_m->num_network_layers_m; i++)
-        for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
-        {
-            OCL_TIDL_BufParams *bufP =
-                                &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
-            bufP->bufferId = UINT_MAX;
-        }
+const std::string& ExecutionObject::GetDeviceName() const
+{
+    return pimpl_m->device_name_m;
  }
  
-void
-ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
+void ExecutionObject::AcquireLock()
  {
-    pimpl_m->WriteLayerOutputsToFile(filename_prefix);
+    pimpl_m->AcquireLock();
  }
  
+void ExecutionObject::ReleaseLock()
+{
+    pimpl_m->ReleaseLock();
+}
  
  //
  // Create a kernel to call the "initialize" function
@@ -342,6 +361,32 @@ ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
                                      device_index_m));
  }
  
+//
+// Allocate an OpenCL buffer for TIDL layer output buffer metadata.
+// The device will populate metadata for every buffer that is used as an
+// output buffer by a layer.  This needs to be done before setting up
+// process kernel.
+//
+void ExecutionObject::Impl::EnableOutputBufferTrace()
+{
+    trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
+                             num_network_layers_m*
+                             TIDL_NUM_OUT_BUFS);
+
+    trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
+                             (trace_buf_params_sz_m));
+
+    // Device will update bufferId if there is valid data for the entry
+    OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
+    for (uint32_t i = 0; i < num_network_layers_m; i++)
+        for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
+        {
+            OCL_TIDL_BufParams *bufP =
+                                &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
+            bufP->bufferId = UINT_MAX;
+        }
+}
+
  //
  // Create a kernel to call the "process" function
  //
@@ -514,10 +559,17 @@ bool ExecutionObject::Impl::RunAsync(CallType ct)
          }
          case CallType::PROCESS:
          {
+            std::chrono::time_point<std::chrono::steady_clock> t1, t2;
+            t1 = std::chrono::steady_clock::now();
+
              shared_process_params_m->frameIdx = current_frame_idx_m;
              shared_process_params_m->bytesWritten = 0;
              HostWriteNetInput();
              k_process_m->RunAsync();
+
+            t2 = std::chrono::steady_clock::now();
+            std::chrono::duration<float> elapsed = t2 - t1;
+            host_time_m = elapsed.count() * 1000;
              break;
          }
          case CallType::CLEANUP:
@@ -551,13 +603,20 @@ bool ExecutionObject::Impl::Wait(CallType ct)
          }
          case CallType::PROCESS:
          {
-            bool has_work = k_process_m->Wait();
+            float host_elapsed_ms = 0.0f;
+            bool has_work = k_process_m->Wait(&host_elapsed_ms);
              if (has_work)
              {
                  if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
                      throw Exception(shared_process_params_m->errorCode,
                                      __FILE__, __FUNCTION__, __LINE__);
+
+                std::chrono::time_point<std::chrono::steady_clock> t1, t2;
+                t1 = std::chrono::steady_clock::now();
                  HostReadNetOutput();
+                t2 = std::chrono::steady_clock::now();
+                std::chrono::duration<float> elapsed = t2 - t1;
+                host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
              }
  
              return has_work;
@@ -574,6 +633,33 @@ bool ExecutionObject::Impl::Wait(CallType ct)
      return false;
  }
  
+bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data)
+{
+    switch (ct)
+    {
+        case CallType::PROCESS:
+        {
+            return k_process_m->AddCallback(user_data);
+            break;
+        }
+        default:
+            return false;
+    }
+
+    return false;
+}
+
+uint64_t ExecutionObject::Impl::GetProcessCycles() const
+{
+    uint8_t factor = 1;
+
+    // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
+    if (device_m->type() == CL_DEVICE_TYPE_CUSTOM)
+        factor = 2;
+
+    return shared_process_params_m.get()->cycles * factor;
+}
+
  //
  // Write the trace data to output files
  //
@@ -697,3 +783,16 @@ LayerOutput::~LayerOutput()
  {
      delete[] data_m;
  }
+
+void ExecutionObject::Impl::AcquireLock()
+{
+    std::unique_lock<std::mutex> lock(mutex_access_m);
+    cv_access_m.wait(lock, [this]{ return this->is_idle_m; });
+    is_idle_m = false;
+}
+
+void ExecutionObject::Impl::ReleaseLock()
+{
+    is_idle_m = true;
+    cv_access_m.notify_all();
+}
diff --git a/tidl_api/src/execution_object_pipeline.cpp b/tidl_api/src/execution_object_pipeline.cpp

new file mode 100644 (file)

index 0000000..ff84255
--- /dev/null
+++ b/tidl_api/src/execution_object_pipeline.cpp
@@ -0,0 +1,360 @@
+/******************************************************************************
+ * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include <assert.h>
+#include <mutex>
+#include <condition_variable>
+#include <chrono>
+#include "device_arginfo.h"
+#include "execution_object_pipeline.h"
+
+using namespace tidl;
+
+class ExecutionObjectPipeline::Impl
+{
+    public:
+        Impl(std::vector<ExecutionObject*> &eos);
+        ~Impl();
+
+        void SetInputOutputBuffer(const ArgInfo &in, const ArgInfo &out);
+        bool RunAsyncStart();
+        bool RunAsyncNext();
+        bool Wait();
+
+        // Trace related
+        void WriteLayerOutputsToFile(const std::string& filename_prefix) const;
+        const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
+                                              uint32_t output_index) const;
+        const LayerOutputs* GetOutputsFromAllLayers() const;
+
+        //! for pipelined execution
+        std::vector<ExecutionObject*> eos_m;
+        std::vector<IODeviceArgInfo*> iobufs_m;
+
+        std::string device_name_m;
+
+        //! current frame index
+        int frame_idx_m;
+
+        //! current execution object index
+        uint32_t curr_eo_idx_m;
+
+        // host time tracking: pipeline start to finish
+        float host_time_m;
+
+    private:
+        //! @brief Initialize ExecutionObjectPipeline with given
+        //! ExecutionObjects: check consecutive layersGroup, allocate memory
+        void Initialize();
+
+        // flag, mutex and cond var for signaling completion and waiting
+        bool has_work_m, is_processed_m;
+        std::mutex mutex_m;
+        std::condition_variable cv_m;
+
+        // host time tracking: pipeline start to finish
+        std::chrono::time_point<std::chrono::steady_clock> start_m;
+};
+
+ExecutionObjectPipeline::ExecutionObjectPipeline(
+    std::vector<ExecutionObject*> eos)
+{
+    pimpl_m = std::unique_ptr<Impl> { new Impl(eos) };
+}
+
+ExecutionObjectPipeline::Impl::Impl(std::vector<ExecutionObject *> &eos) :
+    eos_m(eos), has_work_m(false), is_processed_m(false)
+{
+    Initialize();
+}
+
+// Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
+// Both unique_ptr and shared_ptr can be instantiated with an incomplete type
+// unique_ptr's destructor requires a complete type in order to invoke delete
+ExecutionObjectPipeline::~ExecutionObjectPipeline() = default;
+
+char* ExecutionObjectPipeline::GetInputBufferPtr() const
+{
+    return static_cast<char *>(pimpl_m->iobufs_m.front()->GetArg().ptr());
+}
+
+size_t ExecutionObjectPipeline::GetInputBufferSizeInBytes() const
+{
+    return pimpl_m->eos_m.front()->GetInputBufferSizeInBytes();
+}
+
+char* ExecutionObjectPipeline::GetOutputBufferPtr() const
+{
+    return static_cast<char *>(pimpl_m->iobufs_m.back()->GetArg().ptr());
+}
+
+size_t ExecutionObjectPipeline::GetOutputBufferSizeInBytes() const
+{
+    return pimpl_m->eos_m.back()->GetOutputBufferSizeInBytes();
+}
+
+void ExecutionObjectPipeline::SetInputOutputBuffer(const ArgInfo& in,
+                                                   const ArgInfo& out)
+{
+    assert(in.ptr() != nullptr  && in.size() >= GetInputBufferSizeInBytes());
+    assert(out.ptr() != nullptr && out.size() >= GetOutputBufferSizeInBytes());
+    pimpl_m->SetInputOutputBuffer(in, out);
+}
+
+void ExecutionObjectPipeline::SetFrameIndex(int idx)
+{
+    pimpl_m->frame_idx_m = idx;
+}
+
+int ExecutionObjectPipeline::GetFrameIndex() const
+{
+    return pimpl_m->frame_idx_m;
+}
+
+bool ExecutionObjectPipeline::ProcessFrameStartAsync()
+{
+    assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
+    bool st = pimpl_m->RunAsyncStart();
+    if (st)
+        st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS,
+                                            this);
+    return st;
+}
+
+bool ExecutionObjectPipeline::ProcessFrameWait()
+{
+    return pimpl_m->Wait();
+}
+
+void CallbackWrapper(void *user_data)
+{
+    ((ExecutionObjectPipeline *) user_data)->RunAsyncNext();
+}
+
+void ExecutionObjectPipeline::RunAsyncNext()
+{
+    bool has_next = pimpl_m->RunAsyncNext();
+    if (has_next)
+        pimpl_m->eos_m[pimpl_m->curr_eo_idx_m]->AddCallback(
+                                     ExecutionObject::CallType::PROCESS, this);
+}
+
+float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const
+{
+    float total = 0.0f;
+    for (auto eo : pimpl_m->eos_m)
+        total += eo->GetProcessTimeInMilliSeconds();
+    return total;
+}
+
+float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds() const
+{
+    return pimpl_m->host_time_m;
+}
+
+const std::string& ExecutionObjectPipeline::GetDeviceName() const
+{
+    return pimpl_m->device_name_m;
+}
+
+void
+ExecutionObjectPipeline::WriteLayerOutputsToFile(
+    const std::string& filename_prefix) const
+{
+    pimpl_m->WriteLayerOutputsToFile(filename_prefix);
+}
+
+const LayerOutput*
+ExecutionObjectPipeline::GetOutputFromLayer(uint32_t layer_index,
+    uint32_t output_index) const
+{
+    return pimpl_m->GetOutputFromLayer(layer_index, output_index);
+}
+
+const LayerOutputs*
+ExecutionObjectPipeline::GetOutputsFromAllLayers() const
+{
+    return pimpl_m->GetOutputsFromAllLayers();
+}
+
+
+/// Impl methods start here
+
+
+static
+void* AllocateMem(size_t size)
+{
+    if (size == 0)  return nullptr;
+    void *ptr = malloc(size);
+    if (ptr == nullptr)
+        throw Exception("Out of memory, ExecutionObjectPipeline malloc failed",
+                        __FILE__, __FUNCTION__, __LINE__);
+    return ptr;
+}
+
+void ExecutionObjectPipeline::Impl::Initialize()
+{
+    // Check consecutive layersGroups to form a pipeline
+    int prev_group = 0;
+    for (auto eo : eos_m)
+    {
+        int group = eo->GetLayersGroupId();
+        if (prev_group != 0 && group != prev_group + 1)
+            throw Exception(
+                "Non-consecutive layersGroupIds in ExecutionObjectPipeline",
+                __FILE__, __FUNCTION__, __LINE__);
+        prev_group = group;
+    }
+
+    for (auto eo : eos_m)
+        device_name_m += eo->GetDeviceName() + "+";
+    device_name_m.resize(device_name_m.size() - 1);
+
+    // Allocate input and output memory for EOs/layersGroups
+    // Note that i-th EO's output buffer is the same as (i+1)-th EO's input
+    // So, if n EOs, then (n+1) buffers: b EO b EO b EO b ... EO b
+    // User must set the first input buffer and the last output buffer
+    size_t size;
+    ArgInfo in(nullptr, 0);
+    iobufs_m.push_back(new IODeviceArgInfo(in));
+    for (auto eo : eos_m)
+    {
+        if (eo != eos_m.back())
+            size = eo->GetOutputBufferSizeInBytes();
+        else
+            size = 0;
+
+        void *ptr = AllocateMem(size);
+        ArgInfo out(ptr, size);
+        iobufs_m.push_back(new IODeviceArgInfo(out));
+    }
+}
+
+ExecutionObjectPipeline::Impl::~Impl()
+{
+    int num_iobufs = iobufs_m.size();
+    for (int i = 0; i < num_iobufs; i++)
+    {
+        if (! (i == 0 || i == num_iobufs-1))
+            free(iobufs_m[i]->GetArg().ptr());
+        delete iobufs_m[i];
+    }
+}
+
+void ExecutionObjectPipeline::Impl::SetInputOutputBuffer(const ArgInfo &in,
+                                                         const ArgInfo &out)
+{
+    delete iobufs_m.front();
+    delete iobufs_m.back();
+    iobufs_m.front() = new IODeviceArgInfo(in);
+    iobufs_m.back()  = new IODeviceArgInfo(out);
+}
+
+bool ExecutionObjectPipeline::Impl::RunAsyncStart()
+{
+    start_m = std::chrono::steady_clock::now();
+    has_work_m = true;
+    is_processed_m = false;
+    host_time_m = 0.0f;
+    curr_eo_idx_m = 0;
+    eos_m[0]->AcquireLock();
+    eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1]);
+    return eos_m[0]->ProcessFrameStartAsync();
+}
+
+// returns true if we have more EOs to execute
+bool ExecutionObjectPipeline::Impl::RunAsyncNext()
+{
+    eos_m[curr_eo_idx_m]->ProcessFrameWait();
+    eos_m[curr_eo_idx_m]->ReleaseLock();
+    curr_eo_idx_m += 1;
+    if (curr_eo_idx_m < eos_m.size())
+    {
+        eos_m[curr_eo_idx_m]->AcquireLock();
+        eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m],
+                                                   iobufs_m[curr_eo_idx_m+1]);
+        eos_m[curr_eo_idx_m]->ProcessFrameStartAsync();
+        return true;
+    }
+    else
+    {
+        std::chrono::duration<float> elapsed = std::chrono::steady_clock::now()
+                                               - start_m;
+        host_time_m = elapsed.count() * 1000;  // seconds to milliseconds
+        is_processed_m = true;
+        cv_m.notify_all();
+        return false;
+    }
+}
+
+bool ExecutionObjectPipeline::Impl::Wait()
+{
+    if (! has_work_m)  return false;
+
+    std::unique_lock<std::mutex> lock(mutex_m);
+    cv_m.wait(lock, [this]{ return this->is_processed_m; });
+    has_work_m = false;
+    return true;
+}
+
+void
+ExecutionObjectPipeline::Impl::WriteLayerOutputsToFile(
+    const std::string& filename_prefix) const
+{
+    for (auto eo : eos_m)
+        eo->WriteLayerOutputsToFile(filename_prefix);
+}
+
+const LayerOutput*
+ExecutionObjectPipeline::Impl::GetOutputFromLayer(uint32_t layer_index,
+    uint32_t output_index) const
+{
+    const LayerOutput* lo = nullptr;
+    for (auto eo : eos_m)
+    {
+        lo = eo->GetOutputFromLayer(layer_index, output_index);
+        if (lo != nullptr)  break;
+    }
+    return lo;
+}
+
+const LayerOutputs*
+ExecutionObjectPipeline::Impl::GetOutputsFromAllLayers() const
+{
+    LayerOutputs *all = new LayerOutputs;
+    for (auto eo : eos_m)
+    {
+        LayerOutputs *los = const_cast<LayerOutputs *>(
+                                                eo->GetOutputsFromAllLayers());
+        for (auto& lo : *los)
+            all->push_back(std::unique_ptr<const LayerOutput>{ lo.release() });
+        delete los;
+    }
+    return all;
+}
+
diff --git a/tidl_api/src/executor.cpp b/tidl_api/src/executor.cpp

index b644728afe8f92a75dc65f3bd200ec8aaa8e26fe..914c78ab58104eeba379db5ae8305e45537d007e 100644 (file)
--- a/tidl_api/src/executor.cpp
+++ b/tidl_api/src/executor.cpp
@@ -96,6 +96,12 @@ const ExecutionObjects& Executor::GetExecutionObjects() const
      return pimpl_m->execution_objects_m;
  }
  
+ExecutionObject* Executor::operator[](uint32_t index) const
+{
+    assert(index < pimpl_m->execution_objects_m.size());
+    return pimpl_m->execution_objects_m[index].get();
+}
+
  bool ExecutorImpl::Initialize(const Configuration& configuration)
  {
      configuration_m = configuration;
@@ -145,13 +151,11 @@ bool ExecutorImpl::Initialize(const Configuration& configuration)
               {new ExecutionObject(device_m.get(), index,
                                    create_arg, param_heap_arg,
                                    configuration_m.EXTMEM_HEAP_SIZE,
+                                  layers_group_id_m,
+                                  configuration_m.enableOutputTrace,
                                    configuration_m.enableInternalInput)} );
      }
  
-    if (configuration_m.enableOutputTrace)
-        for (auto &eo : execution_objects_m)
-            eo->EnableOutputBufferTrace();
-
      for (auto &eo : execution_objects_m)
          eo->RunAsync(ExecutionObject::CallType::INIT);
  
@@ -294,4 +298,3 @@ const char* Exception::what() const noexcept
  {
      return message_m.c_str();
  }
-
diff --git a/tidl_api/src/ocl_device.cpp b/tidl_api/src/ocl_device.cpp

index fba4f94305da4659de311f604a320ebec8219159..b3eaf36d4894a8c2f0b15f60d0d24889a9dcc4fa 100644 (file)
--- a/tidl_api/src/ocl_device.cpp
+++ b/tidl_api/src/ocl_device.cpp
@@ -91,7 +91,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename):
          // Queue 0 on device 0
          queue_m[0] = clCreateCommandQueue(context_m,
                                            device_ids[0],
-                                          0,
+                                          CL_QUEUE_PROFILING_ENABLE,
                                            &errcode);
          errorCheck(errcode, __LINE__);
          BuildProgramFromBinary(binary_filename, device_ids, 1);
@@ -139,7 +139,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename):
              int index = static_cast<int>(id);
              queue_m[index] = clCreateCommandQueue(context_m,
                                            sub_devices[index],
-                                          0,
+                                          CL_QUEUE_PROFILING_ENABLE,
                                            &errcode);
              errorCheck(errcode, __LINE__);
          }
@@ -187,7 +187,7 @@ EveDevice::EveDevice(const DeviceIds& ids, const std::string &kernel_names):
          int index = static_cast<int>(id);
          queue_m[index] = clCreateCommandQueue(context_m,
                                        all_device_ids[index],
-                                      0,
+                                      CL_QUEUE_PROFILING_ENABLE,
                                        &errcode);
          errorCheck(errcode, __LINE__);
      }
@@ -317,7 +317,7 @@ Kernel& Kernel::RunAsync()
  }
  
  
-bool Kernel::Wait()
+bool Kernel::Wait(float *host_elapsed_ms)
  {
      // Wait called without a corresponding RunAsync
      if (!is_running_m)
@@ -326,6 +326,17 @@ bool Kernel::Wait()
      TRACE::print("\tKernel: waiting...\n");
      cl_int ret = clWaitForEvents(1, &event_m);
      errorCheck(ret, __LINE__);
+
+    if (host_elapsed_ms != nullptr)
+    {
+        cl_ulong t_que, t_end;
+        clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_QUEUED,
+                                sizeof(cl_ulong), &t_que, nullptr);
+        clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_END,
+                                sizeof(cl_ulong), &t_end, nullptr);
+        *host_elapsed_ms = (t_end - t_que) / 1.0e6;  // nano to milli seconds
+    }
+
      ret = clReleaseEvent(event_m);
      errorCheck(ret, __LINE__);
      TRACE::print("\tKernel: finished execution\n");
@@ -334,6 +345,22 @@ bool Kernel::Wait()
      return true;
  }
  
+extern void CallbackWrapper(void *user_data) __attribute__((weak));
+
+static
+void EventCallback(cl_event event, cl_int exec_status, void *user_data)
+{
+    if (exec_status != CL_SUCCESS || user_data == nullptr)  return;
+    if (CallbackWrapper)  CallbackWrapper(user_data);
+}
+
+bool Kernel::AddCallback(void *user_data)
+{
+    if (! is_running_m)  return false;
+    return clSetEventCallback(event_m, CL_COMPLETE, EventCallback, user_data)
+           == CL_SUCCESS;
+}
+
  Kernel::~Kernel()
  {
      for (auto b : buffers_m)
diff --git a/tidl_api/src/ocl_device.h b/tidl_api/src/ocl_device.h

index 6e80166e5f77b9a046d757c821b7a1ca979a6015..04c5db6af4fef1f0ab7984f139a98a495ef0a2ba 100644 (file)
--- a/tidl_api/src/ocl_device.h
+++ b/tidl_api/src/ocl_device.h
@@ -74,6 +74,8 @@ class Device
  
          static uint32_t GetNumDevices(DeviceType device_type);
  
+        virtual std::string GetDeviceName() = 0;
+
      protected:
  
          static const int MAX_DEVICES = 4;
@@ -101,6 +103,8 @@ class DspDevice: public Device
          DspDevice(const DspDevice&)            = delete;
          DspDevice& operator=(const DspDevice&) = delete;
  
+        virtual std::string GetDeviceName() { return "DSP"; }
+
      protected:
          bool BuildProgramFromBinary(const std::string &binary_filename,
                                      cl_device_id device_ids[],
@@ -117,6 +121,8 @@ class EveDevice : public Device
          EveDevice(const EveDevice&)            = delete;
          EveDevice& operator=(const EveDevice&) = delete;
  
+        virtual std::string GetDeviceName() { return "EVE"; }
+
      protected:
          bool BuildProgramFromBinary(const std::string &kernel_names,
                                      cl_device_id device_ids[],
@@ -137,7 +143,8 @@ class Kernel
          ~Kernel();
  
          Kernel& RunAsync();
-        bool Wait();
+        bool Wait(float *host_elapsed_ms = nullptr);
+        bool AddCallback(void *user_data);
  
      private:
          cl_kernel           kernel_m;
author	Yuan Zhao <yuanzhao@ti.com>
	Fri, 10 Aug 2018 04:42:42 +0000 (23:42 -0500)
committer	Yuan Zhao <yuanzhao@ti.com>
	Mon, 20 Aug 2018 15:57:44 +0000 (10:57 -0500)
examples/ssd_multibox/main.cpp		patch \| blob \| history
tidl_api/Makefile		patch \| blob \| history
tidl_api/inc/execution_object.h		patch \| blob \| history
tidl_api/inc/execution_object_internal.h	[new file with mode: 0644]	patch \| blob
tidl_api/inc/execution_object_pipeline.h	[new file with mode: 0644]	patch \| blob
tidl_api/inc/executor.h		patch \| blob \| history
tidl_api/src/execution_object.cpp		patch \| blob \| history
tidl_api/src/execution_object_pipeline.cpp	[new file with mode: 0644]	patch \| blob
tidl_api/src/executor.cpp		patch \| blob \| history
tidl_api/src/ocl_device.cpp		patch \| blob \| history
tidl_api/src/ocl_device.h		patch \| blob \| history