From: Yuan Zhao <yuanzhao@ti.com>
Date: Fri, 10 Aug 2018 04:42:42 +0000 (-0500)
Subject: ExecutionObjectPipeline for executing layersGroups
X-Git-Tag: v01.01.00.00^2~16
X-Git-Url: https://git.ti.com/gitweb?p=tidl%2Ftidl-api.git;a=commitdiff_plain;h=1a42784dc57d81735218ec2dc85172a1ed4e8181

ExecutionObjectPipeline for executing layersGroups

- Add top level ExecutionObjectPipeline class to execute multiple
  layersGroups.
- An ExecutionObjectPipeline is constructed from multiple
  ExecutionObjects, each ExecutionObject executes one layersGroup
  in the network, together they execute consecutive layersGroups.
- Same look and feel as ExecutionObject, e.g. ProcessFrameStartAsync,
  ProcessFrameWait, GetInputBufferPointer, GetOutputBufferPointer
- MCT-1017, MCT-1029
---

diff --git a/examples/ssd_multibox/main.cpp b/examples/ssd_multibox/main.cpp
index 6d39dda..b302cfa 100644
--- a/examples/ssd_multibox/main.cpp
+++ b/examples/ssd_multibox/main.cpp
@@ -43,6 +43,7 @@
 
 #include "executor.h"
 #include "execution_object.h"
+#include "execution_object_pipeline.h"
 #include "configuration.h"
 #include "../segmentation/object_classes.h"
 
@@ -67,13 +68,13 @@ using namespace tidl;
 using namespace cv;
 
 
-bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
+bool RunConfiguration(const std::string& config_file,
+                      uint32_t num_dsps, uint32_t num_eves,
                       DeviceType device_type, std::string& input_file);
-bool ReadFrame(ExecutionObject& eo, int frame_idx,
+bool ReadFrame(ExecutionObjectPipeline& eop, int frame_idx,
                const Configuration& configuration, int num_frames,
                std::string& image_file, VideoCapture &cap);
-bool WriteFrameOutput(const ExecutionObject &eo_in,
-                      const ExecutionObject &eo_out,
+bool WriteFrameOutput(const ExecutionObjectPipeline& eop,
                       const Configuration& configuration);
 
 void ReportTime(int frame_index, std::string device_name, double elapsed_host,
@@ -81,7 +82,8 @@ void ReportTime(int frame_index, std::string device_name, double elapsed_host,
 
 static void ProcessArgs(int argc, char *argv[],
                         std::string& config,
-                        uint32_t& num_devices,
+                        uint32_t& num_dsps,
+                        uint32_t& num_eves,
                         DeviceType& device_type,
                         std::string& input_file);
 
@@ -110,18 +112,12 @@ int main(int argc, char *argv[])
     // Process arguments
     std::string config      = DEFAULT_CONFIG;
     std::string input_file  = DEFAULT_INPUT;
-    uint32_t num_devices    = 1;
+    uint32_t num_dsps    = 1;
+    uint32_t num_eves    = 1;
     DeviceType  device_type = DeviceType::EVE;
-    ProcessArgs(argc, argv, config, num_devices, device_type, input_file);
+    ProcessArgs(argc, argv, config, num_dsps, num_eves,
+                device_type, input_file);
 
-    // Use same number of EVEs and DSPs
-    num_devices = std::min(num_devices, std::min(num_eve, num_dsp));
-    if (num_devices == 0)
-    {
-        std::cout << "Partitioned execution requires at least 1 EVE and 1 DSP."
-                  << std::endl;
-        return EXIT_FAILURE;
-    }
     if ((object_class_table = GetObjectClassTable(config)) == nullptr)
     {
         std::cout << "No object classes defined for this config." << std::endl;
@@ -136,8 +132,8 @@ int main(int argc, char *argv[])
     std::cout << "Input: " << input_file << std::endl;
     std::string config_file = "../test/testvecs/config/infer/tidl_config_"
                               + config + ".txt";
-    bool status = RunConfiguration(config_file, num_devices, device_type,
-                                   input_file);
+    bool status = RunConfiguration(config_file, num_dsps, num_eves,
+                                   device_type, input_file);
 
     if (!status)
     {
@@ -149,12 +145,15 @@ int main(int argc, char *argv[])
     return EXIT_SUCCESS;
 }
 
-bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
+bool RunConfiguration(const std::string& config_file,
+                      uint32_t num_dsps, uint32_t num_eves,
                       DeviceType device_type, std::string& input_file)
 {
-    DeviceIds ids;
-    for (int i = 0; i < num_devices; i++)
-        ids.insert(static_cast<DeviceId>(i));
+    DeviceIds ids_eve, ids_dsp;
+    for (int i = 0; i < num_eves; i++)
+        ids_eve.insert(static_cast<DeviceId>(i));
+    for (int i = 0; i < num_dsps; i++)
+        ids_dsp.insert(static_cast<DeviceId>(i));
 
     // Read the TI DL configuration file
     Configuration configuration;
@@ -167,7 +166,7 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
     }
 
     // setup input
-    int num_frames = is_default_input ? 3 : 1;
+    int num_frames = is_default_input ? 9 : 9;
     VideoCapture cap;
     std::string image_file;
     if (is_camera_input)
@@ -192,82 +191,58 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
         // and configuration specified
         // EVE will run layersGroupId 1 in the network, while
         // DSP will run layersGroupId 2 in the network
-        Executor executor_eve(DeviceType::EVE, ids, configuration, 1);
-        Executor executor_dsp(DeviceType::DSP, ids, configuration, 2);
-
-        // Query Executor for set of ExecutionObjects created
-        const ExecutionObjects& execution_objects_eve =
-                                            executor_eve.GetExecutionObjects();
-        const ExecutionObjects& execution_objects_dsp =
-                                            executor_dsp.GetExecutionObjects();
-        int num_eos = execution_objects_eve.size();
-
-        // Allocate input and output buffers for each execution object
-        // Note that "out" is both the output of eo_eve and the input of eo_dsp
-        // This is how two layersGroupIds, 1 and 2, are tied together
+        Executor exe_eve(DeviceType::EVE, ids_eve, configuration, 1);
+        Executor exe_dsp(DeviceType::DSP, ids_dsp, configuration, 2);
+
+        // Construct ExecutionObjectPipeline that utilizes multiple
+        // ExecutionObjects to process a single frame, each ExecutionObject
+        // processes one layerGroup of the network
+        int num_eops = std::max(num_eves, num_dsps);
+        std::vector<ExecutionObjectPipeline *> eops;
+        for (int i = 0; i < num_eops; i++)
+            eops.push_back(new ExecutionObjectPipeline({exe_eve[i%num_eves],
+                                                        exe_dsp[i%num_dsps]}));
+
+        // Allocate input/output memory for each EOP
         std::vector<void *> buffers;
-        for (int i = 0; i < num_eos; i++)
+        for (auto eop : eops)
         {
-            ExecutionObject *eo_eve = execution_objects_eve[i].get();
-            size_t in_size  = eo_eve->GetInputBufferSizeInBytes();
-            size_t out_size = eo_eve->GetOutputBufferSizeInBytes();
-            ArgInfo in  = { ArgInfo(malloc(in_size),  in_size)  };
-            ArgInfo out = { ArgInfo(malloc(out_size), out_size) };
-            eo_eve->SetInputOutputBuffer(in, out);
-
-            ExecutionObject *eo_dsp = execution_objects_dsp[i].get();
-            size_t out2_size = eo_dsp->GetOutputBufferSizeInBytes();
-            ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
-            eo_dsp->SetInputOutputBuffer(out, out2);
-
-            buffers.push_back(in.ptr());
-            buffers.push_back(out.ptr());
-            buffers.push_back(out2.ptr());
+            size_t in_size  = eop->GetInputBufferSizeInBytes();
+            size_t out_size = eop->GetOutputBufferSizeInBytes();
+            void*  in_ptr   = malloc(in_size);
+            void*  out_ptr  = malloc(out_size);
+            assert(in_ptr != nullptr && out_ptr != nullptr);
+            buffers.push_back(in_ptr);
+            buffers.push_back(out_ptr);
+
+            ArgInfo in(in_ptr,   in_size);
+            ArgInfo out(out_ptr, out_size);
+            eop->SetInputOutputBuffer(in, out);
         }
 
-        #define MAX_NUM_EOS  4
-        struct timespec t0[MAX_NUM_EOS], t1, tloop0, tloop1;
+        struct timespec tloop0, tloop1;
         clock_gettime(CLOCK_MONOTONIC, &tloop0);
 
-        // Process frames with available execution objects in a pipelined manner
-        // additional num_eos iterations to flush the pipeline (epilogue)
-        ExecutionObject *eo_eve, *eo_dsp, *eo_input;
-        for (int frame_idx = 0;
-             frame_idx < num_frames + num_eos; frame_idx++)
+        // Process frames with ExecutionObjectPipelines in a pipelined manner
+        // additional num_eops iterations to flush pipeline (epilogue)
+        for (int frame_idx = 0; frame_idx < num_frames + num_eops; frame_idx++)
         {
-            eo_eve = execution_objects_eve[frame_idx % num_eos].get();
-            eo_dsp = execution_objects_dsp[frame_idx % num_eos].get();
+            ExecutionObjectPipeline* eop = eops[frame_idx % num_eops];
 
-            // Wait for previous frame on the same eo to finish processing
-            if (eo_dsp->ProcessFrameWait())
+            // Wait for previous frame on the same eop to finish processing
+            if (eop->ProcessFrameWait())
             {
-                int finished_idx = eo_dsp->GetFrameIndex();
-                clock_gettime(CLOCK_MONOTONIC, &t1);
-                ReportTime(finished_idx, "DSP",
-                           ms_diff(t0[finished_idx % num_eos], t1),
-                           eo_dsp->GetProcessTimeInMilliSeconds());
-
-                eo_input = execution_objects_eve[finished_idx % num_eos].get();
-                WriteFrameOutput(*eo_input, *eo_dsp, configuration);
+                ReportTime(eop->GetFrameIndex(), eop->GetDeviceName(),
+                           eop->GetHostProcessTimeInMilliSeconds(),
+                           eop->GetProcessTimeInMilliSeconds());
+                WriteFrameOutput(*eop, configuration);
             }
 
             // Read a frame and start processing it with current eo
-            if (ReadFrame(*eo_eve, frame_idx, configuration, num_frames,
+            if (ReadFrame(*eop, frame_idx, configuration, num_frames,
                           image_file, cap))
             {
-                clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
-                eo_eve->ProcessFrameStartAsync();
-
-                if (eo_eve->ProcessFrameWait())
-                {
-                    clock_gettime(CLOCK_MONOTONIC, &t1);
-                    ReportTime(frame_idx, "EVE",
-                               ms_diff(t0[frame_idx % num_eos], t1),
-                               eo_eve->GetProcessTimeInMilliSeconds());
-
-                    clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
-                    eo_dsp->ProcessFrameStartAsync();
-                }
+                eop->ProcessFrameStartAsync();
             }
         }
 
@@ -276,6 +251,8 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
                   << std::setw(6) << std::setprecision(4)
                   << ms_diff(tloop0, tloop1) << "ms" << std::endl;
 
+        for (auto eop : eops)
+            delete eop;
         for (auto b : buffers)
             free(b);
     }
@@ -305,15 +282,15 @@ void ReportTime(int frame_index, std::string device_name, double elapsed_host,
 }
 
 
-bool ReadFrame(ExecutionObject &eo, int frame_idx,
+bool ReadFrame(ExecutionObjectPipeline& eop, int frame_idx,
                const Configuration& configuration, int num_frames,
                std::string& image_file, VideoCapture &cap)
 {
     if (frame_idx >= num_frames)
         return false;
-    eo.SetFrameIndex(frame_idx);
+    eop.SetFrameIndex(frame_idx);
 
-    char*  frame_buffer = eo.GetInputBufferPtr();
+    char*  frame_buffer = eop.GetInputBufferPtr();
     assert (frame_buffer != nullptr);
     int channel_size = configuration.inWidth * configuration.inHeight;
 
@@ -323,7 +300,7 @@ bool ReadFrame(ExecutionObject &eo, int frame_idx,
         if (is_preprocessed_input)
         {
             std::ifstream ifs(image_file, std::ios::binary);
-            ifs.seekg(frame_idx * channel_size * 3);
+            //ifs.seekg(frame_idx * channel_size * 3);
             ifs.read(frame_buffer, channel_size * 3);
             bool ifs_status = ifs.good();
             ifs.close();
@@ -368,8 +345,7 @@ bool ReadFrame(ExecutionObject &eo, int frame_idx,
 }
 
 // Create frame with boxes drawn around classified objects
-bool WriteFrameOutput(const ExecutionObject &eo_in,
-                      const ExecutionObject &eo_out,
+bool WriteFrameOutput(const ExecutionObjectPipeline& eop,
                       const Configuration& configuration)
 {
     // Asseembly original frame
@@ -378,13 +354,13 @@ bool WriteFrameOutput(const ExecutionObject &eo_in,
     int channel_size = width * height;
     Mat frame, r_frame, bgr[3];
 
-    unsigned char *in = (unsigned char *) eo_in.GetInputBufferPtr();
+    unsigned char *in = (unsigned char *) eop.GetInputBufferPtr();
     bgr[0] = Mat(height, width, CV_8UC(1), in);
     bgr[1] = Mat(height, width, CV_8UC(1), in + channel_size);
     bgr[2] = Mat(height, width, CV_8UC(1), in + channel_size*2);
     cv::merge(bgr, 3, frame);
 
-    int frame_index = eo_in.GetFrameIndex();
+    int frame_index = eop.GetFrameIndex();
     char outfile_name[64];
     if (! is_camera_input && is_preprocessed_input)
     {
@@ -394,8 +370,8 @@ bool WriteFrameOutput(const ExecutionObject &eo_in,
     }
 
     // Draw boxes around classified objects
-    float *out = (float *) eo_out.GetOutputBufferPtr();
-    int num_floats = eo_out.GetOutputBufferSizeInBytes() / sizeof(float);
+    float *out = (float *) eop.GetOutputBufferPtr();
+    int num_floats = eop.GetOutputBufferSizeInBytes() / sizeof(float);
     for (int i = 0; i < num_floats / 7; i++)
     {
         int index = (int)    out[i * 7 + 0];
@@ -443,13 +419,14 @@ bool WriteFrameOutput(const ExecutionObject &eo_in,
 
 
 void ProcessArgs(int argc, char *argv[], std::string& config,
-                 uint32_t& num_devices, DeviceType& device_type,
-                 std::string& input_file)
+                 uint32_t& num_dsps, uint32_t& num_eves,
+                 DeviceType& device_type, std::string& input_file)
 {
     const struct option long_options[] =
     {
         {"config",      required_argument, 0, 'c'},
-        {"num_devices", required_argument, 0, 'n'},
+        {"num_dsps",    required_argument, 0, 'd'},
+        {"num_eves",    required_argument, 0, 'e'},
         {"image_file",  required_argument, 0, 'i'},
         {"help",        no_argument,       0, 'h'},
         {"verbose",     no_argument,       0, 'v'},
@@ -460,7 +437,8 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
 
     while (true)
     {
-        int c = getopt_long(argc, argv, "c:n:i:hv", long_options, &option_index);
+        int c = getopt_long(argc, argv, "c:d:e:i:hv", long_options,
+                            &option_index);
 
         if (c == -1)
             break;
@@ -470,8 +448,14 @@ void ProcessArgs(int argc, char *argv[], std::string& config,
             case 'c': config = optarg;
                       break;
 
-            case 'n': num_devices = atoi(optarg);
-                      assert (num_devices > 0 && num_devices <= 4);
+            case 'd': num_dsps = atoi(optarg);
+                      assert (num_dsps > 0 && num_dsps <= 
+                                     Executor::GetNumDevices(DeviceType::DSP));
+                      break;
+
+            case 'e': num_eves = atoi(optarg);
+                      assert (num_eves > 0 && num_eves <=
+                                     Executor::GetNumDevices(DeviceType::EVE));
                       break;
 
             case 'i': input_file = optarg;
@@ -507,7 +491,8 @@ void DisplayHelp()
                  "Default is jdetnet.\n"
                  "Optional arguments:\n"
                  " -c <config>          Valid configs: jdetnet \n"
-                 " -n <number of cores> Number of cores to use (1 - 4)\n"
+                 " -d <number>          Number of dsp cores to use\n"
+                 " -e <number>          Number of eve cores to use\n"
                  " -i <image>           Path to the image file\n"
                  "                      Default is 1 frame in testvecs\n"
                  " -i camera            Use camera as input\n"
diff --git a/tidl_api/Makefile b/tidl_api/Makefile
index 05a3704..3fc6a2c 100644
--- a/tidl_api/Makefile
+++ b/tidl_api/Makefile
@@ -39,7 +39,8 @@ AR = ar
 
 
 SRCS = ocl_device.cpp configuration_parser.cpp configuration.cpp\
-	   executor.cpp execution_object.cpp trace.cpp util.cpp
+	   executor.cpp execution_object.cpp trace.cpp util.cpp \
+           execution_object_pipeline.cpp
 SRCS_IMGUTIL = imgutil.cpp
 
 OBJS = $(SRCS:.cpp=.o)
@@ -53,8 +54,7 @@ HOST_OBJ_IMGUTIL_FILES = $(addprefix obj/,$(OBJS_IMGUTIL))
 HEADERS  = src/common_defines.h src/executor_impl.h src/ocl_device.h
 HEADERS += src/parameters.h src/tidl_create_params.h src/trace.h src/util.h
 HEADERS += inc/configuration.h inc/execution_object.h inc/executor.h
-HEADERS += inc/imgutil.h src/device_arginfo.h
-
+HEADERS += inc/imgutil.h src/device_arginfo.h inc/execution_object_pipeline.h
 
 ifeq ($(BUILD), debug)
 	CXXFLAGS += -Og -g -ggdb
diff --git a/tidl_api/inc/execution_object.h b/tidl_api/inc/execution_object.h
index e78ad2e..c1d86fc 100644
--- a/tidl_api/inc/execution_object.h
+++ b/tidl_api/inc/execution_object.h
@@ -31,6 +31,7 @@
 #pragma once
 
 #include <memory>
+#include "execution_object_internal.h"
 
 namespace tidl {
 
@@ -39,13 +40,12 @@ class Device;
 class LayerOutput;
 class IODeviceArgInfo;
 
-typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs;
 
 /*! @class ExecutionObject
     @brief Runs the TIDL network on an OpenCL device
 */
 
-class ExecutionObject
+class ExecutionObject : public ExecutionObjectInternalInterface
 {
     public:
 
@@ -55,6 +55,8 @@ class ExecutionObject
                         const  ArgInfo& create_arg,
                         const  ArgInfo& param_heap_arg,
                         size_t extmem_heap_size,
+                        int    layersGroupId,
+                        bool   output_trace,
                         bool   internal_input);
         //! @private
         ~ExecutionObject();
@@ -62,52 +64,56 @@ class ExecutionObject
         //! Specify the input and output buffers used by the EO
         //! @param in buffer used for input.
         //! @param out buffer used for output.
-        void SetInputOutputBuffer (const ArgInfo& in, const ArgInfo& out);
+        void SetInputOutputBuffer(const ArgInfo& in,
+                                  const ArgInfo& out) override;
 
         //! Returns a pointer to the input buffer set via SetInputOutputBuffer
-        char* GetInputBufferPtr() const;
+        char* GetInputBufferPtr() const override;
 
         //! Returns size of the input buffer
-        size_t GetInputBufferSizeInBytes() const;
+        size_t GetInputBufferSizeInBytes() const override;
+
+        //! Returns a pointer to the output buffer
+        char* GetOutputBufferPtr() const override;
+
+        //! Returns size of the output buffer
+        size_t GetOutputBufferSizeInBytes() const override;
 
         //! @brief Set the frame index of the frame currently processed by the
         //! ExecutionObject. Used for trace/debug messages
         //! @param idx index of the frame
-        void  SetFrameIndex(int idx);
+        void  SetFrameIndex(int idx) override;
 
         //! Returns the index of a frame being processed (set by SetFrameIndex)
-        int   GetFrameIndex() const;
-
-        //! Returns a pointer to the output buffer
-        char* GetOutputBufferPtr() const;
-
-        //! Returns the number of bytes written to the output buffer
-        size_t GetOutputBufferSizeInBytes() const;
+        int   GetFrameIndex() const override;
 
-        //! @brief Start processing a frame. The call is asynchronous and returns
-        //! immediately. Use ExecutionObject::ProcessFrameWait to wait
-        bool ProcessFrameStartAsync();
+        //! @brief Start processing a frame. The call is asynchronous and
+        //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait
+        bool ProcessFrameStartAsync() override;
 
         //! Wait for the execution object to complete processing a frame
         //! @return false if ExecutionObject::ProcessFrameWait was called
         //! without a corresponding call to
         //! ExecutionObject::ProcessFrameStartAsync.
-        bool ProcessFrameWait();
-
-        //! @brief return the number of cycles taken *on the device* to
-        //! execute the process call
-        //! @return Number of cycles to process a frame on the device.
-        uint64_t GetProcessCycles() const;
+        bool ProcessFrameWait() override;
 
         //! @brief return the number of milliseconds taken *on the device* to
         //! execute the process call
         //! @return Number of milliseconds to process a frame on the device.
-        float    GetProcessTimeInMilliSeconds() const;
+        float GetProcessTimeInMilliSeconds() const override;
+
+        //! @brief return the number of milliseconds taken *on the host* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the host.
+        float GetHostProcessTimeInMilliSeconds() const override;
+
+        //! Returns the device name that the ExecutionObject runs on
+        const std::string& GetDeviceName() const override;
 
         //! Write the output buffer for each layer to a file
-        //! <filename_prefix>_<ID>_HxW.bin
+        //! \<filename_prefix>_<ID>_HxW.bin
         void WriteLayerOutputsToFile(const std::string& filename_prefix=
-                                     "trace_dump_") const;
+                                     "trace_dump_") const override;
 
         //! Returns a LayerOutput object corresponding to a layer.
         //! Caller is responsible for deleting the LayerOutput object.
@@ -116,10 +122,13 @@ class ExecutionObject
         //! @param output_index The output index of the buffer for a given
         //!                     layer. Defaults to 0.
         const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
-                                              uint32_t output_index=0) const;
+                                       uint32_t output_index=0) const override;
 
         //! Get output buffers from all layers
-        const LayerOutputs* GetOutputsFromAllLayers() const;
+        const LayerOutputs* GetOutputsFromAllLayers() const override;
+
+        //! Returns the layersGrupId that the ExecutionObject is processing
+        int   GetLayersGroupId() const;
 
         //! @private
         // Used by the Executor
@@ -127,12 +136,16 @@ class ExecutionObject
         bool RunAsync(CallType ct);
         bool Wait    (CallType ct);
 
+        //! @private
+        // Used by the ExecutionObjectPipeline
+        bool AddCallback(CallType ct, void *user_data);
+        void AcquireLock();
+        void ReleaseLock();
+
         ExecutionObject()                                  = delete;
         ExecutionObject(const ExecutionObject&)            = delete;
         ExecutionObject& operator=(const ExecutionObject&) = delete;
 
-        void EnableOutputBufferTrace();
-
         //! @private
         void SetInputOutputBuffer(const IODeviceArgInfo* in,
                                   const IODeviceArgInfo* out);
diff --git a/tidl_api/inc/execution_object_internal.h b/tidl_api/inc/execution_object_internal.h
new file mode 100644
index 0000000..816da94
--- /dev/null
+++ b/tidl_api/inc/execution_object_internal.h
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+/*! @file execution_object_internal.h */
+
+#pragma once
+
+namespace tidl {
+
+class LayerOutput;
+
+typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs;
+
+/*! @cond HIDDEN_SYMBOLS
+    @class ExecutionObjectInternalInterface
+    @brief Internal interface for running the TIDL network on OpenCL devices
+           Do not use this internal class directly.
+           Please use ExecutionObject or ExecutionObejctPipeline instead.
+*/
+class ExecutionObjectInternalInterface
+{
+    public:
+        virtual ~ExecutionObjectInternalInterface() {};
+
+        //! Specify the input and output buffers used by the EO
+        //! @param in buffer used for input.
+        //! @param out buffer used for output.
+        virtual void SetInputOutputBuffer(const ArgInfo& in,
+                                          const ArgInfo& out) =0;
+
+        //! Returns a pointer to the input buffer set via SetInputOutputBuffer
+        virtual char* GetInputBufferPtr() const =0;
+
+        //! Returns size of the input buffer
+        virtual size_t GetInputBufferSizeInBytes() const =0;
+
+        //! Returns a pointer to the output buffer
+        virtual char* GetOutputBufferPtr() const =0;
+
+        //! Returns size of the output buffer
+        virtual size_t GetOutputBufferSizeInBytes() const =0;
+
+        //! @brief Set the frame index of the frame currently processed by the
+        //! ExecutionObject. Used for trace/debug messages
+        //! @param idx index of the frame
+        virtual void  SetFrameIndex(int idx) =0;
+
+        //! Returns the index of a frame being processed (set by SetFrameIndex)
+        virtual int   GetFrameIndex() const =0;
+
+        //! @brief Start processing a frame. The call is asynchronous and returns
+        //! immediately. Use ExecutionObject::ProcessFrameWait to wait
+        virtual bool ProcessFrameStartAsync() =0;
+
+        //! Wait for the execution object to complete processing a frame
+        //! @return false if ExecutionObject::ProcessFrameWait was called
+        //! without a corresponding call to
+        //! ExecutionObject::ProcessFrameStartAsync.
+        virtual bool ProcessFrameWait() =0;
+
+        //! @brief return the number of milliseconds taken *on the device* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the device.
+        virtual float GetProcessTimeInMilliSeconds() const =0;
+
+        //! @brief return the number of milliseconds taken *on the host* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the host.
+        virtual float GetHostProcessTimeInMilliSeconds() const =0;
+
+        //! Returns the device name that the ExecutionObject runs on
+        virtual const std::string& GetDeviceName() const =0;
+
+        //! Write the output buffer for each layer to a file
+        //! \<filename_prefix>_<ID>_HxW.bin
+        virtual void WriteLayerOutputsToFile(const std::string& filename_prefix=
+                                             "trace_dump_") const =0;
+
+        //! Returns a LayerOutput object corresponding to a layer.
+        //! Caller is responsible for deleting the LayerOutput object.
+        //! @see LayerOutput
+        //! @param layer_index The layer index of the layer
+        //! @param output_index The output index of the buffer for a given
+        //!                     layer. Defaults to 0.
+        virtual const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
+                                             uint32_t output_index=0) const =0;
+
+        //! Get output buffers from all layers
+        virtual const LayerOutputs* GetOutputsFromAllLayers() const =0;
+};
+/*!  @endcond
+*/
+
+} // namespace tidl
diff --git a/tidl_api/inc/execution_object_pipeline.h b/tidl_api/inc/execution_object_pipeline.h
new file mode 100644
index 0000000..aaa6cf0
--- /dev/null
+++ b/tidl_api/inc/execution_object_pipeline.h
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+//! @file execution_object_pipeline.h
+
+#pragma once
+#include <string>
+#include <vector>
+#include <cstdint>
+#include <cassert>
+
+#include "executor.h"
+#include "execution_object_internal.h"
+#include "execution_object.h"
+
+namespace tidl {
+
+/*! @class ExecutionObjectPipeline
+    @brief Manages the pipelined execution using multiple ExecutionObjects.
+    Each executor runs one layersGroup of the network.  ExecutionObjects
+    must run consecutive layersGroups to form a pipelined execution.
+*/
+class ExecutionObjectPipeline : public ExecutionObjectInternalInterface
+{
+    public:
+        //! @brief Create an ExecutionObjectPipeline object.
+        //!
+        //! The ExecutionObjectPipeline will take the provided ExecutionObjects
+        //! to create an execution pipeline.  E.g.
+        //! @code
+        //!   Configuration config("path to configuration file");
+        //!   DeviceIds ids = {DeviceId::ID0, DeviceId::ID1};
+        //!   Executor exe_eve(DeviceType::EVE, ids, config, 1);
+        //!   Executor exe_dsp(DeviceType::DSP, ids, config, 2);
+        //!   ExecutionObjectPipeline ep0({exe_eve[0], exe_dsp[0]});
+        //!   ExecutionObjectPipeline ep1({exe_eve[1], exe_dsp[1]});
+        //! @endcode
+        //!
+        //! @param eos DSP or EVE ExecutionObjects forming a pipeline
+        ExecutionObjectPipeline(std::vector<ExecutionObject*> eos);
+
+        //! @brief Tear down an ExecutionObjectPipeline and free used resources
+        ~ExecutionObjectPipeline();
+
+        //! Specify the input and output buffers used by the EOP
+        //! @param in buffer used for input.
+        //! @param out buffer used for output.
+        void SetInputOutputBuffer (const ArgInfo& in,
+                                   const ArgInfo& out) override;
+
+        //! Returns a pointer to the input buffer
+        char* GetInputBufferPtr() const override;
+
+        //! Returns size of the input buffer
+        size_t GetInputBufferSizeInBytes() const override;
+
+        //! Returns a pointer to the output buffer
+        char* GetOutputBufferPtr() const override;
+
+        //! Returns the number of bytes written to the output buffer
+        size_t GetOutputBufferSizeInBytes() const override;
+
+        //! @brief Set the frame index of the frame currently processed by the
+        //! ExecutionObjectPipeline. Used for trace/debug messages
+        //! @param idx index of the frame
+        void SetFrameIndex(int idx) override;
+
+        //! Returns the index of a frame being processed (set by SetFrameIndex)
+        int  GetFrameIndex() const override;
+
+        //! @brief Start processing a frame. The call is asynchronous and
+        //! returns immediately. Use ProcessFrameWait() to wait
+        bool ProcessFrameStartAsync() override;
+
+        //! Wait for the executor pipeline to complete processing a frame
+        //! @return false if ProcessFrameWait() was called
+        //! without a corresponding call to
+        //! ExecutionObjectPipeline::ProcessFrameStartAsync().
+        bool ProcessFrameWait() override;
+
+        //! @brief return the number of milliseconds taken *on the device* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the device.
+        float GetProcessTimeInMilliSeconds() const override;
+
+        //! @brief return the number of milliseconds taken *on the host* to
+        //! execute the process call
+        //! @return Number of milliseconds to process a frame on the host.
+        float GetHostProcessTimeInMilliSeconds() const override;
+
+        //! Return the combined device names that this pipeline runs on
+        const std::string& GetDeviceName() const override;
+
+        //! Write the output buffer for each layer to a file
+        //! \<filename_prefix>_<ID>_HxW.bin
+        void WriteLayerOutputsToFile(const std::string& filename_prefix=
+                                     "trace_dump_") const override;
+
+        //! Returns a LayerOutput object corresponding to a layer.
+        //! Caller is responsible for deleting the LayerOutput object.
+        //! @see LayerOutput
+        //! @param layer_index The layer index of the layer
+        //! @param output_index The output index of the buffer for a given
+        //!                     layer. Defaults to 0.
+        const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
+                                       uint32_t output_index=0) const override;
+
+        //! Get output buffers from all layers
+        const LayerOutputs* GetOutputsFromAllLayers() const override;
+
+        //! @private Used by runtime
+        //! @brief callback function at the completion of each ExecutionObject,
+        //! to chain the next ExectionObject for execution
+        void RunAsyncNext();
+
+        ExecutionObjectPipeline()                                     = delete;
+        ExecutionObjectPipeline(const ExecutionObjectPipeline&)       = delete;
+        ExecutionObjectPipeline& operator=(const ExecutionObjectPipeline&)
+                                                                      = delete;
+
+    private:
+        class Impl;
+        std::unique_ptr<Impl> pimpl_m;
+};
+
+} // namespace tidl
diff --git a/tidl_api/inc/executor.h b/tidl_api/inc/executor.h
index 23d92ff..1febfea 100644
--- a/tidl_api/inc/executor.h
+++ b/tidl_api/inc/executor.h
@@ -64,7 +64,7 @@ class ExecutionObject;
 typedef std::vector<std::unique_ptr<ExecutionObject>> ExecutionObjects;
 
 /*! @class Executor
-    @brief Manages the overall execution of a network using the
+    @brief Manages the overall execution of a layersGroup in a network using the
     specified configuration and the set of devices available to the
     executor.
 */
@@ -78,7 +78,7 @@ class Executor
         //! @code
         //!   Configuration configuration;
         //!   configuration.ReadFromFile("path to configuration file");
-        //!   DeviceIds ids1 = {DeviceId::ID2, DeviceId::ID3};
+        //!   DeviceIds ids = {DeviceId::ID2, DeviceId::ID3};
         //!   Executor executor(DeviceType::EVE, ids, configuration);
         //! @endcode
         //!
@@ -98,6 +98,9 @@ class Executor
         //! available on this instance of the Executor
         const ExecutionObjects& GetExecutionObjects() const;
 
+        //! Returns a single execution object at index
+        ExecutionObject* operator[](uint32_t index) const;
+
         //! @brief Returns the number of devices of the specified type
         //! available for TI DL.
         //! @param  device_type DSP or EVE/EVE device
@@ -106,7 +109,7 @@ class Executor
 
         //! @brief Returns a string corresponding to the API version
         //!
-        //! @return <major_ver>.<minor_ver>.<patch_ver>.<git_sha>
+        //! @return \<major_ver>.\<minor_ver>.\<patch_ver>.\<git_sha>
         static std::string GetAPIVersion();
 
         Executor(const Executor&) = delete;
diff --git a/tidl_api/src/execution_object.cpp b/tidl_api/src/execution_object.cpp
index d722ebb..178bbca 100644
--- a/tidl_api/src/execution_object.cpp
+++ b/tidl_api/src/execution_object.cpp
@@ -31,6 +31,9 @@
 #include <string.h>
 #include <fstream>
 #include <climits>
+#include <mutex>
+#include <condition_variable>
+#include <chrono>
 #include "executor.h"
 #include "execution_object.h"
 #include "trace.h"
@@ -50,13 +53,24 @@ class ExecutionObject::Impl
              const DeviceArgInfo& create_arg,
              const DeviceArgInfo& param_heap_arg,
              size_t extmem_heap_size,
+             int    layers_group_id,
+             bool   output_trace,
              bool   internal_input);
         ~Impl() {}
 
         bool RunAsync(CallType ct);
         bool Wait    (CallType ct);
+        bool AddCallback(CallType ct, void *user_data);
+
+        uint64_t GetProcessCycles() const;
+        int  GetLayersGroupId() const;
+        void AcquireLock();
+        void ReleaseLock();
 
         Device*                         device_m;
+        // Index of the OpenCL device/queue used by this EO
+        uint8_t                         device_index_m;
+        std::string                     device_name_m;
 
         up_malloc_ddr<char>             tidl_extmem_heap_m;
         up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
@@ -70,6 +84,9 @@ class ExecutionObject::Impl
         // Frame being processed by the EO
         int                             current_frame_idx_m;
 
+        // LayersGroupId being processed by the EO
+        int                             layers_group_id_m;
+
         // Trace related
         void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
 
@@ -81,25 +98,29 @@ class ExecutionObject::Impl
         up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
         size_t                            trace_buf_params_sz_m;
 
+        // host time tracking: eo start to finish
+        float host_time_m;
+
     private:
         void SetupInitializeKernel(const DeviceArgInfo& create_arg,
                                    const DeviceArgInfo& param_heap_arg,
                                    size_t extmem_heap_size,
                                    bool   internal_input);
+        void EnableOutputBufferTrace();
         void SetupProcessKernel();
 
         void HostWriteNetInput();
         void HostReadNetOutput();
         void ComputeInputOutputSizes();
 
-        // Index of the OpenCL device/queue used by this EO
-        uint8_t                         device_index_m;
-
         std::unique_ptr<Kernel>         k_initialize_m;
         std::unique_ptr<Kernel>         k_process_m;
         std::unique_ptr<Kernel>         k_cleanup_m;
 
-
+        // Guarding sole access to input/output for one frame during execution
+        bool                            is_idle_m;
+        std::mutex                      mutex_access_m;
+        std::condition_variable         cv_access_m;
 };
 
 
@@ -108,6 +129,8 @@ ExecutionObject::ExecutionObject(Device* d,
                                  const ArgInfo& create_arg,
                                  const ArgInfo& param_heap_arg,
                                  size_t extmem_heap_size,
+                                 int    layers_group_id,
+                                 bool   output_trace,
                                  bool   internal_input)
 {
     DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
@@ -118,6 +141,8 @@ ExecutionObject::ExecutionObject(Device* d,
                                           create_arg_d,
                                           param_heap_arg_d,
                                           extmem_heap_size,
+                                          layers_group_id,
+                                          output_trace,
                                           internal_input) };
 }
 
@@ -127,8 +152,11 @@ ExecutionObject::Impl::Impl(Device* d,
                                  const DeviceArgInfo& create_arg,
                                  const DeviceArgInfo& param_heap_arg,
                                  size_t extmem_heap_size,
+                                 int    layers_group_id,
+                                 bool   output_trace,
                                  bool   internal_input):
     device_m(d),
+    device_index_m(device_index),
     tidl_extmem_heap_m (nullptr, &__free_ddr),
     shared_initialize_params_m(nullptr, &__free_ddr),
     shared_process_params_m(nullptr, &__free_ddr),
@@ -137,23 +165,26 @@ ExecutionObject::Impl::Impl(Device* d,
     in_m(),
     out_m(),
     current_frame_idx_m(0),
+    layers_group_id_m(layers_group_id),
     num_network_layers_m(0),
     trace_buf_params_m(nullptr, &__free_ddr),
     trace_buf_params_sz_m(0),
-    device_index_m(device_index),
     k_initialize_m(nullptr),
     k_process_m(nullptr),
-    k_cleanup_m(nullptr)
+    k_cleanup_m(nullptr),
+    is_idle_m(true)
 {
-    SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
-                          internal_input);
-
-    SetupProcessKernel();
-
+    device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
     // Save number of layers in the network
     const TIDL_CreateParams* cp =
                 static_cast<const TIDL_CreateParams *>(create_arg.ptr());
     num_network_layers_m = cp->net.numLayers;
+
+    SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
+                          internal_input);
+
+    if (output_trace)  EnableOutputBufferTrace();
+    SetupProcessKernel();
 }
 
 // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
@@ -168,9 +199,7 @@ char* ExecutionObject::GetInputBufferPtr() const
 
 size_t ExecutionObject::GetInputBufferSizeInBytes() const
 {
-    const DeviceArgInfo& arg = pimpl_m->in_m.GetArg();
-    if    (arg.ptr() == nullptr)  return pimpl_m->in_size_m;
-    else                          return arg.size();
+    return pimpl_m->in_size_m;
 }
 
 char* ExecutionObject::GetOutputBufferPtr() const
@@ -180,11 +209,7 @@ char* ExecutionObject::GetOutputBufferPtr() const
 
 size_t ExecutionObject::GetOutputBufferSizeInBytes() const
 {
-    const DeviceArgInfo& arg = pimpl_m->out_m.GetArg();
-    if   (arg.ptr() == nullptr)
-        return pimpl_m->out_size_m;
-    else
-        return pimpl_m->shared_process_params_m.get()->bytesWritten;
+    return pimpl_m->out_size_m;
 }
 
 void  ExecutionObject::SetFrameIndex(int idx)
@@ -199,8 +224,8 @@ int ExecutionObject::GetFrameIndex() const
 
 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
 {
-    assert(in.ptr() != nullptr && in.size() > 0);
-    assert(out.ptr() != nullptr && out.size() > 0);
+    assert(in.ptr()  != nullptr && in.size()  >= pimpl_m->in_size_m);
+    assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
 
     pimpl_m->in_m  = IODeviceArgInfo(in);
     pimpl_m->out_m = IODeviceArgInfo(out);
@@ -215,6 +240,7 @@ void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
 
 bool ExecutionObject::ProcessFrameStartAsync()
 {
+    assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
     return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
 }
 
@@ -233,21 +259,26 @@ bool ExecutionObject::Wait (CallType ct)
     return pimpl_m->Wait(ct);
 }
 
-uint64_t ExecutionObject::GetProcessCycles() const
+bool ExecutionObject::AddCallback(CallType ct, void *user_data)
 {
-    uint8_t factor = 1;
-
-    // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
-    if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM)
-        factor = 2;
-
-    return pimpl_m->shared_process_params_m.get()->cycles * factor;
+    return pimpl_m->AddCallback(ct, user_data);
 }
 
 float ExecutionObject::GetProcessTimeInMilliSeconds() const
 {
     float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
-    return ((float)GetProcessCycles())/frequency * 1000;
+    return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000;
+}
+
+float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
+{
+    return pimpl_m->host_time_m;
+}
+
+void
+ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
+{
+    pimpl_m->WriteLayerOutputsToFile(filename_prefix);
 }
 
 const LayerOutput* ExecutionObject::GetOutputFromLayer(
@@ -261,37 +292,25 @@ const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
     return pimpl_m->GetOutputsFromAllLayers();
 }
 
-//
-// Allocate an OpenCL buffer for TIDL layer output buffer metadata.
-// The device will populate metadata for every buffer that is used as an
-// output buffer by a layer.
-//
-void ExecutionObject::EnableOutputBufferTrace()
+int ExecutionObject::GetLayersGroupId() const
 {
-    pimpl_m->trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
-                                       pimpl_m->num_network_layers_m*
-                                       TIDL_NUM_OUT_BUFS);
-
-    pimpl_m->trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
-                                      (pimpl_m->trace_buf_params_sz_m));
+    return pimpl_m->layers_group_id_m;
+}
 
-    // Device will update bufferId if there is valid data for the entry
-    OCL_TIDL_BufParams* bufferParams = pimpl_m->trace_buf_params_m.get();
-    for (uint32_t i = 0; i < pimpl_m->num_network_layers_m; i++)
-        for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
-        {
-            OCL_TIDL_BufParams *bufP =
-                                &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
-            bufP->bufferId = UINT_MAX;
-        }
+const std::string& ExecutionObject::GetDeviceName() const
+{
+    return pimpl_m->device_name_m;
 }
 
-void
-ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
+void ExecutionObject::AcquireLock()
 {
-    pimpl_m->WriteLayerOutputsToFile(filename_prefix);
+    pimpl_m->AcquireLock();
 }
 
+void ExecutionObject::ReleaseLock()
+{
+    pimpl_m->ReleaseLock();
+}
 
 //
 // Create a kernel to call the "initialize" function
@@ -342,6 +361,32 @@ ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
                                     device_index_m));
 }
 
+//
+// Allocate an OpenCL buffer for TIDL layer output buffer metadata.
+// The device will populate metadata for every buffer that is used as an
+// output buffer by a layer.  This needs to be done before setting up
+// process kernel.
+//
+void ExecutionObject::Impl::EnableOutputBufferTrace()
+{
+    trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
+                             num_network_layers_m*
+                             TIDL_NUM_OUT_BUFS);
+
+    trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
+                             (trace_buf_params_sz_m));
+
+    // Device will update bufferId if there is valid data for the entry
+    OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
+    for (uint32_t i = 0; i < num_network_layers_m; i++)
+        for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
+        {
+            OCL_TIDL_BufParams *bufP =
+                                &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
+            bufP->bufferId = UINT_MAX;
+        }
+}
+
 //
 // Create a kernel to call the "process" function
 //
@@ -514,10 +559,17 @@ bool ExecutionObject::Impl::RunAsync(CallType ct)
         }
         case CallType::PROCESS:
         {
+            std::chrono::time_point<std::chrono::steady_clock> t1, t2;
+            t1 = std::chrono::steady_clock::now();
+
             shared_process_params_m->frameIdx = current_frame_idx_m;
             shared_process_params_m->bytesWritten = 0;
             HostWriteNetInput();
             k_process_m->RunAsync();
+
+            t2 = std::chrono::steady_clock::now();
+            std::chrono::duration<float> elapsed = t2 - t1;
+            host_time_m = elapsed.count() * 1000;
             break;
         }
         case CallType::CLEANUP:
@@ -551,13 +603,20 @@ bool ExecutionObject::Impl::Wait(CallType ct)
         }
         case CallType::PROCESS:
         {
-            bool has_work = k_process_m->Wait();
+            float host_elapsed_ms = 0.0f;
+            bool has_work = k_process_m->Wait(&host_elapsed_ms);
             if (has_work)
             {
                 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
                     throw Exception(shared_process_params_m->errorCode,
                                     __FILE__, __FUNCTION__, __LINE__);
+
+                std::chrono::time_point<std::chrono::steady_clock> t1, t2;
+                t1 = std::chrono::steady_clock::now();
                 HostReadNetOutput();
+                t2 = std::chrono::steady_clock::now();
+                std::chrono::duration<float> elapsed = t2 - t1;
+                host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
             }
 
             return has_work;
@@ -574,6 +633,33 @@ bool ExecutionObject::Impl::Wait(CallType ct)
     return false;
 }
 
+bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data)
+{
+    switch (ct)
+    {
+        case CallType::PROCESS:
+        {
+            return k_process_m->AddCallback(user_data);
+            break;
+        }
+        default:
+            return false;
+    }
+
+    return false;
+}
+
+uint64_t ExecutionObject::Impl::GetProcessCycles() const
+{
+    uint8_t factor = 1;
+
+    // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
+    if (device_m->type() == CL_DEVICE_TYPE_CUSTOM)
+        factor = 2;
+
+    return shared_process_params_m.get()->cycles * factor;
+}
+
 //
 // Write the trace data to output files
 //
@@ -697,3 +783,16 @@ LayerOutput::~LayerOutput()
 {
     delete[] data_m;
 }
+
+void ExecutionObject::Impl::AcquireLock()
+{
+    std::unique_lock<std::mutex> lock(mutex_access_m);
+    cv_access_m.wait(lock, [this]{ return this->is_idle_m; });
+    is_idle_m = false;
+}
+
+void ExecutionObject::Impl::ReleaseLock()
+{
+    is_idle_m = true;
+    cv_access_m.notify_all();
+}
diff --git a/tidl_api/src/execution_object_pipeline.cpp b/tidl_api/src/execution_object_pipeline.cpp
new file mode 100644
index 0000000..ff84255
--- /dev/null
+++ b/tidl_api/src/execution_object_pipeline.cpp
@@ -0,0 +1,360 @@
+/******************************************************************************
+ * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include <assert.h>
+#include <mutex>
+#include <condition_variable>
+#include <chrono>
+#include "device_arginfo.h"
+#include "execution_object_pipeline.h"
+
+using namespace tidl;
+
+class ExecutionObjectPipeline::Impl
+{
+    public:
+        Impl(std::vector<ExecutionObject*> &eos);
+        ~Impl();
+
+        void SetInputOutputBuffer(const ArgInfo &in, const ArgInfo &out);
+        bool RunAsyncStart();
+        bool RunAsyncNext();
+        bool Wait();
+
+        // Trace related
+        void WriteLayerOutputsToFile(const std::string& filename_prefix) const;
+        const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
+                                              uint32_t output_index) const;
+        const LayerOutputs* GetOutputsFromAllLayers() const;
+
+        //! for pipelined execution
+        std::vector<ExecutionObject*> eos_m;
+        std::vector<IODeviceArgInfo*> iobufs_m;
+
+        std::string device_name_m;
+
+        //! current frame index
+        int frame_idx_m;
+
+        //! current execution object index
+        uint32_t curr_eo_idx_m;
+
+        // host time tracking: pipeline start to finish
+        float host_time_m;
+
+    private:
+        //! @brief Initialize ExecutionObjectPipeline with given
+        //! ExecutionObjects: check consecutive layersGroup, allocate memory
+        void Initialize();
+
+        // flag, mutex and cond var for signaling completion and waiting
+        bool has_work_m, is_processed_m;
+        std::mutex mutex_m;
+        std::condition_variable cv_m;
+
+        // host time tracking: pipeline start to finish
+        std::chrono::time_point<std::chrono::steady_clock> start_m;
+};
+
+ExecutionObjectPipeline::ExecutionObjectPipeline(
+    std::vector<ExecutionObject*> eos)
+{
+    pimpl_m = std::unique_ptr<Impl> { new Impl(eos) };
+}
+
+ExecutionObjectPipeline::Impl::Impl(std::vector<ExecutionObject *> &eos) :
+    eos_m(eos), has_work_m(false), is_processed_m(false)
+{
+    Initialize();
+}
+
+// Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
+// Both unique_ptr and shared_ptr can be instantiated with an incomplete type
+// unique_ptr's destructor requires a complete type in order to invoke delete
+ExecutionObjectPipeline::~ExecutionObjectPipeline() = default;
+
+char* ExecutionObjectPipeline::GetInputBufferPtr() const
+{
+    return static_cast<char *>(pimpl_m->iobufs_m.front()->GetArg().ptr());
+}
+
+size_t ExecutionObjectPipeline::GetInputBufferSizeInBytes() const
+{
+    return pimpl_m->eos_m.front()->GetInputBufferSizeInBytes();
+}
+
+char* ExecutionObjectPipeline::GetOutputBufferPtr() const
+{
+    return static_cast<char *>(pimpl_m->iobufs_m.back()->GetArg().ptr());
+}
+
+size_t ExecutionObjectPipeline::GetOutputBufferSizeInBytes() const
+{
+    return pimpl_m->eos_m.back()->GetOutputBufferSizeInBytes();
+}
+
+void ExecutionObjectPipeline::SetInputOutputBuffer(const ArgInfo& in,
+                                                   const ArgInfo& out)
+{
+    assert(in.ptr() != nullptr  && in.size() >= GetInputBufferSizeInBytes());
+    assert(out.ptr() != nullptr && out.size() >= GetOutputBufferSizeInBytes());
+    pimpl_m->SetInputOutputBuffer(in, out);
+}
+
+void ExecutionObjectPipeline::SetFrameIndex(int idx)
+{
+    pimpl_m->frame_idx_m = idx;
+}
+
+int ExecutionObjectPipeline::GetFrameIndex() const
+{
+    return pimpl_m->frame_idx_m;
+}
+
+bool ExecutionObjectPipeline::ProcessFrameStartAsync()
+{
+    assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
+    bool st = pimpl_m->RunAsyncStart();
+    if (st)
+        st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS,
+                                            this);
+    return st;
+}
+
+bool ExecutionObjectPipeline::ProcessFrameWait()
+{
+    return pimpl_m->Wait();
+}
+
+void CallbackWrapper(void *user_data)
+{
+    ((ExecutionObjectPipeline *) user_data)->RunAsyncNext();
+}
+
+void ExecutionObjectPipeline::RunAsyncNext()
+{
+    bool has_next = pimpl_m->RunAsyncNext();
+    if (has_next)
+        pimpl_m->eos_m[pimpl_m->curr_eo_idx_m]->AddCallback(
+                                     ExecutionObject::CallType::PROCESS, this);
+}
+
+float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const
+{
+    float total = 0.0f;
+    for (auto eo : pimpl_m->eos_m)
+        total += eo->GetProcessTimeInMilliSeconds();
+    return total;
+}
+
+float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds() const
+{
+    return pimpl_m->host_time_m;
+}
+
+const std::string& ExecutionObjectPipeline::GetDeviceName() const
+{
+    return pimpl_m->device_name_m;
+}
+
+void
+ExecutionObjectPipeline::WriteLayerOutputsToFile(
+    const std::string& filename_prefix) const
+{
+    pimpl_m->WriteLayerOutputsToFile(filename_prefix);
+}
+
+const LayerOutput*
+ExecutionObjectPipeline::GetOutputFromLayer(uint32_t layer_index,
+    uint32_t output_index) const
+{
+    return pimpl_m->GetOutputFromLayer(layer_index, output_index);
+}
+
+const LayerOutputs*
+ExecutionObjectPipeline::GetOutputsFromAllLayers() const
+{
+    return pimpl_m->GetOutputsFromAllLayers();
+}
+
+
+/// Impl methods start here
+
+
+static
+void* AllocateMem(size_t size)
+{
+    if (size == 0)  return nullptr;
+    void *ptr = malloc(size);
+    if (ptr == nullptr)
+        throw Exception("Out of memory, ExecutionObjectPipeline malloc failed",
+                        __FILE__, __FUNCTION__, __LINE__);
+    return ptr;
+}
+
+void ExecutionObjectPipeline::Impl::Initialize()
+{
+    // Check consecutive layersGroups to form a pipeline
+    int prev_group = 0;
+    for (auto eo : eos_m)
+    {
+        int group = eo->GetLayersGroupId();
+        if (prev_group != 0 && group != prev_group + 1)
+            throw Exception(
+                "Non-consecutive layersGroupIds in ExecutionObjectPipeline",
+                __FILE__, __FUNCTION__, __LINE__);
+        prev_group = group;
+    }
+
+    for (auto eo : eos_m)
+        device_name_m += eo->GetDeviceName() + "+";
+    device_name_m.resize(device_name_m.size() - 1);
+
+    // Allocate input and output memory for EOs/layersGroups
+    // Note that i-th EO's output buffer is the same as (i+1)-th EO's input
+    // So, if n EOs, then (n+1) buffers: b EO b EO b EO b ... EO b
+    // User must set the first input buffer and the last output buffer
+    size_t size;
+    ArgInfo in(nullptr, 0);
+    iobufs_m.push_back(new IODeviceArgInfo(in));
+    for (auto eo : eos_m)
+    {
+        if (eo != eos_m.back())
+            size = eo->GetOutputBufferSizeInBytes();
+        else
+            size = 0;
+
+        void *ptr = AllocateMem(size);
+        ArgInfo out(ptr, size);
+        iobufs_m.push_back(new IODeviceArgInfo(out));
+    }
+}
+
+ExecutionObjectPipeline::Impl::~Impl()
+{
+    int num_iobufs = iobufs_m.size();
+    for (int i = 0; i < num_iobufs; i++)
+    {
+        if (! (i == 0 || i == num_iobufs-1))
+            free(iobufs_m[i]->GetArg().ptr());
+        delete iobufs_m[i];
+    }
+}
+
+void ExecutionObjectPipeline::Impl::SetInputOutputBuffer(const ArgInfo &in,
+                                                         const ArgInfo &out)
+{
+    delete iobufs_m.front();
+    delete iobufs_m.back();
+    iobufs_m.front() = new IODeviceArgInfo(in);
+    iobufs_m.back()  = new IODeviceArgInfo(out);
+}
+
+bool ExecutionObjectPipeline::Impl::RunAsyncStart()
+{
+    start_m = std::chrono::steady_clock::now();
+    has_work_m = true;
+    is_processed_m = false;
+    host_time_m = 0.0f;
+    curr_eo_idx_m = 0;
+    eos_m[0]->AcquireLock();
+    eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1]);
+    return eos_m[0]->ProcessFrameStartAsync();
+}
+
+// returns true if we have more EOs to execute
+bool ExecutionObjectPipeline::Impl::RunAsyncNext()
+{
+    eos_m[curr_eo_idx_m]->ProcessFrameWait();
+    eos_m[curr_eo_idx_m]->ReleaseLock();
+    curr_eo_idx_m += 1;
+    if (curr_eo_idx_m < eos_m.size())
+    {
+        eos_m[curr_eo_idx_m]->AcquireLock();
+        eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m],
+                                                   iobufs_m[curr_eo_idx_m+1]);
+        eos_m[curr_eo_idx_m]->ProcessFrameStartAsync();
+        return true;
+    }
+    else
+    {
+        std::chrono::duration<float> elapsed = std::chrono::steady_clock::now()
+                                               - start_m;
+        host_time_m = elapsed.count() * 1000;  // seconds to milliseconds
+        is_processed_m = true;
+        cv_m.notify_all();
+        return false;
+    }
+}
+
+bool ExecutionObjectPipeline::Impl::Wait()
+{
+    if (! has_work_m)  return false;
+
+    std::unique_lock<std::mutex> lock(mutex_m);
+    cv_m.wait(lock, [this]{ return this->is_processed_m; });
+    has_work_m = false;
+    return true;
+}
+
+void
+ExecutionObjectPipeline::Impl::WriteLayerOutputsToFile(
+    const std::string& filename_prefix) const
+{
+    for (auto eo : eos_m)
+        eo->WriteLayerOutputsToFile(filename_prefix);
+}
+
+const LayerOutput*
+ExecutionObjectPipeline::Impl::GetOutputFromLayer(uint32_t layer_index,
+    uint32_t output_index) const
+{
+    const LayerOutput* lo = nullptr;
+    for (auto eo : eos_m)
+    {
+        lo = eo->GetOutputFromLayer(layer_index, output_index);
+        if (lo != nullptr)  break;
+    }
+    return lo;
+}
+
+const LayerOutputs*
+ExecutionObjectPipeline::Impl::GetOutputsFromAllLayers() const
+{
+    LayerOutputs *all = new LayerOutputs;
+    for (auto eo : eos_m)
+    {
+        LayerOutputs *los = const_cast<LayerOutputs *>(
+                                                eo->GetOutputsFromAllLayers());
+        for (auto& lo : *los)
+            all->push_back(std::unique_ptr<const LayerOutput>{ lo.release() });
+        delete los;
+    }
+    return all;
+}
+
diff --git a/tidl_api/src/executor.cpp b/tidl_api/src/executor.cpp
index b644728..914c78a 100644
--- a/tidl_api/src/executor.cpp
+++ b/tidl_api/src/executor.cpp
@@ -96,6 +96,12 @@ const ExecutionObjects& Executor::GetExecutionObjects() const
     return pimpl_m->execution_objects_m;
 }
 
+ExecutionObject* Executor::operator[](uint32_t index) const
+{
+    assert(index < pimpl_m->execution_objects_m.size());
+    return pimpl_m->execution_objects_m[index].get();
+}
+
 bool ExecutorImpl::Initialize(const Configuration& configuration)
 {
     configuration_m = configuration;
@@ -145,13 +151,11 @@ bool ExecutorImpl::Initialize(const Configuration& configuration)
              {new ExecutionObject(device_m.get(), index,
                                   create_arg, param_heap_arg,
                                   configuration_m.EXTMEM_HEAP_SIZE,
+                                  layers_group_id_m,
+                                  configuration_m.enableOutputTrace,
                                   configuration_m.enableInternalInput)} );
     }
 
-    if (configuration_m.enableOutputTrace)
-        for (auto &eo : execution_objects_m)
-            eo->EnableOutputBufferTrace();
-
     for (auto &eo : execution_objects_m)
         eo->RunAsync(ExecutionObject::CallType::INIT);
 
@@ -294,4 +298,3 @@ const char* Exception::what() const noexcept
 {
     return message_m.c_str();
 }
-
diff --git a/tidl_api/src/ocl_device.cpp b/tidl_api/src/ocl_device.cpp
index fba4f94..b3eaf36 100644
--- a/tidl_api/src/ocl_device.cpp
+++ b/tidl_api/src/ocl_device.cpp
@@ -91,7 +91,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename):
         // Queue 0 on device 0
         queue_m[0] = clCreateCommandQueue(context_m,
                                           device_ids[0],
-                                          0,
+                                          CL_QUEUE_PROFILING_ENABLE,
                                           &errcode);
         errorCheck(errcode, __LINE__);
         BuildProgramFromBinary(binary_filename, device_ids, 1);
@@ -139,7 +139,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename):
             int index = static_cast<int>(id);
             queue_m[index] = clCreateCommandQueue(context_m,
                                           sub_devices[index],
-                                          0,
+                                          CL_QUEUE_PROFILING_ENABLE,
                                           &errcode);
             errorCheck(errcode, __LINE__);
         }
@@ -187,7 +187,7 @@ EveDevice::EveDevice(const DeviceIds& ids, const std::string &kernel_names):
         int index = static_cast<int>(id);
         queue_m[index] = clCreateCommandQueue(context_m,
                                       all_device_ids[index],
-                                      0,
+                                      CL_QUEUE_PROFILING_ENABLE,
                                       &errcode);
         errorCheck(errcode, __LINE__);
     }
@@ -317,7 +317,7 @@ Kernel& Kernel::RunAsync()
 }
 
 
-bool Kernel::Wait()
+bool Kernel::Wait(float *host_elapsed_ms)
 {
     // Wait called without a corresponding RunAsync
     if (!is_running_m)
@@ -326,6 +326,17 @@ bool Kernel::Wait()
     TRACE::print("\tKernel: waiting...\n");
     cl_int ret = clWaitForEvents(1, &event_m);
     errorCheck(ret, __LINE__);
+
+    if (host_elapsed_ms != nullptr)
+    {
+        cl_ulong t_que, t_end;
+        clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_QUEUED,
+                                sizeof(cl_ulong), &t_que, nullptr);
+        clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_END,
+                                sizeof(cl_ulong), &t_end, nullptr);
+        *host_elapsed_ms = (t_end - t_que) / 1.0e6;  // nano to milli seconds
+    }
+
     ret = clReleaseEvent(event_m);
     errorCheck(ret, __LINE__);
     TRACE::print("\tKernel: finished execution\n");
@@ -334,6 +345,22 @@ bool Kernel::Wait()
     return true;
 }
 
+extern void CallbackWrapper(void *user_data) __attribute__((weak));
+
+static
+void EventCallback(cl_event event, cl_int exec_status, void *user_data)
+{
+    if (exec_status != CL_SUCCESS || user_data == nullptr)  return;
+    if (CallbackWrapper)  CallbackWrapper(user_data);
+}
+
+bool Kernel::AddCallback(void *user_data)
+{
+    if (! is_running_m)  return false;
+    return clSetEventCallback(event_m, CL_COMPLETE, EventCallback, user_data)
+           == CL_SUCCESS;
+}
+
 Kernel::~Kernel()
 {
     for (auto b : buffers_m)
diff --git a/tidl_api/src/ocl_device.h b/tidl_api/src/ocl_device.h
index 6e80166..04c5db6 100644
--- a/tidl_api/src/ocl_device.h
+++ b/tidl_api/src/ocl_device.h
@@ -74,6 +74,8 @@ class Device
 
         static uint32_t GetNumDevices(DeviceType device_type);
 
+        virtual std::string GetDeviceName() = 0;
+
     protected:
 
         static const int MAX_DEVICES = 4;
@@ -101,6 +103,8 @@ class DspDevice: public Device
         DspDevice(const DspDevice&)            = delete;
         DspDevice& operator=(const DspDevice&) = delete;
 
+        virtual std::string GetDeviceName() { return "DSP"; }
+
     protected:
         bool BuildProgramFromBinary(const std::string &binary_filename,
                                     cl_device_id device_ids[],
@@ -117,6 +121,8 @@ class EveDevice : public Device
         EveDevice(const EveDevice&)            = delete;
         EveDevice& operator=(const EveDevice&) = delete;
 
+        virtual std::string GetDeviceName() { return "EVE"; }
+
     protected:
         bool BuildProgramFromBinary(const std::string &kernel_names,
                                     cl_device_id device_ids[],
@@ -137,7 +143,8 @@ class Kernel
         ~Kernel();
 
         Kernel& RunAsync();
-        bool Wait();
+        bool Wait(float *host_elapsed_ms = nullptr);
+        bool AddCallback(void *user_data);
 
     private:
         cl_kernel           kernel_m;