Subgraph offloading to TIDL: first commit
authorYuan Zhao <yuanzhao@ti.com>
Wed, 23 Oct 2019 19:21:09 +0000 (14:21 -0500)
committerYuan Zhao <yuanzhao@ti.com>
Mon, 28 Oct 2019 20:10:07 +0000 (15:10 -0500)
- ResM class provides top level encapsulation
- All allocation of core resources and buffers, and all creation of
  Executor, ExecutionObject, ExecutionObjectPipeline are encapsulated.
- Auto-partition last few layers to DSP if profitable, also encapsulated.
- MCT-1223, MCT-1224

tidl_api/Makefile
tidl_api/inc/subgraph_runtime.h [new file with mode: 0644]
tidl_api/src/subgraph_data_conv.h [new file with mode: 0644]
tidl_api/src/subgraph_runtime.cpp [new file with mode: 0644]

index 8da13e482cc408f2f852506a2c98bd8c5131cd27..988cdc94f7c6706a1057612a3c1e5edb20c61f2b 100644 (file)
@@ -40,7 +40,8 @@ AR = ar
 
 SRCS = ocl_device.cpp configuration_parser.cpp configuration.cpp\
           executor.cpp execution_object.cpp trace.cpp util.cpp \
-       execution_object_pipeline.cpp
+       execution_object_pipeline.cpp \
+       subgraph_runtime.cpp
 SRCS_IMGUTIL = imgutil.cpp
 SRCS_PYBIND  = pybind_eo.cpp pybind_eop.cpp pybind_executor.cpp \
                           pybind_configuration.cpp pybind_helpers.cpp
diff --git a/tidl_api/inc/subgraph_runtime.h b/tidl_api/inc/subgraph_runtime.h
new file mode 100644 (file)
index 0000000..09cf970
--- /dev/null
@@ -0,0 +1,140 @@
+/******************************************************************************
+ * Copyright (c) 2019 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+//! @file subgraph_runtime.h
+
+#pragma once
+#include <vector>
+#include <mutex>
+#include <condition_variable>
+#include "execution_object_pipeline.h"
+
+
+namespace tidl {
+
+#if 0
+// Auto-generated code from Relay/TVM compilation step after
+// partitioning and lowering to backend implementation
+
+// TODO: need to figure out exact arguments and format
+extern void tidl::RunSubgraphImpl(int subgraph_id,
+                                  const std::vector<float*>&,
+                                  const std::vector<float*>&);
+
+void tidlRunSubgraph(int subgraph_id,
+                     int num_input_tensors, int num_output_tensors,
+                     PackedArgs args)
+{
+  std::vector<float *> in_data, out_data;
+
+  for (int i = 0; i < num_input_tensors + num_output_tensors; i++)
+    if (i < num_input_tensors)
+      in_data.push_back(args.data[i]);
+    else
+      out_data.push_back(args.data[i]);
+
+  tidl::RunSubgraphImpl(subgraph_id, in_data, out_data);
+}
+#endif
+
+
+#if 0
+// user application code
+// subgraph_id will be used to find TIDL config file
+// e.g. subgraph_1.cfg, subgraph_2.cfg, etc
+void RunSubgraphImpl(int subgraph_id,
+                     int total_num_subgraphs,
+                     const std::vector<float*>& ext_in_data,
+                     const std::vector<float*>& ext_out_data)
+{
+  ResM& res = ResM::Instance(total_num_subgraphs);
+  const ExecutionObjectPipeline& eop = res.GetEOP(subgraph_id);
+  const SubgraphDataConv& in_conv    = res.GetInConv(subgraph_id);
+  const SubgraphDataConv& out_conv   = res.GetOutConv(subgraph_id);
+
+  in_data = eop.GetInputBufferPtr();
+  in_conv.ScaleQuant(ext_in_data, in_data);
+  eop.ProcessFrameStartAsync();
+  eop.ProcessFrameWait();
+  out_data = eop.GetOutputBufferPtr();
+  out_conv.ScaleDeQuant(out_data, ext_out_data);
+  res.FreeEOP(subgraph_id, eop);
+}
+#endif 
+
+
+// Singleton ResM   .h file
+// Resource manager for available EVE and DSP devices,
+//   - Allocates EVEs and DSPs
+//   - Constructs Executors (tidl_setup) and ExecutionObjects (tid_init)
+//   - Creates set of ExecutionPipelines (with or without DSP)
+//   - Allocating EOP on demand (acquire and free semantics)
+//   - Allocates input/output buffers
+class ResM {
+  public:
+    ResM();
+    ~ResM();
+    static ResM& Instance(uint32_t total_num_subgraphs = 1);
+
+    // how to ge
+    ExecutionObjectPipeline* GetEOP(uint32_t subgraph_id);
+    void                     FreeEOP(uint32_t subgraph_id,
+                                     ExecutionObjectPipeline* eop);
+    Configuration&           GetConfiguration(uint32_t subgraph_id);
+    //const SubgraphDataConv&        GetInConv(uint32_t subgraph_id);
+    //const SubgraphDataConv&        GetOutConv(uint32_t subgraph_id);
+
+
+  private:
+    void Init(uint32_t num_subgraphs);
+
+    bool     enable_trace_m;
+    uint32_t num_subgraphs_m;
+    uint32_t num_es_per_subgraph_m;
+    uint32_t num_eves_m;
+    uint32_t num_dsps_m;
+    uint32_t num_lg2_dsps_used_m;  // in partitioned execution case
+    std::mutex mutex_init_m;
+
+    // indexed by subgraph_id for resources
+    struct ResEOP {
+      ResEOP() : free_eop_index(0), is_used(), eops(nullptr) {}
+
+      uint32_t free_eop_index;
+      std::mutex mutex_eops;
+      std::condition_variable cv_eops;
+      std::vector<bool> is_used;
+      std::vector<ExecutionObjectPipeline*>* eops;
+    };
+    std::vector<Configuration> cs_m;
+    std::vector<Executor*> es_m;
+    std::vector<Executor*> e2s_m;
+    std::vector<ResEOP> *eops_m;
+};
+
+} // namespace tidl
diff --git a/tidl_api/src/subgraph_data_conv.h b/tidl_api/src/subgraph_data_conv.h
new file mode 100644 (file)
index 0000000..24920fc
--- /dev/null
@@ -0,0 +1,128 @@
+/******************************************************************************
+ * Copyright (c) 2019 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+namespace tidl {
+
+/* @class SubgraphDataConv
+   @brief Handles data conversion at subgraph boundaries
+          At calibration time, consume either external input or external
+          output tensors, determine sign and scaling factor.
+          At inference time, use sign and scaling factor to perform data
+          conversion between TIDL tensors and external tensors
+
+   Example use for EstScaleQuant:
+     SubgraphDataConv conv({}, {}, {}, {1,3,64,64,1,3,28,28});
+     conv.EstScaleQuant(in);
+     WriteQuantizationParams(conv.GetIsSigned(), conv.getScaleQ());
+     conv.ScaleQuant(in, out);
+
+   Example use for EstScaleDequant:
+     SubgraphDataConv conv({}, {}, {}, {1,3,64,64,1,3,28,28});
+     conv.EstScaleDeQuant(out);
+     WriteDeQuantizationParams(conv.GetIsSigned(), conv.getScaleQ());
+
+   Example use for ScaleQuant:
+     // one time setup
+     ... Parse json file for is_signed, scaleQ, is_NCHW, dims ...
+     SubgraphDataConv conv(is_signed, scaleQ, is_NCHW, dims);
+
+     // per inference
+     out = eop.GetInputBufferPtr();
+     conv.ScaleQuant(in, out);
+     eop.ProcessFrameStartAsync();
+
+   Example use for ScaleDeQuant:
+     // one time setup
+     ... Parse json file for is_signed, scaleQ, is_NCHW, dims ...
+     SubgraphDataConv conv(is_signed, scaleQ, is_NCHW, dims);
+
+     // per inference
+     eop.ProcessFrameWait();
+     in = eop.GetOutputBufferPtr();
+     conv.ScaleDeQuant(in, out);
+*/
+class SubgraphDataConv
+{
+    public:
+        //! @brief Creates a SubgraphDataConv.
+        //! @param None
+        SubgraphDataConv() {}
+
+        SubgraphDataConv(const vector<bool>& is_signed,
+                         const vector<float>& scaleQ,
+                         const vector<bool>& is_NCHW,
+                         const vector<int>& dims
+                        ) : is_signed_m(is_signed), scaleQ_m(scaleQ),
+                            is_NCHW_m(is_NCHW), dims_m(dims)
+                        {}
+
+        const std::vector<bool>&  GetIsSigned() { return is_signed_m; }
+        const std::vector<float>& GetScaleQ()   { return scaleQ_m; }
+        const std::vector<bool>&  GetIsNCHW()   { return is_NCHW_m; }
+
+        //! @brief Estimate parameters for Quantization
+        //! @param in vector of floating point external tensor data at input
+        void EstScaleQuant(const std::vector<float*>& in);
+
+        //! @brief Estimate paramters for DeQuantization
+        //! @param out vector of floating point external tensor data at output
+        void EstScaleDequant(const std::vector<float*>& out);
+
+        //! @brief Quantizes floating point {in} to 8-bit Quantized {out}
+        //!        and transposes buffer from NHWC to NCHW format (if needed),
+        //!        results are put into out pointer consecutively, as expected
+        //!        by TIDL
+        //! @param in floating point vector input to quantize
+        //! @param out 8-bit Quantized output (quantized from in)
+        void ScaleQuant(const std::vector<float*>& in, uint8_t* out);
+
+        //! @brief De-Quantizes 8-bit Quantized {in} to floating point {out}
+        //!        and transposes buffer from NCHW to NHWC format (if needed),
+        //!        the results are put into out vector, one vector per
+        //!        tensor, as expected by external tensors
+        //! @param in 8-bit Quantized input to De-Quantize
+        //! @param out floating point output (De-Quantized from in)
+        void ScaleDequant(const uint8_t *in, std::vector<float*>& out);
+
+    private:
+        //! if tensor needs to be evaluated as signed char
+        std::vector<bool> is_signed_m;
+
+        //! Q value for Quantization and Dequantization
+        std::vector<float> scaleQ_m;
+
+        //! the format of external tensors, NCHW or NHWC
+        //! if data needs to be transposed between TIDL NCHW tensors and
+        //! external tensors
+        std::vector<bool> is_NCHW_m;
+
+        //! flattened 4d dims of external tensors
+        std::vector<int> dims_m;
+}
+
+}  // namespace tidl
diff --git a/tidl_api/src/subgraph_runtime.cpp b/tidl_api/src/subgraph_runtime.cpp
new file mode 100644 (file)
index 0000000..5445b9b
--- /dev/null
@@ -0,0 +1,321 @@
+/******************************************************************************
+ * Copyright (c) 2019 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include <pthread.h>
+#define LOKI_PTHREAD_H
+#include <loki/Singleton.h>
+
+#include "util.h"
+#include "subgraph_runtime.h"
+
+
+#if 0
+// Auto-generated code from Relay/TVM compilation step after
+// partitioning and lowering to backend implementation
+
+// TODO: need to figure out exact arguments and format
+extern void tidl::RunSubgraphImpl(int subgraph_id,
+                                  const std::vector<float*>&,
+                                  const std::vector<float*>&);
+
+void tidlRunSubgraph(int subgraph_id,
+                     int num_input_tensors, int num_output_tensors,
+                     PackedArgs args)
+{
+  std::vector<float *> in_data, out_data;
+
+  for (int i = 0; i < num_input_tensors + num_output_tensors; i++)
+    if (i < num_input_tensors)
+      in_data.push_back(args.data[i]);
+    else
+      out_data.push_back(args.data[i]);
+
+  tidl::RunSubgraphImpl(subgraph_id, in_data, out_data);
+}
+#endif
+
+
+#if 0
+// user application code
+// subgraph_id will be used to find TIDL config file
+// e.g. subgraph_1.cfg, subgraph_2.cfg, etc
+void RunSubgraphImpl(int subgraph_id,
+                     int total_num_subgraphs,
+                     const std::vector<float*>& ext_in_data,
+                     const std::vector<float*>& ext_out_data)
+{
+  ResM& res = ResM::Instance(total_num_subgraphs);
+  const ExecutionObjectPipeline& eop = res.GetEOP(subgraph_id);
+  const SubgraphDataConv& in_conv    = res.GetInConv(subgraph_id);
+  const SubgraphDataConv& out_conv   = res.GetOutConv(subgraph_id);
+
+  in_data = eop.GetInputBufferPtr();
+  in_conv.ScaleQuant(ext_in_data, in_data);
+  eop.ProcessFrameStartAsync();
+  eop.ProcessFrameWait();
+  out_data = eop.GetOutputBufferPtr();
+  out_conv.ScaleDeQuant(out_data, ext_out_data);
+  res.FreeEOP(subgraph_id, eop);
+}
+#endif
+
+
+
+// Singleton ResM .cpp
+using namespace tidl;
+
+typedef Loki::SingletonHolder <tidl::ResM, Loki::CreateUsingNew,
+Loki::DefaultLifetime, Loki::ClassLevelLockable> tidlSingleResM;
+
+ResM::ResM() : enable_trace_m(false), num_subgraphs_m(0),
+               num_lg2_dsps_used_m(0), eops_m(nullptr)
+{
+}
+
+ResM::~ResM()
+{
+  if (eops_m != nullptr)
+  {
+    for (const ResEOP& res_eop : *eops_m)
+    {
+      if (res_eop.eops != nullptr)
+      {
+        for (const ExecutionObjectPipeline* eop : *(res_eop.eops))
+        {
+          free(eop->GetInputBufferPtr());
+          free(eop->GetOutputBufferPtr());
+          delete eop;
+        }
+      }
+    }
+    delete eops_m;
+    eops_m = nullptr;
+  }
+
+  for (const Executor* e : es_m)
+    if (e != nullptr) delete e;
+  for (const Executor* e : e2s_m)
+    if (e != nullptr) delete e;
+}
+
+ResM& ResM::Instance(uint32_t total_num_subgraphs)
+{
+  ResM& res = tidlSingleResM::Instance();
+  res.Init(total_num_subgraphs);
+  return res;
+}
+
+void ResM::Init(uint32_t num_subgraphs)
+{
+  std::lock_guard<std::mutex> lock(mutex_init_m);
+
+  if (num_subgraphs_m == 0)
+  {
+    num_subgraphs_m = num_subgraphs;
+
+    if (getenv("TIDL_SUBGRAPH_TRACE") != nullptr)  enable_trace_m = true;
+
+    // Allocating resources
+    num_eves_m = Executor::GetNumDevices(DeviceType::EVE);
+    num_eves_m = 1; // TODO: to remove after debugging
+    num_dsps_m = Executor::GetNumDevices(DeviceType::DSP);
+
+    assert(num_eves_m > 0 || num_dsps_m > 0);
+    assert(num_subgraphs_m <= num_eves_m || num_subgraphs_m <= num_dsps_m);
+    num_es_per_subgraph_m = num_eves_m / num_subgraphs_m;
+    if (num_eves_m == 0)
+      num_es_per_subgraph_m = num_dsps_m / num_subgraphs_m;
+
+    cs_m.resize(num_subgraphs_m);
+    es_m.resize(num_subgraphs_m, nullptr);
+    e2s_m.resize(num_subgraphs_m, nullptr);
+    eops_m = new std::vector<ResEOP>(num_subgraphs_m);
+  }
+}
+
+ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
+{
+  assert(subgraph_id < num_subgraphs_m);
+  ResEOP& res_eop = (*eops_m)[subgraph_id];
+
+  std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
+
+  if (res_eop.eops == nullptr)
+  {
+    if (enable_trace_m)
+      printf("Subgraph %d: initialing E/EOPs with %d cores\n",
+             subgraph_id, num_es_per_subgraph_m);
+
+    // Constructing EOPs if not already constructed
+    // Each subgraph -> num_eves_per_subgraph_m EOPs
+    // Each EOP -> use_count
+    std::string cfg_file = "subgraph" + std::to_string(subgraph_id) + ".cfg";
+    bool status = cs_m[subgraph_id].ReadFromFile(cfg_file);
+    assert(status);
+    
+    // Check if last few layers can be offloaded to DSPs
+    //       and DSPs are available
+    DeviceIds e_ids, e2_ids;
+    for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
+      e_ids.insert(static_cast<DeviceId>(
+                               subgraph_id * num_es_per_subgraph_m + i));
+    // uint32_t num_dsps_used = 0;
+    if (num_eves_m > 0 && num_dsps_m > 0 && ! cs_m[subgraph_id].runFullNet)
+    {
+      sTIDL_Network_t *net = new sTIDL_Network_t;
+      bool status = ReadNetworkBinary(cs_m[subgraph_id].netBinFile,
+                                      reinterpret_cast<char *>(net));
+      assert(status);
+      int32_t start_layer = net->numLayers -1;
+      int32_t end_layer = 0;
+      if (net->TIDLLayers[start_layer].layerType == (int32_t) TIDL_DataLayer)
+        start_layer -= 1;
+      if (net->TIDLLayers[end_layer].layerType == (int32_t) TIDL_DataLayer)
+        end_layer += 1;
+      int32_t i = start_layer;
+      for ( ; i > end_layer; i--)
+      {
+        int32_t layer_type = net->TIDLLayers[i].layerType;
+        if (layer_type != (int32_t) TIDL_SoftMaxLayer &&
+            layer_type != (int32_t) TIDL_InnerProductLayer &&
+            layer_type != (int32_t) TIDL_PoolingLayer)
+          break;
+      }
+      i += 1;
+      if (i <= start_layer)
+      {
+        if (num_lg2_dsps_used_m < num_dsps_m)
+        {
+          if (enable_trace_m)
+            printf("Subgraph %d: assign layers %d to %d to group 2 for DSP\n", 
+                   subgraph_id, i, start_layer);
+          while (i <= start_layer)
+            cs_m[subgraph_id].layerIndex2LayerGroupId[i++] = 2;
+          e2_ids.insert(static_cast<DeviceId>(num_lg2_dsps_used_m));
+          num_lg2_dsps_used_m += 1;
+        }
+      }
+      delete net;
+    }
+
+    if (e2_ids.empty())
+      cs_m[subgraph_id].runFullNet = true;
+    cs_m[subgraph_id].enableApiTrace = enable_trace_m;
+
+    // Constructing Es and EOPs
+    res_eop.eops = new std::vector<ExecutionObjectPipeline*>;
+    uint32_t buffer_factor = 2;  // double buffering factor
+    if (num_eves_m > 0)
+    {
+      es_m[subgraph_id]  = new Executor(DeviceType::EVE, e_ids,
+                                        cs_m[subgraph_id], 1);
+      if (! e2_ids.empty())
+      {
+        e2s_m[subgraph_id] = new Executor(DeviceType::DSP, e2_ids,
+                                          cs_m[subgraph_id], 2);
+        for (uint32_t j = 0; j < buffer_factor; j++)
+          for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
+            res_eop.eops->emplace_back(new ExecutionObjectPipeline(
+                                  {(*es_m[subgraph_id])[i],
+                                   (*e2s_m[subgraph_id])[i % e2_ids.size()]}));
+      }
+      else
+      {
+        for (uint32_t j = 0; j < buffer_factor; j++)
+          for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
+            res_eop.eops->emplace_back(new ExecutionObjectPipeline(
+                                                   {(*es_m[subgraph_id])[i]}));
+      }
+    }
+    else
+    {
+      es_m[subgraph_id]  = new Executor(DeviceType::DSP, e_ids,
+                                        cs_m[subgraph_id], 1);
+      for (uint32_t j = 0; j < buffer_factor; j++)
+        for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
+          res_eop.eops->emplace_back(new ExecutionObjectPipeline(
+                                                   {(*es_m[subgraph_id])[i]}));
+    }
+
+    if (enable_trace_m)
+      printf("Subgraph %d: Allocating input/output buffers for %d EOPs\n",
+             subgraph_id, res_eop.eops->size());
+    // Allocate input/output buffers
+    for (auto eop : *(res_eop.eops))
+    {
+      size_t in_size  = eop->GetInputBufferSizeInBytes();
+      size_t out_size = eop->GetOutputBufferSizeInBytes();
+      void*  in_ptr   = malloc(in_size);
+      void*  out_ptr  = malloc(out_size);
+      assert(in_ptr != nullptr && out_ptr != nullptr);
+
+      ArgInfo in(in_ptr, in_size);
+      ArgInfo out(out_ptr, out_size);
+      eop->SetInputOutputBuffer(in, out);
+    }
+
+    res_eop.free_eop_index = 0;
+    res_eop.is_used.resize(res_eop.eops->size(), false);
+  }
+
+  // Return an available EOP (round robin allocation)
+  uint32_t curr_eop = res_eop.free_eop_index;
+  res_eop.cv_eops.wait(lock, [this, subgraph_id, curr_eop]{
+           return this->eops_m->at(subgraph_id).is_used[curr_eop] == false; });
+  res_eop.is_used[curr_eop] = true;
+  res_eop.free_eop_index = (curr_eop + 1) % res_eop.eops->size();
+  if (enable_trace_m)
+    printf("Subgraph %d: return EOP %d for GetEOP()\n", subgraph_id, curr_eop);
+  return res_eop.eops->at(curr_eop);
+}
+
+void ResM::FreeEOP(uint32_t subgraph_id, ExecutionObjectPipeline* eop)
+{
+  ResEOP& res_eop = (*eops_m)[subgraph_id];
+  {
+    std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
+    for (uint32_t i = 0; i < res_eop.is_used.size(); i++)
+      if (res_eop.eops->at(i) == eop)
+      {
+        res_eop.is_used[i] = false;
+        if (enable_trace_m)
+          printf("Subgraph %d: FreeEOP %d\n", subgraph_id, i);
+        break;
+      }
+  }
+  res_eop.cv_eops.notify_all();
+}
+
+Configuration& ResM::GetConfiguration(uint32_t subgraph_id)
+{
+  assert(subgraph_id < num_subgraphs_m);
+  assert((*eops_m)[subgraph_id].eops != nullptr);
+  return cs_m[subgraph_id];
+}
+
+