Subgraph: support batch processing
authorYuan Zhao <yuanzhao@ti.com>
Thu, 31 Oct 2019 20:17:38 +0000 (15:17 -0500)
committerYuan Zhao <yuanzhao@ti.com>
Tue, 5 Nov 2019 18:10:52 +0000 (12:10 -0600)
- MCT-1223

examples/mobilenet_subgraph/Makefile
examples/mobilenet_subgraph/main.cpp
tidl_api/Makefile
tidl_api/inc/subgraph_data_conv.h
tidl_api/inc/subgraph_runtime.h
tidl_api/src/subgraph_runtime.cpp
tidl_api/src/subgraph_runtime_impl.h [new file with mode: 0644]

index aed396ca878ba5beff088e80fe15fd371dcf15b4..ffeb69d4ce3e40b366950e92deac50f167abd12c 100644 (file)
@@ -28,6 +28,7 @@ EXE = imagenet
 
 include ../make.common
 
+# overwrite LIBS, -ltidl_api should be able to pull -lOpenCL
 LIBS     += -lopencv_highgui -lopencv_imgcodecs -lopencv_videoio\
                        -lopencv_imgproc -lopencv_core
 LIBS     += -ljson-c
index c5da64732077b48eb1593402f860eb703b04952c..5534df3660e60e329a0618021caf7d766c075791 100644 (file)
@@ -44,6 +44,7 @@
 #include "execution_object.h"
 #include "execution_object_pipeline.h"
 #include "subgraph_runtime.h"
+#include "subgraph_data_conv.h"
 #include "configuration.h"
 #include "../common/object_classes.h"
 #include "imgutil.h"
@@ -70,7 +71,8 @@ const char *default_inputs[NUM_DEFAULT_INPUTS] =
 std::unique_ptr<ObjectClasses> object_classes;
 
 bool RunConfiguration(cmdline_opts_t& opts);
-bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs);
+bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs,
+               int batch_size);
 bool WriteFrameOutput(float *out, const cmdline_opts_t& opts);
 void DisplayHelp();
 
@@ -140,6 +142,7 @@ bool RunConfiguration(cmdline_opts_t& opts)
     VideoCapture cap;
     if (! SetVideoInputOutput(cap, opts, "ImageNet"))  return false;
 
+    cout << "\n##### Batch size 1 testing ######\n" << endl;
     try
     {
         float **inputs = new float *[1];
@@ -152,8 +155,8 @@ bool RunConfiguration(cmdline_opts_t& opts)
           chrono::time_point<chrono::steady_clock> tloop0, tloop1;
           tloop0 = chrono::steady_clock::now();
 
-          ReadFrame(opts, cap, inputs);
-          TidlRunSubgraph(1, 0, 1, 1, inputs, outputs);
+          ReadFrame(opts, cap, inputs, 1);
+          TidlRunSubgraph(1, 0, 1, 1, 1, inputs, outputs);
           WriteFrameOutput(outputs[0], opts);
 
           tloop1 = chrono::steady_clock::now();
@@ -175,11 +178,56 @@ bool RunConfiguration(cmdline_opts_t& opts)
         status = false;
     }
 
+    int batch_size = 8;
+    cout << "\n##### Batch size " << batch_size << " testing ######\n" << endl;
+    try
+    {
+        float **inputs  = new float *[batch_size];
+        float **outputs = new float *[batch_size];
+        for (int i = 0; i < batch_size; i++)
+        {
+            inputs[i]  = new float[1*3*224*224];
+            outputs[i] = new float[1001];
+        }
+
+        chrono::time_point<chrono::steady_clock> tloop0, tloop1;
+        tloop0 = chrono::steady_clock::now();
+
+        ReadFrame(opts, cap, inputs, batch_size);
+        TidlRunSubgraph(1, 0, batch_size, 1, 1, inputs, outputs);
+        for (int i = 0; i < batch_size; i++)
+        {
+            cout << "Frame " << i << " of " << batch_size << " output:" << endl;
+            WriteFrameOutput(outputs[i], opts);
+        }
+
+        tloop1 = chrono::steady_clock::now();
+        chrono::duration<float> elapsed = tloop1 - tloop0;
+        cout << "Batch size " << batch_size
+             << " time (including read/write/opencv/print/etc): "
+             << setw(6) << setprecision(4)
+             << (elapsed.count() * 1000) << "ms" << endl;
+
+        for (int i = 0; i < batch_size; i++)
+        {
+            delete [] inputs[i];
+            delete [] outputs[i];
+        }
+        delete [] inputs;
+        delete [] outputs;
+    }
+    catch (tidl::Exception &e)
+    {
+        cerr << e.what() << endl;
+        status = false;
+    }
+
     return status;
 }
 
 
-bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs)
+bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs,
+               int batch_size)
 {
     Configuration c;
     c.inNumChannels = 3;;
@@ -226,8 +274,11 @@ bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs)
 
     // TI DL image preprocessing, into frame_buffer
     bool status = imgutil::PreprocessImage(image, frame_buffer, c);
-    std::vector<float *> in_data_v{inputs[0]};
-    in_conv.ScaleDequant((const uint8_t *)frame_buffer, in_data_v);
+    for (int i = 0; i < batch_size; i++)
+    {
+        std::vector<float *> in_data_v{inputs[i]};
+        in_conv.ScaleDequant((const uint8_t *)frame_buffer, in_data_v);
+    }
     delete [] frame_buffer;
     return status;
 }
@@ -247,6 +298,7 @@ bool WriteFrameOutput(float *out, const cmdline_opts_t& opts)
     auto cmp = [](val_index &left, val_index &right)
                          { return left.first > right.first; };
     priority_queue<val_index, vector<val_index>, decltype(cmp)> queue(cmp);
+
     // initialize priority queue with smallest value on top
     for (int i = 0; i < k; i++)
         queue.push(val_index(out[i], i));
index ca9187859866599c9081b9da64f4effba84d856d..a04e604424aff87666a0588da7d167109a20756f 100644 (file)
@@ -95,7 +95,7 @@ $(LIB_NAME): $(HOST_OBJ_FILES)
        $(AR) cr $@ $(HOST_OBJ_FILES)
 
 $(SHARED_LIB_NAME): $(HOST_OBJ_FILES)
-       $(CXX) -shared $(HOST_OBJ_FILES) -o $@
+       $(CXX) $(CXXFLAGS) -Wl,-Bsymbolic -shared -lOpenCL $(HOST_OBJ_FILES) -o $@
 
 $(PY_LIB_NAME): $(HOST_OBJ_PYBIND_FILES) $(LIB_NAME)
        $(CXX) $(CXXFLAGS) -Wl,-Bsymbolic -shared -lOpenCL -locl_util $^ -o $@
@@ -104,7 +104,7 @@ $(LIB_IMGUTIL_NAME): $(HOST_OBJ_IMGUTIL_FILES)
        $(AR) cr $@ $(HOST_OBJ_IMGUTIL_FILES)
 
 $(SHARED_LIB_IMGUTIL_NAME): $(HOST_OBJ_IMGUTIL_FILES)
-       $(CXX) -shared $(HOST_OBJ_IMGUTIL_FILES) -o $@
+       $(CXX) $(CXXFLAGS) -Wl,-Bsymbolic -shared $(HOST_OBJ_IMGUTIL_FILES) -o $@
 
 clean::
        $(RM) -f $(LIB_NAME) $(PY_LIB_NAME)
index 49b4315ef58a156a9cc75452870dae7009266dd0..6b7c4b1b4599d63781a65df957db2c2934908864 100644 (file)
@@ -26,6 +26,8 @@
  *  THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
 
+#pragma once
+
 #include <stdint.h>
 #include <vector>
 
index c153485099acbc59efdc3de9ac332db0c21f8234..37e771defdd2fd657c85351070677d294b069e2f 100644 (file)
 //! @file subgraph_runtime.h
 
 #pragma once
-#include <vector>
-#include <mutex>
-#include <condition_variable>
-#include "execution_object_pipeline.h"
-#include "subgraph_data_conv.h"
 
 extern "C" {
 
-void TidlRunSubgraph(int total_subgraphs,
-                     int subgraph_id,
-                     int num_inputs,
-                     int num_outputs,
-                     float **inputTensors,
-                     float **outputTensors
-                    );
+//! @brief Top level inference to run a TIDL subgraph
+//! @param total_subgraphs  total number of TIDL subgraphs in whole inference
+//! @param subgraph_id  index of current TIDL subgraph
+//! @param batch_size  number of samples/inferences in this batch
+//! @param num_inputs_per_inference  number of inputs to TIDL subgraph
+//!        for every sample/inference
+//! @param num_outputs_per_inference  number of outputs from TIDL subgraph
+//!        for every sample/inference
+//! @param input_tensors  input data to TIDL subgraph, layout as
+//!        batch1_input1, batch1_input2, ..., batch1_inputM,
+//!        ... ... ...
+//!        batchN_input1, batchN_input2, ..., batchN_inputM
+//! @param output_tensors  output data from TIDL subgraph, layout as
+//!        batch1_output1, batch1_output2, ..., batch1_outputK,
+//!        ... ... ...
+//!        batchN_output1, batchN_output2, ..., batchN_outputK
+extern void TidlRunSubgraph(int total_subgraphs,
+                            int subgraph_id,
+                            int batch_size,
+                            int num_inputs_per_inference,
+                            int num_outputs_per_inference,
+                            float **input_tensors,
+                            float **output_tensors
+                           );
 
 }  // extern "C"
 
-namespace tidl {
 
 #if 0
 // Auto-generated code from Relay/TVM compilation step after
@@ -57,20 +68,25 @@ void TVM_TidlFunction(int total_subgraphs, int subgraph_id,
                      int num_input_tensors, int num_output_tensors,
                      PackedArgs args)
 {
-  float** in_data  = new float*[num_input_tensors];
-  float** out_data = new float*[num_output_tensors];
-
-  for (int i = 0; i < num_input_tensors + num_output_tensors; i++)
-    if (i < num_input_tensors)
-      in_data[i] = args.data[i];
-    else
-      out_data[i - num_input_tensors] = args.data[i];
+  float** in_data  = new float*[num_inputs_per_inference * batch_size];
+  float** out_data = new float*[num_outputs_per_inference * batch_size];
+
+  for (in j = 0; j < batch_size; j++)
+  {
+    for (int i = 0; i < num_inputs_per_inference + num_outputs_per_inference;
+         i++)
+      if (i < num_inputs_per_inference)
+        in_data[j * num_inputs_per_inference + i] = args.data[i][j];
+      else
+        out_data[j * num_outpus_per_inference + i - num_inputs_per_inference]
+                                                  = args.data[i][j];
+  }
 
   // call into this function in libtidl.so
-  // dlopen("libtidl.so")
+  // dlopen("libtidl_api.so")
   // TidlFunc = dlsym("TidlRunSubgraph");
-  (*TidlFunc)(total_subgraphs, subgraph_id,
-              num_input_tensors, num_output_tensors,
+  (*TidlFunc)(total_subgraphs, subgraph_id, batch_size
+              num_inputs_per_inference, num_outputs_per_inference,
               in_data, out_data);
 
   delete [] in_data;
@@ -78,56 +94,3 @@ void TVM_TidlFunction(int total_subgraphs, int subgraph_id,
 }
 #endif
 
-
-// Singleton ResM   .h file
-// Resource manager for available EVE and DSP devices,
-//   - Allocates EVEs and DSPs
-//   - Constructs Executors (tidl_setup) and ExecutionObjects (tid_init)
-//   - Creates set of ExecutionPipelines (with or without DSP)
-//   - Allocating EOP on demand (acquire and free semantics)
-//   - Allocates input/output buffers
-class ResM {
-  public:
-    ResM();
-    ~ResM();
-    static ResM& Instance(uint32_t total_num_subgraphs = 1);
-
-    // how to ge
-    ExecutionObjectPipeline* GetEOP(uint32_t subgraph_id);
-    void                     FreeEOP(uint32_t subgraph_id,
-                                     ExecutionObjectPipeline* eop);
-    Configuration&           GetConfiguration(uint32_t subgraph_id);
-    const SubgraphDataConv&  GetInConv(uint32_t subgraph_id);
-    const SubgraphDataConv&  GetOutConv(uint32_t subgraph_id);
-
-
-  private:
-    void Init(uint32_t num_subgraphs);
-
-    bool     enable_trace_m;
-    uint32_t num_subgraphs_m;
-    uint32_t num_es_per_subgraph_m;
-    uint32_t num_eves_m;
-    uint32_t num_dsps_m;
-    uint32_t num_lg2_dsps_used_m;  // in partitioned execution case
-    std::mutex mutex_init_m;
-
-    // indexed by subgraph_id for resources
-    struct ResEOP {
-      ResEOP() : free_eop_index(0), is_used(), eops(nullptr) {}
-
-      uint32_t free_eop_index;
-      std::mutex mutex_eops;
-      std::condition_variable cv_eops;
-      std::vector<bool> is_used;
-      std::vector<ExecutionObjectPipeline*>* eops;
-    };
-    std::vector<Configuration> cs_m;
-    std::vector<Executor*> es_m;
-    std::vector<Executor*> e2s_m;
-    std::vector<ResEOP> *eops_m;
-    std::vector<SubgraphDataConv*> in_conv_m;
-    std::vector<SubgraphDataConv*> out_conv_m;
-};
-
-} // namespace tidl
index ad5a11abdccfbbf3c6cbfab98f3580857366f263..09905fc72ddc6d8e0d60634a033f2e632f2fcd31 100644 (file)
@@ -32,6 +32,7 @@
 
 #include "util.h"
 #include "subgraph_runtime.h"
+#include "subgraph_runtime_impl.h"
 
 
 #if 0
@@ -42,20 +43,25 @@ void TVM_TidlFunction(int total_subgraphs, int subgraph_id,
                      int num_input_tensors, int num_output_tensors,
                      PackedArgs args)
 {
-  float** in_data  = new float*[num_input_tensors];
-  float** out_data = new float*[num_output_tensors];
+  float** in_data  = new float*[num_inputs_per_inference * batch_size];
+  float** out_data = new float*[num_outputs_per_inference * batch_size];
 
-  for (int i = 0; i < num_input_tensors + num_output_tensors; i++)
-    if (i < num_input_tensors)
-      in_data[i] = args.data[i];
-    else
-      out_data[i - num_input_tensors] = args.data[i];
+  for (in j = 0; j < batch_size; j++)
+  {
+    for (int i = 0; i < num_inputs_per_inference + num_outputs_per_inference;
+         i++)
+      if (i < num_inputs_per_inference)
+        in_data[j * num_inputs_per_inference + i] = args.data[i][j];
+      else
+        out_data[j * num_outpus_per_inference + i - num_inputs_per_inference]
+                                                  = args.data[i][j];
+  }
 
   // call into this function in libtidl.so
-  // dlopen("libtidl.so")
+  // dlopen("libtidl_api.so")
   // TidlFunc = dlsym("TidlRunSubgraph");
-  (*TidlFunc)(total_subgraphs, subgraph_id,
-              num_input_tensors, num_output_tensors,
+  (*TidlFunc)(total_subgraphs, subgraph_id, batch_size
+              num_inputs_per_inference, num_outputs_per_inference,
               in_data, out_data);
 
   delete [] in_data;
@@ -70,31 +76,57 @@ using namespace tidl;
 
 void TidlRunSubgraph(int total_subgraphs,
                      int subgraph_id,
-                     int num_inputs,
-                     int num_outputs,
-                     float **inputTensors,
-                     float **outputTensors
+                     int batch_size,
+                     int num_inputs_per_inference,
+                     int num_outputs_per_inference,
+                     float **input_tensors,
+                     float **output_tensors
                     )
 {
   ResM& res = ResM::Instance(total_subgraphs);
-  ExecutionObjectPipeline* eop     = res.GetEOP(subgraph_id);
+  res.InitSubgraph(subgraph_id);
+  int num_eops = res.GetNumEOPs(subgraph_id);
+  if (num_eops > batch_size)  num_eops = batch_size;
+  std::vector<ExecutionObjectPipeline*> eops(num_eops);
+  for (int i = 0; i < num_eops; i++)
+    eops[i] = res.GetEOP(subgraph_id);
   const SubgraphDataConv& in_conv  = res.GetInConv(subgraph_id);
   const SubgraphDataConv& out_conv = res.GetOutConv(subgraph_id);
 
-  std::vector<float *> in_data_v, out_data_v;
-  for (int i = 0; i < num_inputs; i++)
-    in_data_v.emplace_back(inputTensors[i]);
-  for (int i = 0; i < num_outputs; i++)
-    out_data_v.emplace_back(outputTensors[i]);
-  char* in_data = eop->GetInputBufferPtr();
-  in_conv.ScaleQuant(in_data_v, (uint8_t *) in_data);
+  std::vector<std::vector<float *>> in_data_v(batch_size),
+                                    out_data_v(batch_size);
+  for (int frame_idx = 0; frame_idx < batch_size; frame_idx++)
+  {
+    for (int i = 0; i < num_inputs_per_inference; i++)
+      in_data_v[frame_idx].emplace_back(input_tensors[
+                                    frame_idx * num_inputs_per_inference + i]);
+    for (int i = 0; i < num_outputs_per_inference; i++)
+      out_data_v[frame_idx].emplace_back(output_tensors[
+                                    frame_idx * num_inputs_per_inference + i]);
+  }
 
-  eop->ProcessFrameStartAsync();
-  eop->ProcessFrameWait();
+  // Process batch_size frames with available eops in pipelined manner
+  // additional num_eops iterations to flush the pipeline (epilogue)
+  for (int frame_idx = 0; frame_idx < batch_size + num_eops; frame_idx++)
+  {
+    ExecutionObjectPipeline *eop = eops[frame_idx % num_eops];
 
-  char* out_data = eop->GetOutputBufferPtr();
-  out_conv.ScaleDequant((const uint8_t *) out_data, out_data_v);
-  res.FreeEOP(subgraph_id, eop);
+    if (eop->ProcessFrameWait())
+    {
+      const uint8_t *out_data = (const uint8_t*) eop->GetOutputBufferPtr();
+      out_conv.ScaleDequant(out_data, out_data_v[frame_idx - num_eops]);
+    }
+
+    if (frame_idx < batch_size)
+    {
+       uint8_t *in_data = (uint8_t *) eop->GetInputBufferPtr();
+      in_conv.ScaleQuant(in_data_v[frame_idx], in_data);
+      eop->ProcessFrameStartAsync();
+    }
+  }
+
+  for (int i = 0; i < num_eops; i++)
+      res.FreeEOP(subgraph_id, eops[i]);
 }
 
 
@@ -155,7 +187,6 @@ void ResM::Init(uint32_t num_subgraphs)
 
     // Allocating resources
     num_eves_m = Executor::GetNumDevices(DeviceType::EVE);
-    num_eves_m = 1; // TODO: to remove after debugging
     num_dsps_m = Executor::GetNumDevices(DeviceType::DSP);
 
     assert(num_eves_m > 0 || num_dsps_m > 0);
@@ -180,7 +211,8 @@ void ResM::Init(uint32_t num_subgraphs)
   }
 }
 
-ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
+
+void ResM::InitSubgraph(uint32_t subgraph_id)
 {
   assert(subgraph_id < num_subgraphs_m);
   ResEOP& res_eop = (*eops_m)[subgraph_id];
@@ -240,6 +272,11 @@ ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
             cs_m[subgraph_id].layerIndex2LayerGroupId[i++] = 2;
           e2_ids.insert(static_cast<DeviceId>(num_lg2_dsps_used_m));
           num_lg2_dsps_used_m += 1;
+          if (num_subgraphs_m == 1)  // Allocate all dsps if only one subgraph
+          {
+            while (num_lg2_dsps_used_m < num_dsps_m)
+              e2_ids.insert(static_cast<DeviceId>(num_lg2_dsps_used_m++));
+          }
         }
       }
       delete net;
@@ -304,6 +341,24 @@ ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
     res_eop.free_eop_index = 0;
     res_eop.is_used.resize(res_eop.eops->size(), false);
   }
+}
+
+uint32_t ResM::GetNumEOPs(uint32_t subgraph_id)
+{
+  assert(subgraph_id < num_subgraphs_m);
+  ResEOP& res_eop = (*eops_m)[subgraph_id];
+  assert (res_eop.eops != nullptr);
+
+  return res_eop.eops->size();
+}
+
+ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
+{
+  assert(subgraph_id < num_subgraphs_m);
+  ResEOP& res_eop = (*eops_m)[subgraph_id];
+  assert(res_eop.eops != nullptr);
+
+  std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
 
   // Return an available EOP (round robin allocation)
   uint32_t curr_eop = res_eop.free_eop_index;
@@ -318,7 +373,10 @@ ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
 
 void ResM::FreeEOP(uint32_t subgraph_id, ExecutionObjectPipeline* eop)
 {
+  assert(subgraph_id < num_subgraphs_m);
   ResEOP& res_eop = (*eops_m)[subgraph_id];
+  assert(res_eop.eops != nullptr);
+
   {
     std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
     for (uint32_t i = 0; i < res_eop.is_used.size(); i++)
@@ -342,12 +400,14 @@ Configuration& ResM::GetConfiguration(uint32_t subgraph_id)
 
 const SubgraphDataConv& ResM::GetInConv(uint32_t subgraph_id)
 {
+  assert(subgraph_id < num_subgraphs_m);
   assert(in_conv_m[subgraph_id] != nullptr);
   return *in_conv_m[subgraph_id];
 }
 
 const SubgraphDataConv& ResM::GetOutConv(uint32_t subgraph_id)
 {
+  assert(subgraph_id < num_subgraphs_m);
   assert(out_conv_m[subgraph_id] != nullptr);
   return *out_conv_m[subgraph_id];
 }
diff --git a/tidl_api/src/subgraph_runtime_impl.h b/tidl_api/src/subgraph_runtime_impl.h
new file mode 100644 (file)
index 0000000..a792757
--- /dev/null
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * Copyright (c) 2019 Texas Instruments Incorporated - http://www.ti.com/
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Texas Instruments Incorporated nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *  THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+//! @file subgraph_runtime_impl.h
+
+#pragma once
+#include <vector>
+#include <mutex>
+#include <condition_variable>
+#include "execution_object_pipeline.h"
+#include "subgraph_data_conv.h"
+
+
+namespace tidl {
+
+// Singleton ResM   .h file
+// Resource manager for available EVE and DSP devices,
+//   - Allocates EVEs and DSPs
+//   - Constructs Executors (tidl_setup) and ExecutionObjects (tid_init)
+//   - Creates set of ExecutionPipelines (with or without DSP)
+//   - Allocating EOP on demand (acquire and free semantics)
+//   - Allocates input/output buffers
+class ResM {
+  public:
+    ResM();
+    ~ResM();
+    static ResM& Instance(uint32_t total_num_subgraphs = 1);
+
+    // how to get resources for subgraph_id
+    void                     InitSubgraph(uint32_t subgraph_id);
+    uint32_t                 GetNumEOPs(uint32_t subgraph_id);
+    ExecutionObjectPipeline* GetEOP(uint32_t subgraph_id);
+    void                     FreeEOP(uint32_t subgraph_id,
+                                     ExecutionObjectPipeline* eop);
+    Configuration&           GetConfiguration(uint32_t subgraph_id);
+    const SubgraphDataConv&  GetInConv(uint32_t subgraph_id);
+    const SubgraphDataConv&  GetOutConv(uint32_t subgraph_id);
+
+
+  private:
+    void Init(uint32_t num_subgraphs);
+
+    bool     enable_trace_m;
+    uint32_t num_subgraphs_m;
+    uint32_t num_es_per_subgraph_m;
+    uint32_t num_eves_m;
+    uint32_t num_dsps_m;
+    uint32_t num_lg2_dsps_used_m;  // in partitioned execution case
+    std::mutex mutex_init_m;
+
+    // indexed by subgraph_id for resources
+    struct ResEOP {
+      ResEOP() : free_eop_index(0), is_used(), eops(nullptr) {}
+
+      uint32_t free_eop_index;
+      std::mutex mutex_eops;
+      std::condition_variable cv_eops;
+      std::vector<bool> is_used;
+      std::vector<ExecutionObjectPipeline*>* eops;
+    };
+    std::vector<Configuration> cs_m;
+    std::vector<Executor*> es_m;
+    std::vector<Executor*> e2s_m;
+    std::vector<ResEOP> *eops_m;
+    std::vector<SubgraphDataConv*> in_conv_m;
+    std::vector<SubgraphDataConv*> out_conv_m;
+};
+
+} // namespace tidl