9 files changed, 349 insertions, 59 deletions
diff --git a/examples/mobilenet_subgraph/Makefile b/examples/mobilenet_subgraph/Makefile
index ffeb69d..68f5d9d 100644
--- a/examples/mobilenet_subgraph/Makefile
+++ b/examples/mobilenet_subgraph/Makefile
@@ -24,7 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
-EXE = imagenet
+EXE = mobilenet_subgraph
 include ../make.common
diff --git a/examples/mobilenet_subgraph/main.cpp b/examples/mobilenet_subgraph/main.cpp
index 5534df3..e4e499a 100644
--- a/examples/mobilenet_subgraph/main.cpp
+++ b/examples/mobilenet_subgraph/main.cpp
@@ -39,6 +39,7 @@
 #include <queue>
 #include <vector>
 #include <chrono>
+#include <future>
 #include "executor.h"
 #include "execution_object.h"
@@ -145,6 +146,7 @@ bool RunConfiguration(cmdline_opts_t& opts)
    cout << "\n##### Batch size 1 testing ######\n" << endl;
    try
    {
+        TidlInitSubgraph(1, 0);
        float **inputs = new float *[1];
        inputs[0] = new float[1*3*224*224];
        float **outputs = new float *[1];
@@ -222,6 +224,60 @@ bool RunConfiguration(cmdline_opts_t& opts)
        status = false;
    }
+    // This is only to test the multithreaded inference
+    // async/future may not be the most efficient multithreading method
+    // threading pool might have better performance
+    cout << "\n##### Multithreaded inference testing #####\n" << endl;
+    int num_threads = 8;
+    int num_iters = 8;
+    try
+    {
+        float **inputs  = new float *[num_threads];
+        float **outputs = new float *[num_threads];
+        for (int i = 0; i < num_threads; i++)
+        {
+            inputs[i]  = new float[1*3*224*224];
+            outputs[i] = new float[1001];
+        }
+        vector<future<bool>> futures(num_threads);
+        chrono::time_point<chrono::steady_clock> tloop0, tloop1;
+        tloop0 = chrono::steady_clock::now();
+        for (int i = 0; i < num_iters + num_threads; i++)
+        {
+          int index = i % num_threads;
+          if (i >= num_threads)
+          {
+            if (futures[index].get())
+              WriteFrameOutput(outputs[index], opts);
+          }
+          if (i < num_iters)
+          {
+            ReadFrame(opts, cap, &inputs[index], 1);
+            futures[index] = std::async(std::launch::async,
+                                        [inputs, outputs](int index) {
+               TidlRunSubgraph(1, 0, 1, 1, 1, &inputs[index], &outputs[index]);
+               return true;
+               },
+                                        index);
+          }
+        }
+        tloop1 = chrono::steady_clock::now();
+        chrono::duration<float> elapsed = tloop1 - tloop0;
+        cout << "Multithreaded (num_threads=" << num_threads
+             << ") loop time (including read/write/opencv/print/etc): "
+             << setw(6) << setprecision(4)
+             << (elapsed.count() * 1000) << "ms" << endl;
+    }
+    catch (tidl::Exception &e)
+    {
+        cerr << e.what() << endl;
+        status = false;
+    }
    return status;
 }
@@ -234,7 +290,7 @@ bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs,
    c.inWidth = 224;
    c.inHeight = 224;
    c.preProcType = 2;
-    SubgraphDataConv in_conv{{true}, {128.0f}, {false}, {1,3,224,224}};
+    SubgraphDataConv in_conv{{0}, {true}, {128.0f}, {false}, {1,3,224,224}};
    char* frame_buffer = new char[3*224*224];
    assert (frame_buffer != nullptr);
diff --git a/examples/mobilenet_subgraph/subgraph0.cfg b/examples/mobilenet_subgraph/subgraph0.cfg
index 17c20bf..404c70d 100644
--- a/examples/mobilenet_subgraph/subgraph0.cfg
+++ b/examples/mobilenet_subgraph/subgraph0.cfg
@@ -7,3 +7,14 @@ paramsBinFile   = "../test/testvecs/config/tidl_models/tidl_param_mobilenet_1_22
 inWidth = 224
 inHeight = 224
 inNumChannels = 3
+# The following information should be space separated list,
+#   corresponding to vector of inputs and vector of outputs
+#   Quant_value  = float_value * scaleF2Q
+inConvType = 0
+inIsSigned = 1
+inScaleF2Q = 128.0
+inIsNCHW = 0
+outConvType = 0
+outIsSigned = 0
+outScaleF2Q = 255.0
+outIsNCHW = 1
diff --git a/tidl_api/inc/configuration.h b/tidl_api/inc/configuration.h
index 0a1c77c..c76ba7f 100644
--- a/tidl_api/inc/configuration.h
+++ b/tidl_api/inc/configuration.h
@@ -32,6 +32,7 @@
 #include <string>
 #include <map>
+#include <vector>
 #include <iostream>
 namespace tidl {
@@ -145,6 +146,32 @@ class Configuration
    //! Margin added to the average in percentage.
    int quantMargin;
+    //! subgraph data conversion type at subgraph inputs
+    //! 0: float <-> Q, 1: float <-> float, 2: Q <-> Q
+    std::vector<int> inConvType;
+    //! subgraph is signed data at subgraph inputs
+    std::vector<int> inIsSigned;
+    //! subgraph scaleF2Q factor at subgraph inputs
+    std::vector<float> inScaleF2Q;
+    //! subgraph is external tensor NCHW layout at subgraph inputs
+    std::vector<int> inIsNCHW;
+    //! subgraph data conversion type at subgraph outputs
+    //! 0: float <-> Q, 1: float <-> float, 2: Q <-> Q
+    std::vector<int> outConvType;
+    //! subgraph is signed data at subgraph outputs
+    std::vector<int> outIsSigned;
+    //! subgraph scaleF2Q factor at subgraph outputs
+    std::vector<float> outScaleF2Q;
+    //! subgraph is external tensor NCHW layout at subgraph outputs
+    std::vector<int> outIsNCHW;
    //! Default constructor.
    Configuration();
diff --git a/tidl_api/inc/subgraph_data_conv.h b/tidl_api/inc/subgraph_data_conv.h
index 6b7c4b1..dee53e5 100644
--- a/tidl_api/inc/subgraph_data_conv.h
+++ b/tidl_api/inc/subgraph_data_conv.h
@@ -74,15 +74,23 @@ namespace tidl {
 class SubgraphDataConv
 {
    public:
+        enum ConvType {
+            FLOAT_Q = 0,         // conversion between float <-> Q
+            FLOAT_FLOAT = 1,     // conversion between float <-> float
+            Q_Q = 2              // conversion between Q <-> Q
+        };
        //! @brief Creates a SubgraphDataConv.
        //! @param None
        SubgraphDataConv() {}
-        SubgraphDataConv(const std::vector<bool>& is_signed,
+        SubgraphDataConv(const std::vector<int>& conv_type,
+                         const std::vector<bool>& is_signed,
                         const std::vector<float>& scaleQ,
                         const std::vector<bool>& is_NCHW,
                         const std::vector<int>& dims
-                        ) : is_signed_m(is_signed), scaleQ_m(scaleQ),
+                        ) : conv_type_m(conv_type),
+                            is_signed_m(is_signed), scaleQ_m(scaleQ),
                            is_NCHW_m(is_NCHW), dims_m(dims)
                        {}
@@ -115,10 +123,13 @@ class SubgraphDataConv
        void ScaleDequant(const uint8_t *in, std::vector<float*>& out) const;
    private:
-        //! if tensor needs to be evaluated as signed char
+        //! data type conversion, 0: float <-> Q, 1: float <-> float, 2: Q <-> Q
+        std::vector<int> conv_type_m;
+        //! if tensor needs to be evaluated as signed char (if float <-> Q)
        std::vector<bool> is_signed_m;
-        //! Q value for Quantization and Dequantization
+        //! Q value for Quantization and Dequantization (if float <-> Q)
        std::vector<float> scaleQ_m;
        //! the format of external tensors, NCHW or NHWC
diff --git a/tidl_api/inc/subgraph_runtime.h b/tidl_api/inc/subgraph_runtime.h
index 37e771d..b4fc2b7 100644
--- a/tidl_api/inc/subgraph_runtime.h
+++ b/tidl_api/inc/subgraph_runtime.h
@@ -32,6 +32,15 @@
 extern "C" {
+//! @brief Top level API to initialize a TIDL subgraph on device
+//!        If not invoked ahead of time, TidlRunSubgraph() will call this
+//!        function before any inference
+//! @param total_subgraphs  total number of TIDL subgraphs in whole inference
+//! @param subgraph_id  index of current TIDL subgraph
+extern void TidlInitSubgraph(int total_subgraphs,
+                             int subgraph_id
+                            );
 //! @brief Top level inference to run a TIDL subgraph
 //! @param total_subgraphs  total number of TIDL subgraphs in whole inference
 //! @param subgraph_id  index of current TIDL subgraph
diff --git a/tidl_api/src/configuration_parser.cpp b/tidl_api/src/configuration_parser.cpp
index f457560..3ad0d2c 100644
--- a/tidl_api/src/configuration_parser.cpp
+++ b/tidl_api/src/configuration_parser.cpp
@@ -52,6 +52,7 @@ struct ConfigParser : qi::grammar<Iterator, ascii::space_type>
    ConfigParser(Configuration &x) : ConfigParser::base_type(entry)
    {
        using qi::int_;
+        using qi::float_;
        using qi::bool_;
        using qi::lit;
        using qi::lexeme;
@@ -66,6 +67,10 @@ struct ConfigParser : qi::grammar<Iterator, ascii::space_type>
        path %= lexeme[+(char_ - '"')];
        q_path = qi::omit[*char_('"')] >> path >> qi::omit[*char_('"')];
+        // Rules for parsing subgraph data conversion information
+        intvec = int_ >> *int_;
+        floatvec = float_ >> *float_;
        // Grammar for parsing configuration file
        entry %=
         lit("layerIndex2LayerGroupId") >> '=' >>
@@ -85,7 +90,15 @@ struct ConfigParser : qi::grammar<Iterator, ascii::space_type>
                                   int_[ph::ref(x.quantHistoryParam1)= _1] |
         lit("quantHistoryParam2")   >> '=' >>
                                   int_[ph::ref(x.quantHistoryParam2)= _1] |
-         lit("quantMargin")   >> '=' >> int_[ph::ref(x.quantMargin)= _1]
+         lit("quantMargin")   >> '=' >> int_[ph::ref(x.quantMargin)= _1] |
+         lit("inConvType")    >> '=' >> intvec[ph::ref(x.inConvType) = _1] |
+         lit("inIsSigned")    >> '=' >> intvec[ph::ref(x.inIsSigned) = _1] |
+         lit("inScaleF2Q")    >> '=' >> floatvec[ph::ref(x.inScaleF2Q) = _1] |
+         lit("inIsNCHW")      >> '=' >> intvec[ph::ref(x.inIsNCHW) = _1] |
+         lit("outConvType")   >> '=' >> intvec[ph::ref(x.outConvType) = _1] |
+         lit("outIsSigned")   >> '=' >> intvec[ph::ref(x.outIsSigned) = _1] |
+         lit("outScaleF2Q")   >> '=' >> floatvec[ph::ref(x.outScaleF2Q) = _1] |
+         lit("outIsNCHW")     >> '=' >> intvec[ph::ref(x.outIsNCHW) = _1]
         ;
    }
@@ -95,6 +108,9 @@ struct ConfigParser : qi::grammar<Iterator, ascii::space_type>
    qi::rule<Iterator, std::pair<int, int>(), ascii::space_type> id2group;
    qi::rule<Iterator, std::map<int, int>(), ascii::space_type> id2groups;
+    qi::rule<Iterator, std::vector<int>(), ascii::space_type> intvec;
+    qi::rule<Iterator, std::vector<float>(), ascii::space_type> floatvec;
 };
 bool Configuration::ReadFromFile(const std::string &file_name)
diff --git a/tidl_api/src/subgraph_data_conv.cpp b/tidl_api/src/subgraph_data_conv.cpp
index d8cc11f..6366360 100644
--- a/tidl_api/src/subgraph_data_conv.cpp
+++ b/tidl_api/src/subgraph_data_conv.cpp
@@ -26,6 +26,8 @@
 *  THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
+#include <cassert>
+#include <cstring>
 #include "subgraph_data_conv.h"
 using namespace tidl;
@@ -75,32 +77,91 @@ const
  int offset = 0;
  for (uint32_t d = 0; d < is_NCHW_m.size(); d++)
  {
-    float Q     = scaleQ_m[d];
    int   N     = dims_m[4 * d + 0];
    int   C     = dims_m[4 * d + 1];
    int   H     = dims_m[4 * d + 2];
    int   W     = dims_m[4 * d + 3];
-    int   vmin  = is_signed_m[d] ? -128 : 0;
-    int   vmax  = is_signed_m[d] ?  127 : 255;
+    if (conv_type_m[d] == ConvType::FLOAT_Q)
-    float *in_d = in[d];
-    if (is_NCHW_m[d])  // no need to transpose external tensor
    {
-      for (int i = 0; i < N * C * H * W; i++)
+      float Q     = scaleQ_m[d];
-        out[offset + i] = QuantizeValue(in_d[i], Q, vmin, vmax);
+      int   vmin  = is_signed_m[d] ? -128 : 0;
+      int   vmax  = is_signed_m[d] ?  127 : 255;
+      float *in_d = in[d];
+      if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+      {
+        // no need to transpose external tensor
+        for (int i = 0; i < N * C * H * W; i++)
+          out[offset + i] = QuantizeValue(in_d[i], Q, vmin, vmax);
+      }
+      else
+      {
+        // need to transpose external tensor
+        for (int n = 0; n < N; n++)
+          for (int c = 0; c < C; c++)
+            for (int h = 0; h < H; h++)
+              for (int w = 0; w < W; w++)
+              {
+                int nchw = GetIndex(n, c, h, w, N, C, H, W);
+                int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+                out[offset + nchw] = QuantizeValue(in_d[nhwc], Q, vmin, vmax);
+              }
+      }
    }
-    else  // need to transpose external tensor
+    else if (conv_type_m[d] == ConvType::FLOAT_FLOAT)
    {
-      for (int n = 0; n < N; n++)
+      assert((W & 0x3) == 0);   // last dimension is bytes
-        for (int c = 0; c < C; c++)
+      int f_W = W / 4;          // number of elements
-          for (int h = 0; h < H; h++)
+      float *in_d = in[d];
-            for (int w = 0; w < W; w++)
+      float *out_d = (float *) (out + offset);
-            {
+      if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
-              int nchw = GetIndex(n, c, h, w, N, C, H, W);
+      {
-              int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+        // no need to transpose external tensor
-              out[offset + nchw] = QuantizeValue(in_d[nhwc], Q, vmin, vmax);
+        memcpy(out_d, in_d, N * C * H * W);    // W is bytes
-            }
+      }
+      else
+      {
+        // need to transpose external tensor
+        for (int n = 0; n < N; n++)
+          for (int c = 0; c < C; c++)
+            for (int h = 0; h < H; h++)
+              for (int w = 0; w < f_W; w++)
+              {
+                int nchw = GetIndex(n, c, h, w, N, C, H, f_W);
+                int nhwc = GetIndex(n, h, w, c, N, H, f_W, C);
+                out_d[nchw] = in_d[nhwc];
+              }
+      }
    }
-    offset += N * C * H * W;
+    else if (conv_type_m[d] == ConvType::Q_Q)
+    {
+      uint8_t *in_d = (uint8_t *) &in[d];
+      uint8_t *out_d = (out + offset);
+      if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+      {
+        // no need to transpose external tensor
+        memcpy(out_d, in_d, N * C * H * W);
+      }
+      else
+      {
+        // need to transpose external tensor
+        for (int n = 0; n < N; n++)
+          for (int c = 0; c < C; c++)
+            for (int h = 0; h < H; h++)
+              for (int w = 0; w < W; w++)
+              {
+                int nchw = GetIndex(n, c, h, w, N, C, H, W);
+                int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+                out_d[nchw] = in_d[nhwc];
+              }
+      }
+    }
+    else
+    {
+      assert(false);
+    }
+    offset += N * C * H * W;  // accumulate in bytes
  }
 }
@@ -111,31 +172,90 @@ const
  int offset = 0;
  for (uint32_t d = 0; d < is_NCHW_m.size(); d++)
  {
-    float Q      = scaleQ_m[d];
-    float Q_inv  = 1.0f / Q;
    int   N      = dims_m[4 * d + 0];
    int   C      = dims_m[4 * d + 1];
    int   H      = dims_m[4 * d + 2];
    int   W      = dims_m[4 * d + 3];
-    bool  S      = is_signed_m[d];
-    float *out_d = out[d];
+    if (conv_type_m[d] == ConvType::FLOAT_Q)
-    if (is_NCHW_m[d])  // no need to transpose external tensor
+    {
+      float Q      = scaleQ_m[d];
+      float Q_inv  = 1.0f / Q;
+      bool  S      = is_signed_m[d];
+      float *out_d = out[d];
+      if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+      {
+        // no need to transpose external tensor
+        for (int i = 0; i < N * C * H * W; i++)
+          out_d[i] = DequantizeValue(in[offset + i], Q_inv, S);
+      }
+      else
+      {
+        // need to transpose external tensor
+        for (int n = 0; n < N; n++)
+          for (int c = 0; c < C; c++)
+            for (int h = 0; h < H; h++)
+              for (int w = 0; w < W; w++)
+              {
+                int nchw = GetIndex(n, c, h, w, N, C, H, W);
+                int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+                out_d[nhwc] = DequantizeValue(in[offset + nchw], Q_inv, S);
+              }
+      }
+    }
+    else if (conv_type_m[d] == ConvType::FLOAT_FLOAT)
    {
-      for (int i = 0; i < N * C * H * W; i++)
+      assert((W & 0x3) == 0);   // last dimension is bytes
-        out_d[i] = DequantizeValue(in[offset + i], Q_inv, S);
+      int f_W = W / 4;          // number of elements
+      float *in_d = (float *) (in + offset);
+      float *out_d = out[d];
+      if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+      {
+        // no need to transpose external tensor
+        memcpy(out_d, in_d, N * C * H * W);    // W is bytes
+      }
+      else
+      {
+        // need to transpose external tensor
+        for (int n = 0; n < N; n++)
+          for (int c = 0; c < C; c++)
+            for (int h = 0; h < H; h++)
+              for (int w = 0; w < f_W; w++)
+              {
+                int nchw = GetIndex(n, c, h, w, N, C, H, f_W);
+                int nhwc = GetIndex(n, h, w, c, N, H, f_W, C);
+                out_d[nhwc] = in_d[nchw];
+              }
+      }
    }
-    else  // need to transpose external tensor
+    else if (conv_type_m[d] == ConvType::Q_Q)
    {
-      for (int n = 0; n < N; n++)
+      uint8_t *in_d = (uint8_t *) (in + offset);
-        for (int c = 0; c < C; c++)
+      uint8_t *out_d = (uint8_t * ) &out[d];
-          for (int h = 0; h < H; h++)
+      if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
-            for (int w = 0; w < W; w++)
+      {
-            {
+        // no need to transpose external tensor
-              int nchw = GetIndex(n, c, h, w, N, C, H, W);
+        memcpy(out_d, in_d, N * C * H * W);
-              int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+      }
-              out_d[nhwc] = DequantizeValue(in[offset + nchw], Q_inv, S);
+      else
-            }
+      {
+        // need to transpose external tensor
+        for (int n = 0; n < N; n++)
+          for (int c = 0; c < C; c++)
+            for (int h = 0; h < H; h++)
+              for (int w = 0; w < W; w++)
+              {
+                int nchw = GetIndex(n, c, h, w, N, C, H, W);
+                int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+                out_d[nhwc] = in_d[nchw];
+              }
+      }
    }
+    else
+    {
+      assert(false);
+    }
    offset += N * C * H * W;
  }
 }
diff --git a/tidl_api/src/subgraph_runtime.cpp b/tidl_api/src/subgraph_runtime.cpp
index 09905fc..342acd8 100644
--- a/tidl_api/src/subgraph_runtime.cpp
+++ b/tidl_api/src/subgraph_runtime.cpp
@@ -74,6 +74,13 @@ void TVM_TidlFunction(int total_subgraphs, int subgraph_id,
 using namespace tidl;
+void TidlInitSubgraph(int total_subgraphs, int subgraph_id)
+{
+  ResM& res = ResM::Instance(total_subgraphs);
+  res.InitSubgraph(subgraph_id);
+}
 void TidlRunSubgraph(int total_subgraphs,
                     int subgraph_id,
                     int batch_size,
@@ -199,15 +206,8 @@ void ResM::Init(uint32_t num_subgraphs)
    es_m.resize(num_subgraphs_m, nullptr);
    e2s_m.resize(num_subgraphs_m, nullptr);
    eops_m = new std::vector<ResEOP>(num_subgraphs_m);
+    in_conv_m.resize(num_subgraphs_m, nullptr);
-    // TODO: this should come from parsing config file
+    out_conv_m.resize(num_subgraphs_m, nullptr);
-    for (uint32_t i = 0; i < num_subgraphs_m; i++)
-    {
-      in_conv_m.push_back(new SubgraphDataConv(
-                                    {true}, {128.0f}, {false}, {1,3,224,224}));
-      out_conv_m.push_back(new SubgraphDataConv(
-                                    {false}, {255.0f}, {true}, {1,1,1,1001}));
-    }
  }
 }
@@ -219,19 +219,63 @@ void ResM::InitSubgraph(uint32_t subgraph_id)
  std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
+  // Constructing EOPs if not already constructed
  if (res_eop.eops == nullptr)
  {
    if (enable_trace_m)
      printf("Subgraph %d: initialing E/EOPs with %d cores\n",
             subgraph_id, num_es_per_subgraph_m);
-    // Constructing EOPs if not already constructed
+    // Read config file
-    // Each subgraph -> num_eves_per_subgraph_m EOPs
-    // Each EOP -> use_count
    std::string cfg_file = "subgraph" + std::to_string(subgraph_id) + ".cfg";
    bool status = cs_m[subgraph_id].ReadFromFile(cfg_file);
    assert(status);
+    // Read the network
+    sTIDL_Network_t *net = new sTIDL_Network_t;
+    status = ReadNetworkBinary(cs_m[subgraph_id].netBinFile,
+                               reinterpret_cast<char *>(net));
+    assert(status);
+    // Get data conversion info from configuration
+    // Get input/output tensors dimensions from network
+    // Construct data converters at the subgraph boundaries
+    std::vector<int> inDims, outDims;
+    for (int32_t layer = 0; layer < net->numLayers; layer++)
+    {
+      if (net->TIDLLayers[layer].layerType != (int32_t) TIDL_DataLayer)
+        continue;
+      if (net->TIDLLayers[layer].numInBufs <= 0)
+      {
+        for (int d = 0; d < 4; d++)
+          inDims.push_back(net->TIDLLayers[layer].outData[0].dimValues[d]);
+      }
+      if (net->TIDLLayers[layer].numOutBufs <= 0)
+      {
+        for (int d = 0; d < 4; d++)
+          outDims.push_back(net->TIDLLayers[layer].inData[0].dimValues[d]);
+      }
+    }
+    assert(cs_m[subgraph_id].inIsNCHW.size() * 4 == inDims.size());
+    assert(cs_m[subgraph_id].outIsNCHW.size() * 4 == outDims.size());
+    std::vector<bool> inIsSigned, outIsSigned, inIsNCHW, outIsNCHW;
+    for (int v : cs_m[subgraph_id].inIsSigned)  inIsSigned.push_back(v != 0);
+    for (int v : cs_m[subgraph_id].inIsNCHW)    inIsNCHW.push_back(v != 0);
+    for (int v : cs_m[subgraph_id].outIsSigned) outIsSigned.push_back(v != 0);
+    for (int v : cs_m[subgraph_id].outIsNCHW)   outIsNCHW.push_back(v != 0);
+    in_conv_m[subgraph_id] = new SubgraphDataConv(
+                                               cs_m[subgraph_id].inConvType,
+                                               inIsSigned,
+                                               cs_m[subgraph_id].inScaleF2Q,
+                                               inIsNCHW,
+                                               inDims);
+    out_conv_m[subgraph_id] = new SubgraphDataConv(
+                                               cs_m[subgraph_id].outConvType,
+                                               outIsSigned,
+                                               cs_m[subgraph_id].outScaleF2Q,
+                                               outIsNCHW,
+                                               outDims);
    // Check if last few layers can be offloaded to DSPs
    //       and DSPs are available
    DeviceIds e_ids, e2_ids;
@@ -241,10 +285,6 @@ void ResM::InitSubgraph(uint32_t subgraph_id)
    // uint32_t num_dsps_used = 0;
    if (num_eves_m > 0 && num_dsps_m > 0 && ! cs_m[subgraph_id].runFullNet)
    {
-      sTIDL_Network_t *net = new sTIDL_Network_t;
-      bool status = ReadNetworkBinary(cs_m[subgraph_id].netBinFile,
-                                      reinterpret_cast<char *>(net));
-      assert(status);
      int32_t start_layer = net->numLayers -1;
      int32_t end_layer = 0;
      if (net->TIDLLayers[start_layer].layerType == (int32_t) TIDL_DataLayer)
@@ -286,7 +326,7 @@ void ResM::InitSubgraph(uint32_t subgraph_id)
      cs_m[subgraph_id].runFullNet = true;
    cs_m[subgraph_id].enableApiTrace = enable_trace_m;
-    // Constructing Es and EOPs
+    // Constructing Es and EOPs, each subgraph -> num_eves_per_subgraph_m EOPs
    res_eop.eops = new std::vector<ExecutionObjectPipeline*>;
    uint32_t buffer_factor = 2;  // double buffering factor
    if (num_eves_m > 0)