summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: d8571df)
raw | patch | inline | side by side (parent: d8571df)
author | Yuan Zhao <yuanzhao@ti.com> | |
Wed, 20 Nov 2019 17:30:35 +0000 (11:30 -0600) | ||
committer | Yuan Zhao <yuanzhao@ti.com> | |
Wed, 20 Nov 2019 17:30:35 +0000 (11:30 -0600) |
- MCT-1224
index ffeb69d4ce3e40b366950e92deac50f167abd12c..68f5d9df5a811ab4aaeba2b23190e98e24551379 100644 (file)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
-EXE = imagenet
+EXE = mobilenet_subgraph
include ../make.common
index 5534df3660e60e329a0618021caf7d766c075791..e4e499af67eb7a377cae2278b93674231e5fe654 100644 (file)
#include <queue>
#include <vector>
#include <chrono>
+#include <future>
#include "executor.h"
#include "execution_object.h"
cout << "\n##### Batch size 1 testing ######\n" << endl;
try
{
+ TidlInitSubgraph(1, 0);
float **inputs = new float *[1];
inputs[0] = new float[1*3*224*224];
float **outputs = new float *[1];
status = false;
}
+ // This is only to test the multithreaded inference
+ // async/future may not be the most efficient multithreading method
+ // threading pool might have better performance
+ cout << "\n##### Multithreaded inference testing #####\n" << endl;
+ int num_threads = 8;
+ int num_iters = 8;
+ try
+ {
+ float **inputs = new float *[num_threads];
+ float **outputs = new float *[num_threads];
+ for (int i = 0; i < num_threads; i++)
+ {
+ inputs[i] = new float[1*3*224*224];
+ outputs[i] = new float[1001];
+ }
+ vector<future<bool>> futures(num_threads);
+
+ chrono::time_point<chrono::steady_clock> tloop0, tloop1;
+ tloop0 = chrono::steady_clock::now();
+
+ for (int i = 0; i < num_iters + num_threads; i++)
+ {
+ int index = i % num_threads;
+ if (i >= num_threads)
+ {
+ if (futures[index].get())
+ WriteFrameOutput(outputs[index], opts);
+ }
+
+ if (i < num_iters)
+ {
+ ReadFrame(opts, cap, &inputs[index], 1);
+ futures[index] = std::async(std::launch::async,
+ [inputs, outputs](int index) {
+ TidlRunSubgraph(1, 0, 1, 1, 1, &inputs[index], &outputs[index]);
+ return true;
+ },
+ index);
+ }
+ }
+
+ tloop1 = chrono::steady_clock::now();
+ chrono::duration<float> elapsed = tloop1 - tloop0;
+ cout << "Multithreaded (num_threads=" << num_threads
+ << ") loop time (including read/write/opencv/print/etc): "
+ << setw(6) << setprecision(4)
+ << (elapsed.count() * 1000) << "ms" << endl;
+ }
+ catch (tidl::Exception &e)
+ {
+ cerr << e.what() << endl;
+ status = false;
+ }
+
return status;
}
c.inWidth = 224;
c.inHeight = 224;
c.preProcType = 2;
- SubgraphDataConv in_conv{{true}, {128.0f}, {false}, {1,3,224,224}};
+ SubgraphDataConv in_conv{{0}, {true}, {128.0f}, {false}, {1,3,224,224}};
char* frame_buffer = new char[3*224*224];
assert (frame_buffer != nullptr);
index 17c20bf588650684d5eba129103c33bbdba76e20..404c70dbc67f6881b33c6a2a9b3d09c15f0416c2 100644 (file)
inWidth = 224
inHeight = 224
inNumChannels = 3
+# The following information should be space separated list,
+# corresponding to vector of inputs and vector of outputs
+# Quant_value = float_value * scaleF2Q
+inConvType = 0
+inIsSigned = 1
+inScaleF2Q = 128.0
+inIsNCHW = 0
+outConvType = 0
+outIsSigned = 0
+outScaleF2Q = 255.0
+outIsNCHW = 1
index 0a1c77cad1aa00d146c37792a5afaa6b84a17920..c76ba7f54561ff95e3ef471abe3ac70152187a5e 100644 (file)
#include <string>
#include <map>
+#include <vector>
#include <iostream>
namespace tidl {
//! Margin added to the average in percentage.
int quantMargin;
+ //! subgraph data conversion type at subgraph inputs
+ //! 0: float <-> Q, 1: float <-> float, 2: Q <-> Q
+ std::vector<int> inConvType;
+
+ //! subgraph is signed data at subgraph inputs
+ std::vector<int> inIsSigned;
+
+ //! subgraph scaleF2Q factor at subgraph inputs
+ std::vector<float> inScaleF2Q;
+
+ //! subgraph is external tensor NCHW layout at subgraph inputs
+ std::vector<int> inIsNCHW;
+
+ //! subgraph data conversion type at subgraph outputs
+ //! 0: float <-> Q, 1: float <-> float, 2: Q <-> Q
+ std::vector<int> outConvType;
+
+ //! subgraph is signed data at subgraph outputs
+ std::vector<int> outIsSigned;
+
+ //! subgraph scaleF2Q factor at subgraph outputs
+ std::vector<float> outScaleF2Q;
+
+ //! subgraph is external tensor NCHW layout at subgraph outputs
+ std::vector<int> outIsNCHW;
+
//! Default constructor.
Configuration();
index 6b7c4b1b4599d63781a65df957db2c2934908864..dee53e5736a3c53daa4288437a90ca505870ddf5 100644 (file)
class SubgraphDataConv
{
public:
+ enum ConvType {
+ FLOAT_Q = 0, // conversion between float <-> Q
+ FLOAT_FLOAT = 1, // conversion between float <-> float
+ Q_Q = 2 // conversion between Q <-> Q
+ };
+
//! @brief Creates a SubgraphDataConv.
//! @param None
SubgraphDataConv() {}
- SubgraphDataConv(const std::vector<bool>& is_signed,
+ SubgraphDataConv(const std::vector<int>& conv_type,
+ const std::vector<bool>& is_signed,
const std::vector<float>& scaleQ,
const std::vector<bool>& is_NCHW,
const std::vector<int>& dims
- ) : is_signed_m(is_signed), scaleQ_m(scaleQ),
+ ) : conv_type_m(conv_type),
+ is_signed_m(is_signed), scaleQ_m(scaleQ),
is_NCHW_m(is_NCHW), dims_m(dims)
{}
void ScaleDequant(const uint8_t *in, std::vector<float*>& out) const;
private:
- //! if tensor needs to be evaluated as signed char
+ //! data type conversion, 0: float <-> Q, 1: float <-> float, 2: Q <-> Q
+ std::vector<int> conv_type_m;
+
+ //! if tensor needs to be evaluated as signed char (if float <-> Q)
std::vector<bool> is_signed_m;
- //! Q value for Quantization and Dequantization
+ //! Q value for Quantization and Dequantization (if float <-> Q)
std::vector<float> scaleQ_m;
//! the format of external tensors, NCHW or NHWC
index 37e771defdd2fd657c85351070677d294b069e2f..b4fc2b70ec1b06278c6b9bb46d80f3c3bc1d5529 100644 (file)
extern "C" {
+//! @brief Top level API to initialize a TIDL subgraph on device
+//! If not invoked ahead of time, TidlRunSubgraph() will call this
+//! function before any inference
+//! @param total_subgraphs total number of TIDL subgraphs in whole inference
+//! @param subgraph_id index of current TIDL subgraph
+extern void TidlInitSubgraph(int total_subgraphs,
+ int subgraph_id
+ );
+
//! @brief Top level inference to run a TIDL subgraph
//! @param total_subgraphs total number of TIDL subgraphs in whole inference
//! @param subgraph_id index of current TIDL subgraph
index f4575605ca7a68cc783706e463aa318f46c0806b..3ad0d2ccaf84672261aab81acd14de6cde1628b0 100644 (file)
ConfigParser(Configuration &x) : ConfigParser::base_type(entry)
{
using qi::int_;
+ using qi::float_;
using qi::bool_;
using qi::lit;
using qi::lexeme;
path %= lexeme[+(char_ - '"')];
q_path = qi::omit[*char_('"')] >> path >> qi::omit[*char_('"')];
+ // Rules for parsing subgraph data conversion information
+ intvec = int_ >> *int_;
+ floatvec = float_ >> *float_;
+
// Grammar for parsing configuration file
entry %=
lit("layerIndex2LayerGroupId") >> '=' >>
int_[ph::ref(x.quantHistoryParam1)= _1] |
lit("quantHistoryParam2") >> '=' >>
int_[ph::ref(x.quantHistoryParam2)= _1] |
- lit("quantMargin") >> '=' >> int_[ph::ref(x.quantMargin)= _1]
+ lit("quantMargin") >> '=' >> int_[ph::ref(x.quantMargin)= _1] |
+ lit("inConvType") >> '=' >> intvec[ph::ref(x.inConvType) = _1] |
+ lit("inIsSigned") >> '=' >> intvec[ph::ref(x.inIsSigned) = _1] |
+ lit("inScaleF2Q") >> '=' >> floatvec[ph::ref(x.inScaleF2Q) = _1] |
+ lit("inIsNCHW") >> '=' >> intvec[ph::ref(x.inIsNCHW) = _1] |
+ lit("outConvType") >> '=' >> intvec[ph::ref(x.outConvType) = _1] |
+ lit("outIsSigned") >> '=' >> intvec[ph::ref(x.outIsSigned) = _1] |
+ lit("outScaleF2Q") >> '=' >> floatvec[ph::ref(x.outScaleF2Q) = _1] |
+ lit("outIsNCHW") >> '=' >> intvec[ph::ref(x.outIsNCHW) = _1]
;
}
qi::rule<Iterator, std::pair<int, int>(), ascii::space_type> id2group;
qi::rule<Iterator, std::map<int, int>(), ascii::space_type> id2groups;
+
+ qi::rule<Iterator, std::vector<int>(), ascii::space_type> intvec;
+ qi::rule<Iterator, std::vector<float>(), ascii::space_type> floatvec;
};
bool Configuration::ReadFromFile(const std::string &file_name)
index d8cc11f7d178921784268775801a77432baeb144..6366360b5fe50bdd1661faf87a65083dd7da1eb6 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
+#include <cassert>
+#include <cstring>
#include "subgraph_data_conv.h"
using namespace tidl;
int offset = 0;
for (uint32_t d = 0; d < is_NCHW_m.size(); d++)
{
- float Q = scaleQ_m[d];
int N = dims_m[4 * d + 0];
int C = dims_m[4 * d + 1];
int H = dims_m[4 * d + 2];
int W = dims_m[4 * d + 3];
- int vmin = is_signed_m[d] ? -128 : 0;
- int vmax = is_signed_m[d] ? 127 : 255;
- float *in_d = in[d];
- if (is_NCHW_m[d]) // no need to transpose external tensor
+
+ if (conv_type_m[d] == ConvType::FLOAT_Q)
{
- for (int i = 0; i < N * C * H * W; i++)
- out[offset + i] = QuantizeValue(in_d[i], Q, vmin, vmax);
+ float Q = scaleQ_m[d];
+ int vmin = is_signed_m[d] ? -128 : 0;
+ int vmax = is_signed_m[d] ? 127 : 255;
+ float *in_d = in[d];
+ if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+ {
+ // no need to transpose external tensor
+ for (int i = 0; i < N * C * H * W; i++)
+ out[offset + i] = QuantizeValue(in_d[i], Q, vmin, vmax);
+ }
+ else
+ {
+ // need to transpose external tensor
+ for (int n = 0; n < N; n++)
+ for (int c = 0; c < C; c++)
+ for (int h = 0; h < H; h++)
+ for (int w = 0; w < W; w++)
+ {
+ int nchw = GetIndex(n, c, h, w, N, C, H, W);
+ int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+ out[offset + nchw] = QuantizeValue(in_d[nhwc], Q, vmin, vmax);
+ }
+ }
}
- else // need to transpose external tensor
+ else if (conv_type_m[d] == ConvType::FLOAT_FLOAT)
{
- for (int n = 0; n < N; n++)
- for (int c = 0; c < C; c++)
- for (int h = 0; h < H; h++)
- for (int w = 0; w < W; w++)
- {
- int nchw = GetIndex(n, c, h, w, N, C, H, W);
- int nhwc = GetIndex(n, h, w, c, N, H, W, C);
- out[offset + nchw] = QuantizeValue(in_d[nhwc], Q, vmin, vmax);
- }
+ assert((W & 0x3) == 0); // last dimension is bytes
+ int f_W = W / 4; // number of elements
+ float *in_d = in[d];
+ float *out_d = (float *) (out + offset);
+ if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+ {
+ // no need to transpose external tensor
+ memcpy(out_d, in_d, N * C * H * W); // W is bytes
+ }
+ else
+ {
+ // need to transpose external tensor
+ for (int n = 0; n < N; n++)
+ for (int c = 0; c < C; c++)
+ for (int h = 0; h < H; h++)
+ for (int w = 0; w < f_W; w++)
+ {
+ int nchw = GetIndex(n, c, h, w, N, C, H, f_W);
+ int nhwc = GetIndex(n, h, w, c, N, H, f_W, C);
+ out_d[nchw] = in_d[nhwc];
+ }
+ }
}
- offset += N * C * H * W;
+ else if (conv_type_m[d] == ConvType::Q_Q)
+ {
+ uint8_t *in_d = (uint8_t *) &in[d];
+ uint8_t *out_d = (out + offset);
+ if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+ {
+ // no need to transpose external tensor
+ memcpy(out_d, in_d, N * C * H * W);
+ }
+ else
+ {
+ // need to transpose external tensor
+ for (int n = 0; n < N; n++)
+ for (int c = 0; c < C; c++)
+ for (int h = 0; h < H; h++)
+ for (int w = 0; w < W; w++)
+ {
+ int nchw = GetIndex(n, c, h, w, N, C, H, W);
+ int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+ out_d[nchw] = in_d[nhwc];
+ }
+ }
+ }
+ else
+ {
+ assert(false);
+ }
+
+ offset += N * C * H * W; // accumulate in bytes
}
}
int offset = 0;
for (uint32_t d = 0; d < is_NCHW_m.size(); d++)
{
- float Q = scaleQ_m[d];
- float Q_inv = 1.0f / Q;
int N = dims_m[4 * d + 0];
int C = dims_m[4 * d + 1];
int H = dims_m[4 * d + 2];
int W = dims_m[4 * d + 3];
- bool S = is_signed_m[d];
- float *out_d = out[d];
- if (is_NCHW_m[d]) // no need to transpose external tensor
+
+ if (conv_type_m[d] == ConvType::FLOAT_Q)
+ {
+ float Q = scaleQ_m[d];
+ float Q_inv = 1.0f / Q;
+ bool S = is_signed_m[d];
+ float *out_d = out[d];
+ if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+ {
+ // no need to transpose external tensor
+ for (int i = 0; i < N * C * H * W; i++)
+ out_d[i] = DequantizeValue(in[offset + i], Q_inv, S);
+ }
+ else
+ {
+ // need to transpose external tensor
+ for (int n = 0; n < N; n++)
+ for (int c = 0; c < C; c++)
+ for (int h = 0; h < H; h++)
+ for (int w = 0; w < W; w++)
+ {
+ int nchw = GetIndex(n, c, h, w, N, C, H, W);
+ int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+ out_d[nhwc] = DequantizeValue(in[offset + nchw], Q_inv, S);
+ }
+ }
+ }
+ else if (conv_type_m[d] == ConvType::FLOAT_FLOAT)
{
- for (int i = 0; i < N * C * H * W; i++)
- out_d[i] = DequantizeValue(in[offset + i], Q_inv, S);
+ assert((W & 0x3) == 0); // last dimension is bytes
+ int f_W = W / 4; // number of elements
+ float *in_d = (float *) (in + offset);
+ float *out_d = out[d];
+ if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+ {
+ // no need to transpose external tensor
+ memcpy(out_d, in_d, N * C * H * W); // W is bytes
+ }
+ else
+ {
+ // need to transpose external tensor
+ for (int n = 0; n < N; n++)
+ for (int c = 0; c < C; c++)
+ for (int h = 0; h < H; h++)
+ for (int w = 0; w < f_W; w++)
+ {
+ int nchw = GetIndex(n, c, h, w, N, C, H, f_W);
+ int nhwc = GetIndex(n, h, w, c, N, H, f_W, C);
+ out_d[nhwc] = in_d[nchw];
+ }
+ }
}
- else // need to transpose external tensor
+ else if (conv_type_m[d] == ConvType::Q_Q)
{
- for (int n = 0; n < N; n++)
- for (int c = 0; c < C; c++)
- for (int h = 0; h < H; h++)
- for (int w = 0; w < W; w++)
- {
- int nchw = GetIndex(n, c, h, w, N, C, H, W);
- int nhwc = GetIndex(n, h, w, c, N, H, W, C);
- out_d[nhwc] = DequantizeValue(in[offset + nchw], Q_inv, S);
- }
+ uint8_t *in_d = (uint8_t *) (in + offset);
+ uint8_t *out_d = (uint8_t * ) &out[d];
+ if (is_NCHW_m[d] || (C == 1) || (H*W == 1))
+ {
+ // no need to transpose external tensor
+ memcpy(out_d, in_d, N * C * H * W);
+ }
+ else
+ {
+ // need to transpose external tensor
+ for (int n = 0; n < N; n++)
+ for (int c = 0; c < C; c++)
+ for (int h = 0; h < H; h++)
+ for (int w = 0; w < W; w++)
+ {
+ int nchw = GetIndex(n, c, h, w, N, C, H, W);
+ int nhwc = GetIndex(n, h, w, c, N, H, W, C);
+ out_d[nhwc] = in_d[nchw];
+ }
+ }
}
+ else
+ {
+ assert(false);
+ }
+
offset += N * C * H * W;
}
}
index 09905fc72ddc6d8e0d60634a033f2e632f2fcd31..342acd8fa9f3fe6ff343abd2a9e54bf40f418cda 100644 (file)
using namespace tidl;
+void TidlInitSubgraph(int total_subgraphs, int subgraph_id)
+{
+ ResM& res = ResM::Instance(total_subgraphs);
+ res.InitSubgraph(subgraph_id);
+}
+
+
void TidlRunSubgraph(int total_subgraphs,
int subgraph_id,
int batch_size,
es_m.resize(num_subgraphs_m, nullptr);
e2s_m.resize(num_subgraphs_m, nullptr);
eops_m = new std::vector<ResEOP>(num_subgraphs_m);
-
- // TODO: this should come from parsing config file
- for (uint32_t i = 0; i < num_subgraphs_m; i++)
- {
- in_conv_m.push_back(new SubgraphDataConv(
- {true}, {128.0f}, {false}, {1,3,224,224}));
- out_conv_m.push_back(new SubgraphDataConv(
- {false}, {255.0f}, {true}, {1,1,1,1001}));
- }
+ in_conv_m.resize(num_subgraphs_m, nullptr);
+ out_conv_m.resize(num_subgraphs_m, nullptr);
}
}
std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
+ // Constructing EOPs if not already constructed
if (res_eop.eops == nullptr)
{
if (enable_trace_m)
printf("Subgraph %d: initialing E/EOPs with %d cores\n",
subgraph_id, num_es_per_subgraph_m);
- // Constructing EOPs if not already constructed
- // Each subgraph -> num_eves_per_subgraph_m EOPs
- // Each EOP -> use_count
+ // Read config file
std::string cfg_file = "subgraph" + std::to_string(subgraph_id) + ".cfg";
bool status = cs_m[subgraph_id].ReadFromFile(cfg_file);
assert(status);
+ // Read the network
+ sTIDL_Network_t *net = new sTIDL_Network_t;
+ status = ReadNetworkBinary(cs_m[subgraph_id].netBinFile,
+ reinterpret_cast<char *>(net));
+ assert(status);
+
+ // Get data conversion info from configuration
+ // Get input/output tensors dimensions from network
+ // Construct data converters at the subgraph boundaries
+ std::vector<int> inDims, outDims;
+ for (int32_t layer = 0; layer < net->numLayers; layer++)
+ {
+ if (net->TIDLLayers[layer].layerType != (int32_t) TIDL_DataLayer)
+ continue;
+ if (net->TIDLLayers[layer].numInBufs <= 0)
+ {
+ for (int d = 0; d < 4; d++)
+ inDims.push_back(net->TIDLLayers[layer].outData[0].dimValues[d]);
+ }
+ if (net->TIDLLayers[layer].numOutBufs <= 0)
+ {
+ for (int d = 0; d < 4; d++)
+ outDims.push_back(net->TIDLLayers[layer].inData[0].dimValues[d]);
+ }
+ }
+ assert(cs_m[subgraph_id].inIsNCHW.size() * 4 == inDims.size());
+ assert(cs_m[subgraph_id].outIsNCHW.size() * 4 == outDims.size());
+ std::vector<bool> inIsSigned, outIsSigned, inIsNCHW, outIsNCHW;
+ for (int v : cs_m[subgraph_id].inIsSigned) inIsSigned.push_back(v != 0);
+ for (int v : cs_m[subgraph_id].inIsNCHW) inIsNCHW.push_back(v != 0);
+ for (int v : cs_m[subgraph_id].outIsSigned) outIsSigned.push_back(v != 0);
+ for (int v : cs_m[subgraph_id].outIsNCHW) outIsNCHW.push_back(v != 0);
+ in_conv_m[subgraph_id] = new SubgraphDataConv(
+ cs_m[subgraph_id].inConvType,
+ inIsSigned,
+ cs_m[subgraph_id].inScaleF2Q,
+ inIsNCHW,
+ inDims);
+ out_conv_m[subgraph_id] = new SubgraphDataConv(
+ cs_m[subgraph_id].outConvType,
+ outIsSigned,
+ cs_m[subgraph_id].outScaleF2Q,
+ outIsNCHW,
+ outDims);
+
// Check if last few layers can be offloaded to DSPs
// and DSPs are available
DeviceIds e_ids, e2_ids;
// uint32_t num_dsps_used = 0;
if (num_eves_m > 0 && num_dsps_m > 0 && ! cs_m[subgraph_id].runFullNet)
{
- sTIDL_Network_t *net = new sTIDL_Network_t;
- bool status = ReadNetworkBinary(cs_m[subgraph_id].netBinFile,
- reinterpret_cast<char *>(net));
- assert(status);
int32_t start_layer = net->numLayers -1;
int32_t end_layer = 0;
if (net->TIDLLayers[start_layer].layerType == (int32_t) TIDL_DataLayer)
cs_m[subgraph_id].runFullNet = true;
cs_m[subgraph_id].enableApiTrace = enable_trace_m;
- // Constructing Es and EOPs
+ // Constructing Es and EOPs, each subgraph -> num_eves_per_subgraph_m EOPs
res_eop.eops = new std::vector<ExecutionObjectPipeline*>;
uint32_t buffer_factor = 2; // double buffering factor
if (num_eves_m > 0)