diff options
-rw-r--r-- | examples/mobilenet_subgraph/Makefile | 2 | ||||
-rw-r--r-- | examples/mobilenet_subgraph/main.cpp | 58 | ||||
-rw-r--r-- | examples/mobilenet_subgraph/subgraph0.cfg | 11 | ||||
-rw-r--r-- | tidl_api/inc/configuration.h | 27 | ||||
-rw-r--r-- | tidl_api/inc/subgraph_data_conv.h | 19 | ||||
-rw-r--r-- | tidl_api/inc/subgraph_runtime.h | 9 | ||||
-rw-r--r-- | tidl_api/src/configuration_parser.cpp | 18 | ||||
-rw-r--r-- | tidl_api/src/subgraph_data_conv.cpp | 190 | ||||
-rw-r--r-- | tidl_api/src/subgraph_runtime.cpp | 74 |
9 files changed, 349 insertions, 59 deletions
diff --git a/examples/mobilenet_subgraph/Makefile b/examples/mobilenet_subgraph/Makefile index ffeb69d..68f5d9d 100644 --- a/examples/mobilenet_subgraph/Makefile +++ b/examples/mobilenet_subgraph/Makefile | |||
@@ -24,7 +24,7 @@ | |||
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
25 | # THE POSSIBILITY OF SUCH DAMAGE. | 25 | # THE POSSIBILITY OF SUCH DAMAGE. |
26 | 26 | ||
27 | EXE = imagenet | 27 | EXE = mobilenet_subgraph |
28 | 28 | ||
29 | include ../make.common | 29 | include ../make.common |
30 | 30 | ||
diff --git a/examples/mobilenet_subgraph/main.cpp b/examples/mobilenet_subgraph/main.cpp index 5534df3..e4e499a 100644 --- a/examples/mobilenet_subgraph/main.cpp +++ b/examples/mobilenet_subgraph/main.cpp | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <queue> | 39 | #include <queue> |
40 | #include <vector> | 40 | #include <vector> |
41 | #include <chrono> | 41 | #include <chrono> |
42 | #include <future> | ||
42 | 43 | ||
43 | #include "executor.h" | 44 | #include "executor.h" |
44 | #include "execution_object.h" | 45 | #include "execution_object.h" |
@@ -145,6 +146,7 @@ bool RunConfiguration(cmdline_opts_t& opts) | |||
145 | cout << "\n##### Batch size 1 testing ######\n" << endl; | 146 | cout << "\n##### Batch size 1 testing ######\n" << endl; |
146 | try | 147 | try |
147 | { | 148 | { |
149 | TidlInitSubgraph(1, 0); | ||
148 | float **inputs = new float *[1]; | 150 | float **inputs = new float *[1]; |
149 | inputs[0] = new float[1*3*224*224]; | 151 | inputs[0] = new float[1*3*224*224]; |
150 | float **outputs = new float *[1]; | 152 | float **outputs = new float *[1]; |
@@ -222,6 +224,60 @@ bool RunConfiguration(cmdline_opts_t& opts) | |||
222 | status = false; | 224 | status = false; |
223 | } | 225 | } |
224 | 226 | ||
227 | // This is only to test the multithreaded inference | ||
228 | // async/future may not be the most efficient multithreading method | ||
229 | // threading pool might have better performance | ||
230 | cout << "\n##### Multithreaded inference testing #####\n" << endl; | ||
231 | int num_threads = 8; | ||
232 | int num_iters = 8; | ||
233 | try | ||
234 | { | ||
235 | float **inputs = new float *[num_threads]; | ||
236 | float **outputs = new float *[num_threads]; | ||
237 | for (int i = 0; i < num_threads; i++) | ||
238 | { | ||
239 | inputs[i] = new float[1*3*224*224]; | ||
240 | outputs[i] = new float[1001]; | ||
241 | } | ||
242 | vector<future<bool>> futures(num_threads); | ||
243 | |||
244 | chrono::time_point<chrono::steady_clock> tloop0, tloop1; | ||
245 | tloop0 = chrono::steady_clock::now(); | ||
246 | |||
247 | for (int i = 0; i < num_iters + num_threads; i++) | ||
248 | { | ||
249 | int index = i % num_threads; | ||
250 | if (i >= num_threads) | ||
251 | { | ||
252 | if (futures[index].get()) | ||
253 | WriteFrameOutput(outputs[index], opts); | ||
254 | } | ||
255 | |||
256 | if (i < num_iters) | ||
257 | { | ||
258 | ReadFrame(opts, cap, &inputs[index], 1); | ||
259 | futures[index] = std::async(std::launch::async, | ||
260 | [inputs, outputs](int index) { | ||
261 | TidlRunSubgraph(1, 0, 1, 1, 1, &inputs[index], &outputs[index]); | ||
262 | return true; | ||
263 | }, | ||
264 | index); | ||
265 | } | ||
266 | } | ||
267 | |||
268 | tloop1 = chrono::steady_clock::now(); | ||
269 | chrono::duration<float> elapsed = tloop1 - tloop0; | ||
270 | cout << "Multithreaded (num_threads=" << num_threads | ||
271 | << ") loop time (including read/write/opencv/print/etc): " | ||
272 | << setw(6) << setprecision(4) | ||
273 | << (elapsed.count() * 1000) << "ms" << endl; | ||
274 | } | ||
275 | catch (tidl::Exception &e) | ||
276 | { | ||
277 | cerr << e.what() << endl; | ||
278 | status = false; | ||
279 | } | ||
280 | |||
225 | return status; | 281 | return status; |
226 | } | 282 | } |
227 | 283 | ||
@@ -234,7 +290,7 @@ bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs, | |||
234 | c.inWidth = 224; | 290 | c.inWidth = 224; |
235 | c.inHeight = 224; | 291 | c.inHeight = 224; |
236 | c.preProcType = 2; | 292 | c.preProcType = 2; |
237 | SubgraphDataConv in_conv{{true}, {128.0f}, {false}, {1,3,224,224}}; | 293 | SubgraphDataConv in_conv{{0}, {true}, {128.0f}, {false}, {1,3,224,224}}; |
238 | 294 | ||
239 | char* frame_buffer = new char[3*224*224]; | 295 | char* frame_buffer = new char[3*224*224]; |
240 | assert (frame_buffer != nullptr); | 296 | assert (frame_buffer != nullptr); |
diff --git a/examples/mobilenet_subgraph/subgraph0.cfg b/examples/mobilenet_subgraph/subgraph0.cfg index 17c20bf..404c70d 100644 --- a/examples/mobilenet_subgraph/subgraph0.cfg +++ b/examples/mobilenet_subgraph/subgraph0.cfg | |||
@@ -7,3 +7,14 @@ paramsBinFile = "../test/testvecs/config/tidl_models/tidl_param_mobilenet_1_22 | |||
7 | inWidth = 224 | 7 | inWidth = 224 |
8 | inHeight = 224 | 8 | inHeight = 224 |
9 | inNumChannels = 3 | 9 | inNumChannels = 3 |
10 | # The following information should be space separated list, | ||
11 | # corresponding to vector of inputs and vector of outputs | ||
12 | # Quant_value = float_value * scaleF2Q | ||
13 | inConvType = 0 | ||
14 | inIsSigned = 1 | ||
15 | inScaleF2Q = 128.0 | ||
16 | inIsNCHW = 0 | ||
17 | outConvType = 0 | ||
18 | outIsSigned = 0 | ||
19 | outScaleF2Q = 255.0 | ||
20 | outIsNCHW = 1 | ||
diff --git a/tidl_api/inc/configuration.h b/tidl_api/inc/configuration.h index 0a1c77c..c76ba7f 100644 --- a/tidl_api/inc/configuration.h +++ b/tidl_api/inc/configuration.h | |||
@@ -32,6 +32,7 @@ | |||
32 | 32 | ||
33 | #include <string> | 33 | #include <string> |
34 | #include <map> | 34 | #include <map> |
35 | #include <vector> | ||
35 | #include <iostream> | 36 | #include <iostream> |
36 | 37 | ||
37 | namespace tidl { | 38 | namespace tidl { |
@@ -145,6 +146,32 @@ class Configuration | |||
145 | //! Margin added to the average in percentage. | 146 | //! Margin added to the average in percentage. |
146 | int quantMargin; | 147 | int quantMargin; |
147 | 148 | ||
149 | //! subgraph data conversion type at subgraph inputs | ||
150 | //! 0: float <-> Q, 1: float <-> float, 2: Q <-> Q | ||
151 | std::vector<int> inConvType; | ||
152 | |||
153 | //! subgraph is signed data at subgraph inputs | ||
154 | std::vector<int> inIsSigned; | ||
155 | |||
156 | //! subgraph scaleF2Q factor at subgraph inputs | ||
157 | std::vector<float> inScaleF2Q; | ||
158 | |||
159 | //! subgraph is external tensor NCHW layout at subgraph inputs | ||
160 | std::vector<int> inIsNCHW; | ||
161 | |||
162 | //! subgraph data conversion type at subgraph outputs | ||
163 | //! 0: float <-> Q, 1: float <-> float, 2: Q <-> Q | ||
164 | std::vector<int> outConvType; | ||
165 | |||
166 | //! subgraph is signed data at subgraph outputs | ||
167 | std::vector<int> outIsSigned; | ||
168 | |||
169 | //! subgraph scaleF2Q factor at subgraph outputs | ||
170 | std::vector<float> outScaleF2Q; | ||
171 | |||
172 | //! subgraph is external tensor NCHW layout at subgraph outputs | ||
173 | std::vector<int> outIsNCHW; | ||
174 | |||
148 | //! Default constructor. | 175 | //! Default constructor. |
149 | Configuration(); | 176 | Configuration(); |
150 | 177 | ||
diff --git a/tidl_api/inc/subgraph_data_conv.h b/tidl_api/inc/subgraph_data_conv.h index 6b7c4b1..dee53e5 100644 --- a/tidl_api/inc/subgraph_data_conv.h +++ b/tidl_api/inc/subgraph_data_conv.h | |||
@@ -74,15 +74,23 @@ namespace tidl { | |||
74 | class SubgraphDataConv | 74 | class SubgraphDataConv |
75 | { | 75 | { |
76 | public: | 76 | public: |
77 | enum ConvType { | ||
78 | FLOAT_Q = 0, // conversion between float <-> Q | ||
79 | FLOAT_FLOAT = 1, // conversion between float <-> float | ||
80 | Q_Q = 2 // conversion between Q <-> Q | ||
81 | }; | ||
82 | |||
77 | //! @brief Creates a SubgraphDataConv. | 83 | //! @brief Creates a SubgraphDataConv. |
78 | //! @param None | 84 | //! @param None |
79 | SubgraphDataConv() {} | 85 | SubgraphDataConv() {} |
80 | 86 | ||
81 | SubgraphDataConv(const std::vector<bool>& is_signed, | 87 | SubgraphDataConv(const std::vector<int>& conv_type, |
88 | const std::vector<bool>& is_signed, | ||
82 | const std::vector<float>& scaleQ, | 89 | const std::vector<float>& scaleQ, |
83 | const std::vector<bool>& is_NCHW, | 90 | const std::vector<bool>& is_NCHW, |
84 | const std::vector<int>& dims | 91 | const std::vector<int>& dims |
85 | ) : is_signed_m(is_signed), scaleQ_m(scaleQ), | 92 | ) : conv_type_m(conv_type), |
93 | is_signed_m(is_signed), scaleQ_m(scaleQ), | ||
86 | is_NCHW_m(is_NCHW), dims_m(dims) | 94 | is_NCHW_m(is_NCHW), dims_m(dims) |
87 | {} | 95 | {} |
88 | 96 | ||
@@ -115,10 +123,13 @@ class SubgraphDataConv | |||
115 | void ScaleDequant(const uint8_t *in, std::vector<float*>& out) const; | 123 | void ScaleDequant(const uint8_t *in, std::vector<float*>& out) const; |
116 | 124 | ||
117 | private: | 125 | private: |
118 | //! if tensor needs to be evaluated as signed char | 126 | //! data type conversion, 0: float <-> Q, 1: float <-> float, 2: Q <-> Q |
127 | std::vector<int> conv_type_m; | ||
128 | |||
129 | //! if tensor needs to be evaluated as signed char (if float <-> Q) | ||
119 | std::vector<bool> is_signed_m; | 130 | std::vector<bool> is_signed_m; |
120 | 131 | ||
121 | //! Q value for Quantization and Dequantization | 132 | //! Q value for Quantization and Dequantization (if float <-> Q) |
122 | std::vector<float> scaleQ_m; | 133 | std::vector<float> scaleQ_m; |
123 | 134 | ||
124 | //! the format of external tensors, NCHW or NHWC | 135 | //! the format of external tensors, NCHW or NHWC |
diff --git a/tidl_api/inc/subgraph_runtime.h b/tidl_api/inc/subgraph_runtime.h index 37e771d..b4fc2b7 100644 --- a/tidl_api/inc/subgraph_runtime.h +++ b/tidl_api/inc/subgraph_runtime.h | |||
@@ -32,6 +32,15 @@ | |||
32 | 32 | ||
33 | extern "C" { | 33 | extern "C" { |
34 | 34 | ||
35 | //! @brief Top level API to initialize a TIDL subgraph on device | ||
36 | //! If not invoked ahead of time, TidlRunSubgraph() will call this | ||
37 | //! function before any inference | ||
38 | //! @param total_subgraphs total number of TIDL subgraphs in whole inference | ||
39 | //! @param subgraph_id index of current TIDL subgraph | ||
40 | extern void TidlInitSubgraph(int total_subgraphs, | ||
41 | int subgraph_id | ||
42 | ); | ||
43 | |||
35 | //! @brief Top level inference to run a TIDL subgraph | 44 | //! @brief Top level inference to run a TIDL subgraph |
36 | //! @param total_subgraphs total number of TIDL subgraphs in whole inference | 45 | //! @param total_subgraphs total number of TIDL subgraphs in whole inference |
37 | //! @param subgraph_id index of current TIDL subgraph | 46 | //! @param subgraph_id index of current TIDL subgraph |
diff --git a/tidl_api/src/configuration_parser.cpp b/tidl_api/src/configuration_parser.cpp index f457560..3ad0d2c 100644 --- a/tidl_api/src/configuration_parser.cpp +++ b/tidl_api/src/configuration_parser.cpp | |||
@@ -52,6 +52,7 @@ struct ConfigParser : qi::grammar<Iterator, ascii::space_type> | |||
52 | ConfigParser(Configuration &x) : ConfigParser::base_type(entry) | 52 | ConfigParser(Configuration &x) : ConfigParser::base_type(entry) |
53 | { | 53 | { |
54 | using qi::int_; | 54 | using qi::int_; |
55 | using qi::float_; | ||
55 | using qi::bool_; | 56 | using qi::bool_; |
56 | using qi::lit; | 57 | using qi::lit; |
57 | using qi::lexeme; | 58 | using qi::lexeme; |
@@ -66,6 +67,10 @@ struct ConfigParser : qi::grammar<Iterator, ascii::space_type> | |||
66 | path %= lexeme[+(char_ - '"')]; | 67 | path %= lexeme[+(char_ - '"')]; |
67 | q_path = qi::omit[*char_('"')] >> path >> qi::omit[*char_('"')]; | 68 | q_path = qi::omit[*char_('"')] >> path >> qi::omit[*char_('"')]; |
68 | 69 | ||
70 | // Rules for parsing subgraph data conversion information | ||
71 | intvec = int_ >> *int_; | ||
72 | floatvec = float_ >> *float_; | ||
73 | |||
69 | // Grammar for parsing configuration file | 74 | // Grammar for parsing configuration file |
70 | entry %= | 75 | entry %= |
71 | lit("layerIndex2LayerGroupId") >> '=' >> | 76 | lit("layerIndex2LayerGroupId") >> '=' >> |
@@ -85,7 +90,15 @@ struct ConfigParser : qi::grammar<Iterator, ascii::space_type> | |||
85 | int_[ph::ref(x.quantHistoryParam1)= _1] | | 90 | int_[ph::ref(x.quantHistoryParam1)= _1] | |
86 | lit("quantHistoryParam2") >> '=' >> | 91 | lit("quantHistoryParam2") >> '=' >> |
87 | int_[ph::ref(x.quantHistoryParam2)= _1] | | 92 | int_[ph::ref(x.quantHistoryParam2)= _1] | |
88 | lit("quantMargin") >> '=' >> int_[ph::ref(x.quantMargin)= _1] | 93 | lit("quantMargin") >> '=' >> int_[ph::ref(x.quantMargin)= _1] | |
94 | lit("inConvType") >> '=' >> intvec[ph::ref(x.inConvType) = _1] | | ||
95 | lit("inIsSigned") >> '=' >> intvec[ph::ref(x.inIsSigned) = _1] | | ||
96 | lit("inScaleF2Q") >> '=' >> floatvec[ph::ref(x.inScaleF2Q) = _1] | | ||
97 | lit("inIsNCHW") >> '=' >> intvec[ph::ref(x.inIsNCHW) = _1] | | ||
98 | lit("outConvType") >> '=' >> intvec[ph::ref(x.outConvType) = _1] | | ||
99 | lit("outIsSigned") >> '=' >> intvec[ph::ref(x.outIsSigned) = _1] | | ||
100 | lit("outScaleF2Q") >> '=' >> floatvec[ph::ref(x.outScaleF2Q) = _1] | | ||
101 | lit("outIsNCHW") >> '=' >> intvec[ph::ref(x.outIsNCHW) = _1] | ||
89 | ; | 102 | ; |
90 | } | 103 | } |
91 | 104 | ||
@@ -95,6 +108,9 @@ struct ConfigParser : qi::grammar<Iterator, ascii::space_type> | |||
95 | 108 | ||
96 | qi::rule<Iterator, std::pair<int, int>(), ascii::space_type> id2group; | 109 | qi::rule<Iterator, std::pair<int, int>(), ascii::space_type> id2group; |
97 | qi::rule<Iterator, std::map<int, int>(), ascii::space_type> id2groups; | 110 | qi::rule<Iterator, std::map<int, int>(), ascii::space_type> id2groups; |
111 | |||
112 | qi::rule<Iterator, std::vector<int>(), ascii::space_type> intvec; | ||
113 | qi::rule<Iterator, std::vector<float>(), ascii::space_type> floatvec; | ||
98 | }; | 114 | }; |
99 | 115 | ||
100 | bool Configuration::ReadFromFile(const std::string &file_name) | 116 | bool Configuration::ReadFromFile(const std::string &file_name) |
diff --git a/tidl_api/src/subgraph_data_conv.cpp b/tidl_api/src/subgraph_data_conv.cpp index d8cc11f..6366360 100644 --- a/tidl_api/src/subgraph_data_conv.cpp +++ b/tidl_api/src/subgraph_data_conv.cpp | |||
@@ -26,6 +26,8 @@ | |||
26 | * THE POSSIBILITY OF SUCH DAMAGE. | 26 | * THE POSSIBILITY OF SUCH DAMAGE. |
27 | *****************************************************************************/ | 27 | *****************************************************************************/ |
28 | 28 | ||
29 | #include <cassert> | ||
30 | #include <cstring> | ||
29 | #include "subgraph_data_conv.h" | 31 | #include "subgraph_data_conv.h" |
30 | 32 | ||
31 | using namespace tidl; | 33 | using namespace tidl; |
@@ -75,32 +77,91 @@ const | |||
75 | int offset = 0; | 77 | int offset = 0; |
76 | for (uint32_t d = 0; d < is_NCHW_m.size(); d++) | 78 | for (uint32_t d = 0; d < is_NCHW_m.size(); d++) |
77 | { | 79 | { |
78 | float Q = scaleQ_m[d]; | ||
79 | int N = dims_m[4 * d + 0]; | 80 | int N = dims_m[4 * d + 0]; |
80 | int C = dims_m[4 * d + 1]; | 81 | int C = dims_m[4 * d + 1]; |
81 | int H = dims_m[4 * d + 2]; | 82 | int H = dims_m[4 * d + 2]; |
82 | int W = dims_m[4 * d + 3]; | 83 | int W = dims_m[4 * d + 3]; |
83 | int vmin = is_signed_m[d] ? -128 : 0; | 84 | |
84 | int vmax = is_signed_m[d] ? 127 : 255; | 85 | if (conv_type_m[d] == ConvType::FLOAT_Q) |
85 | float *in_d = in[d]; | ||
86 | if (is_NCHW_m[d]) // no need to transpose external tensor | ||
87 | { | 86 | { |
88 | for (int i = 0; i < N * C * H * W; i++) | 87 | float Q = scaleQ_m[d]; |
89 | out[offset + i] = QuantizeValue(in_d[i], Q, vmin, vmax); | 88 | int vmin = is_signed_m[d] ? -128 : 0; |
89 | int vmax = is_signed_m[d] ? 127 : 255; | ||
90 | float *in_d = in[d]; | ||
91 | if (is_NCHW_m[d] || (C == 1) || (H*W == 1)) | ||
92 | { | ||
93 | // no need to transpose external tensor | ||
94 | for (int i = 0; i < N * C * H * W; i++) | ||
95 | out[offset + i] = QuantizeValue(in_d[i], Q, vmin, vmax); | ||
96 | } | ||
97 | else | ||
98 | { | ||
99 | // need to transpose external tensor | ||
100 | for (int n = 0; n < N; n++) | ||
101 | for (int c = 0; c < C; c++) | ||
102 | for (int h = 0; h < H; h++) | ||
103 | for (int w = 0; w < W; w++) | ||
104 | { | ||
105 | int nchw = GetIndex(n, c, h, w, N, C, H, W); | ||
106 | int nhwc = GetIndex(n, h, w, c, N, H, W, C); | ||
107 | out[offset + nchw] = QuantizeValue(in_d[nhwc], Q, vmin, vmax); | ||
108 | } | ||
109 | } | ||
90 | } | 110 | } |
91 | else // need to transpose external tensor | 111 | else if (conv_type_m[d] == ConvType::FLOAT_FLOAT) |
92 | { | 112 | { |
93 | for (int n = 0; n < N; n++) | 113 | assert((W & 0x3) == 0); // last dimension is bytes |
94 | for (int c = 0; c < C; c++) | 114 | int f_W = W / 4; // number of elements |
95 | for (int h = 0; h < H; h++) | 115 | float *in_d = in[d]; |
96 | for (int w = 0; w < W; w++) | 116 | float *out_d = (float *) (out + offset); |
97 | { | 117 | if (is_NCHW_m[d] || (C == 1) || (H*W == 1)) |
98 | int nchw = GetIndex(n, c, h, w, N, C, H, W); | 118 | { |
99 | int nhwc = GetIndex(n, h, w, c, N, H, W, C); | 119 | // no need to transpose external tensor |
100 | out[offset + nchw] = QuantizeValue(in_d[nhwc], Q, vmin, vmax); | 120 | memcpy(out_d, in_d, N * C * H * W); // W is bytes |
101 | } | 121 | } |
122 | else | ||
123 | { | ||
124 | // need to transpose external tensor | ||
125 | for (int n = 0; n < N; n++) | ||
126 | for (int c = 0; c < C; c++) | ||
127 | for (int h = 0; h < H; h++) | ||
128 | for (int w = 0; w < f_W; w++) | ||
129 | { | ||
130 | int nchw = GetIndex(n, c, h, w, N, C, H, f_W); | ||
131 | int nhwc = GetIndex(n, h, w, c, N, H, f_W, C); | ||
132 | out_d[nchw] = in_d[nhwc]; | ||
133 | } | ||
134 | } | ||
102 | } | 135 | } |
103 | offset += N * C * H * W; | 136 | else if (conv_type_m[d] == ConvType::Q_Q) |
137 | { | ||
138 | uint8_t *in_d = (uint8_t *) &in[d]; | ||
139 | uint8_t *out_d = (out + offset); | ||
140 | if (is_NCHW_m[d] || (C == 1) || (H*W == 1)) | ||
141 | { | ||
142 | // no need to transpose external tensor | ||
143 | memcpy(out_d, in_d, N * C * H * W); | ||
144 | } | ||
145 | else | ||
146 | { | ||
147 | // need to transpose external tensor | ||
148 | for (int n = 0; n < N; n++) | ||
149 | for (int c = 0; c < C; c++) | ||
150 | for (int h = 0; h < H; h++) | ||
151 | for (int w = 0; w < W; w++) | ||
152 | { | ||
153 | int nchw = GetIndex(n, c, h, w, N, C, H, W); | ||
154 | int nhwc = GetIndex(n, h, w, c, N, H, W, C); | ||
155 | out_d[nchw] = in_d[nhwc]; | ||
156 | } | ||
157 | } | ||
158 | } | ||
159 | else | ||
160 | { | ||
161 | assert(false); | ||
162 | } | ||
163 | |||
164 | offset += N * C * H * W; // accumulate in bytes | ||
104 | } | 165 | } |
105 | } | 166 | } |
106 | 167 | ||
@@ -111,31 +172,90 @@ const | |||
111 | int offset = 0; | 172 | int offset = 0; |
112 | for (uint32_t d = 0; d < is_NCHW_m.size(); d++) | 173 | for (uint32_t d = 0; d < is_NCHW_m.size(); d++) |
113 | { | 174 | { |
114 | float Q = scaleQ_m[d]; | ||
115 | float Q_inv = 1.0f / Q; | ||
116 | int N = dims_m[4 * d + 0]; | 175 | int N = dims_m[4 * d + 0]; |
117 | int C = dims_m[4 * d + 1]; | 176 | int C = dims_m[4 * d + 1]; |
118 | int H = dims_m[4 * d + 2]; | 177 | int H = dims_m[4 * d + 2]; |
119 | int W = dims_m[4 * d + 3]; | 178 | int W = dims_m[4 * d + 3]; |
120 | bool S = is_signed_m[d]; | 179 | |
121 | float *out_d = out[d]; | 180 | if (conv_type_m[d] == ConvType::FLOAT_Q) |
122 | if (is_NCHW_m[d]) // no need to transpose external tensor | 181 | { |
182 | float Q = scaleQ_m[d]; | ||
183 | float Q_inv = 1.0f / Q; | ||
184 | bool S = is_signed_m[d]; | ||
185 | float *out_d = out[d]; | ||
186 | if (is_NCHW_m[d] || (C == 1) || (H*W == 1)) | ||
187 | { | ||
188 | // no need to transpose external tensor | ||
189 | for (int i = 0; i < N * C * H * W; i++) | ||
190 | out_d[i] = DequantizeValue(in[offset + i], Q_inv, S); | ||
191 | } | ||
192 | else | ||
193 | { | ||
194 | // need to transpose external tensor | ||
195 | for (int n = 0; n < N; n++) | ||
196 | for (int c = 0; c < C; c++) | ||
197 | for (int h = 0; h < H; h++) | ||
198 | for (int w = 0; w < W; w++) | ||
199 | { | ||
200 | int nchw = GetIndex(n, c, h, w, N, C, H, W); | ||
201 | int nhwc = GetIndex(n, h, w, c, N, H, W, C); | ||
202 | out_d[nhwc] = DequantizeValue(in[offset + nchw], Q_inv, S); | ||
203 | } | ||
204 | } | ||
205 | } | ||
206 | else if (conv_type_m[d] == ConvType::FLOAT_FLOAT) | ||
123 | { | 207 | { |
124 | for (int i = 0; i < N * C * H * W; i++) | 208 | assert((W & 0x3) == 0); // last dimension is bytes |
125 | out_d[i] = DequantizeValue(in[offset + i], Q_inv, S); | 209 | int f_W = W / 4; // number of elements |
210 | float *in_d = (float *) (in + offset); | ||
211 | float *out_d = out[d]; | ||
212 | if (is_NCHW_m[d] || (C == 1) || (H*W == 1)) | ||
213 | { | ||
214 | // no need to transpose external tensor | ||
215 | memcpy(out_d, in_d, N * C * H * W); // W is bytes | ||
216 | } | ||
217 | else | ||
218 | { | ||
219 | // need to transpose external tensor | ||
220 | for (int n = 0; n < N; n++) | ||
221 | for (int c = 0; c < C; c++) | ||
222 | for (int h = 0; h < H; h++) | ||
223 | for (int w = 0; w < f_W; w++) | ||
224 | { | ||
225 | int nchw = GetIndex(n, c, h, w, N, C, H, f_W); | ||
226 | int nhwc = GetIndex(n, h, w, c, N, H, f_W, C); | ||
227 | out_d[nhwc] = in_d[nchw]; | ||
228 | } | ||
229 | } | ||
126 | } | 230 | } |
127 | else // need to transpose external tensor | 231 | else if (conv_type_m[d] == ConvType::Q_Q) |
128 | { | 232 | { |
129 | for (int n = 0; n < N; n++) | 233 | uint8_t *in_d = (uint8_t *) (in + offset); |
130 | for (int c = 0; c < C; c++) | 234 | uint8_t *out_d = (uint8_t * ) &out[d]; |
131 | for (int h = 0; h < H; h++) | 235 | if (is_NCHW_m[d] || (C == 1) || (H*W == 1)) |
132 | for (int w = 0; w < W; w++) | 236 | { |
133 | { | 237 | // no need to transpose external tensor |
134 | int nchw = GetIndex(n, c, h, w, N, C, H, W); | 238 | memcpy(out_d, in_d, N * C * H * W); |
135 | int nhwc = GetIndex(n, h, w, c, N, H, W, C); | 239 | } |
136 | out_d[nhwc] = DequantizeValue(in[offset + nchw], Q_inv, S); | 240 | else |
137 | } | 241 | { |
242 | // need to transpose external tensor | ||
243 | for (int n = 0; n < N; n++) | ||
244 | for (int c = 0; c < C; c++) | ||
245 | for (int h = 0; h < H; h++) | ||
246 | for (int w = 0; w < W; w++) | ||
247 | { | ||
248 | int nchw = GetIndex(n, c, h, w, N, C, H, W); | ||
249 | int nhwc = GetIndex(n, h, w, c, N, H, W, C); | ||
250 | out_d[nhwc] = in_d[nchw]; | ||
251 | } | ||
252 | } | ||
138 | } | 253 | } |
254 | else | ||
255 | { | ||
256 | assert(false); | ||
257 | } | ||
258 | |||
139 | offset += N * C * H * W; | 259 | offset += N * C * H * W; |
140 | } | 260 | } |
141 | } | 261 | } |
diff --git a/tidl_api/src/subgraph_runtime.cpp b/tidl_api/src/subgraph_runtime.cpp index 09905fc..342acd8 100644 --- a/tidl_api/src/subgraph_runtime.cpp +++ b/tidl_api/src/subgraph_runtime.cpp | |||
@@ -74,6 +74,13 @@ void TVM_TidlFunction(int total_subgraphs, int subgraph_id, | |||
74 | using namespace tidl; | 74 | using namespace tidl; |
75 | 75 | ||
76 | 76 | ||
77 | void TidlInitSubgraph(int total_subgraphs, int subgraph_id) | ||
78 | { | ||
79 | ResM& res = ResM::Instance(total_subgraphs); | ||
80 | res.InitSubgraph(subgraph_id); | ||
81 | } | ||
82 | |||
83 | |||
77 | void TidlRunSubgraph(int total_subgraphs, | 84 | void TidlRunSubgraph(int total_subgraphs, |
78 | int subgraph_id, | 85 | int subgraph_id, |
79 | int batch_size, | 86 | int batch_size, |
@@ -199,15 +206,8 @@ void ResM::Init(uint32_t num_subgraphs) | |||
199 | es_m.resize(num_subgraphs_m, nullptr); | 206 | es_m.resize(num_subgraphs_m, nullptr); |
200 | e2s_m.resize(num_subgraphs_m, nullptr); | 207 | e2s_m.resize(num_subgraphs_m, nullptr); |
201 | eops_m = new std::vector<ResEOP>(num_subgraphs_m); | 208 | eops_m = new std::vector<ResEOP>(num_subgraphs_m); |
202 | 209 | in_conv_m.resize(num_subgraphs_m, nullptr); | |
203 | // TODO: this should come from parsing config file | 210 | out_conv_m.resize(num_subgraphs_m, nullptr); |
204 | for (uint32_t i = 0; i < num_subgraphs_m; i++) | ||
205 | { | ||
206 | in_conv_m.push_back(new SubgraphDataConv( | ||
207 | {true}, {128.0f}, {false}, {1,3,224,224})); | ||
208 | out_conv_m.push_back(new SubgraphDataConv( | ||
209 | {false}, {255.0f}, {true}, {1,1,1,1001})); | ||
210 | } | ||
211 | } | 211 | } |
212 | } | 212 | } |
213 | 213 | ||
@@ -219,19 +219,63 @@ void ResM::InitSubgraph(uint32_t subgraph_id) | |||
219 | 219 | ||
220 | std::unique_lock<std::mutex> lock(res_eop.mutex_eops); | 220 | std::unique_lock<std::mutex> lock(res_eop.mutex_eops); |
221 | 221 | ||
222 | // Constructing EOPs if not already constructed | ||
222 | if (res_eop.eops == nullptr) | 223 | if (res_eop.eops == nullptr) |
223 | { | 224 | { |
224 | if (enable_trace_m) | 225 | if (enable_trace_m) |
225 | printf("Subgraph %d: initialing E/EOPs with %d cores\n", | 226 | printf("Subgraph %d: initialing E/EOPs with %d cores\n", |
226 | subgraph_id, num_es_per_subgraph_m); | 227 | subgraph_id, num_es_per_subgraph_m); |
227 | 228 | ||
228 | // Constructing EOPs if not already constructed | 229 | // Read config file |
229 | // Each subgraph -> num_eves_per_subgraph_m EOPs | ||
230 | // Each EOP -> use_count | ||
231 | std::string cfg_file = "subgraph" + std::to_string(subgraph_id) + ".cfg"; | 230 | std::string cfg_file = "subgraph" + std::to_string(subgraph_id) + ".cfg"; |
232 | bool status = cs_m[subgraph_id].ReadFromFile(cfg_file); | 231 | bool status = cs_m[subgraph_id].ReadFromFile(cfg_file); |
233 | assert(status); | 232 | assert(status); |
234 | 233 | ||
234 | // Read the network | ||
235 | sTIDL_Network_t *net = new sTIDL_Network_t; | ||
236 | status = ReadNetworkBinary(cs_m[subgraph_id].netBinFile, | ||
237 | reinterpret_cast<char *>(net)); | ||
238 | assert(status); | ||
239 | |||
240 | // Get data conversion info from configuration | ||
241 | // Get input/output tensors dimensions from network | ||
242 | // Construct data converters at the subgraph boundaries | ||
243 | std::vector<int> inDims, outDims; | ||
244 | for (int32_t layer = 0; layer < net->numLayers; layer++) | ||
245 | { | ||
246 | if (net->TIDLLayers[layer].layerType != (int32_t) TIDL_DataLayer) | ||
247 | continue; | ||
248 | if (net->TIDLLayers[layer].numInBufs <= 0) | ||
249 | { | ||
250 | for (int d = 0; d < 4; d++) | ||
251 | inDims.push_back(net->TIDLLayers[layer].outData[0].dimValues[d]); | ||
252 | } | ||
253 | if (net->TIDLLayers[layer].numOutBufs <= 0) | ||
254 | { | ||
255 | for (int d = 0; d < 4; d++) | ||
256 | outDims.push_back(net->TIDLLayers[layer].inData[0].dimValues[d]); | ||
257 | } | ||
258 | } | ||
259 | assert(cs_m[subgraph_id].inIsNCHW.size() * 4 == inDims.size()); | ||
260 | assert(cs_m[subgraph_id].outIsNCHW.size() * 4 == outDims.size()); | ||
261 | std::vector<bool> inIsSigned, outIsSigned, inIsNCHW, outIsNCHW; | ||
262 | for (int v : cs_m[subgraph_id].inIsSigned) inIsSigned.push_back(v != 0); | ||
263 | for (int v : cs_m[subgraph_id].inIsNCHW) inIsNCHW.push_back(v != 0); | ||
264 | for (int v : cs_m[subgraph_id].outIsSigned) outIsSigned.push_back(v != 0); | ||
265 | for (int v : cs_m[subgraph_id].outIsNCHW) outIsNCHW.push_back(v != 0); | ||
266 | in_conv_m[subgraph_id] = new SubgraphDataConv( | ||
267 | cs_m[subgraph_id].inConvType, | ||
268 | inIsSigned, | ||
269 | cs_m[subgraph_id].inScaleF2Q, | ||
270 | inIsNCHW, | ||
271 | inDims); | ||
272 | out_conv_m[subgraph_id] = new SubgraphDataConv( | ||
273 | cs_m[subgraph_id].outConvType, | ||
274 | outIsSigned, | ||
275 | cs_m[subgraph_id].outScaleF2Q, | ||
276 | outIsNCHW, | ||
277 | outDims); | ||
278 | |||
235 | // Check if last few layers can be offloaded to DSPs | 279 | // Check if last few layers can be offloaded to DSPs |
236 | // and DSPs are available | 280 | // and DSPs are available |
237 | DeviceIds e_ids, e2_ids; | 281 | DeviceIds e_ids, e2_ids; |
@@ -241,10 +285,6 @@ void ResM::InitSubgraph(uint32_t subgraph_id) | |||
241 | // uint32_t num_dsps_used = 0; | 285 | // uint32_t num_dsps_used = 0; |
242 | if (num_eves_m > 0 && num_dsps_m > 0 && ! cs_m[subgraph_id].runFullNet) | 286 | if (num_eves_m > 0 && num_dsps_m > 0 && ! cs_m[subgraph_id].runFullNet) |
243 | { | 287 | { |
244 | sTIDL_Network_t *net = new sTIDL_Network_t; | ||
245 | bool status = ReadNetworkBinary(cs_m[subgraph_id].netBinFile, | ||
246 | reinterpret_cast<char *>(net)); | ||
247 | assert(status); | ||
248 | int32_t start_layer = net->numLayers -1; | 288 | int32_t start_layer = net->numLayers -1; |
249 | int32_t end_layer = 0; | 289 | int32_t end_layer = 0; |
250 | if (net->TIDLLayers[start_layer].layerType == (int32_t) TIDL_DataLayer) | 290 | if (net->TIDLLayers[start_layer].layerType == (int32_t) TIDL_DataLayer) |
@@ -286,7 +326,7 @@ void ResM::InitSubgraph(uint32_t subgraph_id) | |||
286 | cs_m[subgraph_id].runFullNet = true; | 326 | cs_m[subgraph_id].runFullNet = true; |
287 | cs_m[subgraph_id].enableApiTrace = enable_trace_m; | 327 | cs_m[subgraph_id].enableApiTrace = enable_trace_m; |
288 | 328 | ||
289 | // Constructing Es and EOPs | 329 | // Constructing Es and EOPs, each subgraph -> num_eves_per_subgraph_m EOPs |
290 | res_eop.eops = new std::vector<ExecutionObjectPipeline*>; | 330 | res_eop.eops = new std::vector<ExecutionObjectPipeline*>; |
291 | uint32_t buffer_factor = 2; // double buffering factor | 331 | uint32_t buffer_factor = 2; // double buffering factor |
292 | if (num_eves_m > 0) | 332 | if (num_eves_m > 0) |