1 /******************************************************************************
2 * Copyright (c) 2019 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
29 #include <pthread.h>
30 #define LOKI_PTHREAD_H
31 #include <loki/Singleton.h>
33 #include "util.h"
34 #include "subgraph_runtime.h"
35 #include "subgraph_runtime_impl.h"
38 #if 0
39 // Auto-generated code from Relay/TVM compilation step after
40 // partitioning and lowering to backend implementation
42 void TVM_TidlFunction(int total_subgraphs, int subgraph_id,
43 int num_input_tensors, int num_output_tensors,
44 PackedArgs args)
45 {
46 float** in_data = new float*[num_inputs_per_inference * batch_size];
47 float** out_data = new float*[num_outputs_per_inference * batch_size];
49 for (in j = 0; j < batch_size; j++)
50 {
51 for (int i = 0; i < num_inputs_per_inference + num_outputs_per_inference;
52 i++)
53 if (i < num_inputs_per_inference)
54 in_data[j * num_inputs_per_inference + i] = args.data[i][j];
55 else
56 out_data[j * num_outpus_per_inference + i - num_inputs_per_inference]
57 = args.data[i][j];
58 }
60 // call into this function in libtidl.so
61 // dlopen("libtidl_api.so")
62 // TidlFunc = dlsym("TidlRunSubgraph");
63 (*TidlFunc)(total_subgraphs, subgraph_id, batch_size
64 num_inputs_per_inference, num_outputs_per_inference,
65 in_data, out_data);
67 delete [] in_data;
68 delete [] out_data;
69 }
70 #endif
73 // Singleton ResM .cpp
74 using namespace tidl;
76 int TidlGetPreferredBatchSize(int total_subgraphs)
77 {
78 ResM& res = ResM::Instance(total_subgraphs);
79 return res.GetNumEs();
80 }
82 void TidlInitSubgraph(int total_subgraphs, int subgraph_id)
83 {
84 ResM& res = ResM::Instance(total_subgraphs);
85 res.InitSubgraph(subgraph_id);
86 }
88 void TidlFreeSubgraph(int total_subgraphs, int subgraph_id)
89 {
90 ResM& res = ResM::Instance(total_subgraphs);
91 res.FreeSubgraph(subgraph_id);
92 }
94 void TidlRunSubgraph(int total_subgraphs,
95 int subgraph_id,
96 int batch_size,
97 int num_inputs_per_inference,
98 int num_outputs_per_inference,
99 float **input_tensors,
100 float **output_tensors
101 )
102 {
103 ResM& res = ResM::Instance(total_subgraphs);
104 res.InitSubgraph(subgraph_id);
105 int num_eops = res.GetNumEOPs(subgraph_id);
106 if (num_eops > batch_size) num_eops = batch_size;
107 std::vector<ExecutionObjectPipeline*> eops(num_eops);
108 for (int i = 0; i < num_eops; i++)
109 eops[i] = res.GetEOP(subgraph_id);
110 const SubgraphDataConv& in_conv = res.GetInConv(subgraph_id);
111 const SubgraphDataConv& out_conv = res.GetOutConv(subgraph_id);
113 std::vector<std::vector<float *>> in_data_v(batch_size),
114 out_data_v(batch_size);
115 for (int frame_idx = 0; frame_idx < batch_size; frame_idx++)
116 {
117 for (int i = 0; i < num_inputs_per_inference; i++)
118 in_data_v[frame_idx].emplace_back(input_tensors[
119 frame_idx * num_inputs_per_inference + i]);
120 for (int i = 0; i < num_outputs_per_inference; i++)
121 out_data_v[frame_idx].emplace_back(output_tensors[
122 frame_idx * num_inputs_per_inference + i]);
123 }
125 // Process batch_size frames with available eops in pipelined manner
126 // additional num_eops iterations to flush the pipeline (epilogue)
127 for (int frame_idx = 0; frame_idx < batch_size + num_eops; frame_idx++)
128 {
129 ExecutionObjectPipeline *eop = eops[frame_idx % num_eops];
131 if (eop->ProcessFrameWait())
132 {
133 const uint8_t *out_data = (const uint8_t*) eop->GetOutputBufferPtr();
134 out_conv.ScaleDequant(out_data, out_data_v[frame_idx - num_eops]);
135 }
137 if (frame_idx < batch_size)
138 {
139 uint8_t *in_data = (uint8_t *) eop->GetInputBufferPtr();
140 in_conv.ScaleQuant(in_data_v[frame_idx], in_data);
141 eop->ProcessFrameStartAsync();
142 }
143 }
145 for (int i = 0; i < num_eops; i++)
146 res.FreeEOP(subgraph_id, eops[i]);
147 }
150 typedef Loki::SingletonHolder <tidl::ResM, Loki::CreateUsingNew,
151 Loki::DefaultLifetime, Loki::ClassLevelLockable> tidlSingleResM;
153 ResM::ResM() : enable_trace_m(false), num_subgraphs_m(0),
154 num_lg2_dsps_used_m(0), eops_m(nullptr)
155 {
156 }
158 ResM::~ResM()
159 {
160 for (uint32_t i = 0; i < num_subgraphs_m; i++)
161 FreeSubgraph(i);
163 delete eops_m;
164 eops_m = nullptr;
165 }
167 void ResM::FreeSubgraph(uint32_t subgraph_id)
168 {
169 assert(subgraph_id < num_subgraphs_m);
171 if (eops_m != nullptr)
172 {
173 ResEOP& res_eop = (*eops_m)[subgraph_id];
174 if (res_eop.eops != nullptr)
175 {
176 for (const ExecutionObjectPipeline* eop : *(res_eop.eops))
177 {
178 free(eop->GetInputBufferPtr());
179 free(eop->GetOutputBufferPtr());
180 delete eop;
181 }
182 delete res_eop.eops;
183 res_eop.eops = nullptr;
184 }
185 }
187 delete es_m[subgraph_id];
188 es_m[subgraph_id] = nullptr;
190 delete e2s_m[subgraph_id];
191 e2s_m[subgraph_id] = nullptr;
193 delete in_conv_m[subgraph_id];
194 in_conv_m[subgraph_id] = nullptr;
196 delete out_conv_m[subgraph_id];
197 out_conv_m[subgraph_id] = nullptr;
198 }
200 ResM& ResM::Instance(uint32_t total_num_subgraphs)
201 {
202 ResM& res = tidlSingleResM::Instance();
203 res.Init(total_num_subgraphs);
204 return res;
205 }
207 void ResM::Init(uint32_t num_subgraphs)
208 {
209 std::lock_guard<std::mutex> lock(mutex_init_m);
211 if (num_subgraphs_m == 0)
212 {
213 num_subgraphs_m = num_subgraphs;
215 if (getenv("TIDL_SUBGRAPH_TRACE") != nullptr) enable_trace_m = true;
217 // Allocating resources
218 num_eves_m = Executor::GetNumDevices(DeviceType::EVE);
219 num_dsps_m = Executor::GetNumDevices(DeviceType::DSP);
221 assert(num_eves_m > 0 || num_dsps_m > 0);
222 assert(num_subgraphs_m <= num_eves_m || num_subgraphs_m <= num_dsps_m);
223 num_es_per_subgraph_m = num_eves_m / num_subgraphs_m;
224 if (num_eves_m == 0)
225 num_es_per_subgraph_m = num_dsps_m / num_subgraphs_m;
227 cs_m.resize(num_subgraphs_m);
228 es_m.resize(num_subgraphs_m, nullptr);
229 e2s_m.resize(num_subgraphs_m, nullptr);
230 eops_m = new std::vector<ResEOP>(num_subgraphs_m);
231 in_conv_m.resize(num_subgraphs_m, nullptr);
232 out_conv_m.resize(num_subgraphs_m, nullptr);
233 }
234 }
237 void ResM::InitSubgraph(uint32_t subgraph_id)
238 {
239 assert(subgraph_id < num_subgraphs_m);
240 ResEOP& res_eop = (*eops_m)[subgraph_id];
242 std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
244 // Constructing EOPs if not already constructed
245 if (res_eop.eops == nullptr)
246 {
247 if (enable_trace_m)
248 printf("Subgraph %d: initialing E/EOPs with %d cores\n",
249 subgraph_id, num_es_per_subgraph_m);
251 // Read config file
252 std::string cfg_file = "subgraph" + std::to_string(subgraph_id) + ".cfg";
253 bool status = cs_m[subgraph_id].ReadFromFile(cfg_file);
254 assert(status);
256 // Read the network
257 sTIDL_Network_t *net = new sTIDL_Network_t;
258 status = ReadNetworkBinary(cs_m[subgraph_id].netBinFile,
259 reinterpret_cast<char *>(net));
260 assert(status);
262 // Get data conversion info from configuration
263 // Get input/output tensors dimensions from network
264 // Construct data converters at the subgraph boundaries
265 std::vector<int> inDims, outDims;
266 for (int32_t layer = 0; layer < net->numLayers; layer++)
267 {
268 if (net->TIDLLayers[layer].layerType != (int32_t) TIDL_DataLayer)
269 continue;
270 if (net->TIDLLayers[layer].numInBufs <= 0)
271 {
272 for (int d = 0; d < 4; d++)
273 inDims.push_back(net->TIDLLayers[layer].outData[0].dimValues[d]);
274 }
275 if (net->TIDLLayers[layer].numOutBufs <= 0)
276 {
277 for (int d = 0; d < 4; d++)
278 outDims.push_back(net->TIDLLayers[layer].inData[0].dimValues[d]);
279 }
280 }
281 assert(cs_m[subgraph_id].inIsNCHW.size() * 4 == inDims.size());
282 assert(cs_m[subgraph_id].outIsNCHW.size() * 4 == outDims.size());
283 std::vector<bool> inIsSigned, outIsSigned, inIsNCHW, outIsNCHW;
284 for (int v : cs_m[subgraph_id].inIsSigned) inIsSigned.push_back(v != 0);
285 for (int v : cs_m[subgraph_id].inIsNCHW) inIsNCHW.push_back(v != 0);
286 for (int v : cs_m[subgraph_id].outIsSigned) outIsSigned.push_back(v != 0);
287 for (int v : cs_m[subgraph_id].outIsNCHW) outIsNCHW.push_back(v != 0);
288 in_conv_m[subgraph_id] = new SubgraphDataConv(
289 cs_m[subgraph_id].inConvType,
290 inIsSigned,
291 cs_m[subgraph_id].inScaleF2Q,
292 inIsNCHW,
293 inDims);
294 out_conv_m[subgraph_id] = new SubgraphDataConv(
295 cs_m[subgraph_id].outConvType,
296 outIsSigned,
297 cs_m[subgraph_id].outScaleF2Q,
298 outIsNCHW,
299 outDims);
301 // Check if last few layers can be offloaded to DSPs
302 // and DSPs are available
303 DeviceIds e_ids, e2_ids;
304 for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
305 e_ids.insert(static_cast<DeviceId>(
306 subgraph_id * num_es_per_subgraph_m + i));
307 // uint32_t num_dsps_used = 0;
308 if (num_eves_m > 0 && num_dsps_m > 0 && ! cs_m[subgraph_id].runFullNet)
309 {
310 if (cs_m[subgraph_id].layerIndex2LayerGroupId.empty())
311 {
312 int32_t start_layer = net->numLayers -1;
313 int32_t end_layer = 0;
314 if (net->TIDLLayers[start_layer].layerType == (int32_t) TIDL_DataLayer)
315 start_layer -= 1;
316 if (net->TIDLLayers[end_layer].layerType == (int32_t) TIDL_DataLayer)
317 end_layer += 1;
318 int32_t i = start_layer;
319 for ( ; i > end_layer; i--)
320 {
321 int32_t layer_type = net->TIDLLayers[i].layerType;
322 if (layer_type != (int32_t) TIDL_SoftMaxLayer &&
323 layer_type != (int32_t) TIDL_InnerProductLayer &&
324 layer_type != (int32_t) TIDL_PoolingLayer)
325 break;
326 }
327 i += 1;
328 if (i <= start_layer)
329 {
330 if (num_lg2_dsps_used_m < num_dsps_m)
331 {
332 if (enable_trace_m)
333 printf("Subgraph %d: assign layers %d to %d to group 2 for DSP\n",
334 subgraph_id, i, start_layer);
335 while (i <= start_layer)
336 cs_m[subgraph_id].layerIndex2LayerGroupId[i++] = 2;
337 }
338 }
339 }
340 else
341 {
342 if (enable_trace_m)
343 printf("Subgraph %d: using layer2group map in config file for DSP\n",
344 subgraph_id);
345 }
347 if (! cs_m[subgraph_id].layerIndex2LayerGroupId.empty())
348 {
349 e2_ids.insert(static_cast<DeviceId>(num_lg2_dsps_used_m));
350 num_lg2_dsps_used_m += 1;
351 if (num_subgraphs_m == 1) // Allocate all dsps if only one subgraph
352 {
353 while (num_lg2_dsps_used_m < num_dsps_m)
354 e2_ids.insert(static_cast<DeviceId>(num_lg2_dsps_used_m++));
355 }
356 }
357 }
358 delete net;
360 if (e2_ids.empty())
361 cs_m[subgraph_id].runFullNet = true;
362 cs_m[subgraph_id].enableApiTrace = enable_trace_m;
364 // Constructing Es and EOPs, each subgraph -> num_eves_per_subgraph_m EOPs
365 res_eop.eops = new std::vector<ExecutionObjectPipeline*>;
366 uint32_t buffer_factor = 2; // double buffering factor
367 if (num_eves_m > 0)
368 {
369 es_m[subgraph_id] = new Executor(DeviceType::EVE, e_ids,
370 cs_m[subgraph_id], 1);
371 if (! e2_ids.empty())
372 {
373 e2s_m[subgraph_id] = new Executor(DeviceType::DSP, e2_ids,
374 cs_m[subgraph_id], 2);
375 for (uint32_t j = 0; j < buffer_factor; j++)
376 for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
377 res_eop.eops->emplace_back(new ExecutionObjectPipeline(
378 {(*es_m[subgraph_id])[i],
379 (*e2s_m[subgraph_id])[i % e2_ids.size()]}));
380 }
381 else
382 {
383 for (uint32_t j = 0; j < buffer_factor; j++)
384 for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
385 res_eop.eops->emplace_back(new ExecutionObjectPipeline(
386 {(*es_m[subgraph_id])[i]}));
387 }
388 }
389 else
390 {
391 es_m[subgraph_id] = new Executor(DeviceType::DSP, e_ids,
392 cs_m[subgraph_id], 1);
393 for (uint32_t j = 0; j < buffer_factor; j++)
394 for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
395 res_eop.eops->emplace_back(new ExecutionObjectPipeline(
396 {(*es_m[subgraph_id])[i]}));
397 }
399 if (enable_trace_m)
400 printf("Subgraph %d: Allocating input/output buffers for %d EOPs\n",
401 subgraph_id, res_eop.eops->size());
402 // Allocate input/output buffers
403 for (auto eop : *(res_eop.eops))
404 {
405 size_t in_size = eop->GetInputBufferSizeInBytes();
406 size_t out_size = eop->GetOutputBufferSizeInBytes();
407 void* in_ptr = malloc(in_size);
408 void* out_ptr = malloc(out_size);
409 assert(in_ptr != nullptr && out_ptr != nullptr);
411 ArgInfo in(in_ptr, in_size);
412 ArgInfo out(out_ptr, out_size);
413 eop->SetInputOutputBuffer(in, out);
414 }
416 res_eop.free_eop_index = 0;
417 res_eop.is_used.resize(res_eop.eops->size(), false);
418 }
419 }
421 uint32_t ResM::GetNumEOPs(uint32_t subgraph_id)
422 {
423 assert(subgraph_id < num_subgraphs_m);
424 ResEOP& res_eop = (*eops_m)[subgraph_id];
425 assert (res_eop.eops != nullptr);
427 return res_eop.eops->size();
428 }
430 ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
431 {
432 assert(subgraph_id < num_subgraphs_m);
433 ResEOP& res_eop = (*eops_m)[subgraph_id];
434 assert(res_eop.eops != nullptr);
436 std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
438 // Return an available EOP (round robin allocation)
439 uint32_t curr_eop = res_eop.free_eop_index;
440 res_eop.cv_eops.wait(lock, [this, subgraph_id, curr_eop]{
441 return this->eops_m->at(subgraph_id).is_used[curr_eop] == false; });
442 res_eop.is_used[curr_eop] = true;
443 res_eop.free_eop_index = (curr_eop + 1) % res_eop.eops->size();
444 if (enable_trace_m)
445 printf("Subgraph %d: return EOP %d for GetEOP()\n", subgraph_id, curr_eop);
446 return res_eop.eops->at(curr_eop);
447 }
449 void ResM::FreeEOP(uint32_t subgraph_id, ExecutionObjectPipeline* eop)
450 {
451 assert(subgraph_id < num_subgraphs_m);
452 ResEOP& res_eop = (*eops_m)[subgraph_id];
453 assert(res_eop.eops != nullptr);
455 {
456 std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
457 for (uint32_t i = 0; i < res_eop.is_used.size(); i++)
458 if (res_eop.eops->at(i) == eop)
459 {
460 res_eop.is_used[i] = false;
461 if (enable_trace_m)
462 printf("Subgraph %d: FreeEOP %d\n", subgraph_id, i);
463 break;
464 }
465 }
466 res_eop.cv_eops.notify_all();
467 }
469 Configuration& ResM::GetConfiguration(uint32_t subgraph_id)
470 {
471 assert(subgraph_id < num_subgraphs_m);
472 assert((*eops_m)[subgraph_id].eops != nullptr);
473 return cs_m[subgraph_id];
474 }
476 const SubgraphDataConv& ResM::GetInConv(uint32_t subgraph_id)
477 {
478 assert(subgraph_id < num_subgraphs_m);
479 assert(in_conv_m[subgraph_id] != nullptr);
480 return *in_conv_m[subgraph_id];
481 }
483 const SubgraphDataConv& ResM::GetOutConv(uint32_t subgraph_id)
484 {
485 assert(subgraph_id < num_subgraphs_m);
486 assert(out_conv_m[subgraph_id] != nullptr);
487 return *out_conv_m[subgraph_id];
488 }