1 /******************************************************************************
2 * Copyright (c) 2019 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
29 #include <pthread.h>
30 #define LOKI_PTHREAD_H
31 #include <loki/Singleton.h>
33 #include "util.h"
34 #include "subgraph_runtime.h"
35 #include "subgraph_runtime_impl.h"
38 #if 0
39 // Auto-generated code from Relay/TVM compilation step after
40 // partitioning and lowering to backend implementation
42 void TVM_TidlFunction(int total_subgraphs, int subgraph_id,
43 int num_input_tensors, int num_output_tensors,
44 PackedArgs args)
45 {
46 float** in_data = new float*[num_inputs_per_inference * batch_size];
47 float** out_data = new float*[num_outputs_per_inference * batch_size];
49 for (in j = 0; j < batch_size; j++)
50 {
51 for (int i = 0; i < num_inputs_per_inference + num_outputs_per_inference;
52 i++)
53 if (i < num_inputs_per_inference)
54 in_data[j * num_inputs_per_inference + i] = args.data[i][j];
55 else
56 out_data[j * num_outpus_per_inference + i - num_inputs_per_inference]
57 = args.data[i][j];
58 }
60 // call into this function in libtidl.so
61 // dlopen("libtidl_api.so")
62 // TidlFunc = dlsym("TidlRunSubgraph");
63 (*TidlFunc)(total_subgraphs, subgraph_id, batch_size
64 num_inputs_per_inference, num_outputs_per_inference,
65 in_data, out_data);
67 delete [] in_data;
68 delete [] out_data;
69 }
70 #endif
73 // Singleton ResM .cpp
74 using namespace tidl;
77 void TidlRunSubgraph(int total_subgraphs,
78 int subgraph_id,
79 int batch_size,
80 int num_inputs_per_inference,
81 int num_outputs_per_inference,
82 float **input_tensors,
83 float **output_tensors
84 )
85 {
86 ResM& res = ResM::Instance(total_subgraphs);
87 res.InitSubgraph(subgraph_id);
88 int num_eops = res.GetNumEOPs(subgraph_id);
89 if (num_eops > batch_size) num_eops = batch_size;
90 std::vector<ExecutionObjectPipeline*> eops(num_eops);
91 for (int i = 0; i < num_eops; i++)
92 eops[i] = res.GetEOP(subgraph_id);
93 const SubgraphDataConv& in_conv = res.GetInConv(subgraph_id);
94 const SubgraphDataConv& out_conv = res.GetOutConv(subgraph_id);
96 std::vector<std::vector<float *>> in_data_v(batch_size),
97 out_data_v(batch_size);
98 for (int frame_idx = 0; frame_idx < batch_size; frame_idx++)
99 {
100 for (int i = 0; i < num_inputs_per_inference; i++)
101 in_data_v[frame_idx].emplace_back(input_tensors[
102 frame_idx * num_inputs_per_inference + i]);
103 for (int i = 0; i < num_outputs_per_inference; i++)
104 out_data_v[frame_idx].emplace_back(output_tensors[
105 frame_idx * num_inputs_per_inference + i]);
106 }
108 // Process batch_size frames with available eops in pipelined manner
109 // additional num_eops iterations to flush the pipeline (epilogue)
110 for (int frame_idx = 0; frame_idx < batch_size + num_eops; frame_idx++)
111 {
112 ExecutionObjectPipeline *eop = eops[frame_idx % num_eops];
114 if (eop->ProcessFrameWait())
115 {
116 const uint8_t *out_data = (const uint8_t*) eop->GetOutputBufferPtr();
117 out_conv.ScaleDequant(out_data, out_data_v[frame_idx - num_eops]);
118 }
120 if (frame_idx < batch_size)
121 {
122 uint8_t *in_data = (uint8_t *) eop->GetInputBufferPtr();
123 in_conv.ScaleQuant(in_data_v[frame_idx], in_data);
124 eop->ProcessFrameStartAsync();
125 }
126 }
128 for (int i = 0; i < num_eops; i++)
129 res.FreeEOP(subgraph_id, eops[i]);
130 }
133 typedef Loki::SingletonHolder <tidl::ResM, Loki::CreateUsingNew,
134 Loki::DefaultLifetime, Loki::ClassLevelLockable> tidlSingleResM;
136 ResM::ResM() : enable_trace_m(false), num_subgraphs_m(0),
137 num_lg2_dsps_used_m(0), eops_m(nullptr)
138 {
139 }
141 ResM::~ResM()
142 {
143 if (eops_m != nullptr)
144 {
145 for (const ResEOP& res_eop : *eops_m)
146 {
147 if (res_eop.eops != nullptr)
148 {
149 for (const ExecutionObjectPipeline* eop : *(res_eop.eops))
150 {
151 free(eop->GetInputBufferPtr());
152 free(eop->GetOutputBufferPtr());
153 delete eop;
154 }
155 }
156 }
157 delete eops_m;
158 eops_m = nullptr;
159 }
161 for (const Executor* e : es_m)
162 if (e != nullptr) delete e;
163 for (const Executor* e : e2s_m)
164 if (e != nullptr) delete e;
165 for (SubgraphDataConv *dc : in_conv_m)
166 if (dc != nullptr) delete dc;
167 for (SubgraphDataConv *dc : out_conv_m)
168 if (dc != nullptr) delete dc;
169 }
171 ResM& ResM::Instance(uint32_t total_num_subgraphs)
172 {
173 ResM& res = tidlSingleResM::Instance();
174 res.Init(total_num_subgraphs);
175 return res;
176 }
178 void ResM::Init(uint32_t num_subgraphs)
179 {
180 std::lock_guard<std::mutex> lock(mutex_init_m);
182 if (num_subgraphs_m == 0)
183 {
184 num_subgraphs_m = num_subgraphs;
186 if (getenv("TIDL_SUBGRAPH_TRACE") != nullptr) enable_trace_m = true;
188 // Allocating resources
189 num_eves_m = Executor::GetNumDevices(DeviceType::EVE);
190 num_dsps_m = Executor::GetNumDevices(DeviceType::DSP);
192 assert(num_eves_m > 0 || num_dsps_m > 0);
193 assert(num_subgraphs_m <= num_eves_m || num_subgraphs_m <= num_dsps_m);
194 num_es_per_subgraph_m = num_eves_m / num_subgraphs_m;
195 if (num_eves_m == 0)
196 num_es_per_subgraph_m = num_dsps_m / num_subgraphs_m;
198 cs_m.resize(num_subgraphs_m);
199 es_m.resize(num_subgraphs_m, nullptr);
200 e2s_m.resize(num_subgraphs_m, nullptr);
201 eops_m = new std::vector<ResEOP>(num_subgraphs_m);
203 // TODO: this should come from parsing config file
204 for (uint32_t i = 0; i < num_subgraphs_m; i++)
205 {
206 in_conv_m.push_back(new SubgraphDataConv(
207 {true}, {128.0f}, {false}, {1,3,224,224}));
208 out_conv_m.push_back(new SubgraphDataConv(
209 {false}, {255.0f}, {true}, {1,1,1,1001}));
210 }
211 }
212 }
215 void ResM::InitSubgraph(uint32_t subgraph_id)
216 {
217 assert(subgraph_id < num_subgraphs_m);
218 ResEOP& res_eop = (*eops_m)[subgraph_id];
220 std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
222 if (res_eop.eops == nullptr)
223 {
224 if (enable_trace_m)
225 printf("Subgraph %d: initialing E/EOPs with %d cores\n",
226 subgraph_id, num_es_per_subgraph_m);
228 // Constructing EOPs if not already constructed
229 // Each subgraph -> num_eves_per_subgraph_m EOPs
230 // Each EOP -> use_count
231 std::string cfg_file = "subgraph" + std::to_string(subgraph_id) + ".cfg";
232 bool status = cs_m[subgraph_id].ReadFromFile(cfg_file);
233 assert(status);
235 // Check if last few layers can be offloaded to DSPs
236 // and DSPs are available
237 DeviceIds e_ids, e2_ids;
238 for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
239 e_ids.insert(static_cast<DeviceId>(
240 subgraph_id * num_es_per_subgraph_m + i));
241 // uint32_t num_dsps_used = 0;
242 if (num_eves_m > 0 && num_dsps_m > 0 && ! cs_m[subgraph_id].runFullNet)
243 {
244 sTIDL_Network_t *net = new sTIDL_Network_t;
245 bool status = ReadNetworkBinary(cs_m[subgraph_id].netBinFile,
246 reinterpret_cast<char *>(net));
247 assert(status);
248 int32_t start_layer = net->numLayers -1;
249 int32_t end_layer = 0;
250 if (net->TIDLLayers[start_layer].layerType == (int32_t) TIDL_DataLayer)
251 start_layer -= 1;
252 if (net->TIDLLayers[end_layer].layerType == (int32_t) TIDL_DataLayer)
253 end_layer += 1;
254 int32_t i = start_layer;
255 for ( ; i > end_layer; i--)
256 {
257 int32_t layer_type = net->TIDLLayers[i].layerType;
258 if (layer_type != (int32_t) TIDL_SoftMaxLayer &&
259 layer_type != (int32_t) TIDL_InnerProductLayer &&
260 layer_type != (int32_t) TIDL_PoolingLayer)
261 break;
262 }
263 i += 1;
264 if (i <= start_layer)
265 {
266 if (num_lg2_dsps_used_m < num_dsps_m)
267 {
268 if (enable_trace_m)
269 printf("Subgraph %d: assign layers %d to %d to group 2 for DSP\n",
270 subgraph_id, i, start_layer);
271 while (i <= start_layer)
272 cs_m[subgraph_id].layerIndex2LayerGroupId[i++] = 2;
273 e2_ids.insert(static_cast<DeviceId>(num_lg2_dsps_used_m));
274 num_lg2_dsps_used_m += 1;
275 if (num_subgraphs_m == 1) // Allocate all dsps if only one subgraph
276 {
277 while (num_lg2_dsps_used_m < num_dsps_m)
278 e2_ids.insert(static_cast<DeviceId>(num_lg2_dsps_used_m++));
279 }
280 }
281 }
282 delete net;
283 }
285 if (e2_ids.empty())
286 cs_m[subgraph_id].runFullNet = true;
287 cs_m[subgraph_id].enableApiTrace = enable_trace_m;
289 // Constructing Es and EOPs
290 res_eop.eops = new std::vector<ExecutionObjectPipeline*>;
291 uint32_t buffer_factor = 2; // double buffering factor
292 if (num_eves_m > 0)
293 {
294 es_m[subgraph_id] = new Executor(DeviceType::EVE, e_ids,
295 cs_m[subgraph_id], 1);
296 if (! e2_ids.empty())
297 {
298 e2s_m[subgraph_id] = new Executor(DeviceType::DSP, e2_ids,
299 cs_m[subgraph_id], 2);
300 for (uint32_t j = 0; j < buffer_factor; j++)
301 for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
302 res_eop.eops->emplace_back(new ExecutionObjectPipeline(
303 {(*es_m[subgraph_id])[i],
304 (*e2s_m[subgraph_id])[i % e2_ids.size()]}));
305 }
306 else
307 {
308 for (uint32_t j = 0; j < buffer_factor; j++)
309 for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
310 res_eop.eops->emplace_back(new ExecutionObjectPipeline(
311 {(*es_m[subgraph_id])[i]}));
312 }
313 }
314 else
315 {
316 es_m[subgraph_id] = new Executor(DeviceType::DSP, e_ids,
317 cs_m[subgraph_id], 1);
318 for (uint32_t j = 0; j < buffer_factor; j++)
319 for (uint32_t i = 0; i < num_es_per_subgraph_m; i++)
320 res_eop.eops->emplace_back(new ExecutionObjectPipeline(
321 {(*es_m[subgraph_id])[i]}));
322 }
324 if (enable_trace_m)
325 printf("Subgraph %d: Allocating input/output buffers for %d EOPs\n",
326 subgraph_id, res_eop.eops->size());
327 // Allocate input/output buffers
328 for (auto eop : *(res_eop.eops))
329 {
330 size_t in_size = eop->GetInputBufferSizeInBytes();
331 size_t out_size = eop->GetOutputBufferSizeInBytes();
332 void* in_ptr = malloc(in_size);
333 void* out_ptr = malloc(out_size);
334 assert(in_ptr != nullptr && out_ptr != nullptr);
336 ArgInfo in(in_ptr, in_size);
337 ArgInfo out(out_ptr, out_size);
338 eop->SetInputOutputBuffer(in, out);
339 }
341 res_eop.free_eop_index = 0;
342 res_eop.is_used.resize(res_eop.eops->size(), false);
343 }
344 }
346 uint32_t ResM::GetNumEOPs(uint32_t subgraph_id)
347 {
348 assert(subgraph_id < num_subgraphs_m);
349 ResEOP& res_eop = (*eops_m)[subgraph_id];
350 assert (res_eop.eops != nullptr);
352 return res_eop.eops->size();
353 }
355 ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
356 {
357 assert(subgraph_id < num_subgraphs_m);
358 ResEOP& res_eop = (*eops_m)[subgraph_id];
359 assert(res_eop.eops != nullptr);
361 std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
363 // Return an available EOP (round robin allocation)
364 uint32_t curr_eop = res_eop.free_eop_index;
365 res_eop.cv_eops.wait(lock, [this, subgraph_id, curr_eop]{
366 return this->eops_m->at(subgraph_id).is_used[curr_eop] == false; });
367 res_eop.is_used[curr_eop] = true;
368 res_eop.free_eop_index = (curr_eop + 1) % res_eop.eops->size();
369 if (enable_trace_m)
370 printf("Subgraph %d: return EOP %d for GetEOP()\n", subgraph_id, curr_eop);
371 return res_eop.eops->at(curr_eop);
372 }
374 void ResM::FreeEOP(uint32_t subgraph_id, ExecutionObjectPipeline* eop)
375 {
376 assert(subgraph_id < num_subgraphs_m);
377 ResEOP& res_eop = (*eops_m)[subgraph_id];
378 assert(res_eop.eops != nullptr);
380 {
381 std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
382 for (uint32_t i = 0; i < res_eop.is_used.size(); i++)
383 if (res_eop.eops->at(i) == eop)
384 {
385 res_eop.is_used[i] = false;
386 if (enable_trace_m)
387 printf("Subgraph %d: FreeEOP %d\n", subgraph_id, i);
388 break;
389 }
390 }
391 res_eop.cv_eops.notify_all();
392 }
394 Configuration& ResM::GetConfiguration(uint32_t subgraph_id)
395 {
396 assert(subgraph_id < num_subgraphs_m);
397 assert((*eops_m)[subgraph_id].eops != nullptr);
398 return cs_m[subgraph_id];
399 }
401 const SubgraphDataConv& ResM::GetInConv(uint32_t subgraph_id)
402 {
403 assert(subgraph_id < num_subgraphs_m);
404 assert(in_conv_m[subgraph_id] != nullptr);
405 return *in_conv_m[subgraph_id];
406 }
408 const SubgraphDataConv& ResM::GetOutConv(uint32_t subgraph_id)
409 {
410 assert(subgraph_id < num_subgraphs_m);
411 assert(out_conv_m[subgraph_id] != nullptr);
412 return *out_conv_m[subgraph_id];
413 }