summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: cc826f3)
raw | patch | inline | side by side (parent: cc826f3)
author | Yuan Zhao <yuanzhao@ti.com> | |
Thu, 31 Oct 2019 20:17:38 +0000 (15:17 -0500) | ||
committer | Yuan Zhao <yuanzhao@ti.com> | |
Tue, 5 Nov 2019 18:10:52 +0000 (12:10 -0600) |
- MCT-1223
index aed396ca878ba5beff088e80fe15fd371dcf15b4..ffeb69d4ce3e40b366950e92deac50f167abd12c 100644 (file)
include ../make.common
+# overwrite LIBS, -ltidl_api should be able to pull -lOpenCL
LIBS += -lopencv_highgui -lopencv_imgcodecs -lopencv_videoio\
-lopencv_imgproc -lopencv_core
LIBS += -ljson-c
index c5da64732077b48eb1593402f860eb703b04952c..5534df3660e60e329a0618021caf7d766c075791 100644 (file)
#include "execution_object.h"
#include "execution_object_pipeline.h"
#include "subgraph_runtime.h"
+#include "subgraph_data_conv.h"
#include "configuration.h"
#include "../common/object_classes.h"
#include "imgutil.h"
std::unique_ptr<ObjectClasses> object_classes;
bool RunConfiguration(cmdline_opts_t& opts);
-bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs);
+bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs,
+ int batch_size);
bool WriteFrameOutput(float *out, const cmdline_opts_t& opts);
void DisplayHelp();
VideoCapture cap;
if (! SetVideoInputOutput(cap, opts, "ImageNet")) return false;
+ cout << "\n##### Batch size 1 testing ######\n" << endl;
try
{
float **inputs = new float *[1];
chrono::time_point<chrono::steady_clock> tloop0, tloop1;
tloop0 = chrono::steady_clock::now();
- ReadFrame(opts, cap, inputs);
- TidlRunSubgraph(1, 0, 1, 1, inputs, outputs);
+ ReadFrame(opts, cap, inputs, 1);
+ TidlRunSubgraph(1, 0, 1, 1, 1, inputs, outputs);
WriteFrameOutput(outputs[0], opts);
tloop1 = chrono::steady_clock::now();
status = false;
}
+ int batch_size = 8;
+ cout << "\n##### Batch size " << batch_size << " testing ######\n" << endl;
+ try
+ {
+ float **inputs = new float *[batch_size];
+ float **outputs = new float *[batch_size];
+ for (int i = 0; i < batch_size; i++)
+ {
+ inputs[i] = new float[1*3*224*224];
+ outputs[i] = new float[1001];
+ }
+
+ chrono::time_point<chrono::steady_clock> tloop0, tloop1;
+ tloop0 = chrono::steady_clock::now();
+
+ ReadFrame(opts, cap, inputs, batch_size);
+ TidlRunSubgraph(1, 0, batch_size, 1, 1, inputs, outputs);
+ for (int i = 0; i < batch_size; i++)
+ {
+ cout << "Frame " << i << " of " << batch_size << " output:" << endl;
+ WriteFrameOutput(outputs[i], opts);
+ }
+
+ tloop1 = chrono::steady_clock::now();
+ chrono::duration<float> elapsed = tloop1 - tloop0;
+ cout << "Batch size " << batch_size
+ << " time (including read/write/opencv/print/etc): "
+ << setw(6) << setprecision(4)
+ << (elapsed.count() * 1000) << "ms" << endl;
+
+ for (int i = 0; i < batch_size; i++)
+ {
+ delete [] inputs[i];
+ delete [] outputs[i];
+ }
+ delete [] inputs;
+ delete [] outputs;
+ }
+ catch (tidl::Exception &e)
+ {
+ cerr << e.what() << endl;
+ status = false;
+ }
+
return status;
}
-bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs)
+bool ReadFrame(const cmdline_opts_t& opts, VideoCapture &cap, float** inputs,
+ int batch_size)
{
Configuration c;
c.inNumChannels = 3;;
// TI DL image preprocessing, into frame_buffer
bool status = imgutil::PreprocessImage(image, frame_buffer, c);
- std::vector<float *> in_data_v{inputs[0]};
- in_conv.ScaleDequant((const uint8_t *)frame_buffer, in_data_v);
+ for (int i = 0; i < batch_size; i++)
+ {
+ std::vector<float *> in_data_v{inputs[i]};
+ in_conv.ScaleDequant((const uint8_t *)frame_buffer, in_data_v);
+ }
delete [] frame_buffer;
return status;
}
auto cmp = [](val_index &left, val_index &right)
{ return left.first > right.first; };
priority_queue<val_index, vector<val_index>, decltype(cmp)> queue(cmp);
+
// initialize priority queue with smallest value on top
for (int i = 0; i < k; i++)
queue.push(val_index(out[i], i));
diff --git a/tidl_api/Makefile b/tidl_api/Makefile
index ca9187859866599c9081b9da64f4effba84d856d..a04e604424aff87666a0588da7d167109a20756f 100644 (file)
--- a/tidl_api/Makefile
+++ b/tidl_api/Makefile
$(AR) cr $@ $(HOST_OBJ_FILES)
$(SHARED_LIB_NAME): $(HOST_OBJ_FILES)
- $(CXX) -shared $(HOST_OBJ_FILES) -o $@
+ $(CXX) $(CXXFLAGS) -Wl,-Bsymbolic -shared -lOpenCL $(HOST_OBJ_FILES) -o $@
$(PY_LIB_NAME): $(HOST_OBJ_PYBIND_FILES) $(LIB_NAME)
$(CXX) $(CXXFLAGS) -Wl,-Bsymbolic -shared -lOpenCL -locl_util $^ -o $@
$(AR) cr $@ $(HOST_OBJ_IMGUTIL_FILES)
$(SHARED_LIB_IMGUTIL_NAME): $(HOST_OBJ_IMGUTIL_FILES)
- $(CXX) -shared $(HOST_OBJ_IMGUTIL_FILES) -o $@
+ $(CXX) $(CXXFLAGS) -Wl,-Bsymbolic -shared $(HOST_OBJ_IMGUTIL_FILES) -o $@
clean::
$(RM) -f $(LIB_NAME) $(PY_LIB_NAME)
index 49b4315ef58a156a9cc75452870dae7009266dd0..6b7c4b1b4599d63781a65df957db2c2934908864 100644 (file)
* THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
+#pragma once
+
#include <stdint.h>
#include <vector>
index c153485099acbc59efdc3de9ac332db0c21f8234..37e771defdd2fd657c85351070677d294b069e2f 100644 (file)
//! @file subgraph_runtime.h
#pragma once
-#include <vector>
-#include <mutex>
-#include <condition_variable>
-#include "execution_object_pipeline.h"
-#include "subgraph_data_conv.h"
extern "C" {
-void TidlRunSubgraph(int total_subgraphs,
- int subgraph_id,
- int num_inputs,
- int num_outputs,
- float **inputTensors,
- float **outputTensors
- );
+//! @brief Top level inference to run a TIDL subgraph
+//! @param total_subgraphs total number of TIDL subgraphs in whole inference
+//! @param subgraph_id index of current TIDL subgraph
+//! @param batch_size number of samples/inferences in this batch
+//! @param num_inputs_per_inference number of inputs to TIDL subgraph
+//! for every sample/inference
+//! @param num_outputs_per_inference number of outputs from TIDL subgraph
+//! for every sample/inference
+//! @param input_tensors input data to TIDL subgraph, layout as
+//! batch1_input1, batch1_input2, ..., batch1_inputM,
+//! ... ... ...
+//! batchN_input1, batchN_input2, ..., batchN_inputM
+//! @param output_tensors output data from TIDL subgraph, layout as
+//! batch1_output1, batch1_output2, ..., batch1_outputK,
+//! ... ... ...
+//! batchN_output1, batchN_output2, ..., batchN_outputK
+extern void TidlRunSubgraph(int total_subgraphs,
+ int subgraph_id,
+ int batch_size,
+ int num_inputs_per_inference,
+ int num_outputs_per_inference,
+ float **input_tensors,
+ float **output_tensors
+ );
} // extern "C"
-namespace tidl {
#if 0
// Auto-generated code from Relay/TVM compilation step after
int num_input_tensors, int num_output_tensors,
PackedArgs args)
{
- float** in_data = new float*[num_input_tensors];
- float** out_data = new float*[num_output_tensors];
-
- for (int i = 0; i < num_input_tensors + num_output_tensors; i++)
- if (i < num_input_tensors)
- in_data[i] = args.data[i];
- else
- out_data[i - num_input_tensors] = args.data[i];
+ float** in_data = new float*[num_inputs_per_inference * batch_size];
+ float** out_data = new float*[num_outputs_per_inference * batch_size];
+
+ for (in j = 0; j < batch_size; j++)
+ {
+ for (int i = 0; i < num_inputs_per_inference + num_outputs_per_inference;
+ i++)
+ if (i < num_inputs_per_inference)
+ in_data[j * num_inputs_per_inference + i] = args.data[i][j];
+ else
+ out_data[j * num_outpus_per_inference + i - num_inputs_per_inference]
+ = args.data[i][j];
+ }
// call into this function in libtidl.so
- // dlopen("libtidl.so")
+ // dlopen("libtidl_api.so")
// TidlFunc = dlsym("TidlRunSubgraph");
- (*TidlFunc)(total_subgraphs, subgraph_id,
- num_input_tensors, num_output_tensors,
+ (*TidlFunc)(total_subgraphs, subgraph_id, batch_size
+ num_inputs_per_inference, num_outputs_per_inference,
in_data, out_data);
delete [] in_data;
}
#endif
-
-// Singleton ResM .h file
-// Resource manager for available EVE and DSP devices,
-// - Allocates EVEs and DSPs
-// - Constructs Executors (tidl_setup) and ExecutionObjects (tid_init)
-// - Creates set of ExecutionPipelines (with or without DSP)
-// - Allocating EOP on demand (acquire and free semantics)
-// - Allocates input/output buffers
-class ResM {
- public:
- ResM();
- ~ResM();
- static ResM& Instance(uint32_t total_num_subgraphs = 1);
-
- // how to ge
- ExecutionObjectPipeline* GetEOP(uint32_t subgraph_id);
- void FreeEOP(uint32_t subgraph_id,
- ExecutionObjectPipeline* eop);
- Configuration& GetConfiguration(uint32_t subgraph_id);
- const SubgraphDataConv& GetInConv(uint32_t subgraph_id);
- const SubgraphDataConv& GetOutConv(uint32_t subgraph_id);
-
-
- private:
- void Init(uint32_t num_subgraphs);
-
- bool enable_trace_m;
- uint32_t num_subgraphs_m;
- uint32_t num_es_per_subgraph_m;
- uint32_t num_eves_m;
- uint32_t num_dsps_m;
- uint32_t num_lg2_dsps_used_m; // in partitioned execution case
- std::mutex mutex_init_m;
-
- // indexed by subgraph_id for resources
- struct ResEOP {
- ResEOP() : free_eop_index(0), is_used(), eops(nullptr) {}
-
- uint32_t free_eop_index;
- std::mutex mutex_eops;
- std::condition_variable cv_eops;
- std::vector<bool> is_used;
- std::vector<ExecutionObjectPipeline*>* eops;
- };
- std::vector<Configuration> cs_m;
- std::vector<Executor*> es_m;
- std::vector<Executor*> e2s_m;
- std::vector<ResEOP> *eops_m;
- std::vector<SubgraphDataConv*> in_conv_m;
- std::vector<SubgraphDataConv*> out_conv_m;
-};
-
-} // namespace tidl
index ad5a11abdccfbbf3c6cbfab98f3580857366f263..09905fc72ddc6d8e0d60634a033f2e632f2fcd31 100644 (file)
#include "util.h"
#include "subgraph_runtime.h"
+#include "subgraph_runtime_impl.h"
#if 0
int num_input_tensors, int num_output_tensors,
PackedArgs args)
{
- float** in_data = new float*[num_input_tensors];
- float** out_data = new float*[num_output_tensors];
+ float** in_data = new float*[num_inputs_per_inference * batch_size];
+ float** out_data = new float*[num_outputs_per_inference * batch_size];
- for (int i = 0; i < num_input_tensors + num_output_tensors; i++)
- if (i < num_input_tensors)
- in_data[i] = args.data[i];
- else
- out_data[i - num_input_tensors] = args.data[i];
+ for (in j = 0; j < batch_size; j++)
+ {
+ for (int i = 0; i < num_inputs_per_inference + num_outputs_per_inference;
+ i++)
+ if (i < num_inputs_per_inference)
+ in_data[j * num_inputs_per_inference + i] = args.data[i][j];
+ else
+ out_data[j * num_outpus_per_inference + i - num_inputs_per_inference]
+ = args.data[i][j];
+ }
// call into this function in libtidl.so
- // dlopen("libtidl.so")
+ // dlopen("libtidl_api.so")
// TidlFunc = dlsym("TidlRunSubgraph");
- (*TidlFunc)(total_subgraphs, subgraph_id,
- num_input_tensors, num_output_tensors,
+ (*TidlFunc)(total_subgraphs, subgraph_id, batch_size
+ num_inputs_per_inference, num_outputs_per_inference,
in_data, out_data);
delete [] in_data;
void TidlRunSubgraph(int total_subgraphs,
int subgraph_id,
- int num_inputs,
- int num_outputs,
- float **inputTensors,
- float **outputTensors
+ int batch_size,
+ int num_inputs_per_inference,
+ int num_outputs_per_inference,
+ float **input_tensors,
+ float **output_tensors
)
{
ResM& res = ResM::Instance(total_subgraphs);
- ExecutionObjectPipeline* eop = res.GetEOP(subgraph_id);
+ res.InitSubgraph(subgraph_id);
+ int num_eops = res.GetNumEOPs(subgraph_id);
+ if (num_eops > batch_size) num_eops = batch_size;
+ std::vector<ExecutionObjectPipeline*> eops(num_eops);
+ for (int i = 0; i < num_eops; i++)
+ eops[i] = res.GetEOP(subgraph_id);
const SubgraphDataConv& in_conv = res.GetInConv(subgraph_id);
const SubgraphDataConv& out_conv = res.GetOutConv(subgraph_id);
- std::vector<float *> in_data_v, out_data_v;
- for (int i = 0; i < num_inputs; i++)
- in_data_v.emplace_back(inputTensors[i]);
- for (int i = 0; i < num_outputs; i++)
- out_data_v.emplace_back(outputTensors[i]);
- char* in_data = eop->GetInputBufferPtr();
- in_conv.ScaleQuant(in_data_v, (uint8_t *) in_data);
+ std::vector<std::vector<float *>> in_data_v(batch_size),
+ out_data_v(batch_size);
+ for (int frame_idx = 0; frame_idx < batch_size; frame_idx++)
+ {
+ for (int i = 0; i < num_inputs_per_inference; i++)
+ in_data_v[frame_idx].emplace_back(input_tensors[
+ frame_idx * num_inputs_per_inference + i]);
+ for (int i = 0; i < num_outputs_per_inference; i++)
+ out_data_v[frame_idx].emplace_back(output_tensors[
+ frame_idx * num_inputs_per_inference + i]);
+ }
- eop->ProcessFrameStartAsync();
- eop->ProcessFrameWait();
+ // Process batch_size frames with available eops in pipelined manner
+ // additional num_eops iterations to flush the pipeline (epilogue)
+ for (int frame_idx = 0; frame_idx < batch_size + num_eops; frame_idx++)
+ {
+ ExecutionObjectPipeline *eop = eops[frame_idx % num_eops];
- char* out_data = eop->GetOutputBufferPtr();
- out_conv.ScaleDequant((const uint8_t *) out_data, out_data_v);
- res.FreeEOP(subgraph_id, eop);
+ if (eop->ProcessFrameWait())
+ {
+ const uint8_t *out_data = (const uint8_t*) eop->GetOutputBufferPtr();
+ out_conv.ScaleDequant(out_data, out_data_v[frame_idx - num_eops]);
+ }
+
+ if (frame_idx < batch_size)
+ {
+ uint8_t *in_data = (uint8_t *) eop->GetInputBufferPtr();
+ in_conv.ScaleQuant(in_data_v[frame_idx], in_data);
+ eop->ProcessFrameStartAsync();
+ }
+ }
+
+ for (int i = 0; i < num_eops; i++)
+ res.FreeEOP(subgraph_id, eops[i]);
}
// Allocating resources
num_eves_m = Executor::GetNumDevices(DeviceType::EVE);
- num_eves_m = 1; // TODO: to remove after debugging
num_dsps_m = Executor::GetNumDevices(DeviceType::DSP);
assert(num_eves_m > 0 || num_dsps_m > 0);
}
}
-ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
+
+void ResM::InitSubgraph(uint32_t subgraph_id)
{
assert(subgraph_id < num_subgraphs_m);
ResEOP& res_eop = (*eops_m)[subgraph_id];
cs_m[subgraph_id].layerIndex2LayerGroupId[i++] = 2;
e2_ids.insert(static_cast<DeviceId>(num_lg2_dsps_used_m));
num_lg2_dsps_used_m += 1;
+ if (num_subgraphs_m == 1) // Allocate all dsps if only one subgraph
+ {
+ while (num_lg2_dsps_used_m < num_dsps_m)
+ e2_ids.insert(static_cast<DeviceId>(num_lg2_dsps_used_m++));
+ }
}
}
delete net;
res_eop.free_eop_index = 0;
res_eop.is_used.resize(res_eop.eops->size(), false);
}
+}
+
+uint32_t ResM::GetNumEOPs(uint32_t subgraph_id)
+{
+ assert(subgraph_id < num_subgraphs_m);
+ ResEOP& res_eop = (*eops_m)[subgraph_id];
+ assert (res_eop.eops != nullptr);
+
+ return res_eop.eops->size();
+}
+
+ExecutionObjectPipeline* ResM::GetEOP(uint32_t subgraph_id)
+{
+ assert(subgraph_id < num_subgraphs_m);
+ ResEOP& res_eop = (*eops_m)[subgraph_id];
+ assert(res_eop.eops != nullptr);
+
+ std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
// Return an available EOP (round robin allocation)
uint32_t curr_eop = res_eop.free_eop_index;
void ResM::FreeEOP(uint32_t subgraph_id, ExecutionObjectPipeline* eop)
{
+ assert(subgraph_id < num_subgraphs_m);
ResEOP& res_eop = (*eops_m)[subgraph_id];
+ assert(res_eop.eops != nullptr);
+
{
std::unique_lock<std::mutex> lock(res_eop.mutex_eops);
for (uint32_t i = 0; i < res_eop.is_used.size(); i++)
const SubgraphDataConv& ResM::GetInConv(uint32_t subgraph_id)
{
+ assert(subgraph_id < num_subgraphs_m);
assert(in_conv_m[subgraph_id] != nullptr);
return *in_conv_m[subgraph_id];
}
const SubgraphDataConv& ResM::GetOutConv(uint32_t subgraph_id)
{
+ assert(subgraph_id < num_subgraphs_m);
assert(out_conv_m[subgraph_id] != nullptr);
return *out_conv_m[subgraph_id];
}
diff --git a/tidl_api/src/subgraph_runtime_impl.h b/tidl_api/src/subgraph_runtime_impl.h
--- /dev/null
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * Copyright (c) 2019 Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+//! @file subgraph_runtime_impl.h
+
+#pragma once
+#include <vector>
+#include <mutex>
+#include <condition_variable>
+#include "execution_object_pipeline.h"
+#include "subgraph_data_conv.h"
+
+
+namespace tidl {
+
+// Singleton ResM .h file
+// Resource manager for available EVE and DSP devices,
+// - Allocates EVEs and DSPs
+// - Constructs Executors (tidl_setup) and ExecutionObjects (tid_init)
+// - Creates set of ExecutionPipelines (with or without DSP)
+// - Allocating EOP on demand (acquire and free semantics)
+// - Allocates input/output buffers
+class ResM {
+ public:
+ ResM();
+ ~ResM();
+ static ResM& Instance(uint32_t total_num_subgraphs = 1);
+
+ // how to get resources for subgraph_id
+ void InitSubgraph(uint32_t subgraph_id);
+ uint32_t GetNumEOPs(uint32_t subgraph_id);
+ ExecutionObjectPipeline* GetEOP(uint32_t subgraph_id);
+ void FreeEOP(uint32_t subgraph_id,
+ ExecutionObjectPipeline* eop);
+ Configuration& GetConfiguration(uint32_t subgraph_id);
+ const SubgraphDataConv& GetInConv(uint32_t subgraph_id);
+ const SubgraphDataConv& GetOutConv(uint32_t subgraph_id);
+
+
+ private:
+ void Init(uint32_t num_subgraphs);
+
+ bool enable_trace_m;
+ uint32_t num_subgraphs_m;
+ uint32_t num_es_per_subgraph_m;
+ uint32_t num_eves_m;
+ uint32_t num_dsps_m;
+ uint32_t num_lg2_dsps_used_m; // in partitioned execution case
+ std::mutex mutex_init_m;
+
+ // indexed by subgraph_id for resources
+ struct ResEOP {
+ ResEOP() : free_eop_index(0), is_used(), eops(nullptr) {}
+
+ uint32_t free_eop_index;
+ std::mutex mutex_eops;
+ std::condition_variable cv_eops;
+ std::vector<bool> is_used;
+ std::vector<ExecutionObjectPipeline*>* eops;
+ };
+ std::vector<Configuration> cs_m;
+ std::vector<Executor*> es_m;
+ std::vector<Executor*> e2s_m;
+ std::vector<ResEOP> *eops_m;
+ std::vector<SubgraphDataConv*> in_conv_m;
+ std::vector<SubgraphDataConv*> out_conv_m;
+};
+
+} // namespace tidl