summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 36786d7)
raw | patch | inline | side by side (parent: 36786d7)
author | Yuan Zhao <yuanzhao@ti.com> | |
Fri, 10 Aug 2018 04:42:42 +0000 (23:42 -0500) | ||
committer | Yuan Zhao <yuanzhao@ti.com> | |
Mon, 20 Aug 2018 15:57:44 +0000 (10:57 -0500) |
- Add top level ExecutionObjectPipeline class to execute multiple
layersGroups.
- An ExecutionObjectPipeline is constructed from multiple
ExecutionObjects, each ExecutionObject executes one layersGroup
in the network, together they execute consecutive layersGroups.
- Same look and feel as ExecutionObject, e.g. ProcessFrameStartAsync,
ProcessFrameWait, GetInputBufferPointer, GetOutputBufferPointer
- MCT-1017, MCT-1029
layersGroups.
- An ExecutionObjectPipeline is constructed from multiple
ExecutionObjects, each ExecutionObject executes one layersGroup
in the network, together they execute consecutive layersGroups.
- Same look and feel as ExecutionObject, e.g. ProcessFrameStartAsync,
ProcessFrameWait, GetInputBufferPointer, GetOutputBufferPointer
- MCT-1017, MCT-1029
examples/ssd_multibox/main.cpp | patch | blob | history | |
tidl_api/Makefile | patch | blob | history | |
tidl_api/inc/execution_object.h | patch | blob | history | |
tidl_api/inc/execution_object_internal.h | [new file with mode: 0644] | patch | blob |
tidl_api/inc/execution_object_pipeline.h | [new file with mode: 0644] | patch | blob |
tidl_api/inc/executor.h | patch | blob | history | |
tidl_api/src/execution_object.cpp | patch | blob | history | |
tidl_api/src/execution_object_pipeline.cpp | [new file with mode: 0644] | patch | blob |
tidl_api/src/executor.cpp | patch | blob | history | |
tidl_api/src/ocl_device.cpp | patch | blob | history | |
tidl_api/src/ocl_device.h | patch | blob | history |
index 6d39dda1561658b70a54964fc7efdf8a4505a4cb..b302cfa7529128f69580152adcc17169af6db815 100644 (file)
#include "executor.h"
#include "execution_object.h"
+#include "execution_object_pipeline.h"
#include "configuration.h"
#include "../segmentation/object_classes.h"
using namespace cv;
-bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
+bool RunConfiguration(const std::string& config_file,
+ uint32_t num_dsps, uint32_t num_eves,
DeviceType device_type, std::string& input_file);
-bool ReadFrame(ExecutionObject& eo, int frame_idx,
+bool ReadFrame(ExecutionObjectPipeline& eop, int frame_idx,
const Configuration& configuration, int num_frames,
std::string& image_file, VideoCapture &cap);
-bool WriteFrameOutput(const ExecutionObject &eo_in,
- const ExecutionObject &eo_out,
+bool WriteFrameOutput(const ExecutionObjectPipeline& eop,
const Configuration& configuration);
void ReportTime(int frame_index, std::string device_name, double elapsed_host,
static void ProcessArgs(int argc, char *argv[],
std::string& config,
- uint32_t& num_devices,
+ uint32_t& num_dsps,
+ uint32_t& num_eves,
DeviceType& device_type,
std::string& input_file);
// Process arguments
std::string config = DEFAULT_CONFIG;
std::string input_file = DEFAULT_INPUT;
- uint32_t num_devices = 1;
+ uint32_t num_dsps = 1;
+ uint32_t num_eves = 1;
DeviceType device_type = DeviceType::EVE;
- ProcessArgs(argc, argv, config, num_devices, device_type, input_file);
+ ProcessArgs(argc, argv, config, num_dsps, num_eves,
+ device_type, input_file);
- // Use same number of EVEs and DSPs
- num_devices = std::min(num_devices, std::min(num_eve, num_dsp));
- if (num_devices == 0)
- {
- std::cout << "Partitioned execution requires at least 1 EVE and 1 DSP."
- << std::endl;
- return EXIT_FAILURE;
- }
if ((object_class_table = GetObjectClassTable(config)) == nullptr)
{
std::cout << "No object classes defined for this config." << std::endl;
std::cout << "Input: " << input_file << std::endl;
std::string config_file = "../test/testvecs/config/infer/tidl_config_"
+ config + ".txt";
- bool status = RunConfiguration(config_file, num_devices, device_type,
- input_file);
+ bool status = RunConfiguration(config_file, num_dsps, num_eves,
+ device_type, input_file);
if (!status)
{
return EXIT_SUCCESS;
}
-bool RunConfiguration(const std::string& config_file, uint32_t num_devices,
+bool RunConfiguration(const std::string& config_file,
+ uint32_t num_dsps, uint32_t num_eves,
DeviceType device_type, std::string& input_file)
{
- DeviceIds ids;
- for (int i = 0; i < num_devices; i++)
- ids.insert(static_cast<DeviceId>(i));
+ DeviceIds ids_eve, ids_dsp;
+ for (int i = 0; i < num_eves; i++)
+ ids_eve.insert(static_cast<DeviceId>(i));
+ for (int i = 0; i < num_dsps; i++)
+ ids_dsp.insert(static_cast<DeviceId>(i));
// Read the TI DL configuration file
Configuration configuration;
}
// setup input
- int num_frames = is_default_input ? 3 : 1;
+ int num_frames = is_default_input ? 9 : 9;
VideoCapture cap;
std::string image_file;
if (is_camera_input)
// and configuration specified
// EVE will run layersGroupId 1 in the network, while
// DSP will run layersGroupId 2 in the network
- Executor executor_eve(DeviceType::EVE, ids, configuration, 1);
- Executor executor_dsp(DeviceType::DSP, ids, configuration, 2);
-
- // Query Executor for set of ExecutionObjects created
- const ExecutionObjects& execution_objects_eve =
- executor_eve.GetExecutionObjects();
- const ExecutionObjects& execution_objects_dsp =
- executor_dsp.GetExecutionObjects();
- int num_eos = execution_objects_eve.size();
-
- // Allocate input and output buffers for each execution object
- // Note that "out" is both the output of eo_eve and the input of eo_dsp
- // This is how two layersGroupIds, 1 and 2, are tied together
+ Executor exe_eve(DeviceType::EVE, ids_eve, configuration, 1);
+ Executor exe_dsp(DeviceType::DSP, ids_dsp, configuration, 2);
+
+ // Construct ExecutionObjectPipeline that utilizes multiple
+ // ExecutionObjects to process a single frame, each ExecutionObject
+ // processes one layerGroup of the network
+ int num_eops = std::max(num_eves, num_dsps);
+ std::vector<ExecutionObjectPipeline *> eops;
+ for (int i = 0; i < num_eops; i++)
+ eops.push_back(new ExecutionObjectPipeline({exe_eve[i%num_eves],
+ exe_dsp[i%num_dsps]}));
+
+ // Allocate input/output memory for each EOP
std::vector<void *> buffers;
- for (int i = 0; i < num_eos; i++)
+ for (auto eop : eops)
{
- ExecutionObject *eo_eve = execution_objects_eve[i].get();
- size_t in_size = eo_eve->GetInputBufferSizeInBytes();
- size_t out_size = eo_eve->GetOutputBufferSizeInBytes();
- ArgInfo in = { ArgInfo(malloc(in_size), in_size) };
- ArgInfo out = { ArgInfo(malloc(out_size), out_size) };
- eo_eve->SetInputOutputBuffer(in, out);
-
- ExecutionObject *eo_dsp = execution_objects_dsp[i].get();
- size_t out2_size = eo_dsp->GetOutputBufferSizeInBytes();
- ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) };
- eo_dsp->SetInputOutputBuffer(out, out2);
-
- buffers.push_back(in.ptr());
- buffers.push_back(out.ptr());
- buffers.push_back(out2.ptr());
+ size_t in_size = eop->GetInputBufferSizeInBytes();
+ size_t out_size = eop->GetOutputBufferSizeInBytes();
+ void* in_ptr = malloc(in_size);
+ void* out_ptr = malloc(out_size);
+ assert(in_ptr != nullptr && out_ptr != nullptr);
+ buffers.push_back(in_ptr);
+ buffers.push_back(out_ptr);
+
+ ArgInfo in(in_ptr, in_size);
+ ArgInfo out(out_ptr, out_size);
+ eop->SetInputOutputBuffer(in, out);
}
- #define MAX_NUM_EOS 4
- struct timespec t0[MAX_NUM_EOS], t1, tloop0, tloop1;
+ struct timespec tloop0, tloop1;
clock_gettime(CLOCK_MONOTONIC, &tloop0);
- // Process frames with available execution objects in a pipelined manner
- // additional num_eos iterations to flush the pipeline (epilogue)
- ExecutionObject *eo_eve, *eo_dsp, *eo_input;
- for (int frame_idx = 0;
- frame_idx < num_frames + num_eos; frame_idx++)
+ // Process frames with ExecutionObjectPipelines in a pipelined manner
+ // additional num_eops iterations to flush pipeline (epilogue)
+ for (int frame_idx = 0; frame_idx < num_frames + num_eops; frame_idx++)
{
- eo_eve = execution_objects_eve[frame_idx % num_eos].get();
- eo_dsp = execution_objects_dsp[frame_idx % num_eos].get();
+ ExecutionObjectPipeline* eop = eops[frame_idx % num_eops];
- // Wait for previous frame on the same eo to finish processing
- if (eo_dsp->ProcessFrameWait())
+ // Wait for previous frame on the same eop to finish processing
+ if (eop->ProcessFrameWait())
{
- int finished_idx = eo_dsp->GetFrameIndex();
- clock_gettime(CLOCK_MONOTONIC, &t1);
- ReportTime(finished_idx, "DSP",
- ms_diff(t0[finished_idx % num_eos], t1),
- eo_dsp->GetProcessTimeInMilliSeconds());
-
- eo_input = execution_objects_eve[finished_idx % num_eos].get();
- WriteFrameOutput(*eo_input, *eo_dsp, configuration);
+ ReportTime(eop->GetFrameIndex(), eop->GetDeviceName(),
+ eop->GetHostProcessTimeInMilliSeconds(),
+ eop->GetProcessTimeInMilliSeconds());
+ WriteFrameOutput(*eop, configuration);
}
// Read a frame and start processing it with current eo
- if (ReadFrame(*eo_eve, frame_idx, configuration, num_frames,
+ if (ReadFrame(*eop, frame_idx, configuration, num_frames,
image_file, cap))
{
- clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
- eo_eve->ProcessFrameStartAsync();
-
- if (eo_eve->ProcessFrameWait())
- {
- clock_gettime(CLOCK_MONOTONIC, &t1);
- ReportTime(frame_idx, "EVE",
- ms_diff(t0[frame_idx % num_eos], t1),
- eo_eve->GetProcessTimeInMilliSeconds());
-
- clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]);
- eo_dsp->ProcessFrameStartAsync();
- }
+ eop->ProcessFrameStartAsync();
}
}
<< std::setw(6) << std::setprecision(4)
<< ms_diff(tloop0, tloop1) << "ms" << std::endl;
+ for (auto eop : eops)
+ delete eop;
for (auto b : buffers)
free(b);
}
@@ -305,15 +282,15 @@ void ReportTime(int frame_index, std::string device_name, double elapsed_host,
}
-bool ReadFrame(ExecutionObject &eo, int frame_idx,
+bool ReadFrame(ExecutionObjectPipeline& eop, int frame_idx,
const Configuration& configuration, int num_frames,
std::string& image_file, VideoCapture &cap)
{
if (frame_idx >= num_frames)
return false;
- eo.SetFrameIndex(frame_idx);
+ eop.SetFrameIndex(frame_idx);
- char* frame_buffer = eo.GetInputBufferPtr();
+ char* frame_buffer = eop.GetInputBufferPtr();
assert (frame_buffer != nullptr);
int channel_size = configuration.inWidth * configuration.inHeight;
if (is_preprocessed_input)
{
std::ifstream ifs(image_file, std::ios::binary);
- ifs.seekg(frame_idx * channel_size * 3);
+ //ifs.seekg(frame_idx * channel_size * 3);
ifs.read(frame_buffer, channel_size * 3);
bool ifs_status = ifs.good();
ifs.close();
}
// Create frame with boxes drawn around classified objects
-bool WriteFrameOutput(const ExecutionObject &eo_in,
- const ExecutionObject &eo_out,
+bool WriteFrameOutput(const ExecutionObjectPipeline& eop,
const Configuration& configuration)
{
// Asseembly original frame
int channel_size = width * height;
Mat frame, r_frame, bgr[3];
- unsigned char *in = (unsigned char *) eo_in.GetInputBufferPtr();
+ unsigned char *in = (unsigned char *) eop.GetInputBufferPtr();
bgr[0] = Mat(height, width, CV_8UC(1), in);
bgr[1] = Mat(height, width, CV_8UC(1), in + channel_size);
bgr[2] = Mat(height, width, CV_8UC(1), in + channel_size*2);
cv::merge(bgr, 3, frame);
- int frame_index = eo_in.GetFrameIndex();
+ int frame_index = eop.GetFrameIndex();
char outfile_name[64];
if (! is_camera_input && is_preprocessed_input)
{
}
// Draw boxes around classified objects
- float *out = (float *) eo_out.GetOutputBufferPtr();
- int num_floats = eo_out.GetOutputBufferSizeInBytes() / sizeof(float);
+ float *out = (float *) eop.GetOutputBufferPtr();
+ int num_floats = eop.GetOutputBufferSizeInBytes() / sizeof(float);
for (int i = 0; i < num_floats / 7; i++)
{
int index = (int) out[i * 7 + 0];
void ProcessArgs(int argc, char *argv[], std::string& config,
- uint32_t& num_devices, DeviceType& device_type,
- std::string& input_file)
+ uint32_t& num_dsps, uint32_t& num_eves,
+ DeviceType& device_type, std::string& input_file)
{
const struct option long_options[] =
{
{"config", required_argument, 0, 'c'},
- {"num_devices", required_argument, 0, 'n'},
+ {"num_dsps", required_argument, 0, 'd'},
+ {"num_eves", required_argument, 0, 'e'},
{"image_file", required_argument, 0, 'i'},
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
while (true)
{
- int c = getopt_long(argc, argv, "c:n:i:hv", long_options, &option_index);
+ int c = getopt_long(argc, argv, "c:d:e:i:hv", long_options,
+ &option_index);
if (c == -1)
break;
case 'c': config = optarg;
break;
- case 'n': num_devices = atoi(optarg);
- assert (num_devices > 0 && num_devices <= 4);
+ case 'd': num_dsps = atoi(optarg);
+ assert (num_dsps > 0 && num_dsps <=
+ Executor::GetNumDevices(DeviceType::DSP));
+ break;
+
+ case 'e': num_eves = atoi(optarg);
+ assert (num_eves > 0 && num_eves <=
+ Executor::GetNumDevices(DeviceType::EVE));
break;
case 'i': input_file = optarg;
"Default is jdetnet.\n"
"Optional arguments:\n"
" -c <config> Valid configs: jdetnet \n"
- " -n <number of cores> Number of cores to use (1 - 4)\n"
+ " -d <number> Number of dsp cores to use\n"
+ " -e <number> Number of eve cores to use\n"
" -i <image> Path to the image file\n"
" Default is 1 frame in testvecs\n"
" -i camera Use camera as input\n"
diff --git a/tidl_api/Makefile b/tidl_api/Makefile
index 05a3704af885d2f055b38b0f5f74c92748a485eb..3fc6a2c1922f86dc09aa5e64d83cf7e0c5873f21 100644 (file)
--- a/tidl_api/Makefile
+++ b/tidl_api/Makefile
SRCS = ocl_device.cpp configuration_parser.cpp configuration.cpp\
- executor.cpp execution_object.cpp trace.cpp util.cpp
+ executor.cpp execution_object.cpp trace.cpp util.cpp \
+ execution_object_pipeline.cpp
SRCS_IMGUTIL = imgutil.cpp
OBJS = $(SRCS:.cpp=.o)
HEADERS = src/common_defines.h src/executor_impl.h src/ocl_device.h
HEADERS += src/parameters.h src/tidl_create_params.h src/trace.h src/util.h
HEADERS += inc/configuration.h inc/execution_object.h inc/executor.h
-HEADERS += inc/imgutil.h src/device_arginfo.h
-
+HEADERS += inc/imgutil.h src/device_arginfo.h inc/execution_object_pipeline.h
ifeq ($(BUILD), debug)
CXXFLAGS += -Og -g -ggdb
index e78ad2e98f0ec8ffd4305536bca46c368dbb4f3b..c1d86fc126bb8e243a67df04d30bfb5c3aca63d8 100644 (file)
#pragma once
#include <memory>
+#include "execution_object_internal.h"
namespace tidl {
class LayerOutput;
class IODeviceArgInfo;
-typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs;
/*! @class ExecutionObject
@brief Runs the TIDL network on an OpenCL device
*/
-class ExecutionObject
+class ExecutionObject : public ExecutionObjectInternalInterface
{
public:
const ArgInfo& create_arg,
const ArgInfo& param_heap_arg,
size_t extmem_heap_size,
+ int layersGroupId,
+ bool output_trace,
bool internal_input);
//! @private
~ExecutionObject();
//! Specify the input and output buffers used by the EO
//! @param in buffer used for input.
//! @param out buffer used for output.
- void SetInputOutputBuffer (const ArgInfo& in, const ArgInfo& out);
+ void SetInputOutputBuffer(const ArgInfo& in,
+ const ArgInfo& out) override;
//! Returns a pointer to the input buffer set via SetInputOutputBuffer
- char* GetInputBufferPtr() const;
+ char* GetInputBufferPtr() const override;
//! Returns size of the input buffer
- size_t GetInputBufferSizeInBytes() const;
+ size_t GetInputBufferSizeInBytes() const override;
+
+ //! Returns a pointer to the output buffer
+ char* GetOutputBufferPtr() const override;
+
+ //! Returns size of the output buffer
+ size_t GetOutputBufferSizeInBytes() const override;
//! @brief Set the frame index of the frame currently processed by the
//! ExecutionObject. Used for trace/debug messages
//! @param idx index of the frame
- void SetFrameIndex(int idx);
+ void SetFrameIndex(int idx) override;
//! Returns the index of a frame being processed (set by SetFrameIndex)
- int GetFrameIndex() const;
-
- //! Returns a pointer to the output buffer
- char* GetOutputBufferPtr() const;
-
- //! Returns the number of bytes written to the output buffer
- size_t GetOutputBufferSizeInBytes() const;
+ int GetFrameIndex() const override;
- //! @brief Start processing a frame. The call is asynchronous and returns
- //! immediately. Use ExecutionObject::ProcessFrameWait to wait
- bool ProcessFrameStartAsync();
+ //! @brief Start processing a frame. The call is asynchronous and
+ //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait
+ bool ProcessFrameStartAsync() override;
//! Wait for the execution object to complete processing a frame
//! @return false if ExecutionObject::ProcessFrameWait was called
//! without a corresponding call to
//! ExecutionObject::ProcessFrameStartAsync.
- bool ProcessFrameWait();
-
- //! @brief return the number of cycles taken *on the device* to
- //! execute the process call
- //! @return Number of cycles to process a frame on the device.
- uint64_t GetProcessCycles() const;
+ bool ProcessFrameWait() override;
//! @brief return the number of milliseconds taken *on the device* to
//! execute the process call
//! @return Number of milliseconds to process a frame on the device.
- float GetProcessTimeInMilliSeconds() const;
+ float GetProcessTimeInMilliSeconds() const override;
+
+ //! @brief return the number of milliseconds taken *on the host* to
+ //! execute the process call
+ //! @return Number of milliseconds to process a frame on the host.
+ float GetHostProcessTimeInMilliSeconds() const override;
+
+ //! Returns the device name that the ExecutionObject runs on
+ const std::string& GetDeviceName() const override;
//! Write the output buffer for each layer to a file
- //! <filename_prefix>_<ID>_HxW.bin
+ //! \<filename_prefix>_<ID>_HxW.bin
void WriteLayerOutputsToFile(const std::string& filename_prefix=
- "trace_dump_") const;
+ "trace_dump_") const override;
//! Returns a LayerOutput object corresponding to a layer.
//! Caller is responsible for deleting the LayerOutput object.
//! @param output_index The output index of the buffer for a given
//! layer. Defaults to 0.
const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
- uint32_t output_index=0) const;
+ uint32_t output_index=0) const override;
//! Get output buffers from all layers
- const LayerOutputs* GetOutputsFromAllLayers() const;
+ const LayerOutputs* GetOutputsFromAllLayers() const override;
+
+ //! Returns the layersGrupId that the ExecutionObject is processing
+ int GetLayersGroupId() const;
//! @private
// Used by the Executor
bool RunAsync(CallType ct);
bool Wait (CallType ct);
+ //! @private
+ // Used by the ExecutionObjectPipeline
+ bool AddCallback(CallType ct, void *user_data);
+ void AcquireLock();
+ void ReleaseLock();
+
ExecutionObject() = delete;
ExecutionObject(const ExecutionObject&) = delete;
ExecutionObject& operator=(const ExecutionObject&) = delete;
- void EnableOutputBufferTrace();
-
//! @private
void SetInputOutputBuffer(const IODeviceArgInfo* in,
const IODeviceArgInfo* out);
diff --git a/tidl_api/inc/execution_object_internal.h b/tidl_api/inc/execution_object_internal.h
--- /dev/null
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+/*! @file execution_object_internal.h */
+
+#pragma once
+
+namespace tidl {
+
+class LayerOutput;
+
+typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs;
+
+/*! @cond HIDDEN_SYMBOLS
+ @class ExecutionObjectInternalInterface
+ @brief Internal interface for running the TIDL network on OpenCL devices
+ Do not use this internal class directly.
+ Please use ExecutionObject or ExecutionObejctPipeline instead.
+*/
+class ExecutionObjectInternalInterface
+{
+ public:
+ virtual ~ExecutionObjectInternalInterface() {};
+
+ //! Specify the input and output buffers used by the EO
+ //! @param in buffer used for input.
+ //! @param out buffer used for output.
+ virtual void SetInputOutputBuffer(const ArgInfo& in,
+ const ArgInfo& out) =0;
+
+ //! Returns a pointer to the input buffer set via SetInputOutputBuffer
+ virtual char* GetInputBufferPtr() const =0;
+
+ //! Returns size of the input buffer
+ virtual size_t GetInputBufferSizeInBytes() const =0;
+
+ //! Returns a pointer to the output buffer
+ virtual char* GetOutputBufferPtr() const =0;
+
+ //! Returns size of the output buffer
+ virtual size_t GetOutputBufferSizeInBytes() const =0;
+
+ //! @brief Set the frame index of the frame currently processed by the
+ //! ExecutionObject. Used for trace/debug messages
+ //! @param idx index of the frame
+ virtual void SetFrameIndex(int idx) =0;
+
+ //! Returns the index of a frame being processed (set by SetFrameIndex)
+ virtual int GetFrameIndex() const =0;
+
+ //! @brief Start processing a frame. The call is asynchronous and returns
+ //! immediately. Use ExecutionObject::ProcessFrameWait to wait
+ virtual bool ProcessFrameStartAsync() =0;
+
+ //! Wait for the execution object to complete processing a frame
+ //! @return false if ExecutionObject::ProcessFrameWait was called
+ //! without a corresponding call to
+ //! ExecutionObject::ProcessFrameStartAsync.
+ virtual bool ProcessFrameWait() =0;
+
+ //! @brief return the number of milliseconds taken *on the device* to
+ //! execute the process call
+ //! @return Number of milliseconds to process a frame on the device.
+ virtual float GetProcessTimeInMilliSeconds() const =0;
+
+ //! @brief return the number of milliseconds taken *on the host* to
+ //! execute the process call
+ //! @return Number of milliseconds to process a frame on the host.
+ virtual float GetHostProcessTimeInMilliSeconds() const =0;
+
+ //! Returns the device name that the ExecutionObject runs on
+ virtual const std::string& GetDeviceName() const =0;
+
+ //! Write the output buffer for each layer to a file
+ //! \<filename_prefix>_<ID>_HxW.bin
+ virtual void WriteLayerOutputsToFile(const std::string& filename_prefix=
+ "trace_dump_") const =0;
+
+ //! Returns a LayerOutput object corresponding to a layer.
+ //! Caller is responsible for deleting the LayerOutput object.
+ //! @see LayerOutput
+ //! @param layer_index The layer index of the layer
+ //! @param output_index The output index of the buffer for a given
+ //! layer. Defaults to 0.
+ virtual const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
+ uint32_t output_index=0) const =0;
+
+ //! Get output buffers from all layers
+ virtual const LayerOutputs* GetOutputsFromAllLayers() const =0;
+};
+/*! @endcond
+*/
+
+} // namespace tidl
diff --git a/tidl_api/inc/execution_object_pipeline.h b/tidl_api/inc/execution_object_pipeline.h
--- /dev/null
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+//! @file execution_object_pipeline.h
+
+#pragma once
+#include <string>
+#include <vector>
+#include <cstdint>
+#include <cassert>
+
+#include "executor.h"
+#include "execution_object_internal.h"
+#include "execution_object.h"
+
+namespace tidl {
+
+/*! @class ExecutionObjectPipeline
+ @brief Manages the pipelined execution using multiple ExecutionObjects.
+ Each executor runs one layersGroup of the network. ExecutionObjects
+ must run consecutive layersGroups to form a pipelined execution.
+*/
+class ExecutionObjectPipeline : public ExecutionObjectInternalInterface
+{
+ public:
+ //! @brief Create an ExecutionObjectPipeline object.
+ //!
+ //! The ExecutionObjectPipeline will take the provided ExecutionObjects
+ //! to create an execution pipeline. E.g.
+ //! @code
+ //! Configuration config("path to configuration file");
+ //! DeviceIds ids = {DeviceId::ID0, DeviceId::ID1};
+ //! Executor exe_eve(DeviceType::EVE, ids, config, 1);
+ //! Executor exe_dsp(DeviceType::DSP, ids, config, 2);
+ //! ExecutionObjectPipeline ep0({exe_eve[0], exe_dsp[0]});
+ //! ExecutionObjectPipeline ep1({exe_eve[1], exe_dsp[1]});
+ //! @endcode
+ //!
+ //! @param eos DSP or EVE ExecutionObjects forming a pipeline
+ ExecutionObjectPipeline(std::vector<ExecutionObject*> eos);
+
+ //! @brief Tear down an ExecutionObjectPipeline and free used resources
+ ~ExecutionObjectPipeline();
+
+ //! Specify the input and output buffers used by the EOP
+ //! @param in buffer used for input.
+ //! @param out buffer used for output.
+ void SetInputOutputBuffer (const ArgInfo& in,
+ const ArgInfo& out) override;
+
+ //! Returns a pointer to the input buffer
+ char* GetInputBufferPtr() const override;
+
+ //! Returns size of the input buffer
+ size_t GetInputBufferSizeInBytes() const override;
+
+ //! Returns a pointer to the output buffer
+ char* GetOutputBufferPtr() const override;
+
+ //! Returns the number of bytes written to the output buffer
+ size_t GetOutputBufferSizeInBytes() const override;
+
+ //! @brief Set the frame index of the frame currently processed by the
+ //! ExecutionObjectPipeline. Used for trace/debug messages
+ //! @param idx index of the frame
+ void SetFrameIndex(int idx) override;
+
+ //! Returns the index of a frame being processed (set by SetFrameIndex)
+ int GetFrameIndex() const override;
+
+ //! @brief Start processing a frame. The call is asynchronous and
+ //! returns immediately. Use ProcessFrameWait() to wait
+ bool ProcessFrameStartAsync() override;
+
+ //! Wait for the executor pipeline to complete processing a frame
+ //! @return false if ProcessFrameWait() was called
+ //! without a corresponding call to
+ //! ExecutionObjectPipeline::ProcessFrameStartAsync().
+ bool ProcessFrameWait() override;
+
+ //! @brief return the number of milliseconds taken *on the device* to
+ //! execute the process call
+ //! @return Number of milliseconds to process a frame on the device.
+ float GetProcessTimeInMilliSeconds() const override;
+
+ //! @brief return the number of milliseconds taken *on the host* to
+ //! execute the process call
+ //! @return Number of milliseconds to process a frame on the host.
+ float GetHostProcessTimeInMilliSeconds() const override;
+
+ //! Return the combined device names that this pipeline runs on
+ const std::string& GetDeviceName() const override;
+
+ //! Write the output buffer for each layer to a file
+ //! \<filename_prefix>_<ID>_HxW.bin
+ void WriteLayerOutputsToFile(const std::string& filename_prefix=
+ "trace_dump_") const override;
+
+ //! Returns a LayerOutput object corresponding to a layer.
+ //! Caller is responsible for deleting the LayerOutput object.
+ //! @see LayerOutput
+ //! @param layer_index The layer index of the layer
+ //! @param output_index The output index of the buffer for a given
+ //! layer. Defaults to 0.
+ const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
+ uint32_t output_index=0) const override;
+
+ //! Get output buffers from all layers
+ const LayerOutputs* GetOutputsFromAllLayers() const override;
+
+ //! @private Used by runtime
+ //! @brief callback function at the completion of each ExecutionObject,
+ //! to chain the next ExectionObject for execution
+ void RunAsyncNext();
+
+ ExecutionObjectPipeline() = delete;
+ ExecutionObjectPipeline(const ExecutionObjectPipeline&) = delete;
+ ExecutionObjectPipeline& operator=(const ExecutionObjectPipeline&)
+ = delete;
+
+ private:
+ class Impl;
+ std::unique_ptr<Impl> pimpl_m;
+};
+
+} // namespace tidl
index 23d92ffef1b2ff815ca3ae23a267e24472dd8b02..1febfeadc2c687a6ce047c16a5f53fef69e00d75 100644 (file)
--- a/tidl_api/inc/executor.h
+++ b/tidl_api/inc/executor.h
typedef std::vector<std::unique_ptr<ExecutionObject>> ExecutionObjects;
/*! @class Executor
- @brief Manages the overall execution of a network using the
+ @brief Manages the overall execution of a layersGroup in a network using the
specified configuration and the set of devices available to the
executor.
*/
//! @code
//! Configuration configuration;
//! configuration.ReadFromFile("path to configuration file");
- //! DeviceIds ids1 = {DeviceId::ID2, DeviceId::ID3};
+ //! DeviceIds ids = {DeviceId::ID2, DeviceId::ID3};
//! Executor executor(DeviceType::EVE, ids, configuration);
//! @endcode
//!
//! available on this instance of the Executor
const ExecutionObjects& GetExecutionObjects() const;
+ //! Returns a single execution object at index
+ ExecutionObject* operator[](uint32_t index) const;
+
//! @brief Returns the number of devices of the specified type
//! available for TI DL.
//! @param device_type DSP or EVE/EVE device
//! @brief Returns a string corresponding to the API version
//!
- //! @return <major_ver>.<minor_ver>.<patch_ver>.<git_sha>
+ //! @return \<major_ver>.\<minor_ver>.\<patch_ver>.\<git_sha>
static std::string GetAPIVersion();
Executor(const Executor&) = delete;
index d722ebb196669019fbc3071338397d62a9a79ba9..178bbcaeb9f256c47c38df355171622c8413638f 100644 (file)
#include <string.h>
#include <fstream>
#include <climits>
+#include <mutex>
+#include <condition_variable>
+#include <chrono>
#include "executor.h"
#include "execution_object.h"
#include "trace.h"
const DeviceArgInfo& create_arg,
const DeviceArgInfo& param_heap_arg,
size_t extmem_heap_size,
+ int layers_group_id,
+ bool output_trace,
bool internal_input);
~Impl() {}
bool RunAsync(CallType ct);
bool Wait (CallType ct);
+ bool AddCallback(CallType ct, void *user_data);
+
+ uint64_t GetProcessCycles() const;
+ int GetLayersGroupId() const;
+ void AcquireLock();
+ void ReleaseLock();
Device* device_m;
+ // Index of the OpenCL device/queue used by this EO
+ uint8_t device_index_m;
+ std::string device_name_m;
up_malloc_ddr<char> tidl_extmem_heap_m;
up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
// Frame being processed by the EO
int current_frame_idx_m;
+ // LayersGroupId being processed by the EO
+ int layers_group_id_m;
+
// Trace related
void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
size_t trace_buf_params_sz_m;
+ // host time tracking: eo start to finish
+ float host_time_m;
+
private:
void SetupInitializeKernel(const DeviceArgInfo& create_arg,
const DeviceArgInfo& param_heap_arg,
size_t extmem_heap_size,
bool internal_input);
+ void EnableOutputBufferTrace();
void SetupProcessKernel();
void HostWriteNetInput();
void HostReadNetOutput();
void ComputeInputOutputSizes();
- // Index of the OpenCL device/queue used by this EO
- uint8_t device_index_m;
-
std::unique_ptr<Kernel> k_initialize_m;
std::unique_ptr<Kernel> k_process_m;
std::unique_ptr<Kernel> k_cleanup_m;
-
+ // Guarding sole access to input/output for one frame during execution
+ bool is_idle_m;
+ std::mutex mutex_access_m;
+ std::condition_variable cv_access_m;
};
const ArgInfo& create_arg,
const ArgInfo& param_heap_arg,
size_t extmem_heap_size,
+ int layers_group_id,
+ bool output_trace,
bool internal_input)
{
DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
create_arg_d,
param_heap_arg_d,
extmem_heap_size,
+ layers_group_id,
+ output_trace,
internal_input) };
}
const DeviceArgInfo& create_arg,
const DeviceArgInfo& param_heap_arg,
size_t extmem_heap_size,
+ int layers_group_id,
+ bool output_trace,
bool internal_input):
device_m(d),
+ device_index_m(device_index),
tidl_extmem_heap_m (nullptr, &__free_ddr),
shared_initialize_params_m(nullptr, &__free_ddr),
shared_process_params_m(nullptr, &__free_ddr),
in_m(),
out_m(),
current_frame_idx_m(0),
+ layers_group_id_m(layers_group_id),
num_network_layers_m(0),
trace_buf_params_m(nullptr, &__free_ddr),
trace_buf_params_sz_m(0),
- device_index_m(device_index),
k_initialize_m(nullptr),
k_process_m(nullptr),
- k_cleanup_m(nullptr)
+ k_cleanup_m(nullptr),
+ is_idle_m(true)
{
- SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
- internal_input);
-
- SetupProcessKernel();
-
+ device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
// Save number of layers in the network
const TIDL_CreateParams* cp =
static_cast<const TIDL_CreateParams *>(create_arg.ptr());
num_network_layers_m = cp->net.numLayers;
+
+ SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
+ internal_input);
+
+ if (output_trace) EnableOutputBufferTrace();
+ SetupProcessKernel();
}
// Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
size_t ExecutionObject::GetInputBufferSizeInBytes() const
{
- const DeviceArgInfo& arg = pimpl_m->in_m.GetArg();
- if (arg.ptr() == nullptr) return pimpl_m->in_size_m;
- else return arg.size();
+ return pimpl_m->in_size_m;
}
char* ExecutionObject::GetOutputBufferPtr() const
size_t ExecutionObject::GetOutputBufferSizeInBytes() const
{
- const DeviceArgInfo& arg = pimpl_m->out_m.GetArg();
- if (arg.ptr() == nullptr)
- return pimpl_m->out_size_m;
- else
- return pimpl_m->shared_process_params_m.get()->bytesWritten;
+ return pimpl_m->out_size_m;
}
void ExecutionObject::SetFrameIndex(int idx)
void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
{
- assert(in.ptr() != nullptr && in.size() > 0);
- assert(out.ptr() != nullptr && out.size() > 0);
+ assert(in.ptr() != nullptr && in.size() >= pimpl_m->in_size_m);
+ assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
pimpl_m->in_m = IODeviceArgInfo(in);
pimpl_m->out_m = IODeviceArgInfo(out);
bool ExecutionObject::ProcessFrameStartAsync()
{
+ assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
}
return pimpl_m->Wait(ct);
}
-uint64_t ExecutionObject::GetProcessCycles() const
+bool ExecutionObject::AddCallback(CallType ct, void *user_data)
{
- uint8_t factor = 1;
-
- // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
- if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM)
- factor = 2;
-
- return pimpl_m->shared_process_params_m.get()->cycles * factor;
+ return pimpl_m->AddCallback(ct, user_data);
}
float ExecutionObject::GetProcessTimeInMilliSeconds() const
{
float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
- return ((float)GetProcessCycles())/frequency * 1000;
+ return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000;
+}
+
+float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
+{
+ return pimpl_m->host_time_m;
+}
+
+void
+ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
+{
+ pimpl_m->WriteLayerOutputsToFile(filename_prefix);
}
const LayerOutput* ExecutionObject::GetOutputFromLayer(
return pimpl_m->GetOutputsFromAllLayers();
}
-//
-// Allocate an OpenCL buffer for TIDL layer output buffer metadata.
-// The device will populate metadata for every buffer that is used as an
-// output buffer by a layer.
-//
-void ExecutionObject::EnableOutputBufferTrace()
+int ExecutionObject::GetLayersGroupId() const
{
- pimpl_m->trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
- pimpl_m->num_network_layers_m*
- TIDL_NUM_OUT_BUFS);
-
- pimpl_m->trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
- (pimpl_m->trace_buf_params_sz_m));
+ return pimpl_m->layers_group_id_m;
+}
- // Device will update bufferId if there is valid data for the entry
- OCL_TIDL_BufParams* bufferParams = pimpl_m->trace_buf_params_m.get();
- for (uint32_t i = 0; i < pimpl_m->num_network_layers_m; i++)
- for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
- {
- OCL_TIDL_BufParams *bufP =
- &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
- bufP->bufferId = UINT_MAX;
- }
+const std::string& ExecutionObject::GetDeviceName() const
+{
+ return pimpl_m->device_name_m;
}
-void
-ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
+void ExecutionObject::AcquireLock()
{
- pimpl_m->WriteLayerOutputsToFile(filename_prefix);
+ pimpl_m->AcquireLock();
}
+void ExecutionObject::ReleaseLock()
+{
+ pimpl_m->ReleaseLock();
+}
//
// Create a kernel to call the "initialize" function
device_index_m));
}
+//
+// Allocate an OpenCL buffer for TIDL layer output buffer metadata.
+// The device will populate metadata for every buffer that is used as an
+// output buffer by a layer. This needs to be done before setting up
+// process kernel.
+//
+void ExecutionObject::Impl::EnableOutputBufferTrace()
+{
+ trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
+ num_network_layers_m*
+ TIDL_NUM_OUT_BUFS);
+
+ trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
+ (trace_buf_params_sz_m));
+
+ // Device will update bufferId if there is valid data for the entry
+ OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
+ for (uint32_t i = 0; i < num_network_layers_m; i++)
+ for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
+ {
+ OCL_TIDL_BufParams *bufP =
+ &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
+ bufP->bufferId = UINT_MAX;
+ }
+}
+
//
// Create a kernel to call the "process" function
//
}
case CallType::PROCESS:
{
+ std::chrono::time_point<std::chrono::steady_clock> t1, t2;
+ t1 = std::chrono::steady_clock::now();
+
shared_process_params_m->frameIdx = current_frame_idx_m;
shared_process_params_m->bytesWritten = 0;
HostWriteNetInput();
k_process_m->RunAsync();
+
+ t2 = std::chrono::steady_clock::now();
+ std::chrono::duration<float> elapsed = t2 - t1;
+ host_time_m = elapsed.count() * 1000;
break;
}
case CallType::CLEANUP:
}
case CallType::PROCESS:
{
- bool has_work = k_process_m->Wait();
+ float host_elapsed_ms = 0.0f;
+ bool has_work = k_process_m->Wait(&host_elapsed_ms);
if (has_work)
{
if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
throw Exception(shared_process_params_m->errorCode,
__FILE__, __FUNCTION__, __LINE__);
+
+ std::chrono::time_point<std::chrono::steady_clock> t1, t2;
+ t1 = std::chrono::steady_clock::now();
HostReadNetOutput();
+ t2 = std::chrono::steady_clock::now();
+ std::chrono::duration<float> elapsed = t2 - t1;
+ host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
}
return has_work;
return false;
}
+bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data)
+{
+ switch (ct)
+ {
+ case CallType::PROCESS:
+ {
+ return k_process_m->AddCallback(user_data);
+ break;
+ }
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+uint64_t ExecutionObject::Impl::GetProcessCycles() const
+{
+ uint8_t factor = 1;
+
+ // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
+ if (device_m->type() == CL_DEVICE_TYPE_CUSTOM)
+ factor = 2;
+
+ return shared_process_params_m.get()->cycles * factor;
+}
+
//
// Write the trace data to output files
//
{
delete[] data_m;
}
+
+void ExecutionObject::Impl::AcquireLock()
+{
+ std::unique_lock<std::mutex> lock(mutex_access_m);
+ cv_access_m.wait(lock, [this]{ return this->is_idle_m; });
+ is_idle_m = false;
+}
+
+void ExecutionObject::Impl::ReleaseLock()
+{
+ is_idle_m = true;
+ cv_access_m.notify_all();
+}
diff --git a/tidl_api/src/execution_object_pipeline.cpp b/tidl_api/src/execution_object_pipeline.cpp
--- /dev/null
@@ -0,0 +1,360 @@
+/******************************************************************************
+ * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include <assert.h>
+#include <mutex>
+#include <condition_variable>
+#include <chrono>
+#include "device_arginfo.h"
+#include "execution_object_pipeline.h"
+
+using namespace tidl;
+
+class ExecutionObjectPipeline::Impl
+{
+ public:
+ Impl(std::vector<ExecutionObject*> &eos);
+ ~Impl();
+
+ void SetInputOutputBuffer(const ArgInfo &in, const ArgInfo &out);
+ bool RunAsyncStart();
+ bool RunAsyncNext();
+ bool Wait();
+
+ // Trace related
+ void WriteLayerOutputsToFile(const std::string& filename_prefix) const;
+ const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
+ uint32_t output_index) const;
+ const LayerOutputs* GetOutputsFromAllLayers() const;
+
+ //! for pipelined execution
+ std::vector<ExecutionObject*> eos_m;
+ std::vector<IODeviceArgInfo*> iobufs_m;
+
+ std::string device_name_m;
+
+ //! current frame index
+ int frame_idx_m;
+
+ //! current execution object index
+ uint32_t curr_eo_idx_m;
+
+ // host time tracking: pipeline start to finish
+ float host_time_m;
+
+ private:
+ //! @brief Initialize ExecutionObjectPipeline with given
+ //! ExecutionObjects: check consecutive layersGroup, allocate memory
+ void Initialize();
+
+ // flag, mutex and cond var for signaling completion and waiting
+ bool has_work_m, is_processed_m;
+ std::mutex mutex_m;
+ std::condition_variable cv_m;
+
+ // host time tracking: pipeline start to finish
+ std::chrono::time_point<std::chrono::steady_clock> start_m;
+};
+
+ExecutionObjectPipeline::ExecutionObjectPipeline(
+ std::vector<ExecutionObject*> eos)
+{
+ pimpl_m = std::unique_ptr<Impl> { new Impl(eos) };
+}
+
+ExecutionObjectPipeline::Impl::Impl(std::vector<ExecutionObject *> &eos) :
+ eos_m(eos), has_work_m(false), is_processed_m(false)
+{
+ Initialize();
+}
+
+// Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
+// Both unique_ptr and shared_ptr can be instantiated with an incomplete type
+// unique_ptr's destructor requires a complete type in order to invoke delete
+ExecutionObjectPipeline::~ExecutionObjectPipeline() = default;
+
+char* ExecutionObjectPipeline::GetInputBufferPtr() const
+{
+ return static_cast<char *>(pimpl_m->iobufs_m.front()->GetArg().ptr());
+}
+
+size_t ExecutionObjectPipeline::GetInputBufferSizeInBytes() const
+{
+ return pimpl_m->eos_m.front()->GetInputBufferSizeInBytes();
+}
+
+char* ExecutionObjectPipeline::GetOutputBufferPtr() const
+{
+ return static_cast<char *>(pimpl_m->iobufs_m.back()->GetArg().ptr());
+}
+
+size_t ExecutionObjectPipeline::GetOutputBufferSizeInBytes() const
+{
+ return pimpl_m->eos_m.back()->GetOutputBufferSizeInBytes();
+}
+
+void ExecutionObjectPipeline::SetInputOutputBuffer(const ArgInfo& in,
+ const ArgInfo& out)
+{
+ assert(in.ptr() != nullptr && in.size() >= GetInputBufferSizeInBytes());
+ assert(out.ptr() != nullptr && out.size() >= GetOutputBufferSizeInBytes());
+ pimpl_m->SetInputOutputBuffer(in, out);
+}
+
+void ExecutionObjectPipeline::SetFrameIndex(int idx)
+{
+ pimpl_m->frame_idx_m = idx;
+}
+
+int ExecutionObjectPipeline::GetFrameIndex() const
+{
+ return pimpl_m->frame_idx_m;
+}
+
+bool ExecutionObjectPipeline::ProcessFrameStartAsync()
+{
+ assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
+ bool st = pimpl_m->RunAsyncStart();
+ if (st)
+ st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS,
+ this);
+ return st;
+}
+
+bool ExecutionObjectPipeline::ProcessFrameWait()
+{
+ return pimpl_m->Wait();
+}
+
+void CallbackWrapper(void *user_data)
+{
+ ((ExecutionObjectPipeline *) user_data)->RunAsyncNext();
+}
+
+void ExecutionObjectPipeline::RunAsyncNext()
+{
+ bool has_next = pimpl_m->RunAsyncNext();
+ if (has_next)
+ pimpl_m->eos_m[pimpl_m->curr_eo_idx_m]->AddCallback(
+ ExecutionObject::CallType::PROCESS, this);
+}
+
+float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const
+{
+ float total = 0.0f;
+ for (auto eo : pimpl_m->eos_m)
+ total += eo->GetProcessTimeInMilliSeconds();
+ return total;
+}
+
+float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds() const
+{
+ return pimpl_m->host_time_m;
+}
+
+const std::string& ExecutionObjectPipeline::GetDeviceName() const
+{
+ return pimpl_m->device_name_m;
+}
+
+void
+ExecutionObjectPipeline::WriteLayerOutputsToFile(
+ const std::string& filename_prefix) const
+{
+ pimpl_m->WriteLayerOutputsToFile(filename_prefix);
+}
+
+const LayerOutput*
+ExecutionObjectPipeline::GetOutputFromLayer(uint32_t layer_index,
+ uint32_t output_index) const
+{
+ return pimpl_m->GetOutputFromLayer(layer_index, output_index);
+}
+
+const LayerOutputs*
+ExecutionObjectPipeline::GetOutputsFromAllLayers() const
+{
+ return pimpl_m->GetOutputsFromAllLayers();
+}
+
+
+/// Impl methods start here
+
+
+static
+void* AllocateMem(size_t size)
+{
+ if (size == 0) return nullptr;
+ void *ptr = malloc(size);
+ if (ptr == nullptr)
+ throw Exception("Out of memory, ExecutionObjectPipeline malloc failed",
+ __FILE__, __FUNCTION__, __LINE__);
+ return ptr;
+}
+
+void ExecutionObjectPipeline::Impl::Initialize()
+{
+ // Check consecutive layersGroups to form a pipeline
+ int prev_group = 0;
+ for (auto eo : eos_m)
+ {
+ int group = eo->GetLayersGroupId();
+ if (prev_group != 0 && group != prev_group + 1)
+ throw Exception(
+ "Non-consecutive layersGroupIds in ExecutionObjectPipeline",
+ __FILE__, __FUNCTION__, __LINE__);
+ prev_group = group;
+ }
+
+ for (auto eo : eos_m)
+ device_name_m += eo->GetDeviceName() + "+";
+ device_name_m.resize(device_name_m.size() - 1);
+
+ // Allocate input and output memory for EOs/layersGroups
+ // Note that i-th EO's output buffer is the same as (i+1)-th EO's input
+ // So, if n EOs, then (n+1) buffers: b EO b EO b EO b ... EO b
+ // User must set the first input buffer and the last output buffer
+ size_t size;
+ ArgInfo in(nullptr, 0);
+ iobufs_m.push_back(new IODeviceArgInfo(in));
+ for (auto eo : eos_m)
+ {
+ if (eo != eos_m.back())
+ size = eo->GetOutputBufferSizeInBytes();
+ else
+ size = 0;
+
+ void *ptr = AllocateMem(size);
+ ArgInfo out(ptr, size);
+ iobufs_m.push_back(new IODeviceArgInfo(out));
+ }
+}
+
+ExecutionObjectPipeline::Impl::~Impl()
+{
+ int num_iobufs = iobufs_m.size();
+ for (int i = 0; i < num_iobufs; i++)
+ {
+ if (! (i == 0 || i == num_iobufs-1))
+ free(iobufs_m[i]->GetArg().ptr());
+ delete iobufs_m[i];
+ }
+}
+
+void ExecutionObjectPipeline::Impl::SetInputOutputBuffer(const ArgInfo &in,
+ const ArgInfo &out)
+{
+ delete iobufs_m.front();
+ delete iobufs_m.back();
+ iobufs_m.front() = new IODeviceArgInfo(in);
+ iobufs_m.back() = new IODeviceArgInfo(out);
+}
+
+bool ExecutionObjectPipeline::Impl::RunAsyncStart()
+{
+ start_m = std::chrono::steady_clock::now();
+ has_work_m = true;
+ is_processed_m = false;
+ host_time_m = 0.0f;
+ curr_eo_idx_m = 0;
+ eos_m[0]->AcquireLock();
+ eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1]);
+ return eos_m[0]->ProcessFrameStartAsync();
+}
+
+// returns true if we have more EOs to execute
+bool ExecutionObjectPipeline::Impl::RunAsyncNext()
+{
+ eos_m[curr_eo_idx_m]->ProcessFrameWait();
+ eos_m[curr_eo_idx_m]->ReleaseLock();
+ curr_eo_idx_m += 1;
+ if (curr_eo_idx_m < eos_m.size())
+ {
+ eos_m[curr_eo_idx_m]->AcquireLock();
+ eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m],
+ iobufs_m[curr_eo_idx_m+1]);
+ eos_m[curr_eo_idx_m]->ProcessFrameStartAsync();
+ return true;
+ }
+ else
+ {
+ std::chrono::duration<float> elapsed = std::chrono::steady_clock::now()
+ - start_m;
+ host_time_m = elapsed.count() * 1000; // seconds to milliseconds
+ is_processed_m = true;
+ cv_m.notify_all();
+ return false;
+ }
+}
+
+bool ExecutionObjectPipeline::Impl::Wait()
+{
+ if (! has_work_m) return false;
+
+ std::unique_lock<std::mutex> lock(mutex_m);
+ cv_m.wait(lock, [this]{ return this->is_processed_m; });
+ has_work_m = false;
+ return true;
+}
+
+void
+ExecutionObjectPipeline::Impl::WriteLayerOutputsToFile(
+ const std::string& filename_prefix) const
+{
+ for (auto eo : eos_m)
+ eo->WriteLayerOutputsToFile(filename_prefix);
+}
+
+const LayerOutput*
+ExecutionObjectPipeline::Impl::GetOutputFromLayer(uint32_t layer_index,
+ uint32_t output_index) const
+{
+ const LayerOutput* lo = nullptr;
+ for (auto eo : eos_m)
+ {
+ lo = eo->GetOutputFromLayer(layer_index, output_index);
+ if (lo != nullptr) break;
+ }
+ return lo;
+}
+
+const LayerOutputs*
+ExecutionObjectPipeline::Impl::GetOutputsFromAllLayers() const
+{
+ LayerOutputs *all = new LayerOutputs;
+ for (auto eo : eos_m)
+ {
+ LayerOutputs *los = const_cast<LayerOutputs *>(
+ eo->GetOutputsFromAllLayers());
+ for (auto& lo : *los)
+ all->push_back(std::unique_ptr<const LayerOutput>{ lo.release() });
+ delete los;
+ }
+ return all;
+}
+
index b644728afe8f92a75dc65f3bd200ec8aaa8e26fe..914c78ab58104eeba379db5ae8305e45537d007e 100644 (file)
return pimpl_m->execution_objects_m;
}
+ExecutionObject* Executor::operator[](uint32_t index) const
+{
+ assert(index < pimpl_m->execution_objects_m.size());
+ return pimpl_m->execution_objects_m[index].get();
+}
+
bool ExecutorImpl::Initialize(const Configuration& configuration)
{
configuration_m = configuration;
{new ExecutionObject(device_m.get(), index,
create_arg, param_heap_arg,
configuration_m.EXTMEM_HEAP_SIZE,
+ layers_group_id_m,
+ configuration_m.enableOutputTrace,
configuration_m.enableInternalInput)} );
}
- if (configuration_m.enableOutputTrace)
- for (auto &eo : execution_objects_m)
- eo->EnableOutputBufferTrace();
-
for (auto &eo : execution_objects_m)
eo->RunAsync(ExecutionObject::CallType::INIT);
{
return message_m.c_str();
}
-
index fba4f94305da4659de311f604a320ebec8219159..b3eaf36d4894a8c2f0b15f60d0d24889a9dcc4fa 100644 (file)
// Queue 0 on device 0
queue_m[0] = clCreateCommandQueue(context_m,
device_ids[0],
- 0,
+ CL_QUEUE_PROFILING_ENABLE,
&errcode);
errorCheck(errcode, __LINE__);
BuildProgramFromBinary(binary_filename, device_ids, 1);
int index = static_cast<int>(id);
queue_m[index] = clCreateCommandQueue(context_m,
sub_devices[index],
- 0,
+ CL_QUEUE_PROFILING_ENABLE,
&errcode);
errorCheck(errcode, __LINE__);
}
int index = static_cast<int>(id);
queue_m[index] = clCreateCommandQueue(context_m,
all_device_ids[index],
- 0,
+ CL_QUEUE_PROFILING_ENABLE,
&errcode);
errorCheck(errcode, __LINE__);
}
}
-bool Kernel::Wait()
+bool Kernel::Wait(float *host_elapsed_ms)
{
// Wait called without a corresponding RunAsync
if (!is_running_m)
TRACE::print("\tKernel: waiting...\n");
cl_int ret = clWaitForEvents(1, &event_m);
errorCheck(ret, __LINE__);
+
+ if (host_elapsed_ms != nullptr)
+ {
+ cl_ulong t_que, t_end;
+ clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_QUEUED,
+ sizeof(cl_ulong), &t_que, nullptr);
+ clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_END,
+ sizeof(cl_ulong), &t_end, nullptr);
+ *host_elapsed_ms = (t_end - t_que) / 1.0e6; // nano to milli seconds
+ }
+
ret = clReleaseEvent(event_m);
errorCheck(ret, __LINE__);
TRACE::print("\tKernel: finished execution\n");
return true;
}
+extern void CallbackWrapper(void *user_data) __attribute__((weak));
+
+static
+void EventCallback(cl_event event, cl_int exec_status, void *user_data)
+{
+ if (exec_status != CL_SUCCESS || user_data == nullptr) return;
+ if (CallbackWrapper) CallbackWrapper(user_data);
+}
+
+bool Kernel::AddCallback(void *user_data)
+{
+ if (! is_running_m) return false;
+ return clSetEventCallback(event_m, CL_COMPLETE, EventCallback, user_data)
+ == CL_SUCCESS;
+}
+
Kernel::~Kernel()
{
for (auto b : buffers_m)
index 6e80166e5f77b9a046d757c821b7a1ca979a6015..04c5db6af4fef1f0ab7984f139a98a495ef0a2ba 100644 (file)
static uint32_t GetNumDevices(DeviceType device_type);
+ virtual std::string GetDeviceName() = 0;
+
protected:
static const int MAX_DEVICES = 4;
DspDevice(const DspDevice&) = delete;
DspDevice& operator=(const DspDevice&) = delete;
+ virtual std::string GetDeviceName() { return "DSP"; }
+
protected:
bool BuildProgramFromBinary(const std::string &binary_filename,
cl_device_id device_ids[],
EveDevice(const EveDevice&) = delete;
EveDevice& operator=(const EveDevice&) = delete;
+ virtual std::string GetDeviceName() { return "EVE"; }
+
protected:
bool BuildProgramFromBinary(const std::string &kernel_names,
cl_device_id device_ids[],
~Kernel();
Kernel& RunAsync();
- bool Wait();
+ bool Wait(float *host_elapsed_ms = nullptr);
+ bool AddCallback(void *user_data);
private:
cl_kernel kernel_m;