From: Yuan Zhao Date: Fri, 10 Aug 2018 04:42:42 +0000 (-0500) Subject: ExecutionObjectPipeline for executing layersGroups X-Git-Tag: v01.01.00.00^2~16 X-Git-Url: https://git.ti.com/gitweb?p=tidl%2Ftidl-api.git;a=commitdiff_plain;h=1a42784dc57d81735218ec2dc85172a1ed4e8181 ExecutionObjectPipeline for executing layersGroups - Add top level ExecutionObjectPipeline class to execute multiple layersGroups. - An ExecutionObjectPipeline is constructed from multiple ExecutionObjects, each ExecutionObject executes one layersGroup in the network, together they execute consecutive layersGroups. - Same look and feel as ExecutionObject, e.g. ProcessFrameStartAsync, ProcessFrameWait, GetInputBufferPointer, GetOutputBufferPointer - MCT-1017, MCT-1029 --- diff --git a/examples/ssd_multibox/main.cpp b/examples/ssd_multibox/main.cpp index 6d39dda..b302cfa 100644 --- a/examples/ssd_multibox/main.cpp +++ b/examples/ssd_multibox/main.cpp @@ -43,6 +43,7 @@ #include "executor.h" #include "execution_object.h" +#include "execution_object_pipeline.h" #include "configuration.h" #include "../segmentation/object_classes.h" @@ -67,13 +68,13 @@ using namespace tidl; using namespace cv; -bool RunConfiguration(const std::string& config_file, uint32_t num_devices, +bool RunConfiguration(const std::string& config_file, + uint32_t num_dsps, uint32_t num_eves, DeviceType device_type, std::string& input_file); -bool ReadFrame(ExecutionObject& eo, int frame_idx, +bool ReadFrame(ExecutionObjectPipeline& eop, int frame_idx, const Configuration& configuration, int num_frames, std::string& image_file, VideoCapture &cap); -bool WriteFrameOutput(const ExecutionObject &eo_in, - const ExecutionObject &eo_out, +bool WriteFrameOutput(const ExecutionObjectPipeline& eop, const Configuration& configuration); void ReportTime(int frame_index, std::string device_name, double elapsed_host, @@ -81,7 +82,8 @@ void ReportTime(int frame_index, std::string device_name, double elapsed_host, static void ProcessArgs(int argc, char *argv[], std::string& config, - uint32_t& num_devices, + uint32_t& num_dsps, + uint32_t& num_eves, DeviceType& device_type, std::string& input_file); @@ -110,18 +112,12 @@ int main(int argc, char *argv[]) // Process arguments std::string config = DEFAULT_CONFIG; std::string input_file = DEFAULT_INPUT; - uint32_t num_devices = 1; + uint32_t num_dsps = 1; + uint32_t num_eves = 1; DeviceType device_type = DeviceType::EVE; - ProcessArgs(argc, argv, config, num_devices, device_type, input_file); + ProcessArgs(argc, argv, config, num_dsps, num_eves, + device_type, input_file); - // Use same number of EVEs and DSPs - num_devices = std::min(num_devices, std::min(num_eve, num_dsp)); - if (num_devices == 0) - { - std::cout << "Partitioned execution requires at least 1 EVE and 1 DSP." - << std::endl; - return EXIT_FAILURE; - } if ((object_class_table = GetObjectClassTable(config)) == nullptr) { std::cout << "No object classes defined for this config." << std::endl; @@ -136,8 +132,8 @@ int main(int argc, char *argv[]) std::cout << "Input: " << input_file << std::endl; std::string config_file = "../test/testvecs/config/infer/tidl_config_" + config + ".txt"; - bool status = RunConfiguration(config_file, num_devices, device_type, - input_file); + bool status = RunConfiguration(config_file, num_dsps, num_eves, + device_type, input_file); if (!status) { @@ -149,12 +145,15 @@ int main(int argc, char *argv[]) return EXIT_SUCCESS; } -bool RunConfiguration(const std::string& config_file, uint32_t num_devices, +bool RunConfiguration(const std::string& config_file, + uint32_t num_dsps, uint32_t num_eves, DeviceType device_type, std::string& input_file) { - DeviceIds ids; - for (int i = 0; i < num_devices; i++) - ids.insert(static_cast(i)); + DeviceIds ids_eve, ids_dsp; + for (int i = 0; i < num_eves; i++) + ids_eve.insert(static_cast(i)); + for (int i = 0; i < num_dsps; i++) + ids_dsp.insert(static_cast(i)); // Read the TI DL configuration file Configuration configuration; @@ -167,7 +166,7 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices, } // setup input - int num_frames = is_default_input ? 3 : 1; + int num_frames = is_default_input ? 9 : 9; VideoCapture cap; std::string image_file; if (is_camera_input) @@ -192,82 +191,58 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices, // and configuration specified // EVE will run layersGroupId 1 in the network, while // DSP will run layersGroupId 2 in the network - Executor executor_eve(DeviceType::EVE, ids, configuration, 1); - Executor executor_dsp(DeviceType::DSP, ids, configuration, 2); - - // Query Executor for set of ExecutionObjects created - const ExecutionObjects& execution_objects_eve = - executor_eve.GetExecutionObjects(); - const ExecutionObjects& execution_objects_dsp = - executor_dsp.GetExecutionObjects(); - int num_eos = execution_objects_eve.size(); - - // Allocate input and output buffers for each execution object - // Note that "out" is both the output of eo_eve and the input of eo_dsp - // This is how two layersGroupIds, 1 and 2, are tied together + Executor exe_eve(DeviceType::EVE, ids_eve, configuration, 1); + Executor exe_dsp(DeviceType::DSP, ids_dsp, configuration, 2); + + // Construct ExecutionObjectPipeline that utilizes multiple + // ExecutionObjects to process a single frame, each ExecutionObject + // processes one layerGroup of the network + int num_eops = std::max(num_eves, num_dsps); + std::vector eops; + for (int i = 0; i < num_eops; i++) + eops.push_back(new ExecutionObjectPipeline({exe_eve[i%num_eves], + exe_dsp[i%num_dsps]})); + + // Allocate input/output memory for each EOP std::vector buffers; - for (int i = 0; i < num_eos; i++) + for (auto eop : eops) { - ExecutionObject *eo_eve = execution_objects_eve[i].get(); - size_t in_size = eo_eve->GetInputBufferSizeInBytes(); - size_t out_size = eo_eve->GetOutputBufferSizeInBytes(); - ArgInfo in = { ArgInfo(malloc(in_size), in_size) }; - ArgInfo out = { ArgInfo(malloc(out_size), out_size) }; - eo_eve->SetInputOutputBuffer(in, out); - - ExecutionObject *eo_dsp = execution_objects_dsp[i].get(); - size_t out2_size = eo_dsp->GetOutputBufferSizeInBytes(); - ArgInfo out2 = { ArgInfo(malloc(out2_size), out2_size) }; - eo_dsp->SetInputOutputBuffer(out, out2); - - buffers.push_back(in.ptr()); - buffers.push_back(out.ptr()); - buffers.push_back(out2.ptr()); + size_t in_size = eop->GetInputBufferSizeInBytes(); + size_t out_size = eop->GetOutputBufferSizeInBytes(); + void* in_ptr = malloc(in_size); + void* out_ptr = malloc(out_size); + assert(in_ptr != nullptr && out_ptr != nullptr); + buffers.push_back(in_ptr); + buffers.push_back(out_ptr); + + ArgInfo in(in_ptr, in_size); + ArgInfo out(out_ptr, out_size); + eop->SetInputOutputBuffer(in, out); } - #define MAX_NUM_EOS 4 - struct timespec t0[MAX_NUM_EOS], t1, tloop0, tloop1; + struct timespec tloop0, tloop1; clock_gettime(CLOCK_MONOTONIC, &tloop0); - // Process frames with available execution objects in a pipelined manner - // additional num_eos iterations to flush the pipeline (epilogue) - ExecutionObject *eo_eve, *eo_dsp, *eo_input; - for (int frame_idx = 0; - frame_idx < num_frames + num_eos; frame_idx++) + // Process frames with ExecutionObjectPipelines in a pipelined manner + // additional num_eops iterations to flush pipeline (epilogue) + for (int frame_idx = 0; frame_idx < num_frames + num_eops; frame_idx++) { - eo_eve = execution_objects_eve[frame_idx % num_eos].get(); - eo_dsp = execution_objects_dsp[frame_idx % num_eos].get(); + ExecutionObjectPipeline* eop = eops[frame_idx % num_eops]; - // Wait for previous frame on the same eo to finish processing - if (eo_dsp->ProcessFrameWait()) + // Wait for previous frame on the same eop to finish processing + if (eop->ProcessFrameWait()) { - int finished_idx = eo_dsp->GetFrameIndex(); - clock_gettime(CLOCK_MONOTONIC, &t1); - ReportTime(finished_idx, "DSP", - ms_diff(t0[finished_idx % num_eos], t1), - eo_dsp->GetProcessTimeInMilliSeconds()); - - eo_input = execution_objects_eve[finished_idx % num_eos].get(); - WriteFrameOutput(*eo_input, *eo_dsp, configuration); + ReportTime(eop->GetFrameIndex(), eop->GetDeviceName(), + eop->GetHostProcessTimeInMilliSeconds(), + eop->GetProcessTimeInMilliSeconds()); + WriteFrameOutput(*eop, configuration); } // Read a frame and start processing it with current eo - if (ReadFrame(*eo_eve, frame_idx, configuration, num_frames, + if (ReadFrame(*eop, frame_idx, configuration, num_frames, image_file, cap)) { - clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]); - eo_eve->ProcessFrameStartAsync(); - - if (eo_eve->ProcessFrameWait()) - { - clock_gettime(CLOCK_MONOTONIC, &t1); - ReportTime(frame_idx, "EVE", - ms_diff(t0[frame_idx % num_eos], t1), - eo_eve->GetProcessTimeInMilliSeconds()); - - clock_gettime(CLOCK_MONOTONIC, &t0[frame_idx % num_eos]); - eo_dsp->ProcessFrameStartAsync(); - } + eop->ProcessFrameStartAsync(); } } @@ -276,6 +251,8 @@ bool RunConfiguration(const std::string& config_file, uint32_t num_devices, << std::setw(6) << std::setprecision(4) << ms_diff(tloop0, tloop1) << "ms" << std::endl; + for (auto eop : eops) + delete eop; for (auto b : buffers) free(b); } @@ -305,15 +282,15 @@ void ReportTime(int frame_index, std::string device_name, double elapsed_host, } -bool ReadFrame(ExecutionObject &eo, int frame_idx, +bool ReadFrame(ExecutionObjectPipeline& eop, int frame_idx, const Configuration& configuration, int num_frames, std::string& image_file, VideoCapture &cap) { if (frame_idx >= num_frames) return false; - eo.SetFrameIndex(frame_idx); + eop.SetFrameIndex(frame_idx); - char* frame_buffer = eo.GetInputBufferPtr(); + char* frame_buffer = eop.GetInputBufferPtr(); assert (frame_buffer != nullptr); int channel_size = configuration.inWidth * configuration.inHeight; @@ -323,7 +300,7 @@ bool ReadFrame(ExecutionObject &eo, int frame_idx, if (is_preprocessed_input) { std::ifstream ifs(image_file, std::ios::binary); - ifs.seekg(frame_idx * channel_size * 3); + //ifs.seekg(frame_idx * channel_size * 3); ifs.read(frame_buffer, channel_size * 3); bool ifs_status = ifs.good(); ifs.close(); @@ -368,8 +345,7 @@ bool ReadFrame(ExecutionObject &eo, int frame_idx, } // Create frame with boxes drawn around classified objects -bool WriteFrameOutput(const ExecutionObject &eo_in, - const ExecutionObject &eo_out, +bool WriteFrameOutput(const ExecutionObjectPipeline& eop, const Configuration& configuration) { // Asseembly original frame @@ -378,13 +354,13 @@ bool WriteFrameOutput(const ExecutionObject &eo_in, int channel_size = width * height; Mat frame, r_frame, bgr[3]; - unsigned char *in = (unsigned char *) eo_in.GetInputBufferPtr(); + unsigned char *in = (unsigned char *) eop.GetInputBufferPtr(); bgr[0] = Mat(height, width, CV_8UC(1), in); bgr[1] = Mat(height, width, CV_8UC(1), in + channel_size); bgr[2] = Mat(height, width, CV_8UC(1), in + channel_size*2); cv::merge(bgr, 3, frame); - int frame_index = eo_in.GetFrameIndex(); + int frame_index = eop.GetFrameIndex(); char outfile_name[64]; if (! is_camera_input && is_preprocessed_input) { @@ -394,8 +370,8 @@ bool WriteFrameOutput(const ExecutionObject &eo_in, } // Draw boxes around classified objects - float *out = (float *) eo_out.GetOutputBufferPtr(); - int num_floats = eo_out.GetOutputBufferSizeInBytes() / sizeof(float); + float *out = (float *) eop.GetOutputBufferPtr(); + int num_floats = eop.GetOutputBufferSizeInBytes() / sizeof(float); for (int i = 0; i < num_floats / 7; i++) { int index = (int) out[i * 7 + 0]; @@ -443,13 +419,14 @@ bool WriteFrameOutput(const ExecutionObject &eo_in, void ProcessArgs(int argc, char *argv[], std::string& config, - uint32_t& num_devices, DeviceType& device_type, - std::string& input_file) + uint32_t& num_dsps, uint32_t& num_eves, + DeviceType& device_type, std::string& input_file) { const struct option long_options[] = { {"config", required_argument, 0, 'c'}, - {"num_devices", required_argument, 0, 'n'}, + {"num_dsps", required_argument, 0, 'd'}, + {"num_eves", required_argument, 0, 'e'}, {"image_file", required_argument, 0, 'i'}, {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, @@ -460,7 +437,8 @@ void ProcessArgs(int argc, char *argv[], std::string& config, while (true) { - int c = getopt_long(argc, argv, "c:n:i:hv", long_options, &option_index); + int c = getopt_long(argc, argv, "c:d:e:i:hv", long_options, + &option_index); if (c == -1) break; @@ -470,8 +448,14 @@ void ProcessArgs(int argc, char *argv[], std::string& config, case 'c': config = optarg; break; - case 'n': num_devices = atoi(optarg); - assert (num_devices > 0 && num_devices <= 4); + case 'd': num_dsps = atoi(optarg); + assert (num_dsps > 0 && num_dsps <= + Executor::GetNumDevices(DeviceType::DSP)); + break; + + case 'e': num_eves = atoi(optarg); + assert (num_eves > 0 && num_eves <= + Executor::GetNumDevices(DeviceType::EVE)); break; case 'i': input_file = optarg; @@ -507,7 +491,8 @@ void DisplayHelp() "Default is jdetnet.\n" "Optional arguments:\n" " -c Valid configs: jdetnet \n" - " -n Number of cores to use (1 - 4)\n" + " -d Number of dsp cores to use\n" + " -e Number of eve cores to use\n" " -i Path to the image file\n" " Default is 1 frame in testvecs\n" " -i camera Use camera as input\n" diff --git a/tidl_api/Makefile b/tidl_api/Makefile index 05a3704..3fc6a2c 100644 --- a/tidl_api/Makefile +++ b/tidl_api/Makefile @@ -39,7 +39,8 @@ AR = ar SRCS = ocl_device.cpp configuration_parser.cpp configuration.cpp\ - executor.cpp execution_object.cpp trace.cpp util.cpp + executor.cpp execution_object.cpp trace.cpp util.cpp \ + execution_object_pipeline.cpp SRCS_IMGUTIL = imgutil.cpp OBJS = $(SRCS:.cpp=.o) @@ -53,8 +54,7 @@ HOST_OBJ_IMGUTIL_FILES = $(addprefix obj/,$(OBJS_IMGUTIL)) HEADERS = src/common_defines.h src/executor_impl.h src/ocl_device.h HEADERS += src/parameters.h src/tidl_create_params.h src/trace.h src/util.h HEADERS += inc/configuration.h inc/execution_object.h inc/executor.h -HEADERS += inc/imgutil.h src/device_arginfo.h - +HEADERS += inc/imgutil.h src/device_arginfo.h inc/execution_object_pipeline.h ifeq ($(BUILD), debug) CXXFLAGS += -Og -g -ggdb diff --git a/tidl_api/inc/execution_object.h b/tidl_api/inc/execution_object.h index e78ad2e..c1d86fc 100644 --- a/tidl_api/inc/execution_object.h +++ b/tidl_api/inc/execution_object.h @@ -31,6 +31,7 @@ #pragma once #include +#include "execution_object_internal.h" namespace tidl { @@ -39,13 +40,12 @@ class Device; class LayerOutput; class IODeviceArgInfo; -typedef std::vector> LayerOutputs; /*! @class ExecutionObject @brief Runs the TIDL network on an OpenCL device */ -class ExecutionObject +class ExecutionObject : public ExecutionObjectInternalInterface { public: @@ -55,6 +55,8 @@ class ExecutionObject const ArgInfo& create_arg, const ArgInfo& param_heap_arg, size_t extmem_heap_size, + int layersGroupId, + bool output_trace, bool internal_input); //! @private ~ExecutionObject(); @@ -62,52 +64,56 @@ class ExecutionObject //! Specify the input and output buffers used by the EO //! @param in buffer used for input. //! @param out buffer used for output. - void SetInputOutputBuffer (const ArgInfo& in, const ArgInfo& out); + void SetInputOutputBuffer(const ArgInfo& in, + const ArgInfo& out) override; //! Returns a pointer to the input buffer set via SetInputOutputBuffer - char* GetInputBufferPtr() const; + char* GetInputBufferPtr() const override; //! Returns size of the input buffer - size_t GetInputBufferSizeInBytes() const; + size_t GetInputBufferSizeInBytes() const override; + + //! Returns a pointer to the output buffer + char* GetOutputBufferPtr() const override; + + //! Returns size of the output buffer + size_t GetOutputBufferSizeInBytes() const override; //! @brief Set the frame index of the frame currently processed by the //! ExecutionObject. Used for trace/debug messages //! @param idx index of the frame - void SetFrameIndex(int idx); + void SetFrameIndex(int idx) override; //! Returns the index of a frame being processed (set by SetFrameIndex) - int GetFrameIndex() const; - - //! Returns a pointer to the output buffer - char* GetOutputBufferPtr() const; - - //! Returns the number of bytes written to the output buffer - size_t GetOutputBufferSizeInBytes() const; + int GetFrameIndex() const override; - //! @brief Start processing a frame. The call is asynchronous and returns - //! immediately. Use ExecutionObject::ProcessFrameWait to wait - bool ProcessFrameStartAsync(); + //! @brief Start processing a frame. The call is asynchronous and + //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait + bool ProcessFrameStartAsync() override; //! Wait for the execution object to complete processing a frame //! @return false if ExecutionObject::ProcessFrameWait was called //! without a corresponding call to //! ExecutionObject::ProcessFrameStartAsync. - bool ProcessFrameWait(); - - //! @brief return the number of cycles taken *on the device* to - //! execute the process call - //! @return Number of cycles to process a frame on the device. - uint64_t GetProcessCycles() const; + bool ProcessFrameWait() override; //! @brief return the number of milliseconds taken *on the device* to //! execute the process call //! @return Number of milliseconds to process a frame on the device. - float GetProcessTimeInMilliSeconds() const; + float GetProcessTimeInMilliSeconds() const override; + + //! @brief return the number of milliseconds taken *on the host* to + //! execute the process call + //! @return Number of milliseconds to process a frame on the host. + float GetHostProcessTimeInMilliSeconds() const override; + + //! Returns the device name that the ExecutionObject runs on + const std::string& GetDeviceName() const override; //! Write the output buffer for each layer to a file - //! __HxW.bin + //! \__HxW.bin void WriteLayerOutputsToFile(const std::string& filename_prefix= - "trace_dump_") const; + "trace_dump_") const override; //! Returns a LayerOutput object corresponding to a layer. //! Caller is responsible for deleting the LayerOutput object. @@ -116,10 +122,13 @@ class ExecutionObject //! @param output_index The output index of the buffer for a given //! layer. Defaults to 0. const LayerOutput* GetOutputFromLayer(uint32_t layer_index, - uint32_t output_index=0) const; + uint32_t output_index=0) const override; //! Get output buffers from all layers - const LayerOutputs* GetOutputsFromAllLayers() const; + const LayerOutputs* GetOutputsFromAllLayers() const override; + + //! Returns the layersGrupId that the ExecutionObject is processing + int GetLayersGroupId() const; //! @private // Used by the Executor @@ -127,12 +136,16 @@ class ExecutionObject bool RunAsync(CallType ct); bool Wait (CallType ct); + //! @private + // Used by the ExecutionObjectPipeline + bool AddCallback(CallType ct, void *user_data); + void AcquireLock(); + void ReleaseLock(); + ExecutionObject() = delete; ExecutionObject(const ExecutionObject&) = delete; ExecutionObject& operator=(const ExecutionObject&) = delete; - void EnableOutputBufferTrace(); - //! @private void SetInputOutputBuffer(const IODeviceArgInfo* in, const IODeviceArgInfo* out); diff --git a/tidl_api/inc/execution_object_internal.h b/tidl_api/inc/execution_object_internal.h new file mode 100644 index 0000000..816da94 --- /dev/null +++ b/tidl_api/inc/execution_object_internal.h @@ -0,0 +1,119 @@ +/****************************************************************************** + * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +/*! @file execution_object_internal.h */ + +#pragma once + +namespace tidl { + +class LayerOutput; + +typedef std::vector> LayerOutputs; + +/*! @cond HIDDEN_SYMBOLS + @class ExecutionObjectInternalInterface + @brief Internal interface for running the TIDL network on OpenCL devices + Do not use this internal class directly. + Please use ExecutionObject or ExecutionObejctPipeline instead. +*/ +class ExecutionObjectInternalInterface +{ + public: + virtual ~ExecutionObjectInternalInterface() {}; + + //! Specify the input and output buffers used by the EO + //! @param in buffer used for input. + //! @param out buffer used for output. + virtual void SetInputOutputBuffer(const ArgInfo& in, + const ArgInfo& out) =0; + + //! Returns a pointer to the input buffer set via SetInputOutputBuffer + virtual char* GetInputBufferPtr() const =0; + + //! Returns size of the input buffer + virtual size_t GetInputBufferSizeInBytes() const =0; + + //! Returns a pointer to the output buffer + virtual char* GetOutputBufferPtr() const =0; + + //! Returns size of the output buffer + virtual size_t GetOutputBufferSizeInBytes() const =0; + + //! @brief Set the frame index of the frame currently processed by the + //! ExecutionObject. Used for trace/debug messages + //! @param idx index of the frame + virtual void SetFrameIndex(int idx) =0; + + //! Returns the index of a frame being processed (set by SetFrameIndex) + virtual int GetFrameIndex() const =0; + + //! @brief Start processing a frame. The call is asynchronous and returns + //! immediately. Use ExecutionObject::ProcessFrameWait to wait + virtual bool ProcessFrameStartAsync() =0; + + //! Wait for the execution object to complete processing a frame + //! @return false if ExecutionObject::ProcessFrameWait was called + //! without a corresponding call to + //! ExecutionObject::ProcessFrameStartAsync. + virtual bool ProcessFrameWait() =0; + + //! @brief return the number of milliseconds taken *on the device* to + //! execute the process call + //! @return Number of milliseconds to process a frame on the device. + virtual float GetProcessTimeInMilliSeconds() const =0; + + //! @brief return the number of milliseconds taken *on the host* to + //! execute the process call + //! @return Number of milliseconds to process a frame on the host. + virtual float GetHostProcessTimeInMilliSeconds() const =0; + + //! Returns the device name that the ExecutionObject runs on + virtual const std::string& GetDeviceName() const =0; + + //! Write the output buffer for each layer to a file + //! \__HxW.bin + virtual void WriteLayerOutputsToFile(const std::string& filename_prefix= + "trace_dump_") const =0; + + //! Returns a LayerOutput object corresponding to a layer. + //! Caller is responsible for deleting the LayerOutput object. + //! @see LayerOutput + //! @param layer_index The layer index of the layer + //! @param output_index The output index of the buffer for a given + //! layer. Defaults to 0. + virtual const LayerOutput* GetOutputFromLayer(uint32_t layer_index, + uint32_t output_index=0) const =0; + + //! Get output buffers from all layers + virtual const LayerOutputs* GetOutputsFromAllLayers() const =0; +}; +/*! @endcond +*/ + +} // namespace tidl diff --git a/tidl_api/inc/execution_object_pipeline.h b/tidl_api/inc/execution_object_pipeline.h new file mode 100644 index 0000000..aaa6cf0 --- /dev/null +++ b/tidl_api/inc/execution_object_pipeline.h @@ -0,0 +1,151 @@ +/****************************************************************************** + * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +//! @file execution_object_pipeline.h + +#pragma once +#include +#include +#include +#include + +#include "executor.h" +#include "execution_object_internal.h" +#include "execution_object.h" + +namespace tidl { + +/*! @class ExecutionObjectPipeline + @brief Manages the pipelined execution using multiple ExecutionObjects. + Each executor runs one layersGroup of the network. ExecutionObjects + must run consecutive layersGroups to form a pipelined execution. +*/ +class ExecutionObjectPipeline : public ExecutionObjectInternalInterface +{ + public: + //! @brief Create an ExecutionObjectPipeline object. + //! + //! The ExecutionObjectPipeline will take the provided ExecutionObjects + //! to create an execution pipeline. E.g. + //! @code + //! Configuration config("path to configuration file"); + //! DeviceIds ids = {DeviceId::ID0, DeviceId::ID1}; + //! Executor exe_eve(DeviceType::EVE, ids, config, 1); + //! Executor exe_dsp(DeviceType::DSP, ids, config, 2); + //! ExecutionObjectPipeline ep0({exe_eve[0], exe_dsp[0]}); + //! ExecutionObjectPipeline ep1({exe_eve[1], exe_dsp[1]}); + //! @endcode + //! + //! @param eos DSP or EVE ExecutionObjects forming a pipeline + ExecutionObjectPipeline(std::vector eos); + + //! @brief Tear down an ExecutionObjectPipeline and free used resources + ~ExecutionObjectPipeline(); + + //! Specify the input and output buffers used by the EOP + //! @param in buffer used for input. + //! @param out buffer used for output. + void SetInputOutputBuffer (const ArgInfo& in, + const ArgInfo& out) override; + + //! Returns a pointer to the input buffer + char* GetInputBufferPtr() const override; + + //! Returns size of the input buffer + size_t GetInputBufferSizeInBytes() const override; + + //! Returns a pointer to the output buffer + char* GetOutputBufferPtr() const override; + + //! Returns the number of bytes written to the output buffer + size_t GetOutputBufferSizeInBytes() const override; + + //! @brief Set the frame index of the frame currently processed by the + //! ExecutionObjectPipeline. Used for trace/debug messages + //! @param idx index of the frame + void SetFrameIndex(int idx) override; + + //! Returns the index of a frame being processed (set by SetFrameIndex) + int GetFrameIndex() const override; + + //! @brief Start processing a frame. The call is asynchronous and + //! returns immediately. Use ProcessFrameWait() to wait + bool ProcessFrameStartAsync() override; + + //! Wait for the executor pipeline to complete processing a frame + //! @return false if ProcessFrameWait() was called + //! without a corresponding call to + //! ExecutionObjectPipeline::ProcessFrameStartAsync(). + bool ProcessFrameWait() override; + + //! @brief return the number of milliseconds taken *on the device* to + //! execute the process call + //! @return Number of milliseconds to process a frame on the device. + float GetProcessTimeInMilliSeconds() const override; + + //! @brief return the number of milliseconds taken *on the host* to + //! execute the process call + //! @return Number of milliseconds to process a frame on the host. + float GetHostProcessTimeInMilliSeconds() const override; + + //! Return the combined device names that this pipeline runs on + const std::string& GetDeviceName() const override; + + //! Write the output buffer for each layer to a file + //! \__HxW.bin + void WriteLayerOutputsToFile(const std::string& filename_prefix= + "trace_dump_") const override; + + //! Returns a LayerOutput object corresponding to a layer. + //! Caller is responsible for deleting the LayerOutput object. + //! @see LayerOutput + //! @param layer_index The layer index of the layer + //! @param output_index The output index of the buffer for a given + //! layer. Defaults to 0. + const LayerOutput* GetOutputFromLayer(uint32_t layer_index, + uint32_t output_index=0) const override; + + //! Get output buffers from all layers + const LayerOutputs* GetOutputsFromAllLayers() const override; + + //! @private Used by runtime + //! @brief callback function at the completion of each ExecutionObject, + //! to chain the next ExectionObject for execution + void RunAsyncNext(); + + ExecutionObjectPipeline() = delete; + ExecutionObjectPipeline(const ExecutionObjectPipeline&) = delete; + ExecutionObjectPipeline& operator=(const ExecutionObjectPipeline&) + = delete; + + private: + class Impl; + std::unique_ptr pimpl_m; +}; + +} // namespace tidl diff --git a/tidl_api/inc/executor.h b/tidl_api/inc/executor.h index 23d92ff..1febfea 100644 --- a/tidl_api/inc/executor.h +++ b/tidl_api/inc/executor.h @@ -64,7 +64,7 @@ class ExecutionObject; typedef std::vector> ExecutionObjects; /*! @class Executor - @brief Manages the overall execution of a network using the + @brief Manages the overall execution of a layersGroup in a network using the specified configuration and the set of devices available to the executor. */ @@ -78,7 +78,7 @@ class Executor //! @code //! Configuration configuration; //! configuration.ReadFromFile("path to configuration file"); - //! DeviceIds ids1 = {DeviceId::ID2, DeviceId::ID3}; + //! DeviceIds ids = {DeviceId::ID2, DeviceId::ID3}; //! Executor executor(DeviceType::EVE, ids, configuration); //! @endcode //! @@ -98,6 +98,9 @@ class Executor //! available on this instance of the Executor const ExecutionObjects& GetExecutionObjects() const; + //! Returns a single execution object at index + ExecutionObject* operator[](uint32_t index) const; + //! @brief Returns the number of devices of the specified type //! available for TI DL. //! @param device_type DSP or EVE/EVE device @@ -106,7 +109,7 @@ class Executor //! @brief Returns a string corresponding to the API version //! - //! @return ... + //! @return \.\.\.\ static std::string GetAPIVersion(); Executor(const Executor&) = delete; diff --git a/tidl_api/src/execution_object.cpp b/tidl_api/src/execution_object.cpp index d722ebb..178bbca 100644 --- a/tidl_api/src/execution_object.cpp +++ b/tidl_api/src/execution_object.cpp @@ -31,6 +31,9 @@ #include #include #include +#include +#include +#include #include "executor.h" #include "execution_object.h" #include "trace.h" @@ -50,13 +53,24 @@ class ExecutionObject::Impl const DeviceArgInfo& create_arg, const DeviceArgInfo& param_heap_arg, size_t extmem_heap_size, + int layers_group_id, + bool output_trace, bool internal_input); ~Impl() {} bool RunAsync(CallType ct); bool Wait (CallType ct); + bool AddCallback(CallType ct, void *user_data); + + uint64_t GetProcessCycles() const; + int GetLayersGroupId() const; + void AcquireLock(); + void ReleaseLock(); Device* device_m; + // Index of the OpenCL device/queue used by this EO + uint8_t device_index_m; + std::string device_name_m; up_malloc_ddr tidl_extmem_heap_m; up_malloc_ddr shared_initialize_params_m; @@ -70,6 +84,9 @@ class ExecutionObject::Impl // Frame being processed by the EO int current_frame_idx_m; + // LayersGroupId being processed by the EO + int layers_group_id_m; + // Trace related void WriteLayerOutputsToFile (const std::string& filename_prefix) const; @@ -81,25 +98,29 @@ class ExecutionObject::Impl up_malloc_ddr trace_buf_params_m; size_t trace_buf_params_sz_m; + // host time tracking: eo start to finish + float host_time_m; + private: void SetupInitializeKernel(const DeviceArgInfo& create_arg, const DeviceArgInfo& param_heap_arg, size_t extmem_heap_size, bool internal_input); + void EnableOutputBufferTrace(); void SetupProcessKernel(); void HostWriteNetInput(); void HostReadNetOutput(); void ComputeInputOutputSizes(); - // Index of the OpenCL device/queue used by this EO - uint8_t device_index_m; - std::unique_ptr k_initialize_m; std::unique_ptr k_process_m; std::unique_ptr k_cleanup_m; - + // Guarding sole access to input/output for one frame during execution + bool is_idle_m; + std::mutex mutex_access_m; + std::condition_variable cv_access_m; }; @@ -108,6 +129,8 @@ ExecutionObject::ExecutionObject(Device* d, const ArgInfo& create_arg, const ArgInfo& param_heap_arg, size_t extmem_heap_size, + int layers_group_id, + bool output_trace, bool internal_input) { DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER); @@ -118,6 +141,8 @@ ExecutionObject::ExecutionObject(Device* d, create_arg_d, param_heap_arg_d, extmem_heap_size, + layers_group_id, + output_trace, internal_input) }; } @@ -127,8 +152,11 @@ ExecutionObject::Impl::Impl(Device* d, const DeviceArgInfo& create_arg, const DeviceArgInfo& param_heap_arg, size_t extmem_heap_size, + int layers_group_id, + bool output_trace, bool internal_input): device_m(d), + device_index_m(device_index), tidl_extmem_heap_m (nullptr, &__free_ddr), shared_initialize_params_m(nullptr, &__free_ddr), shared_process_params_m(nullptr, &__free_ddr), @@ -137,23 +165,26 @@ ExecutionObject::Impl::Impl(Device* d, in_m(), out_m(), current_frame_idx_m(0), + layers_group_id_m(layers_group_id), num_network_layers_m(0), trace_buf_params_m(nullptr, &__free_ddr), trace_buf_params_sz_m(0), - device_index_m(device_index), k_initialize_m(nullptr), k_process_m(nullptr), - k_cleanup_m(nullptr) + k_cleanup_m(nullptr), + is_idle_m(true) { - SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size, - internal_input); - - SetupProcessKernel(); - + device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m); // Save number of layers in the network const TIDL_CreateParams* cp = static_cast(create_arg.ptr()); num_network_layers_m = cp->net.numLayers; + + SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size, + internal_input); + + if (output_trace) EnableOutputBufferTrace(); + SetupProcessKernel(); } // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/: @@ -168,9 +199,7 @@ char* ExecutionObject::GetInputBufferPtr() const size_t ExecutionObject::GetInputBufferSizeInBytes() const { - const DeviceArgInfo& arg = pimpl_m->in_m.GetArg(); - if (arg.ptr() == nullptr) return pimpl_m->in_size_m; - else return arg.size(); + return pimpl_m->in_size_m; } char* ExecutionObject::GetOutputBufferPtr() const @@ -180,11 +209,7 @@ char* ExecutionObject::GetOutputBufferPtr() const size_t ExecutionObject::GetOutputBufferSizeInBytes() const { - const DeviceArgInfo& arg = pimpl_m->out_m.GetArg(); - if (arg.ptr() == nullptr) - return pimpl_m->out_size_m; - else - return pimpl_m->shared_process_params_m.get()->bytesWritten; + return pimpl_m->out_size_m; } void ExecutionObject::SetFrameIndex(int idx) @@ -199,8 +224,8 @@ int ExecutionObject::GetFrameIndex() const void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out) { - assert(in.ptr() != nullptr && in.size() > 0); - assert(out.ptr() != nullptr && out.size() > 0); + assert(in.ptr() != nullptr && in.size() >= pimpl_m->in_size_m); + assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m); pimpl_m->in_m = IODeviceArgInfo(in); pimpl_m->out_m = IODeviceArgInfo(out); @@ -215,6 +240,7 @@ void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in, bool ExecutionObject::ProcessFrameStartAsync() { + assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr); return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS); } @@ -233,21 +259,26 @@ bool ExecutionObject::Wait (CallType ct) return pimpl_m->Wait(ct); } -uint64_t ExecutionObject::GetProcessCycles() const +bool ExecutionObject::AddCallback(CallType ct, void *user_data) { - uint8_t factor = 1; - - // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles - if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM) - factor = 2; - - return pimpl_m->shared_process_params_m.get()->cycles * factor; + return pimpl_m->AddCallback(ct, user_data); } float ExecutionObject::GetProcessTimeInMilliSeconds() const { float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000; - return ((float)GetProcessCycles())/frequency * 1000; + return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000; +} + +float ExecutionObject::GetHostProcessTimeInMilliSeconds() const +{ + return pimpl_m->host_time_m; +} + +void +ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const +{ + pimpl_m->WriteLayerOutputsToFile(filename_prefix); } const LayerOutput* ExecutionObject::GetOutputFromLayer( @@ -261,37 +292,25 @@ const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const return pimpl_m->GetOutputsFromAllLayers(); } -// -// Allocate an OpenCL buffer for TIDL layer output buffer metadata. -// The device will populate metadata for every buffer that is used as an -// output buffer by a layer. -// -void ExecutionObject::EnableOutputBufferTrace() +int ExecutionObject::GetLayersGroupId() const { - pimpl_m->trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)* - pimpl_m->num_network_layers_m* - TIDL_NUM_OUT_BUFS); - - pimpl_m->trace_buf_params_m.reset(malloc_ddr - (pimpl_m->trace_buf_params_sz_m)); + return pimpl_m->layers_group_id_m; +} - // Device will update bufferId if there is valid data for the entry - OCL_TIDL_BufParams* bufferParams = pimpl_m->trace_buf_params_m.get(); - for (uint32_t i = 0; i < pimpl_m->num_network_layers_m; i++) - for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++) - { - OCL_TIDL_BufParams *bufP = - &bufferParams[i*TIDL_NUM_OUT_BUFS+j]; - bufP->bufferId = UINT_MAX; - } +const std::string& ExecutionObject::GetDeviceName() const +{ + return pimpl_m->device_name_m; } -void -ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const +void ExecutionObject::AcquireLock() { - pimpl_m->WriteLayerOutputsToFile(filename_prefix); + pimpl_m->AcquireLock(); } +void ExecutionObject::ReleaseLock() +{ + pimpl_m->ReleaseLock(); +} // // Create a kernel to call the "initialize" function @@ -342,6 +361,32 @@ ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg, device_index_m)); } +// +// Allocate an OpenCL buffer for TIDL layer output buffer metadata. +// The device will populate metadata for every buffer that is used as an +// output buffer by a layer. This needs to be done before setting up +// process kernel. +// +void ExecutionObject::Impl::EnableOutputBufferTrace() +{ + trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)* + num_network_layers_m* + TIDL_NUM_OUT_BUFS); + + trace_buf_params_m.reset(malloc_ddr + (trace_buf_params_sz_m)); + + // Device will update bufferId if there is valid data for the entry + OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get(); + for (uint32_t i = 0; i < num_network_layers_m; i++) + for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++) + { + OCL_TIDL_BufParams *bufP = + &bufferParams[i*TIDL_NUM_OUT_BUFS+j]; + bufP->bufferId = UINT_MAX; + } +} + // // Create a kernel to call the "process" function // @@ -514,10 +559,17 @@ bool ExecutionObject::Impl::RunAsync(CallType ct) } case CallType::PROCESS: { + std::chrono::time_point t1, t2; + t1 = std::chrono::steady_clock::now(); + shared_process_params_m->frameIdx = current_frame_idx_m; shared_process_params_m->bytesWritten = 0; HostWriteNetInput(); k_process_m->RunAsync(); + + t2 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed = t2 - t1; + host_time_m = elapsed.count() * 1000; break; } case CallType::CLEANUP: @@ -551,13 +603,20 @@ bool ExecutionObject::Impl::Wait(CallType ct) } case CallType::PROCESS: { - bool has_work = k_process_m->Wait(); + float host_elapsed_ms = 0.0f; + bool has_work = k_process_m->Wait(&host_elapsed_ms); if (has_work) { if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS) throw Exception(shared_process_params_m->errorCode, __FILE__, __FUNCTION__, __LINE__); + + std::chrono::time_point t1, t2; + t1 = std::chrono::steady_clock::now(); HostReadNetOutput(); + t2 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed = t2 - t1; + host_time_m += elapsed.count() * 1000 + host_elapsed_ms; } return has_work; @@ -574,6 +633,33 @@ bool ExecutionObject::Impl::Wait(CallType ct) return false; } +bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data) +{ + switch (ct) + { + case CallType::PROCESS: + { + return k_process_m->AddCallback(user_data); + break; + } + default: + return false; + } + + return false; +} + +uint64_t ExecutionObject::Impl::GetProcessCycles() const +{ + uint8_t factor = 1; + + // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles + if (device_m->type() == CL_DEVICE_TYPE_CUSTOM) + factor = 2; + + return shared_process_params_m.get()->cycles * factor; +} + // // Write the trace data to output files // @@ -697,3 +783,16 @@ LayerOutput::~LayerOutput() { delete[] data_m; } + +void ExecutionObject::Impl::AcquireLock() +{ + std::unique_lock lock(mutex_access_m); + cv_access_m.wait(lock, [this]{ return this->is_idle_m; }); + is_idle_m = false; +} + +void ExecutionObject::Impl::ReleaseLock() +{ + is_idle_m = true; + cv_access_m.notify_all(); +} diff --git a/tidl_api/src/execution_object_pipeline.cpp b/tidl_api/src/execution_object_pipeline.cpp new file mode 100644 index 0000000..ff84255 --- /dev/null +++ b/tidl_api/src/execution_object_pipeline.cpp @@ -0,0 +1,360 @@ +/****************************************************************************** + * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/ + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Texas Instruments Incorporated nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include +#include +#include +#include +#include "device_arginfo.h" +#include "execution_object_pipeline.h" + +using namespace tidl; + +class ExecutionObjectPipeline::Impl +{ + public: + Impl(std::vector &eos); + ~Impl(); + + void SetInputOutputBuffer(const ArgInfo &in, const ArgInfo &out); + bool RunAsyncStart(); + bool RunAsyncNext(); + bool Wait(); + + // Trace related + void WriteLayerOutputsToFile(const std::string& filename_prefix) const; + const LayerOutput* GetOutputFromLayer(uint32_t layer_index, + uint32_t output_index) const; + const LayerOutputs* GetOutputsFromAllLayers() const; + + //! for pipelined execution + std::vector eos_m; + std::vector iobufs_m; + + std::string device_name_m; + + //! current frame index + int frame_idx_m; + + //! current execution object index + uint32_t curr_eo_idx_m; + + // host time tracking: pipeline start to finish + float host_time_m; + + private: + //! @brief Initialize ExecutionObjectPipeline with given + //! ExecutionObjects: check consecutive layersGroup, allocate memory + void Initialize(); + + // flag, mutex and cond var for signaling completion and waiting + bool has_work_m, is_processed_m; + std::mutex mutex_m; + std::condition_variable cv_m; + + // host time tracking: pipeline start to finish + std::chrono::time_point start_m; +}; + +ExecutionObjectPipeline::ExecutionObjectPipeline( + std::vector eos) +{ + pimpl_m = std::unique_ptr { new Impl(eos) }; +} + +ExecutionObjectPipeline::Impl::Impl(std::vector &eos) : + eos_m(eos), has_work_m(false), is_processed_m(false) +{ + Initialize(); +} + +// Pointer to implementation idiom: https://herbsutter.com/gotw/_100/: +// Both unique_ptr and shared_ptr can be instantiated with an incomplete type +// unique_ptr's destructor requires a complete type in order to invoke delete +ExecutionObjectPipeline::~ExecutionObjectPipeline() = default; + +char* ExecutionObjectPipeline::GetInputBufferPtr() const +{ + return static_cast(pimpl_m->iobufs_m.front()->GetArg().ptr()); +} + +size_t ExecutionObjectPipeline::GetInputBufferSizeInBytes() const +{ + return pimpl_m->eos_m.front()->GetInputBufferSizeInBytes(); +} + +char* ExecutionObjectPipeline::GetOutputBufferPtr() const +{ + return static_cast(pimpl_m->iobufs_m.back()->GetArg().ptr()); +} + +size_t ExecutionObjectPipeline::GetOutputBufferSizeInBytes() const +{ + return pimpl_m->eos_m.back()->GetOutputBufferSizeInBytes(); +} + +void ExecutionObjectPipeline::SetInputOutputBuffer(const ArgInfo& in, + const ArgInfo& out) +{ + assert(in.ptr() != nullptr && in.size() >= GetInputBufferSizeInBytes()); + assert(out.ptr() != nullptr && out.size() >= GetOutputBufferSizeInBytes()); + pimpl_m->SetInputOutputBuffer(in, out); +} + +void ExecutionObjectPipeline::SetFrameIndex(int idx) +{ + pimpl_m->frame_idx_m = idx; +} + +int ExecutionObjectPipeline::GetFrameIndex() const +{ + return pimpl_m->frame_idx_m; +} + +bool ExecutionObjectPipeline::ProcessFrameStartAsync() +{ + assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr); + bool st = pimpl_m->RunAsyncStart(); + if (st) + st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS, + this); + return st; +} + +bool ExecutionObjectPipeline::ProcessFrameWait() +{ + return pimpl_m->Wait(); +} + +void CallbackWrapper(void *user_data) +{ + ((ExecutionObjectPipeline *) user_data)->RunAsyncNext(); +} + +void ExecutionObjectPipeline::RunAsyncNext() +{ + bool has_next = pimpl_m->RunAsyncNext(); + if (has_next) + pimpl_m->eos_m[pimpl_m->curr_eo_idx_m]->AddCallback( + ExecutionObject::CallType::PROCESS, this); +} + +float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const +{ + float total = 0.0f; + for (auto eo : pimpl_m->eos_m) + total += eo->GetProcessTimeInMilliSeconds(); + return total; +} + +float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds() const +{ + return pimpl_m->host_time_m; +} + +const std::string& ExecutionObjectPipeline::GetDeviceName() const +{ + return pimpl_m->device_name_m; +} + +void +ExecutionObjectPipeline::WriteLayerOutputsToFile( + const std::string& filename_prefix) const +{ + pimpl_m->WriteLayerOutputsToFile(filename_prefix); +} + +const LayerOutput* +ExecutionObjectPipeline::GetOutputFromLayer(uint32_t layer_index, + uint32_t output_index) const +{ + return pimpl_m->GetOutputFromLayer(layer_index, output_index); +} + +const LayerOutputs* +ExecutionObjectPipeline::GetOutputsFromAllLayers() const +{ + return pimpl_m->GetOutputsFromAllLayers(); +} + + +/// Impl methods start here + + +static +void* AllocateMem(size_t size) +{ + if (size == 0) return nullptr; + void *ptr = malloc(size); + if (ptr == nullptr) + throw Exception("Out of memory, ExecutionObjectPipeline malloc failed", + __FILE__, __FUNCTION__, __LINE__); + return ptr; +} + +void ExecutionObjectPipeline::Impl::Initialize() +{ + // Check consecutive layersGroups to form a pipeline + int prev_group = 0; + for (auto eo : eos_m) + { + int group = eo->GetLayersGroupId(); + if (prev_group != 0 && group != prev_group + 1) + throw Exception( + "Non-consecutive layersGroupIds in ExecutionObjectPipeline", + __FILE__, __FUNCTION__, __LINE__); + prev_group = group; + } + + for (auto eo : eos_m) + device_name_m += eo->GetDeviceName() + "+"; + device_name_m.resize(device_name_m.size() - 1); + + // Allocate input and output memory for EOs/layersGroups + // Note that i-th EO's output buffer is the same as (i+1)-th EO's input + // So, if n EOs, then (n+1) buffers: b EO b EO b EO b ... EO b + // User must set the first input buffer and the last output buffer + size_t size; + ArgInfo in(nullptr, 0); + iobufs_m.push_back(new IODeviceArgInfo(in)); + for (auto eo : eos_m) + { + if (eo != eos_m.back()) + size = eo->GetOutputBufferSizeInBytes(); + else + size = 0; + + void *ptr = AllocateMem(size); + ArgInfo out(ptr, size); + iobufs_m.push_back(new IODeviceArgInfo(out)); + } +} + +ExecutionObjectPipeline::Impl::~Impl() +{ + int num_iobufs = iobufs_m.size(); + for (int i = 0; i < num_iobufs; i++) + { + if (! (i == 0 || i == num_iobufs-1)) + free(iobufs_m[i]->GetArg().ptr()); + delete iobufs_m[i]; + } +} + +void ExecutionObjectPipeline::Impl::SetInputOutputBuffer(const ArgInfo &in, + const ArgInfo &out) +{ + delete iobufs_m.front(); + delete iobufs_m.back(); + iobufs_m.front() = new IODeviceArgInfo(in); + iobufs_m.back() = new IODeviceArgInfo(out); +} + +bool ExecutionObjectPipeline::Impl::RunAsyncStart() +{ + start_m = std::chrono::steady_clock::now(); + has_work_m = true; + is_processed_m = false; + host_time_m = 0.0f; + curr_eo_idx_m = 0; + eos_m[0]->AcquireLock(); + eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1]); + return eos_m[0]->ProcessFrameStartAsync(); +} + +// returns true if we have more EOs to execute +bool ExecutionObjectPipeline::Impl::RunAsyncNext() +{ + eos_m[curr_eo_idx_m]->ProcessFrameWait(); + eos_m[curr_eo_idx_m]->ReleaseLock(); + curr_eo_idx_m += 1; + if (curr_eo_idx_m < eos_m.size()) + { + eos_m[curr_eo_idx_m]->AcquireLock(); + eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m], + iobufs_m[curr_eo_idx_m+1]); + eos_m[curr_eo_idx_m]->ProcessFrameStartAsync(); + return true; + } + else + { + std::chrono::duration elapsed = std::chrono::steady_clock::now() + - start_m; + host_time_m = elapsed.count() * 1000; // seconds to milliseconds + is_processed_m = true; + cv_m.notify_all(); + return false; + } +} + +bool ExecutionObjectPipeline::Impl::Wait() +{ + if (! has_work_m) return false; + + std::unique_lock lock(mutex_m); + cv_m.wait(lock, [this]{ return this->is_processed_m; }); + has_work_m = false; + return true; +} + +void +ExecutionObjectPipeline::Impl::WriteLayerOutputsToFile( + const std::string& filename_prefix) const +{ + for (auto eo : eos_m) + eo->WriteLayerOutputsToFile(filename_prefix); +} + +const LayerOutput* +ExecutionObjectPipeline::Impl::GetOutputFromLayer(uint32_t layer_index, + uint32_t output_index) const +{ + const LayerOutput* lo = nullptr; + for (auto eo : eos_m) + { + lo = eo->GetOutputFromLayer(layer_index, output_index); + if (lo != nullptr) break; + } + return lo; +} + +const LayerOutputs* +ExecutionObjectPipeline::Impl::GetOutputsFromAllLayers() const +{ + LayerOutputs *all = new LayerOutputs; + for (auto eo : eos_m) + { + LayerOutputs *los = const_cast( + eo->GetOutputsFromAllLayers()); + for (auto& lo : *los) + all->push_back(std::unique_ptr{ lo.release() }); + delete los; + } + return all; +} + diff --git a/tidl_api/src/executor.cpp b/tidl_api/src/executor.cpp index b644728..914c78a 100644 --- a/tidl_api/src/executor.cpp +++ b/tidl_api/src/executor.cpp @@ -96,6 +96,12 @@ const ExecutionObjects& Executor::GetExecutionObjects() const return pimpl_m->execution_objects_m; } +ExecutionObject* Executor::operator[](uint32_t index) const +{ + assert(index < pimpl_m->execution_objects_m.size()); + return pimpl_m->execution_objects_m[index].get(); +} + bool ExecutorImpl::Initialize(const Configuration& configuration) { configuration_m = configuration; @@ -145,13 +151,11 @@ bool ExecutorImpl::Initialize(const Configuration& configuration) {new ExecutionObject(device_m.get(), index, create_arg, param_heap_arg, configuration_m.EXTMEM_HEAP_SIZE, + layers_group_id_m, + configuration_m.enableOutputTrace, configuration_m.enableInternalInput)} ); } - if (configuration_m.enableOutputTrace) - for (auto &eo : execution_objects_m) - eo->EnableOutputBufferTrace(); - for (auto &eo : execution_objects_m) eo->RunAsync(ExecutionObject::CallType::INIT); @@ -294,4 +298,3 @@ const char* Exception::what() const noexcept { return message_m.c_str(); } - diff --git a/tidl_api/src/ocl_device.cpp b/tidl_api/src/ocl_device.cpp index fba4f94..b3eaf36 100644 --- a/tidl_api/src/ocl_device.cpp +++ b/tidl_api/src/ocl_device.cpp @@ -91,7 +91,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename): // Queue 0 on device 0 queue_m[0] = clCreateCommandQueue(context_m, device_ids[0], - 0, + CL_QUEUE_PROFILING_ENABLE, &errcode); errorCheck(errcode, __LINE__); BuildProgramFromBinary(binary_filename, device_ids, 1); @@ -139,7 +139,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename): int index = static_cast(id); queue_m[index] = clCreateCommandQueue(context_m, sub_devices[index], - 0, + CL_QUEUE_PROFILING_ENABLE, &errcode); errorCheck(errcode, __LINE__); } @@ -187,7 +187,7 @@ EveDevice::EveDevice(const DeviceIds& ids, const std::string &kernel_names): int index = static_cast(id); queue_m[index] = clCreateCommandQueue(context_m, all_device_ids[index], - 0, + CL_QUEUE_PROFILING_ENABLE, &errcode); errorCheck(errcode, __LINE__); } @@ -317,7 +317,7 @@ Kernel& Kernel::RunAsync() } -bool Kernel::Wait() +bool Kernel::Wait(float *host_elapsed_ms) { // Wait called without a corresponding RunAsync if (!is_running_m) @@ -326,6 +326,17 @@ bool Kernel::Wait() TRACE::print("\tKernel: waiting...\n"); cl_int ret = clWaitForEvents(1, &event_m); errorCheck(ret, __LINE__); + + if (host_elapsed_ms != nullptr) + { + cl_ulong t_que, t_end; + clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), &t_que, nullptr); + clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &t_end, nullptr); + *host_elapsed_ms = (t_end - t_que) / 1.0e6; // nano to milli seconds + } + ret = clReleaseEvent(event_m); errorCheck(ret, __LINE__); TRACE::print("\tKernel: finished execution\n"); @@ -334,6 +345,22 @@ bool Kernel::Wait() return true; } +extern void CallbackWrapper(void *user_data) __attribute__((weak)); + +static +void EventCallback(cl_event event, cl_int exec_status, void *user_data) +{ + if (exec_status != CL_SUCCESS || user_data == nullptr) return; + if (CallbackWrapper) CallbackWrapper(user_data); +} + +bool Kernel::AddCallback(void *user_data) +{ + if (! is_running_m) return false; + return clSetEventCallback(event_m, CL_COMPLETE, EventCallback, user_data) + == CL_SUCCESS; +} + Kernel::~Kernel() { for (auto b : buffers_m) diff --git a/tidl_api/src/ocl_device.h b/tidl_api/src/ocl_device.h index 6e80166..04c5db6 100644 --- a/tidl_api/src/ocl_device.h +++ b/tidl_api/src/ocl_device.h @@ -74,6 +74,8 @@ class Device static uint32_t GetNumDevices(DeviceType device_type); + virtual std::string GetDeviceName() = 0; + protected: static const int MAX_DEVICES = 4; @@ -101,6 +103,8 @@ class DspDevice: public Device DspDevice(const DspDevice&) = delete; DspDevice& operator=(const DspDevice&) = delete; + virtual std::string GetDeviceName() { return "DSP"; } + protected: bool BuildProgramFromBinary(const std::string &binary_filename, cl_device_id device_ids[], @@ -117,6 +121,8 @@ class EveDevice : public Device EveDevice(const EveDevice&) = delete; EveDevice& operator=(const EveDevice&) = delete; + virtual std::string GetDeviceName() { return "EVE"; } + protected: bool BuildProgramFromBinary(const std::string &kernel_names, cl_device_id device_ids[], @@ -137,7 +143,8 @@ class Kernel ~Kernel(); Kernel& RunAsync(); - bool Wait(); + bool Wait(float *host_elapsed_ms = nullptr); + bool AddCallback(void *user_data); private: cl_kernel kernel_m;