summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuan Zhao2018-08-09 23:42:42 -0500
committerYuan Zhao2018-08-20 10:57:44 -0500
commit1a42784dc57d81735218ec2dc85172a1ed4e8181 (patch)
tree4d7dea04882465dbb6c95f50582102505623be6f /tidl_api
parent36786d7afca8c1906293854d1e6243bb961c712f (diff)
downloadtidl-api-1a42784dc57d81735218ec2dc85172a1ed4e8181.tar.gz
tidl-api-1a42784dc57d81735218ec2dc85172a1ed4e8181.tar.xz
tidl-api-1a42784dc57d81735218ec2dc85172a1ed4e8181.zip
ExecutionObjectPipeline for executing layersGroups
- Add top level ExecutionObjectPipeline class to execute multiple layersGroups. - An ExecutionObjectPipeline is constructed from multiple ExecutionObjects, each ExecutionObject executes one layersGroup in the network, together they execute consecutive layersGroups. - Same look and feel as ExecutionObject, e.g. ProcessFrameStartAsync, ProcessFrameWait, GetInputBufferPointer, GetOutputBufferPointer - MCT-1017, MCT-1029
Diffstat (limited to 'tidl_api')
-rw-r--r--tidl_api/Makefile6
-rw-r--r--tidl_api/inc/execution_object.h71
-rw-r--r--tidl_api/inc/execution_object_internal.h119
-rw-r--r--tidl_api/inc/execution_object_pipeline.h151
-rw-r--r--tidl_api/inc/executor.h9
-rw-r--r--tidl_api/src/execution_object.cpp209
-rw-r--r--tidl_api/src/execution_object_pipeline.cpp360
-rw-r--r--tidl_api/src/executor.cpp13
-rw-r--r--tidl_api/src/ocl_device.cpp35
-rw-r--r--tidl_api/src/ocl_device.h9
10 files changed, 882 insertions, 100 deletions
diff --git a/tidl_api/Makefile b/tidl_api/Makefile
index 05a3704..3fc6a2c 100644
--- a/tidl_api/Makefile
+++ b/tidl_api/Makefile
@@ -39,7 +39,8 @@ AR = ar
39 39
40 40
41SRCS = ocl_device.cpp configuration_parser.cpp configuration.cpp\ 41SRCS = ocl_device.cpp configuration_parser.cpp configuration.cpp\
42 executor.cpp execution_object.cpp trace.cpp util.cpp 42 executor.cpp execution_object.cpp trace.cpp util.cpp \
43 execution_object_pipeline.cpp
43SRCS_IMGUTIL = imgutil.cpp 44SRCS_IMGUTIL = imgutil.cpp
44 45
45OBJS = $(SRCS:.cpp=.o) 46OBJS = $(SRCS:.cpp=.o)
@@ -53,8 +54,7 @@ HOST_OBJ_IMGUTIL_FILES = $(addprefix obj/,$(OBJS_IMGUTIL))
53HEADERS = src/common_defines.h src/executor_impl.h src/ocl_device.h 54HEADERS = src/common_defines.h src/executor_impl.h src/ocl_device.h
54HEADERS += src/parameters.h src/tidl_create_params.h src/trace.h src/util.h 55HEADERS += src/parameters.h src/tidl_create_params.h src/trace.h src/util.h
55HEADERS += inc/configuration.h inc/execution_object.h inc/executor.h 56HEADERS += inc/configuration.h inc/execution_object.h inc/executor.h
56HEADERS += inc/imgutil.h src/device_arginfo.h 57HEADERS += inc/imgutil.h src/device_arginfo.h inc/execution_object_pipeline.h
57
58 58
59ifeq ($(BUILD), debug) 59ifeq ($(BUILD), debug)
60 CXXFLAGS += -Og -g -ggdb 60 CXXFLAGS += -Og -g -ggdb
diff --git a/tidl_api/inc/execution_object.h b/tidl_api/inc/execution_object.h
index e78ad2e..c1d86fc 100644
--- a/tidl_api/inc/execution_object.h
+++ b/tidl_api/inc/execution_object.h
@@ -31,6 +31,7 @@
31#pragma once 31#pragma once
32 32
33#include <memory> 33#include <memory>
34#include "execution_object_internal.h"
34 35
35namespace tidl { 36namespace tidl {
36 37
@@ -39,13 +40,12 @@ class Device;
39class LayerOutput; 40class LayerOutput;
40class IODeviceArgInfo; 41class IODeviceArgInfo;
41 42
42typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs;
43 43
44/*! @class ExecutionObject 44/*! @class ExecutionObject
45 @brief Runs the TIDL network on an OpenCL device 45 @brief Runs the TIDL network on an OpenCL device
46*/ 46*/
47 47
48class ExecutionObject 48class ExecutionObject : public ExecutionObjectInternalInterface
49{ 49{
50 public: 50 public:
51 51
@@ -55,6 +55,8 @@ class ExecutionObject
55 const ArgInfo& create_arg, 55 const ArgInfo& create_arg,
56 const ArgInfo& param_heap_arg, 56 const ArgInfo& param_heap_arg,
57 size_t extmem_heap_size, 57 size_t extmem_heap_size,
58 int layersGroupId,
59 bool output_trace,
58 bool internal_input); 60 bool internal_input);
59 //! @private 61 //! @private
60 ~ExecutionObject(); 62 ~ExecutionObject();
@@ -62,52 +64,56 @@ class ExecutionObject
62 //! Specify the input and output buffers used by the EO 64 //! Specify the input and output buffers used by the EO
63 //! @param in buffer used for input. 65 //! @param in buffer used for input.
64 //! @param out buffer used for output. 66 //! @param out buffer used for output.
65 void SetInputOutputBuffer (const ArgInfo& in, const ArgInfo& out); 67 void SetInputOutputBuffer(const ArgInfo& in,
68 const ArgInfo& out) override;
66 69
67 //! Returns a pointer to the input buffer set via SetInputOutputBuffer 70 //! Returns a pointer to the input buffer set via SetInputOutputBuffer
68 char* GetInputBufferPtr() const; 71 char* GetInputBufferPtr() const override;
69 72
70 //! Returns size of the input buffer 73 //! Returns size of the input buffer
71 size_t GetInputBufferSizeInBytes() const; 74 size_t GetInputBufferSizeInBytes() const override;
75
76 //! Returns a pointer to the output buffer
77 char* GetOutputBufferPtr() const override;
78
79 //! Returns size of the output buffer
80 size_t GetOutputBufferSizeInBytes() const override;
72 81
73 //! @brief Set the frame index of the frame currently processed by the 82 //! @brief Set the frame index of the frame currently processed by the
74 //! ExecutionObject. Used for trace/debug messages 83 //! ExecutionObject. Used for trace/debug messages
75 //! @param idx index of the frame 84 //! @param idx index of the frame
76 void SetFrameIndex(int idx); 85 void SetFrameIndex(int idx) override;
77 86
78 //! Returns the index of a frame being processed (set by SetFrameIndex) 87 //! Returns the index of a frame being processed (set by SetFrameIndex)
79 int GetFrameIndex() const; 88 int GetFrameIndex() const override;
80
81 //! Returns a pointer to the output buffer
82 char* GetOutputBufferPtr() const;
83
84 //! Returns the number of bytes written to the output buffer
85 size_t GetOutputBufferSizeInBytes() const;
86 89
87 //! @brief Start processing a frame. The call is asynchronous and returns 90 //! @brief Start processing a frame. The call is asynchronous and
88 //! immediately. Use ExecutionObject::ProcessFrameWait to wait 91 //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait
89 bool ProcessFrameStartAsync(); 92 bool ProcessFrameStartAsync() override;
90 93
91 //! Wait for the execution object to complete processing a frame 94 //! Wait for the execution object to complete processing a frame
92 //! @return false if ExecutionObject::ProcessFrameWait was called 95 //! @return false if ExecutionObject::ProcessFrameWait was called
93 //! without a corresponding call to 96 //! without a corresponding call to
94 //! ExecutionObject::ProcessFrameStartAsync. 97 //! ExecutionObject::ProcessFrameStartAsync.
95 bool ProcessFrameWait(); 98 bool ProcessFrameWait() override;
96
97 //! @brief return the number of cycles taken *on the device* to
98 //! execute the process call
99 //! @return Number of cycles to process a frame on the device.
100 uint64_t GetProcessCycles() const;
101 99
102 //! @brief return the number of milliseconds taken *on the device* to 100 //! @brief return the number of milliseconds taken *on the device* to
103 //! execute the process call 101 //! execute the process call
104 //! @return Number of milliseconds to process a frame on the device. 102 //! @return Number of milliseconds to process a frame on the device.
105 float GetProcessTimeInMilliSeconds() const; 103 float GetProcessTimeInMilliSeconds() const override;
104
105 //! @brief return the number of milliseconds taken *on the host* to
106 //! execute the process call
107 //! @return Number of milliseconds to process a frame on the host.
108 float GetHostProcessTimeInMilliSeconds() const override;
109
110 //! Returns the device name that the ExecutionObject runs on
111 const std::string& GetDeviceName() const override;
106 112
107 //! Write the output buffer for each layer to a file 113 //! Write the output buffer for each layer to a file
108 //! <filename_prefix>_<ID>_HxW.bin 114 //! \<filename_prefix>_<ID>_HxW.bin
109 void WriteLayerOutputsToFile(const std::string& filename_prefix= 115 void WriteLayerOutputsToFile(const std::string& filename_prefix=
110 "trace_dump_") const; 116 "trace_dump_") const override;
111 117
112 //! Returns a LayerOutput object corresponding to a layer. 118 //! Returns a LayerOutput object corresponding to a layer.
113 //! Caller is responsible for deleting the LayerOutput object. 119 //! Caller is responsible for deleting the LayerOutput object.
@@ -116,10 +122,13 @@ class ExecutionObject
116 //! @param output_index The output index of the buffer for a given 122 //! @param output_index The output index of the buffer for a given
117 //! layer. Defaults to 0. 123 //! layer. Defaults to 0.
118 const LayerOutput* GetOutputFromLayer(uint32_t layer_index, 124 const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
119 uint32_t output_index=0) const; 125 uint32_t output_index=0) const override;
120 126
121 //! Get output buffers from all layers 127 //! Get output buffers from all layers
122 const LayerOutputs* GetOutputsFromAllLayers() const; 128 const LayerOutputs* GetOutputsFromAllLayers() const override;
129
130 //! Returns the layersGrupId that the ExecutionObject is processing
131 int GetLayersGroupId() const;
123 132
124 //! @private 133 //! @private
125 // Used by the Executor 134 // Used by the Executor
@@ -127,12 +136,16 @@ class ExecutionObject
127 bool RunAsync(CallType ct); 136 bool RunAsync(CallType ct);
128 bool Wait (CallType ct); 137 bool Wait (CallType ct);
129 138
139 //! @private
140 // Used by the ExecutionObjectPipeline
141 bool AddCallback(CallType ct, void *user_data);
142 void AcquireLock();
143 void ReleaseLock();
144
130 ExecutionObject() = delete; 145 ExecutionObject() = delete;
131 ExecutionObject(const ExecutionObject&) = delete; 146 ExecutionObject(const ExecutionObject&) = delete;
132 ExecutionObject& operator=(const ExecutionObject&) = delete; 147 ExecutionObject& operator=(const ExecutionObject&) = delete;
133 148
134 void EnableOutputBufferTrace();
135
136 //! @private 149 //! @private
137 void SetInputOutputBuffer(const IODeviceArgInfo* in, 150 void SetInputOutputBuffer(const IODeviceArgInfo* in,
138 const IODeviceArgInfo* out); 151 const IODeviceArgInfo* out);
diff --git a/tidl_api/inc/execution_object_internal.h b/tidl_api/inc/execution_object_internal.h
new file mode 100644
index 0000000..816da94
--- /dev/null
+++ b/tidl_api/inc/execution_object_internal.h
@@ -0,0 +1,119 @@
1/******************************************************************************
2 * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28
29/*! @file execution_object_internal.h */
30
31#pragma once
32
33namespace tidl {
34
35class LayerOutput;
36
37typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs;
38
39/*! @cond HIDDEN_SYMBOLS
40 @class ExecutionObjectInternalInterface
41 @brief Internal interface for running the TIDL network on OpenCL devices
42 Do not use this internal class directly.
43 Please use ExecutionObject or ExecutionObejctPipeline instead.
44*/
45class ExecutionObjectInternalInterface
46{
47 public:
48 virtual ~ExecutionObjectInternalInterface() {};
49
50 //! Specify the input and output buffers used by the EO
51 //! @param in buffer used for input.
52 //! @param out buffer used for output.
53 virtual void SetInputOutputBuffer(const ArgInfo& in,
54 const ArgInfo& out) =0;
55
56 //! Returns a pointer to the input buffer set via SetInputOutputBuffer
57 virtual char* GetInputBufferPtr() const =0;
58
59 //! Returns size of the input buffer
60 virtual size_t GetInputBufferSizeInBytes() const =0;
61
62 //! Returns a pointer to the output buffer
63 virtual char* GetOutputBufferPtr() const =0;
64
65 //! Returns size of the output buffer
66 virtual size_t GetOutputBufferSizeInBytes() const =0;
67
68 //! @brief Set the frame index of the frame currently processed by the
69 //! ExecutionObject. Used for trace/debug messages
70 //! @param idx index of the frame
71 virtual void SetFrameIndex(int idx) =0;
72
73 //! Returns the index of a frame being processed (set by SetFrameIndex)
74 virtual int GetFrameIndex() const =0;
75
76 //! @brief Start processing a frame. The call is asynchronous and returns
77 //! immediately. Use ExecutionObject::ProcessFrameWait to wait
78 virtual bool ProcessFrameStartAsync() =0;
79
80 //! Wait for the execution object to complete processing a frame
81 //! @return false if ExecutionObject::ProcessFrameWait was called
82 //! without a corresponding call to
83 //! ExecutionObject::ProcessFrameStartAsync.
84 virtual bool ProcessFrameWait() =0;
85
86 //! @brief return the number of milliseconds taken *on the device* to
87 //! execute the process call
88 //! @return Number of milliseconds to process a frame on the device.
89 virtual float GetProcessTimeInMilliSeconds() const =0;
90
91 //! @brief return the number of milliseconds taken *on the host* to
92 //! execute the process call
93 //! @return Number of milliseconds to process a frame on the host.
94 virtual float GetHostProcessTimeInMilliSeconds() const =0;
95
96 //! Returns the device name that the ExecutionObject runs on
97 virtual const std::string& GetDeviceName() const =0;
98
99 //! Write the output buffer for each layer to a file
100 //! \<filename_prefix>_<ID>_HxW.bin
101 virtual void WriteLayerOutputsToFile(const std::string& filename_prefix=
102 "trace_dump_") const =0;
103
104 //! Returns a LayerOutput object corresponding to a layer.
105 //! Caller is responsible for deleting the LayerOutput object.
106 //! @see LayerOutput
107 //! @param layer_index The layer index of the layer
108 //! @param output_index The output index of the buffer for a given
109 //! layer. Defaults to 0.
110 virtual const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
111 uint32_t output_index=0) const =0;
112
113 //! Get output buffers from all layers
114 virtual const LayerOutputs* GetOutputsFromAllLayers() const =0;
115};
116/*! @endcond
117*/
118
119} // namespace tidl
diff --git a/tidl_api/inc/execution_object_pipeline.h b/tidl_api/inc/execution_object_pipeline.h
new file mode 100644
index 0000000..aaa6cf0
--- /dev/null
+++ b/tidl_api/inc/execution_object_pipeline.h
@@ -0,0 +1,151 @@
1/******************************************************************************
2 * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28
29//! @file execution_object_pipeline.h
30
31#pragma once
32#include <string>
33#include <vector>
34#include <cstdint>
35#include <cassert>
36
37#include "executor.h"
38#include "execution_object_internal.h"
39#include "execution_object.h"
40
41namespace tidl {
42
43/*! @class ExecutionObjectPipeline
44 @brief Manages the pipelined execution using multiple ExecutionObjects.
45 Each executor runs one layersGroup of the network. ExecutionObjects
46 must run consecutive layersGroups to form a pipelined execution.
47*/
48class ExecutionObjectPipeline : public ExecutionObjectInternalInterface
49{
50 public:
51 //! @brief Create an ExecutionObjectPipeline object.
52 //!
53 //! The ExecutionObjectPipeline will take the provided ExecutionObjects
54 //! to create an execution pipeline. E.g.
55 //! @code
56 //! Configuration config("path to configuration file");
57 //! DeviceIds ids = {DeviceId::ID0, DeviceId::ID1};
58 //! Executor exe_eve(DeviceType::EVE, ids, config, 1);
59 //! Executor exe_dsp(DeviceType::DSP, ids, config, 2);
60 //! ExecutionObjectPipeline ep0({exe_eve[0], exe_dsp[0]});
61 //! ExecutionObjectPipeline ep1({exe_eve[1], exe_dsp[1]});
62 //! @endcode
63 //!
64 //! @param eos DSP or EVE ExecutionObjects forming a pipeline
65 ExecutionObjectPipeline(std::vector<ExecutionObject*> eos);
66
67 //! @brief Tear down an ExecutionObjectPipeline and free used resources
68 ~ExecutionObjectPipeline();
69
70 //! Specify the input and output buffers used by the EOP
71 //! @param in buffer used for input.
72 //! @param out buffer used for output.
73 void SetInputOutputBuffer (const ArgInfo& in,
74 const ArgInfo& out) override;
75
76 //! Returns a pointer to the input buffer
77 char* GetInputBufferPtr() const override;
78
79 //! Returns size of the input buffer
80 size_t GetInputBufferSizeInBytes() const override;
81
82 //! Returns a pointer to the output buffer
83 char* GetOutputBufferPtr() const override;
84
85 //! Returns the number of bytes written to the output buffer
86 size_t GetOutputBufferSizeInBytes() const override;
87
88 //! @brief Set the frame index of the frame currently processed by the
89 //! ExecutionObjectPipeline. Used for trace/debug messages
90 //! @param idx index of the frame
91 void SetFrameIndex(int idx) override;
92
93 //! Returns the index of a frame being processed (set by SetFrameIndex)
94 int GetFrameIndex() const override;
95
96 //! @brief Start processing a frame. The call is asynchronous and
97 //! returns immediately. Use ProcessFrameWait() to wait
98 bool ProcessFrameStartAsync() override;
99
100 //! Wait for the executor pipeline to complete processing a frame
101 //! @return false if ProcessFrameWait() was called
102 //! without a corresponding call to
103 //! ExecutionObjectPipeline::ProcessFrameStartAsync().
104 bool ProcessFrameWait() override;
105
106 //! @brief return the number of milliseconds taken *on the device* to
107 //! execute the process call
108 //! @return Number of milliseconds to process a frame on the device.
109 float GetProcessTimeInMilliSeconds() const override;
110
111 //! @brief return the number of milliseconds taken *on the host* to
112 //! execute the process call
113 //! @return Number of milliseconds to process a frame on the host.
114 float GetHostProcessTimeInMilliSeconds() const override;
115
116 //! Return the combined device names that this pipeline runs on
117 const std::string& GetDeviceName() const override;
118
119 //! Write the output buffer for each layer to a file
120 //! \<filename_prefix>_<ID>_HxW.bin
121 void WriteLayerOutputsToFile(const std::string& filename_prefix=
122 "trace_dump_") const override;
123
124 //! Returns a LayerOutput object corresponding to a layer.
125 //! Caller is responsible for deleting the LayerOutput object.
126 //! @see LayerOutput
127 //! @param layer_index The layer index of the layer
128 //! @param output_index The output index of the buffer for a given
129 //! layer. Defaults to 0.
130 const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
131 uint32_t output_index=0) const override;
132
133 //! Get output buffers from all layers
134 const LayerOutputs* GetOutputsFromAllLayers() const override;
135
136 //! @private Used by runtime
137 //! @brief callback function at the completion of each ExecutionObject,
138 //! to chain the next ExectionObject for execution
139 void RunAsyncNext();
140
141 ExecutionObjectPipeline() = delete;
142 ExecutionObjectPipeline(const ExecutionObjectPipeline&) = delete;
143 ExecutionObjectPipeline& operator=(const ExecutionObjectPipeline&)
144 = delete;
145
146 private:
147 class Impl;
148 std::unique_ptr<Impl> pimpl_m;
149};
150
151} // namespace tidl
diff --git a/tidl_api/inc/executor.h b/tidl_api/inc/executor.h
index 23d92ff..1febfea 100644
--- a/tidl_api/inc/executor.h
+++ b/tidl_api/inc/executor.h
@@ -64,7 +64,7 @@ class ExecutionObject;
64typedef std::vector<std::unique_ptr<ExecutionObject>> ExecutionObjects; 64typedef std::vector<std::unique_ptr<ExecutionObject>> ExecutionObjects;
65 65
66/*! @class Executor 66/*! @class Executor
67 @brief Manages the overall execution of a network using the 67 @brief Manages the overall execution of a layersGroup in a network using the
68 specified configuration and the set of devices available to the 68 specified configuration and the set of devices available to the
69 executor. 69 executor.
70*/ 70*/
@@ -78,7 +78,7 @@ class Executor
78 //! @code 78 //! @code
79 //! Configuration configuration; 79 //! Configuration configuration;
80 //! configuration.ReadFromFile("path to configuration file"); 80 //! configuration.ReadFromFile("path to configuration file");
81 //! DeviceIds ids1 = {DeviceId::ID2, DeviceId::ID3}; 81 //! DeviceIds ids = {DeviceId::ID2, DeviceId::ID3};
82 //! Executor executor(DeviceType::EVE, ids, configuration); 82 //! Executor executor(DeviceType::EVE, ids, configuration);
83 //! @endcode 83 //! @endcode
84 //! 84 //!
@@ -98,6 +98,9 @@ class Executor
98 //! available on this instance of the Executor 98 //! available on this instance of the Executor
99 const ExecutionObjects& GetExecutionObjects() const; 99 const ExecutionObjects& GetExecutionObjects() const;
100 100
101 //! Returns a single execution object at index
102 ExecutionObject* operator[](uint32_t index) const;
103
101 //! @brief Returns the number of devices of the specified type 104 //! @brief Returns the number of devices of the specified type
102 //! available for TI DL. 105 //! available for TI DL.
103 //! @param device_type DSP or EVE/EVE device 106 //! @param device_type DSP or EVE/EVE device
@@ -106,7 +109,7 @@ class Executor
106 109
107 //! @brief Returns a string corresponding to the API version 110 //! @brief Returns a string corresponding to the API version
108 //! 111 //!
109 //! @return <major_ver>.<minor_ver>.<patch_ver>.<git_sha> 112 //! @return \<major_ver>.\<minor_ver>.\<patch_ver>.\<git_sha>
110 static std::string GetAPIVersion(); 113 static std::string GetAPIVersion();
111 114
112 Executor(const Executor&) = delete; 115 Executor(const Executor&) = delete;
diff --git a/tidl_api/src/execution_object.cpp b/tidl_api/src/execution_object.cpp
index d722ebb..178bbca 100644
--- a/tidl_api/src/execution_object.cpp
+++ b/tidl_api/src/execution_object.cpp
@@ -31,6 +31,9 @@
31#include <string.h> 31#include <string.h>
32#include <fstream> 32#include <fstream>
33#include <climits> 33#include <climits>
34#include <mutex>
35#include <condition_variable>
36#include <chrono>
34#include "executor.h" 37#include "executor.h"
35#include "execution_object.h" 38#include "execution_object.h"
36#include "trace.h" 39#include "trace.h"
@@ -50,13 +53,24 @@ class ExecutionObject::Impl
50 const DeviceArgInfo& create_arg, 53 const DeviceArgInfo& create_arg,
51 const DeviceArgInfo& param_heap_arg, 54 const DeviceArgInfo& param_heap_arg,
52 size_t extmem_heap_size, 55 size_t extmem_heap_size,
56 int layers_group_id,
57 bool output_trace,
53 bool internal_input); 58 bool internal_input);
54 ~Impl() {} 59 ~Impl() {}
55 60
56 bool RunAsync(CallType ct); 61 bool RunAsync(CallType ct);
57 bool Wait (CallType ct); 62 bool Wait (CallType ct);
63 bool AddCallback(CallType ct, void *user_data);
64
65 uint64_t GetProcessCycles() const;
66 int GetLayersGroupId() const;
67 void AcquireLock();
68 void ReleaseLock();
58 69
59 Device* device_m; 70 Device* device_m;
71 // Index of the OpenCL device/queue used by this EO
72 uint8_t device_index_m;
73 std::string device_name_m;
60 74
61 up_malloc_ddr<char> tidl_extmem_heap_m; 75 up_malloc_ddr<char> tidl_extmem_heap_m;
62 up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m; 76 up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m;
@@ -70,6 +84,9 @@ class ExecutionObject::Impl
70 // Frame being processed by the EO 84 // Frame being processed by the EO
71 int current_frame_idx_m; 85 int current_frame_idx_m;
72 86
87 // LayersGroupId being processed by the EO
88 int layers_group_id_m;
89
73 // Trace related 90 // Trace related
74 void WriteLayerOutputsToFile (const std::string& filename_prefix) const; 91 void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
75 92
@@ -81,25 +98,29 @@ class ExecutionObject::Impl
81 up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m; 98 up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
82 size_t trace_buf_params_sz_m; 99 size_t trace_buf_params_sz_m;
83 100
101 // host time tracking: eo start to finish
102 float host_time_m;
103
84 private: 104 private:
85 void SetupInitializeKernel(const DeviceArgInfo& create_arg, 105 void SetupInitializeKernel(const DeviceArgInfo& create_arg,
86 const DeviceArgInfo& param_heap_arg, 106 const DeviceArgInfo& param_heap_arg,
87 size_t extmem_heap_size, 107 size_t extmem_heap_size,
88 bool internal_input); 108 bool internal_input);
109 void EnableOutputBufferTrace();
89 void SetupProcessKernel(); 110 void SetupProcessKernel();
90 111
91 void HostWriteNetInput(); 112 void HostWriteNetInput();
92 void HostReadNetOutput(); 113 void HostReadNetOutput();
93 void ComputeInputOutputSizes(); 114 void ComputeInputOutputSizes();
94 115
95 // Index of the OpenCL device/queue used by this EO
96 uint8_t device_index_m;
97
98 std::unique_ptr<Kernel> k_initialize_m; 116 std::unique_ptr<Kernel> k_initialize_m;
99 std::unique_ptr<Kernel> k_process_m; 117 std::unique_ptr<Kernel> k_process_m;
100 std::unique_ptr<Kernel> k_cleanup_m; 118 std::unique_ptr<Kernel> k_cleanup_m;
101 119
102 120 // Guarding sole access to input/output for one frame during execution
121 bool is_idle_m;
122 std::mutex mutex_access_m;
123 std::condition_variable cv_access_m;
103}; 124};
104 125
105 126
@@ -108,6 +129,8 @@ ExecutionObject::ExecutionObject(Device* d,
108 const ArgInfo& create_arg, 129 const ArgInfo& create_arg,
109 const ArgInfo& param_heap_arg, 130 const ArgInfo& param_heap_arg,
110 size_t extmem_heap_size, 131 size_t extmem_heap_size,
132 int layers_group_id,
133 bool output_trace,
111 bool internal_input) 134 bool internal_input)
112{ 135{
113 DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER); 136 DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER);
@@ -118,6 +141,8 @@ ExecutionObject::ExecutionObject(Device* d,
118 create_arg_d, 141 create_arg_d,
119 param_heap_arg_d, 142 param_heap_arg_d,
120 extmem_heap_size, 143 extmem_heap_size,
144 layers_group_id,
145 output_trace,
121 internal_input) }; 146 internal_input) };
122} 147}
123 148
@@ -127,8 +152,11 @@ ExecutionObject::Impl::Impl(Device* d,
127 const DeviceArgInfo& create_arg, 152 const DeviceArgInfo& create_arg,
128 const DeviceArgInfo& param_heap_arg, 153 const DeviceArgInfo& param_heap_arg,
129 size_t extmem_heap_size, 154 size_t extmem_heap_size,
155 int layers_group_id,
156 bool output_trace,
130 bool internal_input): 157 bool internal_input):
131 device_m(d), 158 device_m(d),
159 device_index_m(device_index),
132 tidl_extmem_heap_m (nullptr, &__free_ddr), 160 tidl_extmem_heap_m (nullptr, &__free_ddr),
133 shared_initialize_params_m(nullptr, &__free_ddr), 161 shared_initialize_params_m(nullptr, &__free_ddr),
134 shared_process_params_m(nullptr, &__free_ddr), 162 shared_process_params_m(nullptr, &__free_ddr),
@@ -137,23 +165,26 @@ ExecutionObject::Impl::Impl(Device* d,
137 in_m(), 165 in_m(),
138 out_m(), 166 out_m(),
139 current_frame_idx_m(0), 167 current_frame_idx_m(0),
168 layers_group_id_m(layers_group_id),
140 num_network_layers_m(0), 169 num_network_layers_m(0),
141 trace_buf_params_m(nullptr, &__free_ddr), 170 trace_buf_params_m(nullptr, &__free_ddr),
142 trace_buf_params_sz_m(0), 171 trace_buf_params_sz_m(0),
143 device_index_m(device_index),
144 k_initialize_m(nullptr), 172 k_initialize_m(nullptr),
145 k_process_m(nullptr), 173 k_process_m(nullptr),
146 k_cleanup_m(nullptr) 174 k_cleanup_m(nullptr),
175 is_idle_m(true)
147{ 176{
148 SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size, 177 device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m);
149 internal_input);
150
151 SetupProcessKernel();
152
153 // Save number of layers in the network 178 // Save number of layers in the network
154 const TIDL_CreateParams* cp = 179 const TIDL_CreateParams* cp =
155 static_cast<const TIDL_CreateParams *>(create_arg.ptr()); 180 static_cast<const TIDL_CreateParams *>(create_arg.ptr());
156 num_network_layers_m = cp->net.numLayers; 181 num_network_layers_m = cp->net.numLayers;
182
183 SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size,
184 internal_input);
185
186 if (output_trace) EnableOutputBufferTrace();
187 SetupProcessKernel();
157} 188}
158 189
159// Pointer to implementation idiom: https://herbsutter.com/gotw/_100/: 190// Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
@@ -168,9 +199,7 @@ char* ExecutionObject::GetInputBufferPtr() const
168 199
169size_t ExecutionObject::GetInputBufferSizeInBytes() const 200size_t ExecutionObject::GetInputBufferSizeInBytes() const
170{ 201{
171 const DeviceArgInfo& arg = pimpl_m->in_m.GetArg(); 202 return pimpl_m->in_size_m;
172 if (arg.ptr() == nullptr) return pimpl_m->in_size_m;
173 else return arg.size();
174} 203}
175 204
176char* ExecutionObject::GetOutputBufferPtr() const 205char* ExecutionObject::GetOutputBufferPtr() const
@@ -180,11 +209,7 @@ char* ExecutionObject::GetOutputBufferPtr() const
180 209
181size_t ExecutionObject::GetOutputBufferSizeInBytes() const 210size_t ExecutionObject::GetOutputBufferSizeInBytes() const
182{ 211{
183 const DeviceArgInfo& arg = pimpl_m->out_m.GetArg(); 212 return pimpl_m->out_size_m;
184 if (arg.ptr() == nullptr)
185 return pimpl_m->out_size_m;
186 else
187 return pimpl_m->shared_process_params_m.get()->bytesWritten;
188} 213}
189 214
190void ExecutionObject::SetFrameIndex(int idx) 215void ExecutionObject::SetFrameIndex(int idx)
@@ -199,8 +224,8 @@ int ExecutionObject::GetFrameIndex() const
199 224
200void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out) 225void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out)
201{ 226{
202 assert(in.ptr() != nullptr && in.size() > 0); 227 assert(in.ptr() != nullptr && in.size() >= pimpl_m->in_size_m);
203 assert(out.ptr() != nullptr && out.size() > 0); 228 assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
204 229
205 pimpl_m->in_m = IODeviceArgInfo(in); 230 pimpl_m->in_m = IODeviceArgInfo(in);
206 pimpl_m->out_m = IODeviceArgInfo(out); 231 pimpl_m->out_m = IODeviceArgInfo(out);
@@ -215,6 +240,7 @@ void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
215 240
216bool ExecutionObject::ProcessFrameStartAsync() 241bool ExecutionObject::ProcessFrameStartAsync()
217{ 242{
243 assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
218 return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS); 244 return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS);
219} 245}
220 246
@@ -233,21 +259,26 @@ bool ExecutionObject::Wait (CallType ct)
233 return pimpl_m->Wait(ct); 259 return pimpl_m->Wait(ct);
234} 260}
235 261
236uint64_t ExecutionObject::GetProcessCycles() const 262bool ExecutionObject::AddCallback(CallType ct, void *user_data)
237{ 263{
238 uint8_t factor = 1; 264 return pimpl_m->AddCallback(ct, user_data);
239
240 // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
241 if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM)
242 factor = 2;
243
244 return pimpl_m->shared_process_params_m.get()->cycles * factor;
245} 265}
246 266
247float ExecutionObject::GetProcessTimeInMilliSeconds() const 267float ExecutionObject::GetProcessTimeInMilliSeconds() const
248{ 268{
249 float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000; 269 float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
250 return ((float)GetProcessCycles())/frequency * 1000; 270 return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000;
271}
272
273float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
274{
275 return pimpl_m->host_time_m;
276}
277
278void
279ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
280{
281 pimpl_m->WriteLayerOutputsToFile(filename_prefix);
251} 282}
252 283
253const LayerOutput* ExecutionObject::GetOutputFromLayer( 284const LayerOutput* ExecutionObject::GetOutputFromLayer(
@@ -261,37 +292,25 @@ const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const
261 return pimpl_m->GetOutputsFromAllLayers(); 292 return pimpl_m->GetOutputsFromAllLayers();
262} 293}
263 294
264// 295int ExecutionObject::GetLayersGroupId() const
265// Allocate an OpenCL buffer for TIDL layer output buffer metadata.
266// The device will populate metadata for every buffer that is used as an
267// output buffer by a layer.
268//
269void ExecutionObject::EnableOutputBufferTrace()
270{ 296{
271 pimpl_m->trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)* 297 return pimpl_m->layers_group_id_m;
272 pimpl_m->num_network_layers_m* 298}
273 TIDL_NUM_OUT_BUFS);
274
275 pimpl_m->trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
276 (pimpl_m->trace_buf_params_sz_m));
277 299
278 // Device will update bufferId if there is valid data for the entry 300const std::string& ExecutionObject::GetDeviceName() const
279 OCL_TIDL_BufParams* bufferParams = pimpl_m->trace_buf_params_m.get(); 301{
280 for (uint32_t i = 0; i < pimpl_m->num_network_layers_m; i++) 302 return pimpl_m->device_name_m;
281 for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
282 {
283 OCL_TIDL_BufParams *bufP =
284 &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
285 bufP->bufferId = UINT_MAX;
286 }
287} 303}
288 304
289void 305void ExecutionObject::AcquireLock()
290ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const
291{ 306{
292 pimpl_m->WriteLayerOutputsToFile(filename_prefix); 307 pimpl_m->AcquireLock();
293} 308}
294 309
310void ExecutionObject::ReleaseLock()
311{
312 pimpl_m->ReleaseLock();
313}
295 314
296// 315//
297// Create a kernel to call the "initialize" function 316// Create a kernel to call the "initialize" function
@@ -343,6 +362,32 @@ ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg,
343} 362}
344 363
345// 364//
365// Allocate an OpenCL buffer for TIDL layer output buffer metadata.
366// The device will populate metadata for every buffer that is used as an
367// output buffer by a layer. This needs to be done before setting up
368// process kernel.
369//
370void ExecutionObject::Impl::EnableOutputBufferTrace()
371{
372 trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)*
373 num_network_layers_m*
374 TIDL_NUM_OUT_BUFS);
375
376 trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams>
377 (trace_buf_params_sz_m));
378
379 // Device will update bufferId if there is valid data for the entry
380 OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get();
381 for (uint32_t i = 0; i < num_network_layers_m; i++)
382 for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++)
383 {
384 OCL_TIDL_BufParams *bufP =
385 &bufferParams[i*TIDL_NUM_OUT_BUFS+j];
386 bufP->bufferId = UINT_MAX;
387 }
388}
389
390//
346// Create a kernel to call the "process" function 391// Create a kernel to call the "process" function
347// 392//
348void 393void
@@ -514,10 +559,17 @@ bool ExecutionObject::Impl::RunAsync(CallType ct)
514 } 559 }
515 case CallType::PROCESS: 560 case CallType::PROCESS:
516 { 561 {
562 std::chrono::time_point<std::chrono::steady_clock> t1, t2;
563 t1 = std::chrono::steady_clock::now();
564
517 shared_process_params_m->frameIdx = current_frame_idx_m; 565 shared_process_params_m->frameIdx = current_frame_idx_m;
518 shared_process_params_m->bytesWritten = 0; 566 shared_process_params_m->bytesWritten = 0;
519 HostWriteNetInput(); 567 HostWriteNetInput();
520 k_process_m->RunAsync(); 568 k_process_m->RunAsync();
569
570 t2 = std::chrono::steady_clock::now();
571 std::chrono::duration<float> elapsed = t2 - t1;
572 host_time_m = elapsed.count() * 1000;
521 break; 573 break;
522 } 574 }
523 case CallType::CLEANUP: 575 case CallType::CLEANUP:
@@ -551,13 +603,20 @@ bool ExecutionObject::Impl::Wait(CallType ct)
551 } 603 }
552 case CallType::PROCESS: 604 case CallType::PROCESS:
553 { 605 {
554 bool has_work = k_process_m->Wait(); 606 float host_elapsed_ms = 0.0f;
607 bool has_work = k_process_m->Wait(&host_elapsed_ms);
555 if (has_work) 608 if (has_work)
556 { 609 {
557 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS) 610 if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS)
558 throw Exception(shared_process_params_m->errorCode, 611 throw Exception(shared_process_params_m->errorCode,
559 __FILE__, __FUNCTION__, __LINE__); 612 __FILE__, __FUNCTION__, __LINE__);
613
614 std::chrono::time_point<std::chrono::steady_clock> t1, t2;
615 t1 = std::chrono::steady_clock::now();
560 HostReadNetOutput(); 616 HostReadNetOutput();
617 t2 = std::chrono::steady_clock::now();
618 std::chrono::duration<float> elapsed = t2 - t1;
619 host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
561 } 620 }
562 621
563 return has_work; 622 return has_work;
@@ -574,6 +633,33 @@ bool ExecutionObject::Impl::Wait(CallType ct)
574 return false; 633 return false;
575} 634}
576 635
636bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data)
637{
638 switch (ct)
639 {
640 case CallType::PROCESS:
641 {
642 return k_process_m->AddCallback(user_data);
643 break;
644 }
645 default:
646 return false;
647 }
648
649 return false;
650}
651
652uint64_t ExecutionObject::Impl::GetProcessCycles() const
653{
654 uint8_t factor = 1;
655
656 // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles
657 if (device_m->type() == CL_DEVICE_TYPE_CUSTOM)
658 factor = 2;
659
660 return shared_process_params_m.get()->cycles * factor;
661}
662
577// 663//
578// Write the trace data to output files 664// Write the trace data to output files
579// 665//
@@ -697,3 +783,16 @@ LayerOutput::~LayerOutput()
697{ 783{
698 delete[] data_m; 784 delete[] data_m;
699} 785}
786
787void ExecutionObject::Impl::AcquireLock()
788{
789 std::unique_lock<std::mutex> lock(mutex_access_m);
790 cv_access_m.wait(lock, [this]{ return this->is_idle_m; });
791 is_idle_m = false;
792}
793
794void ExecutionObject::Impl::ReleaseLock()
795{
796 is_idle_m = true;
797 cv_access_m.notify_all();
798}
diff --git a/tidl_api/src/execution_object_pipeline.cpp b/tidl_api/src/execution_object_pipeline.cpp
new file mode 100644
index 0000000..ff84255
--- /dev/null
+++ b/tidl_api/src/execution_object_pipeline.cpp
@@ -0,0 +1,360 @@
1/******************************************************************************
2 * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Texas Instruments Incorporated nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 *****************************************************************************/
28
29#include <assert.h>
30#include <mutex>
31#include <condition_variable>
32#include <chrono>
33#include "device_arginfo.h"
34#include "execution_object_pipeline.h"
35
36using namespace tidl;
37
38class ExecutionObjectPipeline::Impl
39{
40 public:
41 Impl(std::vector<ExecutionObject*> &eos);
42 ~Impl();
43
44 void SetInputOutputBuffer(const ArgInfo &in, const ArgInfo &out);
45 bool RunAsyncStart();
46 bool RunAsyncNext();
47 bool Wait();
48
49 // Trace related
50 void WriteLayerOutputsToFile(const std::string& filename_prefix) const;
51 const LayerOutput* GetOutputFromLayer(uint32_t layer_index,
52 uint32_t output_index) const;
53 const LayerOutputs* GetOutputsFromAllLayers() const;
54
55 //! for pipelined execution
56 std::vector<ExecutionObject*> eos_m;
57 std::vector<IODeviceArgInfo*> iobufs_m;
58
59 std::string device_name_m;
60
61 //! current frame index
62 int frame_idx_m;
63
64 //! current execution object index
65 uint32_t curr_eo_idx_m;
66
67 // host time tracking: pipeline start to finish
68 float host_time_m;
69
70 private:
71 //! @brief Initialize ExecutionObjectPipeline with given
72 //! ExecutionObjects: check consecutive layersGroup, allocate memory
73 void Initialize();
74
75 // flag, mutex and cond var for signaling completion and waiting
76 bool has_work_m, is_processed_m;
77 std::mutex mutex_m;
78 std::condition_variable cv_m;
79
80 // host time tracking: pipeline start to finish
81 std::chrono::time_point<std::chrono::steady_clock> start_m;
82};
83
84ExecutionObjectPipeline::ExecutionObjectPipeline(
85 std::vector<ExecutionObject*> eos)
86{
87 pimpl_m = std::unique_ptr<Impl> { new Impl(eos) };
88}
89
90ExecutionObjectPipeline::Impl::Impl(std::vector<ExecutionObject *> &eos) :
91 eos_m(eos), has_work_m(false), is_processed_m(false)
92{
93 Initialize();
94}
95
96// Pointer to implementation idiom: https://herbsutter.com/gotw/_100/:
97// Both unique_ptr and shared_ptr can be instantiated with an incomplete type
98// unique_ptr's destructor requires a complete type in order to invoke delete
99ExecutionObjectPipeline::~ExecutionObjectPipeline() = default;
100
101char* ExecutionObjectPipeline::GetInputBufferPtr() const
102{
103 return static_cast<char *>(pimpl_m->iobufs_m.front()->GetArg().ptr());
104}
105
106size_t ExecutionObjectPipeline::GetInputBufferSizeInBytes() const
107{
108 return pimpl_m->eos_m.front()->GetInputBufferSizeInBytes();
109}
110
111char* ExecutionObjectPipeline::GetOutputBufferPtr() const
112{
113 return static_cast<char *>(pimpl_m->iobufs_m.back()->GetArg().ptr());
114}
115
116size_t ExecutionObjectPipeline::GetOutputBufferSizeInBytes() const
117{
118 return pimpl_m->eos_m.back()->GetOutputBufferSizeInBytes();
119}
120
121void ExecutionObjectPipeline::SetInputOutputBuffer(const ArgInfo& in,
122 const ArgInfo& out)
123{
124 assert(in.ptr() != nullptr && in.size() >= GetInputBufferSizeInBytes());
125 assert(out.ptr() != nullptr && out.size() >= GetOutputBufferSizeInBytes());
126 pimpl_m->SetInputOutputBuffer(in, out);
127}
128
129void ExecutionObjectPipeline::SetFrameIndex(int idx)
130{
131 pimpl_m->frame_idx_m = idx;
132}
133
134int ExecutionObjectPipeline::GetFrameIndex() const
135{
136 return pimpl_m->frame_idx_m;
137}
138
139bool ExecutionObjectPipeline::ProcessFrameStartAsync()
140{
141 assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
142 bool st = pimpl_m->RunAsyncStart();
143 if (st)
144 st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS,
145 this);
146 return st;
147}
148
149bool ExecutionObjectPipeline::ProcessFrameWait()
150{
151 return pimpl_m->Wait();
152}
153
154void CallbackWrapper(void *user_data)
155{
156 ((ExecutionObjectPipeline *) user_data)->RunAsyncNext();
157}
158
159void ExecutionObjectPipeline::RunAsyncNext()
160{
161 bool has_next = pimpl_m->RunAsyncNext();
162 if (has_next)
163 pimpl_m->eos_m[pimpl_m->curr_eo_idx_m]->AddCallback(
164 ExecutionObject::CallType::PROCESS, this);
165}
166
167float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const
168{
169 float total = 0.0f;
170 for (auto eo : pimpl_m->eos_m)
171 total += eo->GetProcessTimeInMilliSeconds();
172 return total;
173}
174
175float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds() const
176{
177 return pimpl_m->host_time_m;
178}
179
180const std::string& ExecutionObjectPipeline::GetDeviceName() const
181{
182 return pimpl_m->device_name_m;
183}
184
185void
186ExecutionObjectPipeline::WriteLayerOutputsToFile(
187 const std::string& filename_prefix) const
188{
189 pimpl_m->WriteLayerOutputsToFile(filename_prefix);
190}
191
192const LayerOutput*
193ExecutionObjectPipeline::GetOutputFromLayer(uint32_t layer_index,
194 uint32_t output_index) const
195{
196 return pimpl_m->GetOutputFromLayer(layer_index, output_index);
197}
198
199const LayerOutputs*
200ExecutionObjectPipeline::GetOutputsFromAllLayers() const
201{
202 return pimpl_m->GetOutputsFromAllLayers();
203}
204
205
206/// Impl methods start here
207
208
209static
210void* AllocateMem(size_t size)
211{
212 if (size == 0) return nullptr;
213 void *ptr = malloc(size);
214 if (ptr == nullptr)
215 throw Exception("Out of memory, ExecutionObjectPipeline malloc failed",
216 __FILE__, __FUNCTION__, __LINE__);
217 return ptr;
218}
219
220void ExecutionObjectPipeline::Impl::Initialize()
221{
222 // Check consecutive layersGroups to form a pipeline
223 int prev_group = 0;
224 for (auto eo : eos_m)
225 {
226 int group = eo->GetLayersGroupId();
227 if (prev_group != 0 && group != prev_group + 1)
228 throw Exception(
229 "Non-consecutive layersGroupIds in ExecutionObjectPipeline",
230 __FILE__, __FUNCTION__, __LINE__);
231 prev_group = group;
232 }
233
234 for (auto eo : eos_m)
235 device_name_m += eo->GetDeviceName() + "+";
236 device_name_m.resize(device_name_m.size() - 1);
237
238 // Allocate input and output memory for EOs/layersGroups
239 // Note that i-th EO's output buffer is the same as (i+1)-th EO's input
240 // So, if n EOs, then (n+1) buffers: b EO b EO b EO b ... EO b
241 // User must set the first input buffer and the last output buffer
242 size_t size;
243 ArgInfo in(nullptr, 0);
244 iobufs_m.push_back(new IODeviceArgInfo(in));
245 for (auto eo : eos_m)
246 {
247 if (eo != eos_m.back())
248 size = eo->GetOutputBufferSizeInBytes();
249 else
250 size = 0;
251
252 void *ptr = AllocateMem(size);
253 ArgInfo out(ptr, size);
254 iobufs_m.push_back(new IODeviceArgInfo(out));
255 }
256}
257
258ExecutionObjectPipeline::Impl::~Impl()
259{
260 int num_iobufs = iobufs_m.size();
261 for (int i = 0; i < num_iobufs; i++)
262 {
263 if (! (i == 0 || i == num_iobufs-1))
264 free(iobufs_m[i]->GetArg().ptr());
265 delete iobufs_m[i];
266 }
267}
268
269void ExecutionObjectPipeline::Impl::SetInputOutputBuffer(const ArgInfo &in,
270 const ArgInfo &out)
271{
272 delete iobufs_m.front();
273 delete iobufs_m.back();
274 iobufs_m.front() = new IODeviceArgInfo(in);
275 iobufs_m.back() = new IODeviceArgInfo(out);
276}
277
278bool ExecutionObjectPipeline::Impl::RunAsyncStart()
279{
280 start_m = std::chrono::steady_clock::now();
281 has_work_m = true;
282 is_processed_m = false;
283 host_time_m = 0.0f;
284 curr_eo_idx_m = 0;
285 eos_m[0]->AcquireLock();
286 eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1]);
287 return eos_m[0]->ProcessFrameStartAsync();
288}
289
290// returns true if we have more EOs to execute
291bool ExecutionObjectPipeline::Impl::RunAsyncNext()
292{
293 eos_m[curr_eo_idx_m]->ProcessFrameWait();
294 eos_m[curr_eo_idx_m]->ReleaseLock();
295 curr_eo_idx_m += 1;
296 if (curr_eo_idx_m < eos_m.size())
297 {
298 eos_m[curr_eo_idx_m]->AcquireLock();
299 eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m],
300 iobufs_m[curr_eo_idx_m+1]);
301 eos_m[curr_eo_idx_m]->ProcessFrameStartAsync();
302 return true;
303 }
304 else
305 {
306 std::chrono::duration<float> elapsed = std::chrono::steady_clock::now()
307 - start_m;
308 host_time_m = elapsed.count() * 1000; // seconds to milliseconds
309 is_processed_m = true;
310 cv_m.notify_all();
311 return false;
312 }
313}
314
315bool ExecutionObjectPipeline::Impl::Wait()
316{
317 if (! has_work_m) return false;
318
319 std::unique_lock<std::mutex> lock(mutex_m);
320 cv_m.wait(lock, [this]{ return this->is_processed_m; });
321 has_work_m = false;
322 return true;
323}
324
325void
326ExecutionObjectPipeline::Impl::WriteLayerOutputsToFile(
327 const std::string& filename_prefix) const
328{
329 for (auto eo : eos_m)
330 eo->WriteLayerOutputsToFile(filename_prefix);
331}
332
333const LayerOutput*
334ExecutionObjectPipeline::Impl::GetOutputFromLayer(uint32_t layer_index,
335 uint32_t output_index) const
336{
337 const LayerOutput* lo = nullptr;
338 for (auto eo : eos_m)
339 {
340 lo = eo->GetOutputFromLayer(layer_index, output_index);
341 if (lo != nullptr) break;
342 }
343 return lo;
344}
345
346const LayerOutputs*
347ExecutionObjectPipeline::Impl::GetOutputsFromAllLayers() const
348{
349 LayerOutputs *all = new LayerOutputs;
350 for (auto eo : eos_m)
351 {
352 LayerOutputs *los = const_cast<LayerOutputs *>(
353 eo->GetOutputsFromAllLayers());
354 for (auto& lo : *los)
355 all->push_back(std::unique_ptr<const LayerOutput>{ lo.release() });
356 delete los;
357 }
358 return all;
359}
360
diff --git a/tidl_api/src/executor.cpp b/tidl_api/src/executor.cpp
index b644728..914c78a 100644
--- a/tidl_api/src/executor.cpp
+++ b/tidl_api/src/executor.cpp
@@ -96,6 +96,12 @@ const ExecutionObjects& Executor::GetExecutionObjects() const
96 return pimpl_m->execution_objects_m; 96 return pimpl_m->execution_objects_m;
97} 97}
98 98
99ExecutionObject* Executor::operator[](uint32_t index) const
100{
101 assert(index < pimpl_m->execution_objects_m.size());
102 return pimpl_m->execution_objects_m[index].get();
103}
104
99bool ExecutorImpl::Initialize(const Configuration& configuration) 105bool ExecutorImpl::Initialize(const Configuration& configuration)
100{ 106{
101 configuration_m = configuration; 107 configuration_m = configuration;
@@ -145,13 +151,11 @@ bool ExecutorImpl::Initialize(const Configuration& configuration)
145 {new ExecutionObject(device_m.get(), index, 151 {new ExecutionObject(device_m.get(), index,
146 create_arg, param_heap_arg, 152 create_arg, param_heap_arg,
147 configuration_m.EXTMEM_HEAP_SIZE, 153 configuration_m.EXTMEM_HEAP_SIZE,
154 layers_group_id_m,
155 configuration_m.enableOutputTrace,
148 configuration_m.enableInternalInput)} ); 156 configuration_m.enableInternalInput)} );
149 } 157 }
150 158
151 if (configuration_m.enableOutputTrace)
152 for (auto &eo : execution_objects_m)
153 eo->EnableOutputBufferTrace();
154
155 for (auto &eo : execution_objects_m) 159 for (auto &eo : execution_objects_m)
156 eo->RunAsync(ExecutionObject::CallType::INIT); 160 eo->RunAsync(ExecutionObject::CallType::INIT);
157 161
@@ -294,4 +298,3 @@ const char* Exception::what() const noexcept
294{ 298{
295 return message_m.c_str(); 299 return message_m.c_str();
296} 300}
297
diff --git a/tidl_api/src/ocl_device.cpp b/tidl_api/src/ocl_device.cpp
index fba4f94..b3eaf36 100644
--- a/tidl_api/src/ocl_device.cpp
+++ b/tidl_api/src/ocl_device.cpp
@@ -91,7 +91,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename):
91 // Queue 0 on device 0 91 // Queue 0 on device 0
92 queue_m[0] = clCreateCommandQueue(context_m, 92 queue_m[0] = clCreateCommandQueue(context_m,
93 device_ids[0], 93 device_ids[0],
94 0, 94 CL_QUEUE_PROFILING_ENABLE,
95 &errcode); 95 &errcode);
96 errorCheck(errcode, __LINE__); 96 errorCheck(errcode, __LINE__);
97 BuildProgramFromBinary(binary_filename, device_ids, 1); 97 BuildProgramFromBinary(binary_filename, device_ids, 1);
@@ -139,7 +139,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename):
139 int index = static_cast<int>(id); 139 int index = static_cast<int>(id);
140 queue_m[index] = clCreateCommandQueue(context_m, 140 queue_m[index] = clCreateCommandQueue(context_m,
141 sub_devices[index], 141 sub_devices[index],
142 0, 142 CL_QUEUE_PROFILING_ENABLE,
143 &errcode); 143 &errcode);
144 errorCheck(errcode, __LINE__); 144 errorCheck(errcode, __LINE__);
145 } 145 }
@@ -187,7 +187,7 @@ EveDevice::EveDevice(const DeviceIds& ids, const std::string &kernel_names):
187 int index = static_cast<int>(id); 187 int index = static_cast<int>(id);
188 queue_m[index] = clCreateCommandQueue(context_m, 188 queue_m[index] = clCreateCommandQueue(context_m,
189 all_device_ids[index], 189 all_device_ids[index],
190 0, 190 CL_QUEUE_PROFILING_ENABLE,
191 &errcode); 191 &errcode);
192 errorCheck(errcode, __LINE__); 192 errorCheck(errcode, __LINE__);
193 } 193 }
@@ -317,7 +317,7 @@ Kernel& Kernel::RunAsync()
317} 317}
318 318
319 319
320bool Kernel::Wait() 320bool Kernel::Wait(float *host_elapsed_ms)
321{ 321{
322 // Wait called without a corresponding RunAsync 322 // Wait called without a corresponding RunAsync
323 if (!is_running_m) 323 if (!is_running_m)
@@ -326,6 +326,17 @@ bool Kernel::Wait()
326 TRACE::print("\tKernel: waiting...\n"); 326 TRACE::print("\tKernel: waiting...\n");
327 cl_int ret = clWaitForEvents(1, &event_m); 327 cl_int ret = clWaitForEvents(1, &event_m);
328 errorCheck(ret, __LINE__); 328 errorCheck(ret, __LINE__);
329
330 if (host_elapsed_ms != nullptr)
331 {
332 cl_ulong t_que, t_end;
333 clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_QUEUED,
334 sizeof(cl_ulong), &t_que, nullptr);
335 clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_END,
336 sizeof(cl_ulong), &t_end, nullptr);
337 *host_elapsed_ms = (t_end - t_que) / 1.0e6; // nano to milli seconds
338 }
339
329 ret = clReleaseEvent(event_m); 340 ret = clReleaseEvent(event_m);
330 errorCheck(ret, __LINE__); 341 errorCheck(ret, __LINE__);
331 TRACE::print("\tKernel: finished execution\n"); 342 TRACE::print("\tKernel: finished execution\n");
@@ -334,6 +345,22 @@ bool Kernel::Wait()
334 return true; 345 return true;
335} 346}
336 347
348extern void CallbackWrapper(void *user_data) __attribute__((weak));
349
350static
351void EventCallback(cl_event event, cl_int exec_status, void *user_data)
352{
353 if (exec_status != CL_SUCCESS || user_data == nullptr) return;
354 if (CallbackWrapper) CallbackWrapper(user_data);
355}
356
357bool Kernel::AddCallback(void *user_data)
358{
359 if (! is_running_m) return false;
360 return clSetEventCallback(event_m, CL_COMPLETE, EventCallback, user_data)
361 == CL_SUCCESS;
362}
363
337Kernel::~Kernel() 364Kernel::~Kernel()
338{ 365{
339 for (auto b : buffers_m) 366 for (auto b : buffers_m)
diff --git a/tidl_api/src/ocl_device.h b/tidl_api/src/ocl_device.h
index 6e80166..04c5db6 100644
--- a/tidl_api/src/ocl_device.h
+++ b/tidl_api/src/ocl_device.h
@@ -74,6 +74,8 @@ class Device
74 74
75 static uint32_t GetNumDevices(DeviceType device_type); 75 static uint32_t GetNumDevices(DeviceType device_type);
76 76
77 virtual std::string GetDeviceName() = 0;
78
77 protected: 79 protected:
78 80
79 static const int MAX_DEVICES = 4; 81 static const int MAX_DEVICES = 4;
@@ -101,6 +103,8 @@ class DspDevice: public Device
101 DspDevice(const DspDevice&) = delete; 103 DspDevice(const DspDevice&) = delete;
102 DspDevice& operator=(const DspDevice&) = delete; 104 DspDevice& operator=(const DspDevice&) = delete;
103 105
106 virtual std::string GetDeviceName() { return "DSP"; }
107
104 protected: 108 protected:
105 bool BuildProgramFromBinary(const std::string &binary_filename, 109 bool BuildProgramFromBinary(const std::string &binary_filename,
106 cl_device_id device_ids[], 110 cl_device_id device_ids[],
@@ -117,6 +121,8 @@ class EveDevice : public Device
117 EveDevice(const EveDevice&) = delete; 121 EveDevice(const EveDevice&) = delete;
118 EveDevice& operator=(const EveDevice&) = delete; 122 EveDevice& operator=(const EveDevice&) = delete;
119 123
124 virtual std::string GetDeviceName() { return "EVE"; }
125
120 protected: 126 protected:
121 bool BuildProgramFromBinary(const std::string &kernel_names, 127 bool BuildProgramFromBinary(const std::string &kernel_names,
122 cl_device_id device_ids[], 128 cl_device_id device_ids[],
@@ -137,7 +143,8 @@ class Kernel
137 ~Kernel(); 143 ~Kernel();
138 144
139 Kernel& RunAsync(); 145 Kernel& RunAsync();
140 bool Wait(); 146 bool Wait(float *host_elapsed_ms = nullptr);
147 bool AddCallback(void *user_data);
141 148
142 private: 149 private:
143 cl_kernel kernel_m; 150 cl_kernel kernel_m;