diff options
author | Yuan Zhao | 2018-08-09 23:42:42 -0500 |
---|---|---|
committer | Yuan Zhao | 2018-08-20 10:57:44 -0500 |
commit | 1a42784dc57d81735218ec2dc85172a1ed4e8181 (patch) | |
tree | 4d7dea04882465dbb6c95f50582102505623be6f /tidl_api | |
parent | 36786d7afca8c1906293854d1e6243bb961c712f (diff) | |
download | tidl-api-1a42784dc57d81735218ec2dc85172a1ed4e8181.tar.gz tidl-api-1a42784dc57d81735218ec2dc85172a1ed4e8181.tar.xz tidl-api-1a42784dc57d81735218ec2dc85172a1ed4e8181.zip |
ExecutionObjectPipeline for executing layersGroups
- Add top level ExecutionObjectPipeline class to execute multiple
layersGroups.
- An ExecutionObjectPipeline is constructed from multiple
ExecutionObjects, each ExecutionObject executes one layersGroup
in the network, together they execute consecutive layersGroups.
- Same look and feel as ExecutionObject, e.g. ProcessFrameStartAsync,
ProcessFrameWait, GetInputBufferPointer, GetOutputBufferPointer
- MCT-1017, MCT-1029
Diffstat (limited to 'tidl_api')
-rw-r--r-- | tidl_api/Makefile | 6 | ||||
-rw-r--r-- | tidl_api/inc/execution_object.h | 71 | ||||
-rw-r--r-- | tidl_api/inc/execution_object_internal.h | 119 | ||||
-rw-r--r-- | tidl_api/inc/execution_object_pipeline.h | 151 | ||||
-rw-r--r-- | tidl_api/inc/executor.h | 9 | ||||
-rw-r--r-- | tidl_api/src/execution_object.cpp | 209 | ||||
-rw-r--r-- | tidl_api/src/execution_object_pipeline.cpp | 360 | ||||
-rw-r--r-- | tidl_api/src/executor.cpp | 13 | ||||
-rw-r--r-- | tidl_api/src/ocl_device.cpp | 35 | ||||
-rw-r--r-- | tidl_api/src/ocl_device.h | 9 |
10 files changed, 882 insertions, 100 deletions
diff --git a/tidl_api/Makefile b/tidl_api/Makefile index 05a3704..3fc6a2c 100644 --- a/tidl_api/Makefile +++ b/tidl_api/Makefile | |||
@@ -39,7 +39,8 @@ AR = ar | |||
39 | 39 | ||
40 | 40 | ||
41 | SRCS = ocl_device.cpp configuration_parser.cpp configuration.cpp\ | 41 | SRCS = ocl_device.cpp configuration_parser.cpp configuration.cpp\ |
42 | executor.cpp execution_object.cpp trace.cpp util.cpp | 42 | executor.cpp execution_object.cpp trace.cpp util.cpp \ |
43 | execution_object_pipeline.cpp | ||
43 | SRCS_IMGUTIL = imgutil.cpp | 44 | SRCS_IMGUTIL = imgutil.cpp |
44 | 45 | ||
45 | OBJS = $(SRCS:.cpp=.o) | 46 | OBJS = $(SRCS:.cpp=.o) |
@@ -53,8 +54,7 @@ HOST_OBJ_IMGUTIL_FILES = $(addprefix obj/,$(OBJS_IMGUTIL)) | |||
53 | HEADERS = src/common_defines.h src/executor_impl.h src/ocl_device.h | 54 | HEADERS = src/common_defines.h src/executor_impl.h src/ocl_device.h |
54 | HEADERS += src/parameters.h src/tidl_create_params.h src/trace.h src/util.h | 55 | HEADERS += src/parameters.h src/tidl_create_params.h src/trace.h src/util.h |
55 | HEADERS += inc/configuration.h inc/execution_object.h inc/executor.h | 56 | HEADERS += inc/configuration.h inc/execution_object.h inc/executor.h |
56 | HEADERS += inc/imgutil.h src/device_arginfo.h | 57 | HEADERS += inc/imgutil.h src/device_arginfo.h inc/execution_object_pipeline.h |
57 | |||
58 | 58 | ||
59 | ifeq ($(BUILD), debug) | 59 | ifeq ($(BUILD), debug) |
60 | CXXFLAGS += -Og -g -ggdb | 60 | CXXFLAGS += -Og -g -ggdb |
diff --git a/tidl_api/inc/execution_object.h b/tidl_api/inc/execution_object.h index e78ad2e..c1d86fc 100644 --- a/tidl_api/inc/execution_object.h +++ b/tidl_api/inc/execution_object.h | |||
@@ -31,6 +31,7 @@ | |||
31 | #pragma once | 31 | #pragma once |
32 | 32 | ||
33 | #include <memory> | 33 | #include <memory> |
34 | #include "execution_object_internal.h" | ||
34 | 35 | ||
35 | namespace tidl { | 36 | namespace tidl { |
36 | 37 | ||
@@ -39,13 +40,12 @@ class Device; | |||
39 | class LayerOutput; | 40 | class LayerOutput; |
40 | class IODeviceArgInfo; | 41 | class IODeviceArgInfo; |
41 | 42 | ||
42 | typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs; | ||
43 | 43 | ||
44 | /*! @class ExecutionObject | 44 | /*! @class ExecutionObject |
45 | @brief Runs the TIDL network on an OpenCL device | 45 | @brief Runs the TIDL network on an OpenCL device |
46 | */ | 46 | */ |
47 | 47 | ||
48 | class ExecutionObject | 48 | class ExecutionObject : public ExecutionObjectInternalInterface |
49 | { | 49 | { |
50 | public: | 50 | public: |
51 | 51 | ||
@@ -55,6 +55,8 @@ class ExecutionObject | |||
55 | const ArgInfo& create_arg, | 55 | const ArgInfo& create_arg, |
56 | const ArgInfo& param_heap_arg, | 56 | const ArgInfo& param_heap_arg, |
57 | size_t extmem_heap_size, | 57 | size_t extmem_heap_size, |
58 | int layersGroupId, | ||
59 | bool output_trace, | ||
58 | bool internal_input); | 60 | bool internal_input); |
59 | //! @private | 61 | //! @private |
60 | ~ExecutionObject(); | 62 | ~ExecutionObject(); |
@@ -62,52 +64,56 @@ class ExecutionObject | |||
62 | //! Specify the input and output buffers used by the EO | 64 | //! Specify the input and output buffers used by the EO |
63 | //! @param in buffer used for input. | 65 | //! @param in buffer used for input. |
64 | //! @param out buffer used for output. | 66 | //! @param out buffer used for output. |
65 | void SetInputOutputBuffer (const ArgInfo& in, const ArgInfo& out); | 67 | void SetInputOutputBuffer(const ArgInfo& in, |
68 | const ArgInfo& out) override; | ||
66 | 69 | ||
67 | //! Returns a pointer to the input buffer set via SetInputOutputBuffer | 70 | //! Returns a pointer to the input buffer set via SetInputOutputBuffer |
68 | char* GetInputBufferPtr() const; | 71 | char* GetInputBufferPtr() const override; |
69 | 72 | ||
70 | //! Returns size of the input buffer | 73 | //! Returns size of the input buffer |
71 | size_t GetInputBufferSizeInBytes() const; | 74 | size_t GetInputBufferSizeInBytes() const override; |
75 | |||
76 | //! Returns a pointer to the output buffer | ||
77 | char* GetOutputBufferPtr() const override; | ||
78 | |||
79 | //! Returns size of the output buffer | ||
80 | size_t GetOutputBufferSizeInBytes() const override; | ||
72 | 81 | ||
73 | //! @brief Set the frame index of the frame currently processed by the | 82 | //! @brief Set the frame index of the frame currently processed by the |
74 | //! ExecutionObject. Used for trace/debug messages | 83 | //! ExecutionObject. Used for trace/debug messages |
75 | //! @param idx index of the frame | 84 | //! @param idx index of the frame |
76 | void SetFrameIndex(int idx); | 85 | void SetFrameIndex(int idx) override; |
77 | 86 | ||
78 | //! Returns the index of a frame being processed (set by SetFrameIndex) | 87 | //! Returns the index of a frame being processed (set by SetFrameIndex) |
79 | int GetFrameIndex() const; | 88 | int GetFrameIndex() const override; |
80 | |||
81 | //! Returns a pointer to the output buffer | ||
82 | char* GetOutputBufferPtr() const; | ||
83 | |||
84 | //! Returns the number of bytes written to the output buffer | ||
85 | size_t GetOutputBufferSizeInBytes() const; | ||
86 | 89 | ||
87 | //! @brief Start processing a frame. The call is asynchronous and returns | 90 | //! @brief Start processing a frame. The call is asynchronous and |
88 | //! immediately. Use ExecutionObject::ProcessFrameWait to wait | 91 | //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait |
89 | bool ProcessFrameStartAsync(); | 92 | bool ProcessFrameStartAsync() override; |
90 | 93 | ||
91 | //! Wait for the execution object to complete processing a frame | 94 | //! Wait for the execution object to complete processing a frame |
92 | //! @return false if ExecutionObject::ProcessFrameWait was called | 95 | //! @return false if ExecutionObject::ProcessFrameWait was called |
93 | //! without a corresponding call to | 96 | //! without a corresponding call to |
94 | //! ExecutionObject::ProcessFrameStartAsync. | 97 | //! ExecutionObject::ProcessFrameStartAsync. |
95 | bool ProcessFrameWait(); | 98 | bool ProcessFrameWait() override; |
96 | |||
97 | //! @brief return the number of cycles taken *on the device* to | ||
98 | //! execute the process call | ||
99 | //! @return Number of cycles to process a frame on the device. | ||
100 | uint64_t GetProcessCycles() const; | ||
101 | 99 | ||
102 | //! @brief return the number of milliseconds taken *on the device* to | 100 | //! @brief return the number of milliseconds taken *on the device* to |
103 | //! execute the process call | 101 | //! execute the process call |
104 | //! @return Number of milliseconds to process a frame on the device. | 102 | //! @return Number of milliseconds to process a frame on the device. |
105 | float GetProcessTimeInMilliSeconds() const; | 103 | float GetProcessTimeInMilliSeconds() const override; |
104 | |||
105 | //! @brief return the number of milliseconds taken *on the host* to | ||
106 | //! execute the process call | ||
107 | //! @return Number of milliseconds to process a frame on the host. | ||
108 | float GetHostProcessTimeInMilliSeconds() const override; | ||
109 | |||
110 | //! Returns the device name that the ExecutionObject runs on | ||
111 | const std::string& GetDeviceName() const override; | ||
106 | 112 | ||
107 | //! Write the output buffer for each layer to a file | 113 | //! Write the output buffer for each layer to a file |
108 | //! <filename_prefix>_<ID>_HxW.bin | 114 | //! \<filename_prefix>_<ID>_HxW.bin |
109 | void WriteLayerOutputsToFile(const std::string& filename_prefix= | 115 | void WriteLayerOutputsToFile(const std::string& filename_prefix= |
110 | "trace_dump_") const; | 116 | "trace_dump_") const override; |
111 | 117 | ||
112 | //! Returns a LayerOutput object corresponding to a layer. | 118 | //! Returns a LayerOutput object corresponding to a layer. |
113 | //! Caller is responsible for deleting the LayerOutput object. | 119 | //! Caller is responsible for deleting the LayerOutput object. |
@@ -116,10 +122,13 @@ class ExecutionObject | |||
116 | //! @param output_index The output index of the buffer for a given | 122 | //! @param output_index The output index of the buffer for a given |
117 | //! layer. Defaults to 0. | 123 | //! layer. Defaults to 0. |
118 | const LayerOutput* GetOutputFromLayer(uint32_t layer_index, | 124 | const LayerOutput* GetOutputFromLayer(uint32_t layer_index, |
119 | uint32_t output_index=0) const; | 125 | uint32_t output_index=0) const override; |
120 | 126 | ||
121 | //! Get output buffers from all layers | 127 | //! Get output buffers from all layers |
122 | const LayerOutputs* GetOutputsFromAllLayers() const; | 128 | const LayerOutputs* GetOutputsFromAllLayers() const override; |
129 | |||
130 | //! Returns the layersGrupId that the ExecutionObject is processing | ||
131 | int GetLayersGroupId() const; | ||
123 | 132 | ||
124 | //! @private | 133 | //! @private |
125 | // Used by the Executor | 134 | // Used by the Executor |
@@ -127,12 +136,16 @@ class ExecutionObject | |||
127 | bool RunAsync(CallType ct); | 136 | bool RunAsync(CallType ct); |
128 | bool Wait (CallType ct); | 137 | bool Wait (CallType ct); |
129 | 138 | ||
139 | //! @private | ||
140 | // Used by the ExecutionObjectPipeline | ||
141 | bool AddCallback(CallType ct, void *user_data); | ||
142 | void AcquireLock(); | ||
143 | void ReleaseLock(); | ||
144 | |||
130 | ExecutionObject() = delete; | 145 | ExecutionObject() = delete; |
131 | ExecutionObject(const ExecutionObject&) = delete; | 146 | ExecutionObject(const ExecutionObject&) = delete; |
132 | ExecutionObject& operator=(const ExecutionObject&) = delete; | 147 | ExecutionObject& operator=(const ExecutionObject&) = delete; |
133 | 148 | ||
134 | void EnableOutputBufferTrace(); | ||
135 | |||
136 | //! @private | 149 | //! @private |
137 | void SetInputOutputBuffer(const IODeviceArgInfo* in, | 150 | void SetInputOutputBuffer(const IODeviceArgInfo* in, |
138 | const IODeviceArgInfo* out); | 151 | const IODeviceArgInfo* out); |
diff --git a/tidl_api/inc/execution_object_internal.h b/tidl_api/inc/execution_object_internal.h new file mode 100644 index 0000000..816da94 --- /dev/null +++ b/tidl_api/inc/execution_object_internal.h | |||
@@ -0,0 +1,119 @@ | |||
1 | /****************************************************************************** | ||
2 | * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/ | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * Redistribution and use in source and binary forms, with or without | ||
6 | * modification, are permitted provided that the following conditions are met: | ||
7 | * * Redistributions of source code must retain the above copyright | ||
8 | * notice, this list of conditions and the following disclaimer. | ||
9 | * * Redistributions in binary form must reproduce the above copyright | ||
10 | * notice, this list of conditions and the following disclaimer in the | ||
11 | * documentation and/or other materials provided with the distribution. | ||
12 | * * Neither the name of Texas Instruments Incorporated nor the | ||
13 | * names of its contributors may be used to endorse or promote products | ||
14 | * derived from this software without specific prior written permission. | ||
15 | * | ||
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | ||
26 | * THE POSSIBILITY OF SUCH DAMAGE. | ||
27 | *****************************************************************************/ | ||
28 | |||
29 | /*! @file execution_object_internal.h */ | ||
30 | |||
31 | #pragma once | ||
32 | |||
33 | namespace tidl { | ||
34 | |||
35 | class LayerOutput; | ||
36 | |||
37 | typedef std::vector<std::unique_ptr<const LayerOutput>> LayerOutputs; | ||
38 | |||
39 | /*! @cond HIDDEN_SYMBOLS | ||
40 | @class ExecutionObjectInternalInterface | ||
41 | @brief Internal interface for running the TIDL network on OpenCL devices | ||
42 | Do not use this internal class directly. | ||
43 | Please use ExecutionObject or ExecutionObejctPipeline instead. | ||
44 | */ | ||
45 | class ExecutionObjectInternalInterface | ||
46 | { | ||
47 | public: | ||
48 | virtual ~ExecutionObjectInternalInterface() {}; | ||
49 | |||
50 | //! Specify the input and output buffers used by the EO | ||
51 | //! @param in buffer used for input. | ||
52 | //! @param out buffer used for output. | ||
53 | virtual void SetInputOutputBuffer(const ArgInfo& in, | ||
54 | const ArgInfo& out) =0; | ||
55 | |||
56 | //! Returns a pointer to the input buffer set via SetInputOutputBuffer | ||
57 | virtual char* GetInputBufferPtr() const =0; | ||
58 | |||
59 | //! Returns size of the input buffer | ||
60 | virtual size_t GetInputBufferSizeInBytes() const =0; | ||
61 | |||
62 | //! Returns a pointer to the output buffer | ||
63 | virtual char* GetOutputBufferPtr() const =0; | ||
64 | |||
65 | //! Returns size of the output buffer | ||
66 | virtual size_t GetOutputBufferSizeInBytes() const =0; | ||
67 | |||
68 | //! @brief Set the frame index of the frame currently processed by the | ||
69 | //! ExecutionObject. Used for trace/debug messages | ||
70 | //! @param idx index of the frame | ||
71 | virtual void SetFrameIndex(int idx) =0; | ||
72 | |||
73 | //! Returns the index of a frame being processed (set by SetFrameIndex) | ||
74 | virtual int GetFrameIndex() const =0; | ||
75 | |||
76 | //! @brief Start processing a frame. The call is asynchronous and returns | ||
77 | //! immediately. Use ExecutionObject::ProcessFrameWait to wait | ||
78 | virtual bool ProcessFrameStartAsync() =0; | ||
79 | |||
80 | //! Wait for the execution object to complete processing a frame | ||
81 | //! @return false if ExecutionObject::ProcessFrameWait was called | ||
82 | //! without a corresponding call to | ||
83 | //! ExecutionObject::ProcessFrameStartAsync. | ||
84 | virtual bool ProcessFrameWait() =0; | ||
85 | |||
86 | //! @brief return the number of milliseconds taken *on the device* to | ||
87 | //! execute the process call | ||
88 | //! @return Number of milliseconds to process a frame on the device. | ||
89 | virtual float GetProcessTimeInMilliSeconds() const =0; | ||
90 | |||
91 | //! @brief return the number of milliseconds taken *on the host* to | ||
92 | //! execute the process call | ||
93 | //! @return Number of milliseconds to process a frame on the host. | ||
94 | virtual float GetHostProcessTimeInMilliSeconds() const =0; | ||
95 | |||
96 | //! Returns the device name that the ExecutionObject runs on | ||
97 | virtual const std::string& GetDeviceName() const =0; | ||
98 | |||
99 | //! Write the output buffer for each layer to a file | ||
100 | //! \<filename_prefix>_<ID>_HxW.bin | ||
101 | virtual void WriteLayerOutputsToFile(const std::string& filename_prefix= | ||
102 | "trace_dump_") const =0; | ||
103 | |||
104 | //! Returns a LayerOutput object corresponding to a layer. | ||
105 | //! Caller is responsible for deleting the LayerOutput object. | ||
106 | //! @see LayerOutput | ||
107 | //! @param layer_index The layer index of the layer | ||
108 | //! @param output_index The output index of the buffer for a given | ||
109 | //! layer. Defaults to 0. | ||
110 | virtual const LayerOutput* GetOutputFromLayer(uint32_t layer_index, | ||
111 | uint32_t output_index=0) const =0; | ||
112 | |||
113 | //! Get output buffers from all layers | ||
114 | virtual const LayerOutputs* GetOutputsFromAllLayers() const =0; | ||
115 | }; | ||
116 | /*! @endcond | ||
117 | */ | ||
118 | |||
119 | } // namespace tidl | ||
diff --git a/tidl_api/inc/execution_object_pipeline.h b/tidl_api/inc/execution_object_pipeline.h new file mode 100644 index 0000000..aaa6cf0 --- /dev/null +++ b/tidl_api/inc/execution_object_pipeline.h | |||
@@ -0,0 +1,151 @@ | |||
1 | /****************************************************************************** | ||
2 | * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/ | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * Redistribution and use in source and binary forms, with or without | ||
6 | * modification, are permitted provided that the following conditions are met: | ||
7 | * * Redistributions of source code must retain the above copyright | ||
8 | * notice, this list of conditions and the following disclaimer. | ||
9 | * * Redistributions in binary form must reproduce the above copyright | ||
10 | * notice, this list of conditions and the following disclaimer in the | ||
11 | * documentation and/or other materials provided with the distribution. | ||
12 | * * Neither the name of Texas Instruments Incorporated nor the | ||
13 | * names of its contributors may be used to endorse or promote products | ||
14 | * derived from this software without specific prior written permission. | ||
15 | * | ||
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | ||
26 | * THE POSSIBILITY OF SUCH DAMAGE. | ||
27 | *****************************************************************************/ | ||
28 | |||
29 | //! @file execution_object_pipeline.h | ||
30 | |||
31 | #pragma once | ||
32 | #include <string> | ||
33 | #include <vector> | ||
34 | #include <cstdint> | ||
35 | #include <cassert> | ||
36 | |||
37 | #include "executor.h" | ||
38 | #include "execution_object_internal.h" | ||
39 | #include "execution_object.h" | ||
40 | |||
41 | namespace tidl { | ||
42 | |||
43 | /*! @class ExecutionObjectPipeline | ||
44 | @brief Manages the pipelined execution using multiple ExecutionObjects. | ||
45 | Each executor runs one layersGroup of the network. ExecutionObjects | ||
46 | must run consecutive layersGroups to form a pipelined execution. | ||
47 | */ | ||
48 | class ExecutionObjectPipeline : public ExecutionObjectInternalInterface | ||
49 | { | ||
50 | public: | ||
51 | //! @brief Create an ExecutionObjectPipeline object. | ||
52 | //! | ||
53 | //! The ExecutionObjectPipeline will take the provided ExecutionObjects | ||
54 | //! to create an execution pipeline. E.g. | ||
55 | //! @code | ||
56 | //! Configuration config("path to configuration file"); | ||
57 | //! DeviceIds ids = {DeviceId::ID0, DeviceId::ID1}; | ||
58 | //! Executor exe_eve(DeviceType::EVE, ids, config, 1); | ||
59 | //! Executor exe_dsp(DeviceType::DSP, ids, config, 2); | ||
60 | //! ExecutionObjectPipeline ep0({exe_eve[0], exe_dsp[0]}); | ||
61 | //! ExecutionObjectPipeline ep1({exe_eve[1], exe_dsp[1]}); | ||
62 | //! @endcode | ||
63 | //! | ||
64 | //! @param eos DSP or EVE ExecutionObjects forming a pipeline | ||
65 | ExecutionObjectPipeline(std::vector<ExecutionObject*> eos); | ||
66 | |||
67 | //! @brief Tear down an ExecutionObjectPipeline and free used resources | ||
68 | ~ExecutionObjectPipeline(); | ||
69 | |||
70 | //! Specify the input and output buffers used by the EOP | ||
71 | //! @param in buffer used for input. | ||
72 | //! @param out buffer used for output. | ||
73 | void SetInputOutputBuffer (const ArgInfo& in, | ||
74 | const ArgInfo& out) override; | ||
75 | |||
76 | //! Returns a pointer to the input buffer | ||
77 | char* GetInputBufferPtr() const override; | ||
78 | |||
79 | //! Returns size of the input buffer | ||
80 | size_t GetInputBufferSizeInBytes() const override; | ||
81 | |||
82 | //! Returns a pointer to the output buffer | ||
83 | char* GetOutputBufferPtr() const override; | ||
84 | |||
85 | //! Returns the number of bytes written to the output buffer | ||
86 | size_t GetOutputBufferSizeInBytes() const override; | ||
87 | |||
88 | //! @brief Set the frame index of the frame currently processed by the | ||
89 | //! ExecutionObjectPipeline. Used for trace/debug messages | ||
90 | //! @param idx index of the frame | ||
91 | void SetFrameIndex(int idx) override; | ||
92 | |||
93 | //! Returns the index of a frame being processed (set by SetFrameIndex) | ||
94 | int GetFrameIndex() const override; | ||
95 | |||
96 | //! @brief Start processing a frame. The call is asynchronous and | ||
97 | //! returns immediately. Use ProcessFrameWait() to wait | ||
98 | bool ProcessFrameStartAsync() override; | ||
99 | |||
100 | //! Wait for the executor pipeline to complete processing a frame | ||
101 | //! @return false if ProcessFrameWait() was called | ||
102 | //! without a corresponding call to | ||
103 | //! ExecutionObjectPipeline::ProcessFrameStartAsync(). | ||
104 | bool ProcessFrameWait() override; | ||
105 | |||
106 | //! @brief return the number of milliseconds taken *on the device* to | ||
107 | //! execute the process call | ||
108 | //! @return Number of milliseconds to process a frame on the device. | ||
109 | float GetProcessTimeInMilliSeconds() const override; | ||
110 | |||
111 | //! @brief return the number of milliseconds taken *on the host* to | ||
112 | //! execute the process call | ||
113 | //! @return Number of milliseconds to process a frame on the host. | ||
114 | float GetHostProcessTimeInMilliSeconds() const override; | ||
115 | |||
116 | //! Return the combined device names that this pipeline runs on | ||
117 | const std::string& GetDeviceName() const override; | ||
118 | |||
119 | //! Write the output buffer for each layer to a file | ||
120 | //! \<filename_prefix>_<ID>_HxW.bin | ||
121 | void WriteLayerOutputsToFile(const std::string& filename_prefix= | ||
122 | "trace_dump_") const override; | ||
123 | |||
124 | //! Returns a LayerOutput object corresponding to a layer. | ||
125 | //! Caller is responsible for deleting the LayerOutput object. | ||
126 | //! @see LayerOutput | ||
127 | //! @param layer_index The layer index of the layer | ||
128 | //! @param output_index The output index of the buffer for a given | ||
129 | //! layer. Defaults to 0. | ||
130 | const LayerOutput* GetOutputFromLayer(uint32_t layer_index, | ||
131 | uint32_t output_index=0) const override; | ||
132 | |||
133 | //! Get output buffers from all layers | ||
134 | const LayerOutputs* GetOutputsFromAllLayers() const override; | ||
135 | |||
136 | //! @private Used by runtime | ||
137 | //! @brief callback function at the completion of each ExecutionObject, | ||
138 | //! to chain the next ExectionObject for execution | ||
139 | void RunAsyncNext(); | ||
140 | |||
141 | ExecutionObjectPipeline() = delete; | ||
142 | ExecutionObjectPipeline(const ExecutionObjectPipeline&) = delete; | ||
143 | ExecutionObjectPipeline& operator=(const ExecutionObjectPipeline&) | ||
144 | = delete; | ||
145 | |||
146 | private: | ||
147 | class Impl; | ||
148 | std::unique_ptr<Impl> pimpl_m; | ||
149 | }; | ||
150 | |||
151 | } // namespace tidl | ||
diff --git a/tidl_api/inc/executor.h b/tidl_api/inc/executor.h index 23d92ff..1febfea 100644 --- a/tidl_api/inc/executor.h +++ b/tidl_api/inc/executor.h | |||
@@ -64,7 +64,7 @@ class ExecutionObject; | |||
64 | typedef std::vector<std::unique_ptr<ExecutionObject>> ExecutionObjects; | 64 | typedef std::vector<std::unique_ptr<ExecutionObject>> ExecutionObjects; |
65 | 65 | ||
66 | /*! @class Executor | 66 | /*! @class Executor |
67 | @brief Manages the overall execution of a network using the | 67 | @brief Manages the overall execution of a layersGroup in a network using the |
68 | specified configuration and the set of devices available to the | 68 | specified configuration and the set of devices available to the |
69 | executor. | 69 | executor. |
70 | */ | 70 | */ |
@@ -78,7 +78,7 @@ class Executor | |||
78 | //! @code | 78 | //! @code |
79 | //! Configuration configuration; | 79 | //! Configuration configuration; |
80 | //! configuration.ReadFromFile("path to configuration file"); | 80 | //! configuration.ReadFromFile("path to configuration file"); |
81 | //! DeviceIds ids1 = {DeviceId::ID2, DeviceId::ID3}; | 81 | //! DeviceIds ids = {DeviceId::ID2, DeviceId::ID3}; |
82 | //! Executor executor(DeviceType::EVE, ids, configuration); | 82 | //! Executor executor(DeviceType::EVE, ids, configuration); |
83 | //! @endcode | 83 | //! @endcode |
84 | //! | 84 | //! |
@@ -98,6 +98,9 @@ class Executor | |||
98 | //! available on this instance of the Executor | 98 | //! available on this instance of the Executor |
99 | const ExecutionObjects& GetExecutionObjects() const; | 99 | const ExecutionObjects& GetExecutionObjects() const; |
100 | 100 | ||
101 | //! Returns a single execution object at index | ||
102 | ExecutionObject* operator[](uint32_t index) const; | ||
103 | |||
101 | //! @brief Returns the number of devices of the specified type | 104 | //! @brief Returns the number of devices of the specified type |
102 | //! available for TI DL. | 105 | //! available for TI DL. |
103 | //! @param device_type DSP or EVE/EVE device | 106 | //! @param device_type DSP or EVE/EVE device |
@@ -106,7 +109,7 @@ class Executor | |||
106 | 109 | ||
107 | //! @brief Returns a string corresponding to the API version | 110 | //! @brief Returns a string corresponding to the API version |
108 | //! | 111 | //! |
109 | //! @return <major_ver>.<minor_ver>.<patch_ver>.<git_sha> | 112 | //! @return \<major_ver>.\<minor_ver>.\<patch_ver>.\<git_sha> |
110 | static std::string GetAPIVersion(); | 113 | static std::string GetAPIVersion(); |
111 | 114 | ||
112 | Executor(const Executor&) = delete; | 115 | Executor(const Executor&) = delete; |
diff --git a/tidl_api/src/execution_object.cpp b/tidl_api/src/execution_object.cpp index d722ebb..178bbca 100644 --- a/tidl_api/src/execution_object.cpp +++ b/tidl_api/src/execution_object.cpp | |||
@@ -31,6 +31,9 @@ | |||
31 | #include <string.h> | 31 | #include <string.h> |
32 | #include <fstream> | 32 | #include <fstream> |
33 | #include <climits> | 33 | #include <climits> |
34 | #include <mutex> | ||
35 | #include <condition_variable> | ||
36 | #include <chrono> | ||
34 | #include "executor.h" | 37 | #include "executor.h" |
35 | #include "execution_object.h" | 38 | #include "execution_object.h" |
36 | #include "trace.h" | 39 | #include "trace.h" |
@@ -50,13 +53,24 @@ class ExecutionObject::Impl | |||
50 | const DeviceArgInfo& create_arg, | 53 | const DeviceArgInfo& create_arg, |
51 | const DeviceArgInfo& param_heap_arg, | 54 | const DeviceArgInfo& param_heap_arg, |
52 | size_t extmem_heap_size, | 55 | size_t extmem_heap_size, |
56 | int layers_group_id, | ||
57 | bool output_trace, | ||
53 | bool internal_input); | 58 | bool internal_input); |
54 | ~Impl() {} | 59 | ~Impl() {} |
55 | 60 | ||
56 | bool RunAsync(CallType ct); | 61 | bool RunAsync(CallType ct); |
57 | bool Wait (CallType ct); | 62 | bool Wait (CallType ct); |
63 | bool AddCallback(CallType ct, void *user_data); | ||
64 | |||
65 | uint64_t GetProcessCycles() const; | ||
66 | int GetLayersGroupId() const; | ||
67 | void AcquireLock(); | ||
68 | void ReleaseLock(); | ||
58 | 69 | ||
59 | Device* device_m; | 70 | Device* device_m; |
71 | // Index of the OpenCL device/queue used by this EO | ||
72 | uint8_t device_index_m; | ||
73 | std::string device_name_m; | ||
60 | 74 | ||
61 | up_malloc_ddr<char> tidl_extmem_heap_m; | 75 | up_malloc_ddr<char> tidl_extmem_heap_m; |
62 | up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m; | 76 | up_malloc_ddr<OCL_TIDL_InitializeParams> shared_initialize_params_m; |
@@ -70,6 +84,9 @@ class ExecutionObject::Impl | |||
70 | // Frame being processed by the EO | 84 | // Frame being processed by the EO |
71 | int current_frame_idx_m; | 85 | int current_frame_idx_m; |
72 | 86 | ||
87 | // LayersGroupId being processed by the EO | ||
88 | int layers_group_id_m; | ||
89 | |||
73 | // Trace related | 90 | // Trace related |
74 | void WriteLayerOutputsToFile (const std::string& filename_prefix) const; | 91 | void WriteLayerOutputsToFile (const std::string& filename_prefix) const; |
75 | 92 | ||
@@ -81,25 +98,29 @@ class ExecutionObject::Impl | |||
81 | up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m; | 98 | up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m; |
82 | size_t trace_buf_params_sz_m; | 99 | size_t trace_buf_params_sz_m; |
83 | 100 | ||
101 | // host time tracking: eo start to finish | ||
102 | float host_time_m; | ||
103 | |||
84 | private: | 104 | private: |
85 | void SetupInitializeKernel(const DeviceArgInfo& create_arg, | 105 | void SetupInitializeKernel(const DeviceArgInfo& create_arg, |
86 | const DeviceArgInfo& param_heap_arg, | 106 | const DeviceArgInfo& param_heap_arg, |
87 | size_t extmem_heap_size, | 107 | size_t extmem_heap_size, |
88 | bool internal_input); | 108 | bool internal_input); |
109 | void EnableOutputBufferTrace(); | ||
89 | void SetupProcessKernel(); | 110 | void SetupProcessKernel(); |
90 | 111 | ||
91 | void HostWriteNetInput(); | 112 | void HostWriteNetInput(); |
92 | void HostReadNetOutput(); | 113 | void HostReadNetOutput(); |
93 | void ComputeInputOutputSizes(); | 114 | void ComputeInputOutputSizes(); |
94 | 115 | ||
95 | // Index of the OpenCL device/queue used by this EO | ||
96 | uint8_t device_index_m; | ||
97 | |||
98 | std::unique_ptr<Kernel> k_initialize_m; | 116 | std::unique_ptr<Kernel> k_initialize_m; |
99 | std::unique_ptr<Kernel> k_process_m; | 117 | std::unique_ptr<Kernel> k_process_m; |
100 | std::unique_ptr<Kernel> k_cleanup_m; | 118 | std::unique_ptr<Kernel> k_cleanup_m; |
101 | 119 | ||
102 | 120 | // Guarding sole access to input/output for one frame during execution | |
121 | bool is_idle_m; | ||
122 | std::mutex mutex_access_m; | ||
123 | std::condition_variable cv_access_m; | ||
103 | }; | 124 | }; |
104 | 125 | ||
105 | 126 | ||
@@ -108,6 +129,8 @@ ExecutionObject::ExecutionObject(Device* d, | |||
108 | const ArgInfo& create_arg, | 129 | const ArgInfo& create_arg, |
109 | const ArgInfo& param_heap_arg, | 130 | const ArgInfo& param_heap_arg, |
110 | size_t extmem_heap_size, | 131 | size_t extmem_heap_size, |
132 | int layers_group_id, | ||
133 | bool output_trace, | ||
111 | bool internal_input) | 134 | bool internal_input) |
112 | { | 135 | { |
113 | DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER); | 136 | DeviceArgInfo create_arg_d(create_arg, DeviceArgInfo::Kind::BUFFER); |
@@ -118,6 +141,8 @@ ExecutionObject::ExecutionObject(Device* d, | |||
118 | create_arg_d, | 141 | create_arg_d, |
119 | param_heap_arg_d, | 142 | param_heap_arg_d, |
120 | extmem_heap_size, | 143 | extmem_heap_size, |
144 | layers_group_id, | ||
145 | output_trace, | ||
121 | internal_input) }; | 146 | internal_input) }; |
122 | } | 147 | } |
123 | 148 | ||
@@ -127,8 +152,11 @@ ExecutionObject::Impl::Impl(Device* d, | |||
127 | const DeviceArgInfo& create_arg, | 152 | const DeviceArgInfo& create_arg, |
128 | const DeviceArgInfo& param_heap_arg, | 153 | const DeviceArgInfo& param_heap_arg, |
129 | size_t extmem_heap_size, | 154 | size_t extmem_heap_size, |
155 | int layers_group_id, | ||
156 | bool output_trace, | ||
130 | bool internal_input): | 157 | bool internal_input): |
131 | device_m(d), | 158 | device_m(d), |
159 | device_index_m(device_index), | ||
132 | tidl_extmem_heap_m (nullptr, &__free_ddr), | 160 | tidl_extmem_heap_m (nullptr, &__free_ddr), |
133 | shared_initialize_params_m(nullptr, &__free_ddr), | 161 | shared_initialize_params_m(nullptr, &__free_ddr), |
134 | shared_process_params_m(nullptr, &__free_ddr), | 162 | shared_process_params_m(nullptr, &__free_ddr), |
@@ -137,23 +165,26 @@ ExecutionObject::Impl::Impl(Device* d, | |||
137 | in_m(), | 165 | in_m(), |
138 | out_m(), | 166 | out_m(), |
139 | current_frame_idx_m(0), | 167 | current_frame_idx_m(0), |
168 | layers_group_id_m(layers_group_id), | ||
140 | num_network_layers_m(0), | 169 | num_network_layers_m(0), |
141 | trace_buf_params_m(nullptr, &__free_ddr), | 170 | trace_buf_params_m(nullptr, &__free_ddr), |
142 | trace_buf_params_sz_m(0), | 171 | trace_buf_params_sz_m(0), |
143 | device_index_m(device_index), | ||
144 | k_initialize_m(nullptr), | 172 | k_initialize_m(nullptr), |
145 | k_process_m(nullptr), | 173 | k_process_m(nullptr), |
146 | k_cleanup_m(nullptr) | 174 | k_cleanup_m(nullptr), |
175 | is_idle_m(true) | ||
147 | { | 176 | { |
148 | SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size, | 177 | device_name_m = device_m->GetDeviceName() + std::to_string(device_index_m); |
149 | internal_input); | ||
150 | |||
151 | SetupProcessKernel(); | ||
152 | |||
153 | // Save number of layers in the network | 178 | // Save number of layers in the network |
154 | const TIDL_CreateParams* cp = | 179 | const TIDL_CreateParams* cp = |
155 | static_cast<const TIDL_CreateParams *>(create_arg.ptr()); | 180 | static_cast<const TIDL_CreateParams *>(create_arg.ptr()); |
156 | num_network_layers_m = cp->net.numLayers; | 181 | num_network_layers_m = cp->net.numLayers; |
182 | |||
183 | SetupInitializeKernel(create_arg, param_heap_arg, extmem_heap_size, | ||
184 | internal_input); | ||
185 | |||
186 | if (output_trace) EnableOutputBufferTrace(); | ||
187 | SetupProcessKernel(); | ||
157 | } | 188 | } |
158 | 189 | ||
159 | // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/: | 190 | // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/: |
@@ -168,9 +199,7 @@ char* ExecutionObject::GetInputBufferPtr() const | |||
168 | 199 | ||
169 | size_t ExecutionObject::GetInputBufferSizeInBytes() const | 200 | size_t ExecutionObject::GetInputBufferSizeInBytes() const |
170 | { | 201 | { |
171 | const DeviceArgInfo& arg = pimpl_m->in_m.GetArg(); | 202 | return pimpl_m->in_size_m; |
172 | if (arg.ptr() == nullptr) return pimpl_m->in_size_m; | ||
173 | else return arg.size(); | ||
174 | } | 203 | } |
175 | 204 | ||
176 | char* ExecutionObject::GetOutputBufferPtr() const | 205 | char* ExecutionObject::GetOutputBufferPtr() const |
@@ -180,11 +209,7 @@ char* ExecutionObject::GetOutputBufferPtr() const | |||
180 | 209 | ||
181 | size_t ExecutionObject::GetOutputBufferSizeInBytes() const | 210 | size_t ExecutionObject::GetOutputBufferSizeInBytes() const |
182 | { | 211 | { |
183 | const DeviceArgInfo& arg = pimpl_m->out_m.GetArg(); | 212 | return pimpl_m->out_size_m; |
184 | if (arg.ptr() == nullptr) | ||
185 | return pimpl_m->out_size_m; | ||
186 | else | ||
187 | return pimpl_m->shared_process_params_m.get()->bytesWritten; | ||
188 | } | 213 | } |
189 | 214 | ||
190 | void ExecutionObject::SetFrameIndex(int idx) | 215 | void ExecutionObject::SetFrameIndex(int idx) |
@@ -199,8 +224,8 @@ int ExecutionObject::GetFrameIndex() const | |||
199 | 224 | ||
200 | void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out) | 225 | void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in, const ArgInfo& out) |
201 | { | 226 | { |
202 | assert(in.ptr() != nullptr && in.size() > 0); | 227 | assert(in.ptr() != nullptr && in.size() >= pimpl_m->in_size_m); |
203 | assert(out.ptr() != nullptr && out.size() > 0); | 228 | assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m); |
204 | 229 | ||
205 | pimpl_m->in_m = IODeviceArgInfo(in); | 230 | pimpl_m->in_m = IODeviceArgInfo(in); |
206 | pimpl_m->out_m = IODeviceArgInfo(out); | 231 | pimpl_m->out_m = IODeviceArgInfo(out); |
@@ -215,6 +240,7 @@ void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in, | |||
215 | 240 | ||
216 | bool ExecutionObject::ProcessFrameStartAsync() | 241 | bool ExecutionObject::ProcessFrameStartAsync() |
217 | { | 242 | { |
243 | assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr); | ||
218 | return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS); | 244 | return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS); |
219 | } | 245 | } |
220 | 246 | ||
@@ -233,21 +259,26 @@ bool ExecutionObject::Wait (CallType ct) | |||
233 | return pimpl_m->Wait(ct); | 259 | return pimpl_m->Wait(ct); |
234 | } | 260 | } |
235 | 261 | ||
236 | uint64_t ExecutionObject::GetProcessCycles() const | 262 | bool ExecutionObject::AddCallback(CallType ct, void *user_data) |
237 | { | 263 | { |
238 | uint8_t factor = 1; | 264 | return pimpl_m->AddCallback(ct, user_data); |
239 | |||
240 | // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles | ||
241 | if (pimpl_m->device_m->type() == CL_DEVICE_TYPE_CUSTOM) | ||
242 | factor = 2; | ||
243 | |||
244 | return pimpl_m->shared_process_params_m.get()->cycles * factor; | ||
245 | } | 265 | } |
246 | 266 | ||
247 | float ExecutionObject::GetProcessTimeInMilliSeconds() const | 267 | float ExecutionObject::GetProcessTimeInMilliSeconds() const |
248 | { | 268 | { |
249 | float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000; | 269 | float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000; |
250 | return ((float)GetProcessCycles())/frequency * 1000; | 270 | return ((float)pimpl_m->GetProcessCycles()) / frequency * 1000; |
271 | } | ||
272 | |||
273 | float ExecutionObject::GetHostProcessTimeInMilliSeconds() const | ||
274 | { | ||
275 | return pimpl_m->host_time_m; | ||
276 | } | ||
277 | |||
278 | void | ||
279 | ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const | ||
280 | { | ||
281 | pimpl_m->WriteLayerOutputsToFile(filename_prefix); | ||
251 | } | 282 | } |
252 | 283 | ||
253 | const LayerOutput* ExecutionObject::GetOutputFromLayer( | 284 | const LayerOutput* ExecutionObject::GetOutputFromLayer( |
@@ -261,37 +292,25 @@ const LayerOutputs* ExecutionObject::GetOutputsFromAllLayers() const | |||
261 | return pimpl_m->GetOutputsFromAllLayers(); | 292 | return pimpl_m->GetOutputsFromAllLayers(); |
262 | } | 293 | } |
263 | 294 | ||
264 | // | 295 | int ExecutionObject::GetLayersGroupId() const |
265 | // Allocate an OpenCL buffer for TIDL layer output buffer metadata. | ||
266 | // The device will populate metadata for every buffer that is used as an | ||
267 | // output buffer by a layer. | ||
268 | // | ||
269 | void ExecutionObject::EnableOutputBufferTrace() | ||
270 | { | 296 | { |
271 | pimpl_m->trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)* | 297 | return pimpl_m->layers_group_id_m; |
272 | pimpl_m->num_network_layers_m* | 298 | } |
273 | TIDL_NUM_OUT_BUFS); | ||
274 | |||
275 | pimpl_m->trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams> | ||
276 | (pimpl_m->trace_buf_params_sz_m)); | ||
277 | 299 | ||
278 | // Device will update bufferId if there is valid data for the entry | 300 | const std::string& ExecutionObject::GetDeviceName() const |
279 | OCL_TIDL_BufParams* bufferParams = pimpl_m->trace_buf_params_m.get(); | 301 | { |
280 | for (uint32_t i = 0; i < pimpl_m->num_network_layers_m; i++) | 302 | return pimpl_m->device_name_m; |
281 | for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++) | ||
282 | { | ||
283 | OCL_TIDL_BufParams *bufP = | ||
284 | &bufferParams[i*TIDL_NUM_OUT_BUFS+j]; | ||
285 | bufP->bufferId = UINT_MAX; | ||
286 | } | ||
287 | } | 303 | } |
288 | 304 | ||
289 | void | 305 | void ExecutionObject::AcquireLock() |
290 | ExecutionObject::WriteLayerOutputsToFile(const std::string& filename_prefix) const | ||
291 | { | 306 | { |
292 | pimpl_m->WriteLayerOutputsToFile(filename_prefix); | 307 | pimpl_m->AcquireLock(); |
293 | } | 308 | } |
294 | 309 | ||
310 | void ExecutionObject::ReleaseLock() | ||
311 | { | ||
312 | pimpl_m->ReleaseLock(); | ||
313 | } | ||
295 | 314 | ||
296 | // | 315 | // |
297 | // Create a kernel to call the "initialize" function | 316 | // Create a kernel to call the "initialize" function |
@@ -343,6 +362,32 @@ ExecutionObject::Impl::SetupInitializeKernel(const DeviceArgInfo& create_arg, | |||
343 | } | 362 | } |
344 | 363 | ||
345 | // | 364 | // |
365 | // Allocate an OpenCL buffer for TIDL layer output buffer metadata. | ||
366 | // The device will populate metadata for every buffer that is used as an | ||
367 | // output buffer by a layer. This needs to be done before setting up | ||
368 | // process kernel. | ||
369 | // | ||
370 | void ExecutionObject::Impl::EnableOutputBufferTrace() | ||
371 | { | ||
372 | trace_buf_params_sz_m = (sizeof(OCL_TIDL_BufParams)* | ||
373 | num_network_layers_m* | ||
374 | TIDL_NUM_OUT_BUFS); | ||
375 | |||
376 | trace_buf_params_m.reset(malloc_ddr<OCL_TIDL_BufParams> | ||
377 | (trace_buf_params_sz_m)); | ||
378 | |||
379 | // Device will update bufferId if there is valid data for the entry | ||
380 | OCL_TIDL_BufParams* bufferParams = trace_buf_params_m.get(); | ||
381 | for (uint32_t i = 0; i < num_network_layers_m; i++) | ||
382 | for (int j = 0; j < TIDL_NUM_OUT_BUFS; j++) | ||
383 | { | ||
384 | OCL_TIDL_BufParams *bufP = | ||
385 | &bufferParams[i*TIDL_NUM_OUT_BUFS+j]; | ||
386 | bufP->bufferId = UINT_MAX; | ||
387 | } | ||
388 | } | ||
389 | |||
390 | // | ||
346 | // Create a kernel to call the "process" function | 391 | // Create a kernel to call the "process" function |
347 | // | 392 | // |
348 | void | 393 | void |
@@ -514,10 +559,17 @@ bool ExecutionObject::Impl::RunAsync(CallType ct) | |||
514 | } | 559 | } |
515 | case CallType::PROCESS: | 560 | case CallType::PROCESS: |
516 | { | 561 | { |
562 | std::chrono::time_point<std::chrono::steady_clock> t1, t2; | ||
563 | t1 = std::chrono::steady_clock::now(); | ||
564 | |||
517 | shared_process_params_m->frameIdx = current_frame_idx_m; | 565 | shared_process_params_m->frameIdx = current_frame_idx_m; |
518 | shared_process_params_m->bytesWritten = 0; | 566 | shared_process_params_m->bytesWritten = 0; |
519 | HostWriteNetInput(); | 567 | HostWriteNetInput(); |
520 | k_process_m->RunAsync(); | 568 | k_process_m->RunAsync(); |
569 | |||
570 | t2 = std::chrono::steady_clock::now(); | ||
571 | std::chrono::duration<float> elapsed = t2 - t1; | ||
572 | host_time_m = elapsed.count() * 1000; | ||
521 | break; | 573 | break; |
522 | } | 574 | } |
523 | case CallType::CLEANUP: | 575 | case CallType::CLEANUP: |
@@ -551,13 +603,20 @@ bool ExecutionObject::Impl::Wait(CallType ct) | |||
551 | } | 603 | } |
552 | case CallType::PROCESS: | 604 | case CallType::PROCESS: |
553 | { | 605 | { |
554 | bool has_work = k_process_m->Wait(); | 606 | float host_elapsed_ms = 0.0f; |
607 | bool has_work = k_process_m->Wait(&host_elapsed_ms); | ||
555 | if (has_work) | 608 | if (has_work) |
556 | { | 609 | { |
557 | if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS) | 610 | if (shared_process_params_m->errorCode != OCL_TIDL_SUCCESS) |
558 | throw Exception(shared_process_params_m->errorCode, | 611 | throw Exception(shared_process_params_m->errorCode, |
559 | __FILE__, __FUNCTION__, __LINE__); | 612 | __FILE__, __FUNCTION__, __LINE__); |
613 | |||
614 | std::chrono::time_point<std::chrono::steady_clock> t1, t2; | ||
615 | t1 = std::chrono::steady_clock::now(); | ||
560 | HostReadNetOutput(); | 616 | HostReadNetOutput(); |
617 | t2 = std::chrono::steady_clock::now(); | ||
618 | std::chrono::duration<float> elapsed = t2 - t1; | ||
619 | host_time_m += elapsed.count() * 1000 + host_elapsed_ms; | ||
561 | } | 620 | } |
562 | 621 | ||
563 | return has_work; | 622 | return has_work; |
@@ -574,6 +633,33 @@ bool ExecutionObject::Impl::Wait(CallType ct) | |||
574 | return false; | 633 | return false; |
575 | } | 634 | } |
576 | 635 | ||
636 | bool ExecutionObject::Impl::AddCallback(CallType ct, void *user_data) | ||
637 | { | ||
638 | switch (ct) | ||
639 | { | ||
640 | case CallType::PROCESS: | ||
641 | { | ||
642 | return k_process_m->AddCallback(user_data); | ||
643 | break; | ||
644 | } | ||
645 | default: | ||
646 | return false; | ||
647 | } | ||
648 | |||
649 | return false; | ||
650 | } | ||
651 | |||
652 | uint64_t ExecutionObject::Impl::GetProcessCycles() const | ||
653 | { | ||
654 | uint8_t factor = 1; | ||
655 | |||
656 | // ARP32 running at half frequency of VCOP, multiply by 2 for VCOP cycles | ||
657 | if (device_m->type() == CL_DEVICE_TYPE_CUSTOM) | ||
658 | factor = 2; | ||
659 | |||
660 | return shared_process_params_m.get()->cycles * factor; | ||
661 | } | ||
662 | |||
577 | // | 663 | // |
578 | // Write the trace data to output files | 664 | // Write the trace data to output files |
579 | // | 665 | // |
@@ -697,3 +783,16 @@ LayerOutput::~LayerOutput() | |||
697 | { | 783 | { |
698 | delete[] data_m; | 784 | delete[] data_m; |
699 | } | 785 | } |
786 | |||
787 | void ExecutionObject::Impl::AcquireLock() | ||
788 | { | ||
789 | std::unique_lock<std::mutex> lock(mutex_access_m); | ||
790 | cv_access_m.wait(lock, [this]{ return this->is_idle_m; }); | ||
791 | is_idle_m = false; | ||
792 | } | ||
793 | |||
794 | void ExecutionObject::Impl::ReleaseLock() | ||
795 | { | ||
796 | is_idle_m = true; | ||
797 | cv_access_m.notify_all(); | ||
798 | } | ||
diff --git a/tidl_api/src/execution_object_pipeline.cpp b/tidl_api/src/execution_object_pipeline.cpp new file mode 100644 index 0000000..ff84255 --- /dev/null +++ b/tidl_api/src/execution_object_pipeline.cpp | |||
@@ -0,0 +1,360 @@ | |||
1 | /****************************************************************************** | ||
2 | * Copyright (c) 2018 Texas Instruments Incorporated - http://www.ti.com/ | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * Redistribution and use in source and binary forms, with or without | ||
6 | * modification, are permitted provided that the following conditions are met: | ||
7 | * * Redistributions of source code must retain the above copyright | ||
8 | * notice, this list of conditions and the following disclaimer. | ||
9 | * * Redistributions in binary form must reproduce the above copyright | ||
10 | * notice, this list of conditions and the following disclaimer in the | ||
11 | * documentation and/or other materials provided with the distribution. | ||
12 | * * Neither the name of Texas Instruments Incorporated nor the | ||
13 | * names of its contributors may be used to endorse or promote products | ||
14 | * derived from this software without specific prior written permission. | ||
15 | * | ||
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
17 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
20 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | ||
26 | * THE POSSIBILITY OF SUCH DAMAGE. | ||
27 | *****************************************************************************/ | ||
28 | |||
29 | #include <assert.h> | ||
30 | #include <mutex> | ||
31 | #include <condition_variable> | ||
32 | #include <chrono> | ||
33 | #include "device_arginfo.h" | ||
34 | #include "execution_object_pipeline.h" | ||
35 | |||
36 | using namespace tidl; | ||
37 | |||
38 | class ExecutionObjectPipeline::Impl | ||
39 | { | ||
40 | public: | ||
41 | Impl(std::vector<ExecutionObject*> &eos); | ||
42 | ~Impl(); | ||
43 | |||
44 | void SetInputOutputBuffer(const ArgInfo &in, const ArgInfo &out); | ||
45 | bool RunAsyncStart(); | ||
46 | bool RunAsyncNext(); | ||
47 | bool Wait(); | ||
48 | |||
49 | // Trace related | ||
50 | void WriteLayerOutputsToFile(const std::string& filename_prefix) const; | ||
51 | const LayerOutput* GetOutputFromLayer(uint32_t layer_index, | ||
52 | uint32_t output_index) const; | ||
53 | const LayerOutputs* GetOutputsFromAllLayers() const; | ||
54 | |||
55 | //! for pipelined execution | ||
56 | std::vector<ExecutionObject*> eos_m; | ||
57 | std::vector<IODeviceArgInfo*> iobufs_m; | ||
58 | |||
59 | std::string device_name_m; | ||
60 | |||
61 | //! current frame index | ||
62 | int frame_idx_m; | ||
63 | |||
64 | //! current execution object index | ||
65 | uint32_t curr_eo_idx_m; | ||
66 | |||
67 | // host time tracking: pipeline start to finish | ||
68 | float host_time_m; | ||
69 | |||
70 | private: | ||
71 | //! @brief Initialize ExecutionObjectPipeline with given | ||
72 | //! ExecutionObjects: check consecutive layersGroup, allocate memory | ||
73 | void Initialize(); | ||
74 | |||
75 | // flag, mutex and cond var for signaling completion and waiting | ||
76 | bool has_work_m, is_processed_m; | ||
77 | std::mutex mutex_m; | ||
78 | std::condition_variable cv_m; | ||
79 | |||
80 | // host time tracking: pipeline start to finish | ||
81 | std::chrono::time_point<std::chrono::steady_clock> start_m; | ||
82 | }; | ||
83 | |||
84 | ExecutionObjectPipeline::ExecutionObjectPipeline( | ||
85 | std::vector<ExecutionObject*> eos) | ||
86 | { | ||
87 | pimpl_m = std::unique_ptr<Impl> { new Impl(eos) }; | ||
88 | } | ||
89 | |||
90 | ExecutionObjectPipeline::Impl::Impl(std::vector<ExecutionObject *> &eos) : | ||
91 | eos_m(eos), has_work_m(false), is_processed_m(false) | ||
92 | { | ||
93 | Initialize(); | ||
94 | } | ||
95 | |||
96 | // Pointer to implementation idiom: https://herbsutter.com/gotw/_100/: | ||
97 | // Both unique_ptr and shared_ptr can be instantiated with an incomplete type | ||
98 | // unique_ptr's destructor requires a complete type in order to invoke delete | ||
99 | ExecutionObjectPipeline::~ExecutionObjectPipeline() = default; | ||
100 | |||
101 | char* ExecutionObjectPipeline::GetInputBufferPtr() const | ||
102 | { | ||
103 | return static_cast<char *>(pimpl_m->iobufs_m.front()->GetArg().ptr()); | ||
104 | } | ||
105 | |||
106 | size_t ExecutionObjectPipeline::GetInputBufferSizeInBytes() const | ||
107 | { | ||
108 | return pimpl_m->eos_m.front()->GetInputBufferSizeInBytes(); | ||
109 | } | ||
110 | |||
111 | char* ExecutionObjectPipeline::GetOutputBufferPtr() const | ||
112 | { | ||
113 | return static_cast<char *>(pimpl_m->iobufs_m.back()->GetArg().ptr()); | ||
114 | } | ||
115 | |||
116 | size_t ExecutionObjectPipeline::GetOutputBufferSizeInBytes() const | ||
117 | { | ||
118 | return pimpl_m->eos_m.back()->GetOutputBufferSizeInBytes(); | ||
119 | } | ||
120 | |||
121 | void ExecutionObjectPipeline::SetInputOutputBuffer(const ArgInfo& in, | ||
122 | const ArgInfo& out) | ||
123 | { | ||
124 | assert(in.ptr() != nullptr && in.size() >= GetInputBufferSizeInBytes()); | ||
125 | assert(out.ptr() != nullptr && out.size() >= GetOutputBufferSizeInBytes()); | ||
126 | pimpl_m->SetInputOutputBuffer(in, out); | ||
127 | } | ||
128 | |||
129 | void ExecutionObjectPipeline::SetFrameIndex(int idx) | ||
130 | { | ||
131 | pimpl_m->frame_idx_m = idx; | ||
132 | } | ||
133 | |||
134 | int ExecutionObjectPipeline::GetFrameIndex() const | ||
135 | { | ||
136 | return pimpl_m->frame_idx_m; | ||
137 | } | ||
138 | |||
139 | bool ExecutionObjectPipeline::ProcessFrameStartAsync() | ||
140 | { | ||
141 | assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr); | ||
142 | bool st = pimpl_m->RunAsyncStart(); | ||
143 | if (st) | ||
144 | st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS, | ||
145 | this); | ||
146 | return st; | ||
147 | } | ||
148 | |||
149 | bool ExecutionObjectPipeline::ProcessFrameWait() | ||
150 | { | ||
151 | return pimpl_m->Wait(); | ||
152 | } | ||
153 | |||
154 | void CallbackWrapper(void *user_data) | ||
155 | { | ||
156 | ((ExecutionObjectPipeline *) user_data)->RunAsyncNext(); | ||
157 | } | ||
158 | |||
159 | void ExecutionObjectPipeline::RunAsyncNext() | ||
160 | { | ||
161 | bool has_next = pimpl_m->RunAsyncNext(); | ||
162 | if (has_next) | ||
163 | pimpl_m->eos_m[pimpl_m->curr_eo_idx_m]->AddCallback( | ||
164 | ExecutionObject::CallType::PROCESS, this); | ||
165 | } | ||
166 | |||
167 | float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const | ||
168 | { | ||
169 | float total = 0.0f; | ||
170 | for (auto eo : pimpl_m->eos_m) | ||
171 | total += eo->GetProcessTimeInMilliSeconds(); | ||
172 | return total; | ||
173 | } | ||
174 | |||
175 | float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds() const | ||
176 | { | ||
177 | return pimpl_m->host_time_m; | ||
178 | } | ||
179 | |||
180 | const std::string& ExecutionObjectPipeline::GetDeviceName() const | ||
181 | { | ||
182 | return pimpl_m->device_name_m; | ||
183 | } | ||
184 | |||
185 | void | ||
186 | ExecutionObjectPipeline::WriteLayerOutputsToFile( | ||
187 | const std::string& filename_prefix) const | ||
188 | { | ||
189 | pimpl_m->WriteLayerOutputsToFile(filename_prefix); | ||
190 | } | ||
191 | |||
192 | const LayerOutput* | ||
193 | ExecutionObjectPipeline::GetOutputFromLayer(uint32_t layer_index, | ||
194 | uint32_t output_index) const | ||
195 | { | ||
196 | return pimpl_m->GetOutputFromLayer(layer_index, output_index); | ||
197 | } | ||
198 | |||
199 | const LayerOutputs* | ||
200 | ExecutionObjectPipeline::GetOutputsFromAllLayers() const | ||
201 | { | ||
202 | return pimpl_m->GetOutputsFromAllLayers(); | ||
203 | } | ||
204 | |||
205 | |||
206 | /// Impl methods start here | ||
207 | |||
208 | |||
209 | static | ||
210 | void* AllocateMem(size_t size) | ||
211 | { | ||
212 | if (size == 0) return nullptr; | ||
213 | void *ptr = malloc(size); | ||
214 | if (ptr == nullptr) | ||
215 | throw Exception("Out of memory, ExecutionObjectPipeline malloc failed", | ||
216 | __FILE__, __FUNCTION__, __LINE__); | ||
217 | return ptr; | ||
218 | } | ||
219 | |||
220 | void ExecutionObjectPipeline::Impl::Initialize() | ||
221 | { | ||
222 | // Check consecutive layersGroups to form a pipeline | ||
223 | int prev_group = 0; | ||
224 | for (auto eo : eos_m) | ||
225 | { | ||
226 | int group = eo->GetLayersGroupId(); | ||
227 | if (prev_group != 0 && group != prev_group + 1) | ||
228 | throw Exception( | ||
229 | "Non-consecutive layersGroupIds in ExecutionObjectPipeline", | ||
230 | __FILE__, __FUNCTION__, __LINE__); | ||
231 | prev_group = group; | ||
232 | } | ||
233 | |||
234 | for (auto eo : eos_m) | ||
235 | device_name_m += eo->GetDeviceName() + "+"; | ||
236 | device_name_m.resize(device_name_m.size() - 1); | ||
237 | |||
238 | // Allocate input and output memory for EOs/layersGroups | ||
239 | // Note that i-th EO's output buffer is the same as (i+1)-th EO's input | ||
240 | // So, if n EOs, then (n+1) buffers: b EO b EO b EO b ... EO b | ||
241 | // User must set the first input buffer and the last output buffer | ||
242 | size_t size; | ||
243 | ArgInfo in(nullptr, 0); | ||
244 | iobufs_m.push_back(new IODeviceArgInfo(in)); | ||
245 | for (auto eo : eos_m) | ||
246 | { | ||
247 | if (eo != eos_m.back()) | ||
248 | size = eo->GetOutputBufferSizeInBytes(); | ||
249 | else | ||
250 | size = 0; | ||
251 | |||
252 | void *ptr = AllocateMem(size); | ||
253 | ArgInfo out(ptr, size); | ||
254 | iobufs_m.push_back(new IODeviceArgInfo(out)); | ||
255 | } | ||
256 | } | ||
257 | |||
258 | ExecutionObjectPipeline::Impl::~Impl() | ||
259 | { | ||
260 | int num_iobufs = iobufs_m.size(); | ||
261 | for (int i = 0; i < num_iobufs; i++) | ||
262 | { | ||
263 | if (! (i == 0 || i == num_iobufs-1)) | ||
264 | free(iobufs_m[i]->GetArg().ptr()); | ||
265 | delete iobufs_m[i]; | ||
266 | } | ||
267 | } | ||
268 | |||
269 | void ExecutionObjectPipeline::Impl::SetInputOutputBuffer(const ArgInfo &in, | ||
270 | const ArgInfo &out) | ||
271 | { | ||
272 | delete iobufs_m.front(); | ||
273 | delete iobufs_m.back(); | ||
274 | iobufs_m.front() = new IODeviceArgInfo(in); | ||
275 | iobufs_m.back() = new IODeviceArgInfo(out); | ||
276 | } | ||
277 | |||
278 | bool ExecutionObjectPipeline::Impl::RunAsyncStart() | ||
279 | { | ||
280 | start_m = std::chrono::steady_clock::now(); | ||
281 | has_work_m = true; | ||
282 | is_processed_m = false; | ||
283 | host_time_m = 0.0f; | ||
284 | curr_eo_idx_m = 0; | ||
285 | eos_m[0]->AcquireLock(); | ||
286 | eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1]); | ||
287 | return eos_m[0]->ProcessFrameStartAsync(); | ||
288 | } | ||
289 | |||
290 | // returns true if we have more EOs to execute | ||
291 | bool ExecutionObjectPipeline::Impl::RunAsyncNext() | ||
292 | { | ||
293 | eos_m[curr_eo_idx_m]->ProcessFrameWait(); | ||
294 | eos_m[curr_eo_idx_m]->ReleaseLock(); | ||
295 | curr_eo_idx_m += 1; | ||
296 | if (curr_eo_idx_m < eos_m.size()) | ||
297 | { | ||
298 | eos_m[curr_eo_idx_m]->AcquireLock(); | ||
299 | eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m], | ||
300 | iobufs_m[curr_eo_idx_m+1]); | ||
301 | eos_m[curr_eo_idx_m]->ProcessFrameStartAsync(); | ||
302 | return true; | ||
303 | } | ||
304 | else | ||
305 | { | ||
306 | std::chrono::duration<float> elapsed = std::chrono::steady_clock::now() | ||
307 | - start_m; | ||
308 | host_time_m = elapsed.count() * 1000; // seconds to milliseconds | ||
309 | is_processed_m = true; | ||
310 | cv_m.notify_all(); | ||
311 | return false; | ||
312 | } | ||
313 | } | ||
314 | |||
315 | bool ExecutionObjectPipeline::Impl::Wait() | ||
316 | { | ||
317 | if (! has_work_m) return false; | ||
318 | |||
319 | std::unique_lock<std::mutex> lock(mutex_m); | ||
320 | cv_m.wait(lock, [this]{ return this->is_processed_m; }); | ||
321 | has_work_m = false; | ||
322 | return true; | ||
323 | } | ||
324 | |||
325 | void | ||
326 | ExecutionObjectPipeline::Impl::WriteLayerOutputsToFile( | ||
327 | const std::string& filename_prefix) const | ||
328 | { | ||
329 | for (auto eo : eos_m) | ||
330 | eo->WriteLayerOutputsToFile(filename_prefix); | ||
331 | } | ||
332 | |||
333 | const LayerOutput* | ||
334 | ExecutionObjectPipeline::Impl::GetOutputFromLayer(uint32_t layer_index, | ||
335 | uint32_t output_index) const | ||
336 | { | ||
337 | const LayerOutput* lo = nullptr; | ||
338 | for (auto eo : eos_m) | ||
339 | { | ||
340 | lo = eo->GetOutputFromLayer(layer_index, output_index); | ||
341 | if (lo != nullptr) break; | ||
342 | } | ||
343 | return lo; | ||
344 | } | ||
345 | |||
346 | const LayerOutputs* | ||
347 | ExecutionObjectPipeline::Impl::GetOutputsFromAllLayers() const | ||
348 | { | ||
349 | LayerOutputs *all = new LayerOutputs; | ||
350 | for (auto eo : eos_m) | ||
351 | { | ||
352 | LayerOutputs *los = const_cast<LayerOutputs *>( | ||
353 | eo->GetOutputsFromAllLayers()); | ||
354 | for (auto& lo : *los) | ||
355 | all->push_back(std::unique_ptr<const LayerOutput>{ lo.release() }); | ||
356 | delete los; | ||
357 | } | ||
358 | return all; | ||
359 | } | ||
360 | |||
diff --git a/tidl_api/src/executor.cpp b/tidl_api/src/executor.cpp index b644728..914c78a 100644 --- a/tidl_api/src/executor.cpp +++ b/tidl_api/src/executor.cpp | |||
@@ -96,6 +96,12 @@ const ExecutionObjects& Executor::GetExecutionObjects() const | |||
96 | return pimpl_m->execution_objects_m; | 96 | return pimpl_m->execution_objects_m; |
97 | } | 97 | } |
98 | 98 | ||
99 | ExecutionObject* Executor::operator[](uint32_t index) const | ||
100 | { | ||
101 | assert(index < pimpl_m->execution_objects_m.size()); | ||
102 | return pimpl_m->execution_objects_m[index].get(); | ||
103 | } | ||
104 | |||
99 | bool ExecutorImpl::Initialize(const Configuration& configuration) | 105 | bool ExecutorImpl::Initialize(const Configuration& configuration) |
100 | { | 106 | { |
101 | configuration_m = configuration; | 107 | configuration_m = configuration; |
@@ -145,13 +151,11 @@ bool ExecutorImpl::Initialize(const Configuration& configuration) | |||
145 | {new ExecutionObject(device_m.get(), index, | 151 | {new ExecutionObject(device_m.get(), index, |
146 | create_arg, param_heap_arg, | 152 | create_arg, param_heap_arg, |
147 | configuration_m.EXTMEM_HEAP_SIZE, | 153 | configuration_m.EXTMEM_HEAP_SIZE, |
154 | layers_group_id_m, | ||
155 | configuration_m.enableOutputTrace, | ||
148 | configuration_m.enableInternalInput)} ); | 156 | configuration_m.enableInternalInput)} ); |
149 | } | 157 | } |
150 | 158 | ||
151 | if (configuration_m.enableOutputTrace) | ||
152 | for (auto &eo : execution_objects_m) | ||
153 | eo->EnableOutputBufferTrace(); | ||
154 | |||
155 | for (auto &eo : execution_objects_m) | 159 | for (auto &eo : execution_objects_m) |
156 | eo->RunAsync(ExecutionObject::CallType::INIT); | 160 | eo->RunAsync(ExecutionObject::CallType::INIT); |
157 | 161 | ||
@@ -294,4 +298,3 @@ const char* Exception::what() const noexcept | |||
294 | { | 298 | { |
295 | return message_m.c_str(); | 299 | return message_m.c_str(); |
296 | } | 300 | } |
297 | |||
diff --git a/tidl_api/src/ocl_device.cpp b/tidl_api/src/ocl_device.cpp index fba4f94..b3eaf36 100644 --- a/tidl_api/src/ocl_device.cpp +++ b/tidl_api/src/ocl_device.cpp | |||
@@ -91,7 +91,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename): | |||
91 | // Queue 0 on device 0 | 91 | // Queue 0 on device 0 |
92 | queue_m[0] = clCreateCommandQueue(context_m, | 92 | queue_m[0] = clCreateCommandQueue(context_m, |
93 | device_ids[0], | 93 | device_ids[0], |
94 | 0, | 94 | CL_QUEUE_PROFILING_ENABLE, |
95 | &errcode); | 95 | &errcode); |
96 | errorCheck(errcode, __LINE__); | 96 | errorCheck(errcode, __LINE__); |
97 | BuildProgramFromBinary(binary_filename, device_ids, 1); | 97 | BuildProgramFromBinary(binary_filename, device_ids, 1); |
@@ -139,7 +139,7 @@ DspDevice::DspDevice(const DeviceIds& ids, const std::string &binary_filename): | |||
139 | int index = static_cast<int>(id); | 139 | int index = static_cast<int>(id); |
140 | queue_m[index] = clCreateCommandQueue(context_m, | 140 | queue_m[index] = clCreateCommandQueue(context_m, |
141 | sub_devices[index], | 141 | sub_devices[index], |
142 | 0, | 142 | CL_QUEUE_PROFILING_ENABLE, |
143 | &errcode); | 143 | &errcode); |
144 | errorCheck(errcode, __LINE__); | 144 | errorCheck(errcode, __LINE__); |
145 | } | 145 | } |
@@ -187,7 +187,7 @@ EveDevice::EveDevice(const DeviceIds& ids, const std::string &kernel_names): | |||
187 | int index = static_cast<int>(id); | 187 | int index = static_cast<int>(id); |
188 | queue_m[index] = clCreateCommandQueue(context_m, | 188 | queue_m[index] = clCreateCommandQueue(context_m, |
189 | all_device_ids[index], | 189 | all_device_ids[index], |
190 | 0, | 190 | CL_QUEUE_PROFILING_ENABLE, |
191 | &errcode); | 191 | &errcode); |
192 | errorCheck(errcode, __LINE__); | 192 | errorCheck(errcode, __LINE__); |
193 | } | 193 | } |
@@ -317,7 +317,7 @@ Kernel& Kernel::RunAsync() | |||
317 | } | 317 | } |
318 | 318 | ||
319 | 319 | ||
320 | bool Kernel::Wait() | 320 | bool Kernel::Wait(float *host_elapsed_ms) |
321 | { | 321 | { |
322 | // Wait called without a corresponding RunAsync | 322 | // Wait called without a corresponding RunAsync |
323 | if (!is_running_m) | 323 | if (!is_running_m) |
@@ -326,6 +326,17 @@ bool Kernel::Wait() | |||
326 | TRACE::print("\tKernel: waiting...\n"); | 326 | TRACE::print("\tKernel: waiting...\n"); |
327 | cl_int ret = clWaitForEvents(1, &event_m); | 327 | cl_int ret = clWaitForEvents(1, &event_m); |
328 | errorCheck(ret, __LINE__); | 328 | errorCheck(ret, __LINE__); |
329 | |||
330 | if (host_elapsed_ms != nullptr) | ||
331 | { | ||
332 | cl_ulong t_que, t_end; | ||
333 | clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_QUEUED, | ||
334 | sizeof(cl_ulong), &t_que, nullptr); | ||
335 | clGetEventProfilingInfo(event_m, CL_PROFILING_COMMAND_END, | ||
336 | sizeof(cl_ulong), &t_end, nullptr); | ||
337 | *host_elapsed_ms = (t_end - t_que) / 1.0e6; // nano to milli seconds | ||
338 | } | ||
339 | |||
329 | ret = clReleaseEvent(event_m); | 340 | ret = clReleaseEvent(event_m); |
330 | errorCheck(ret, __LINE__); | 341 | errorCheck(ret, __LINE__); |
331 | TRACE::print("\tKernel: finished execution\n"); | 342 | TRACE::print("\tKernel: finished execution\n"); |
@@ -334,6 +345,22 @@ bool Kernel::Wait() | |||
334 | return true; | 345 | return true; |
335 | } | 346 | } |
336 | 347 | ||
348 | extern void CallbackWrapper(void *user_data) __attribute__((weak)); | ||
349 | |||
350 | static | ||
351 | void EventCallback(cl_event event, cl_int exec_status, void *user_data) | ||
352 | { | ||
353 | if (exec_status != CL_SUCCESS || user_data == nullptr) return; | ||
354 | if (CallbackWrapper) CallbackWrapper(user_data); | ||
355 | } | ||
356 | |||
357 | bool Kernel::AddCallback(void *user_data) | ||
358 | { | ||
359 | if (! is_running_m) return false; | ||
360 | return clSetEventCallback(event_m, CL_COMPLETE, EventCallback, user_data) | ||
361 | == CL_SUCCESS; | ||
362 | } | ||
363 | |||
337 | Kernel::~Kernel() | 364 | Kernel::~Kernel() |
338 | { | 365 | { |
339 | for (auto b : buffers_m) | 366 | for (auto b : buffers_m) |
diff --git a/tidl_api/src/ocl_device.h b/tidl_api/src/ocl_device.h index 6e80166..04c5db6 100644 --- a/tidl_api/src/ocl_device.h +++ b/tidl_api/src/ocl_device.h | |||
@@ -74,6 +74,8 @@ class Device | |||
74 | 74 | ||
75 | static uint32_t GetNumDevices(DeviceType device_type); | 75 | static uint32_t GetNumDevices(DeviceType device_type); |
76 | 76 | ||
77 | virtual std::string GetDeviceName() = 0; | ||
78 | |||
77 | protected: | 79 | protected: |
78 | 80 | ||
79 | static const int MAX_DEVICES = 4; | 81 | static const int MAX_DEVICES = 4; |
@@ -101,6 +103,8 @@ class DspDevice: public Device | |||
101 | DspDevice(const DspDevice&) = delete; | 103 | DspDevice(const DspDevice&) = delete; |
102 | DspDevice& operator=(const DspDevice&) = delete; | 104 | DspDevice& operator=(const DspDevice&) = delete; |
103 | 105 | ||
106 | virtual std::string GetDeviceName() { return "DSP"; } | ||
107 | |||
104 | protected: | 108 | protected: |
105 | bool BuildProgramFromBinary(const std::string &binary_filename, | 109 | bool BuildProgramFromBinary(const std::string &binary_filename, |
106 | cl_device_id device_ids[], | 110 | cl_device_id device_ids[], |
@@ -117,6 +121,8 @@ class EveDevice : public Device | |||
117 | EveDevice(const EveDevice&) = delete; | 121 | EveDevice(const EveDevice&) = delete; |
118 | EveDevice& operator=(const EveDevice&) = delete; | 122 | EveDevice& operator=(const EveDevice&) = delete; |
119 | 123 | ||
124 | virtual std::string GetDeviceName() { return "EVE"; } | ||
125 | |||
120 | protected: | 126 | protected: |
121 | bool BuildProgramFromBinary(const std::string &kernel_names, | 127 | bool BuildProgramFromBinary(const std::string &kernel_names, |
122 | cl_device_id device_ids[], | 128 | cl_device_id device_ids[], |
@@ -137,7 +143,8 @@ class Kernel | |||
137 | ~Kernel(); | 143 | ~Kernel(); |
138 | 144 | ||
139 | Kernel& RunAsync(); | 145 | Kernel& RunAsync(); |
140 | bool Wait(); | 146 | bool Wait(float *host_elapsed_ms = nullptr); |
147 | bool AddCallback(void *user_data); | ||
141 | 148 | ||
142 | private: | 149 | private: |
143 | cl_kernel kernel_m; | 150 | cl_kernel kernel_m; |