Simplify API for multiple contexts
authorAjay Jayaraj <ajayj@ti.com>
Mon, 15 Oct 2018 16:00:46 +0000 (11:00 -0500)
committerAjay Jayaraj <ajayj@ti.com>
Fri, 19 Oct 2018 21:46:52 +0000 (16:46 -0500)
1.  Simplify context API in ExecutionObject. Replace context_id variants to
multiple existing APIs with these two APIs:

        bool AcquireAndRunContext(uint32_t& context_idx,
                                  int frame_idx,
                                  const IODeviceArgInfo& in,
                                  const IODeviceArgInfo& out);

        bool WaitAndReleaseContext(uint32_t  context_idx);

2. The timing methods for host execution in EOPs and EOs:

* GetProcessTimeInMilliSeconds()
* GetHostProcessTimeInMilliSeconds()

are no longer accurate with multiple contexts and pipelining.

Replace these methods and replace with a generic timestamp
based approach. There is a single API call to enable time stamps in an
application:

//! Enable time stamp generation for TIDL API events
bool EnableTimeStamps(const std::string& file = "timestamp.log", size_t
num_frames=32);
If this method is called before TIDL API frame processing, the API will
generate timestamps for events corresponding to each frame (e.g.
EOP::ProcessFrameStartAsync, EOP::ProcessFrameWait, etc.). These
timestamps are then written to file when the user's application
completes.

A separate script is used for post-processing the time stamps and
generating data for the user.

(MCT-1073, MCT-1074)

17 files changed:
examples/classification/main.cpp
examples/common/utils.cpp
examples/common/utils.h
examples/imagenet/main.cpp
examples/segmentation/main.cpp
examples/ssd_multibox/main.cpp
examples/two_eo_per_frame/main.cpp
examples/two_eo_per_frame_opt/main.cpp
tidl_api/inc/execution_object.h
tidl_api/inc/execution_object_internal.h
tidl_api/inc/execution_object_pipeline.h
tidl_api/inc/executor.h
tidl_api/src/execution_object.cpp
tidl_api/src/execution_object_pipeline.cpp
tidl_api/src/ocl_device.cpp
tidl_api/src/util.cpp
tidl_api/src/util.h

index 9f6da3aba600d9804f20a1bd468f11ecad0606ef..f0473f4f6bb6c89508f653f4e90205404f55106d 100644 (file)
@@ -134,7 +134,6 @@ static void ProcessArgs(int argc, char *argv[],
                         std::string& config_file,
                         uint32_t & num_dsps, uint32_t &num_eves,
                         int & num_layers_groups);
-void ReportTime(const ExecutionObjectPipeline* eop);
 
 static void DisplayHelp();
 extern std::string labels_classes[];
@@ -232,9 +231,6 @@ bool RunConfiguration(const std::string& config_file, int num_layers_groups, uin
             // Wait for previous frame on the same eo to finish processing
             if (eop->ProcessFrameWait())
             {
-                 #ifdef PERF_VERBOSE
-                 ReportTime(eop);
-                 #endif
                  DisplayFrame(eop, writer, frame_idx, num_eops,
                               num_eves, num_dsps);
             }
@@ -525,24 +521,6 @@ bool ReadFrame(ExecutionObjectPipeline* eop, const Configuration& c,
     return false;
 }
 
-void ReportTime(const ExecutionObjectPipeline* eop)
-{
-    uint32_t frame_index    = eop->GetFrameIndex();
-    std::string device_name = eop->GetDeviceName();
-    float elapsed_host      = eop->GetHostProcessTimeInMilliSeconds();
-    float elapsed_device    = eop->GetProcessTimeInMilliSeconds();
-    double overhead         = 100 - (elapsed_device/elapsed_host*100);
-    std::cout << "frame[" << frame_index << "]: "
-              << "Time on " << device_name << ": "
-              << std::setw(6) << std::setprecision(4)
-              << elapsed_device << "ms, "
-              << "host: "
-              << std::setw(6) << std::setprecision(4)
-              << elapsed_host << "ms ";
-    std::cout << "API overhead: "
-              << std::setw(6) << std::setprecision(3)
-              << overhead << " %" << std::endl;
-}
 
 void DisplayFrame(const ExecutionObjectPipeline* eop, VideoWriter& writer,
                   uint32_t frame_idx, uint32_t num_eops,
index 43810a9773809e475188ee93ed2d1fcc6c2c7c72..4476c17a706022328d702ff565f5995a9d94589b 100644 (file)
@@ -122,26 +122,11 @@ bool WriteFrame(const ExecutionObject* eo, ostream& output_file)
 
 void ReportTime(const ExecutionObject* eo)
 {
-    double elapsed_host   = eo->GetHostProcessTimeInMilliSeconds();
     double elapsed_device = eo->GetProcessTimeInMilliSeconds();
-    double overhead = 100 - (elapsed_device/elapsed_host*100);
 
-    std::cout << format("frame[%3d]: Time on %s: %4.2f ms, host: %4.2f ms"
-                        " API overhead: %2.2f %%\n")
+    std::cout << format("frame[%3d]: Time on %s: %4.2f ms\n")
                         % eo->GetFrameIndex() % eo->GetDeviceName()
-                        % elapsed_device % elapsed_host % overhead;
-}
-
-void ReportTime(const ExecutionObjectPipeline* eop)
-{
-    double elapsed_host   = eop->GetHostProcessTimeInMilliSeconds();
-    double elapsed_device = eop->GetProcessTimeInMilliSeconds();
-    double overhead = 100 - (elapsed_device/elapsed_host*100);
-
-    std::cout << format("frame[%3d]: Time on %s: %4.2f ms, host: %4.2f ms"
-                        " API overhead: %2.2f %%\n")
-                        % eop->GetFrameIndex() % eop->GetDeviceName()
-                        % elapsed_device % elapsed_host % overhead;
+                        % elapsed_device;
 }
 
 // Compare output against reference output
index 57c570d31aede1d20d0e5e62081753d552304d13..3774d1eb2337e91f368db43661cf47260edbbf14 100644 (file)
@@ -58,7 +58,6 @@ bool ReadFrame(ExecutionObjectPipeline* eop,
 bool WriteFrame(const ExecutionObject* eo, std::ostream& output_file);
 
 void ReportTime(const ExecutionObject* eo);
-void ReportTime(const ExecutionObjectPipeline* eop);
 
 bool CheckFrame(const ExecutionObject* eo, const char *ref_output);
 bool CheckFrame(const ExecutionObjectPipeline *eop, const char *ref_output);
index 4cce811f52017279c0a7bed0898091acf2aa0e45..149f759ad50befebb136bf501490cb1cc0c9fb51 100644 (file)
@@ -194,7 +194,6 @@ bool RunConfiguration(cmdline_opts_t& opts)
             // Wait for previous frame on the same eop to finish processing
             if (eop->ProcessFrameWait())
             {
-                ReportTime(eop);
                 WriteFrameOutput(*eop);
             }
 
index 35294c4d444a302dbf203369317d7adde1313a1b..4e538b74cab714964810ef98bc8afabb6c292585 100644 (file)
@@ -201,7 +201,6 @@ bool RunConfiguration(const cmdline_opts_t& opts)
             // Wait for previous frame on the same eop to finish processing
             if (eop->ProcessFrameWait())
             {
-                ReportTime(eop);
                 WriteFrameOutput(*eop, c, opts);
             }
 
index 4bcc707984bae927bf2c521e45c2138a89061872..b9c57dbbdb4308929de827c596c808f10f3ccd6d 100644 (file)
@@ -210,7 +210,6 @@ bool RunConfiguration(const cmdline_opts_t& opts)
             // Wait for previous frame on the same eop to finish processing
             if (eop->ProcessFrameWait())
             {
-                ReportTime(eop);
                 WriteFrameOutput(*eop, c, opts);
             }
 
index 5cc618feec4515aeb02783e06ff6c3e1afa88e88..d1aa6dc7a5d403538e5cdbbedb9f5ae135847d48 100644 (file)
@@ -137,8 +137,6 @@ bool Run(int num_eve, int num_dsp, const char* ref_output)
             // Wait for previous frame on the same EOP to finish processing
             if (eop->ProcessFrameWait())
             {
-                ReportTime(eop);
-
                 // The reference output is valid only for the first frame
                 // processed on each EOP
                 if (frame_idx < num_eops && !CheckFrame(eop, ref_output))
index 8c67e556b5ee2b218ae861354fb1c4e72de64ae5..659485e1dbf7e1c053593a2bc4a2bad6d5cb6dd0 100644 (file)
@@ -140,8 +140,6 @@ bool Run(int num_eve, int num_dsp, const char* ref_output)
             // Wait for previous frame on the same EOP to finish processing
             if (eop->ProcessFrameWait())
             {
-                ReportTime(eop);
-
                 // The reference output is valid only for the first frame
                 // processed on each EOP
                 if (frame_idx < num_eops && !CheckFrame(eop, ref_output))
index d875d7b5e9244748c66db8e382ec31334d4f6a49..5abe33d15370315ebc388dbb49ef8626218d70a0 100644 (file)
@@ -66,13 +66,6 @@ class ExecutionObject : public ExecutionObjectInternalInterface
         void SetInputOutputBuffer(const ArgInfo& in,
                                   const ArgInfo& out) override;
 
-        //! Specify the input and output buffers used by the EO in a context
-        //! @param in buffer used for input.
-        //! @param out buffer used for output.
-        //! @param context_idx the index of the context
-        void SetInputOutputBuffer(const ArgInfo& in,
-                                  const ArgInfo& out, uint32_t context_idx);
-
         //! Returns a pointer to the input buffer set via SetInputOutputBuffer
         char* GetInputBufferPtr() const override;
 
@@ -97,45 +90,16 @@ class ExecutionObject : public ExecutionObjectInternalInterface
         //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait
         bool ProcessFrameStartAsync() override;
 
-        //! @brief Start processing with a context. The call is asynchronous and
-        //! returns immediately. Use ExecutionObject::ProcessFrameWait to wait
-        //! @param context_idx the index of the context
-        bool ProcessFrameStartAsync(uint32_t context_idx);
-
         //! Wait for the execution object to complete processing a frame
         //! @return false if ExecutionObject::ProcessFrameWait was called
         //! without a corresponding call to
         //! ExecutionObject::ProcessFrameStartAsync.
         bool ProcessFrameWait() override;
 
-        //! Wait for the execution object to complete processing with a context
-        //! @param context_idx the index of the context
-        //! @return false if ExecutionObject::ProcessFrameWait was called
-        //! without a corresponding call to
-        //! ExecutionObject::ProcessFrameStartAsync.
-        bool ProcessFrameWait(uint32_t context_idx);
-
         //! @brief return the number of milliseconds taken *on the device* to
         //! execute the process call
         //! @return Number of milliseconds to process a frame on the device.
-        float GetProcessTimeInMilliSeconds() const override;
-
-        //! @brief return the number of milliseconds taken *on the device* to
-        //! execute the process call with a contex
-        //! @param context_idx the index of the context
-        //! @return Number of milliseconds to process a frame on the device.
-        float GetProcessTimeInMilliSeconds(uint32_t context_idx) const;
-
-        //! @brief return the number of milliseconds taken *on the host* to
-        //! execute the process call
-        //! @return Number of milliseconds to process a frame on the host.
-        float GetHostProcessTimeInMilliSeconds() const override;
-
-        //! @brief return the number of milliseconds taken *on the host* to
-        //! execute the process call with a contex
-        //! @param context_idx the index of the context
-        //! @return Number of milliseconds to process a frame on the host.
-        float GetHostProcessTimeInMilliSeconds(uint32_t context_idx) const;
+        float GetProcessTimeInMilliSeconds() const;
 
         //! Returns the device name that the ExecutionObject runs on
         const std::string& GetDeviceName() const override;
@@ -169,17 +133,17 @@ class ExecutionObject : public ExecutionObjectInternalInterface
         //! @private
         // Used by the ExecutionObjectPipeline
         bool AddCallback(CallType ct, void *user_data, uint32_t context_idx);
-        void AcquireContext(uint32_t& context_idx);
-        void ReleaseContext(uint32_t  context_idx);
+        bool AcquireAndRunContext(uint32_t& context_idx,
+                                  int frame_idx,
+                                  const IODeviceArgInfo& in,
+                                  const IODeviceArgInfo& out);
+
+        bool WaitAndReleaseContext(uint32_t  context_idx);
 
         ExecutionObject()                                  = delete;
         ExecutionObject(const ExecutionObject&)            = delete;
         ExecutionObject& operator=(const ExecutionObject&) = delete;
 
-        //! @private
-        void SetInputOutputBuffer(const IODeviceArgInfo* in,
-                             const IODeviceArgInfo* out, uint32_t context_idx);
-
     private:
         class Impl;
         std::unique_ptr<Impl> pimpl_m;
index 816da94adeffa9b45810646a8a683af75708a500..e9da88b7249c0c4ce3ad31f5b6754ada73370aee 100644 (file)
@@ -83,16 +83,6 @@ class ExecutionObjectInternalInterface
         //! ExecutionObject::ProcessFrameStartAsync.
         virtual bool ProcessFrameWait() =0;
 
-        //! @brief return the number of milliseconds taken *on the device* to
-        //! execute the process call
-        //! @return Number of milliseconds to process a frame on the device.
-        virtual float GetProcessTimeInMilliSeconds() const =0;
-
-        //! @brief return the number of milliseconds taken *on the host* to
-        //! execute the process call
-        //! @return Number of milliseconds to process a frame on the host.
-        virtual float GetHostProcessTimeInMilliSeconds() const =0;
-
         //! Returns the device name that the ExecutionObject runs on
         virtual const std::string& GetDeviceName() const =0;
 
index b31fec6ab94882222e0cc660912394b715960b00..0787d0e9dbc1a4eb165b09f84d737f4efc778f54 100644 (file)
@@ -107,28 +107,6 @@ class ExecutionObjectPipeline : public ExecutionObjectInternalInterface
         //! ExecutionObjectPipeline::ProcessFrameStartAsync().
         bool ProcessFrameWait() override;
 
-        //! @brief return the number of milliseconds taken *on the device* to
-        //! execute the process call
-        //! @return Number of milliseconds to process a frame on the device.
-        float GetProcessTimeInMilliSeconds() const override;
-
-        //! @brief return the number of milliseconds taken *on the device*
-        //! to process a layersGroup by a componenet ExecutionObject
-        //! @return Number of milliseconds to process a layersGroup on the
-        //! device by a component ExecutionObject.
-        float GetProcessTimeInMilliSeconds(uint32_t eo_index) const;
-
-        //! @brief return the number of milliseconds taken *on the host* to
-        //! execute the process call
-        //! @return Number of milliseconds to process a frame on the host.
-        float GetHostProcessTimeInMilliSeconds() const override;
-
-        //! @brief return the number of milliseconds taken *on the host*
-        //! to process a layersGroup by a componenet ExecutionObject
-        //! @return Number of milliseconds to process a layersGroup on the
-        //! host by a component ExecutionObject.
-        float GetHostProcessTimeInMilliSeconds(uint32_t eo_index) const;
-
         //! Return the combined device names that this pipeline runs on
         const std::string& GetDeviceName() const override;
 
index faa637a767927ffa7a36b6a3e8792a468f913d13..11e6aa3fa32c75ea199e5a522571e521548d485a 100644 (file)
@@ -173,6 +173,10 @@ inline T* malloc_ddr(size_t size)
     return val;
 }
 
+//! Enable time stamp generation for TIDL API events
+bool EnableTimeStamps(const std::string& file = "timestamp.log",
+                      size_t num_frames=32);
+
 /*! @class Exception
  *  @brief Used to error reporting
  */
index 9bc9f05d61f1cd06574f986fc175ef95c1347b7c..b6292da6761a35f962707839f01f7065a39d09cc 100644 (file)
@@ -42,6 +42,7 @@
 #include "common_defines.h"
 #include "tidl_create_params.h"
 #include "device_arginfo.h"
+#include "util.h"
 
 using namespace tidl;
 
@@ -79,10 +80,10 @@ class ExecutionObject::Impl
         IODeviceArgInfo                 out_m[tidl::internal::NUM_CONTEXTS];
 
         // Frame being processed by the EO
-        int                             current_frame_idx_m;
+        int current_frame_idx_m[tidl::internal::NUM_CONTEXTS];
 
         // LayersGroupId being processed by the EO
-        int                             layers_group_id_m;
+        int layers_group_id_m;
 
         // Trace related
         void WriteLayerOutputsToFile (const std::string& filename_prefix) const;
@@ -95,9 +96,6 @@ class ExecutionObject::Impl
         up_malloc_ddr<OCL_TIDL_BufParams> trace_buf_params_m;
         size_t                            trace_buf_params_sz_m;
 
-        // host time tracking: eo start to finish
-        float host_time_m[tidl::internal::NUM_CONTEXTS];
-
     private:
         void SetupInitializeKernel(const DeviceArgInfo& create_arg,
                                    const DeviceArgInfo& param_heap_arg);
@@ -156,7 +154,6 @@ ExecutionObject::Impl::Impl(Device* d, uint8_t device_index,
     shared_process_params_m(nullptr, &__free_ddr),
     in_size_m(0),
     out_size_m(0),
-    current_frame_idx_m(0),
     layers_group_id_m(layers_group_id),
     num_network_layers_m(0),
     trace_buf_params_m(nullptr, &__free_ddr),
@@ -208,60 +205,29 @@ size_t ExecutionObject::GetOutputBufferSizeInBytes() const
 
 void  ExecutionObject::SetFrameIndex(int idx)
 {
-    pimpl_m->current_frame_idx_m = idx;
+    pimpl_m->current_frame_idx_m[0] = idx;
 }
 
 int ExecutionObject::GetFrameIndex() const
 {
-    return pimpl_m->current_frame_idx_m;
+    return pimpl_m->current_frame_idx_m[0];
 }
 
 void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in,
                                            const ArgInfo& out)
 {
-    SetInputOutputBuffer(in, out, 0);
-}
-
-void ExecutionObject::SetInputOutputBuffer(const ArgInfo& in,
-                                      const ArgInfo& out, uint32_t context_idx)
-{
-    assert(in.ptr()  != nullptr && in.size()  >= pimpl_m->in_size_m);
-    assert(out.ptr() != nullptr && out.size() >= pimpl_m->out_size_m);
-
-    pimpl_m->in_m[context_idx]  = IODeviceArgInfo(in);
-    pimpl_m->out_m[context_idx] = IODeviceArgInfo(out);
-}
-
-void ExecutionObject::SetInputOutputBuffer(const IODeviceArgInfo* in,
-                                           const IODeviceArgInfo* out,
-                                           uint32_t context_idx)
-{
-    pimpl_m->in_m[context_idx]  = *in;
-    pimpl_m->out_m[context_idx] = *out;
+    pimpl_m->in_m[0]  = IODeviceArgInfo(in);
+    pimpl_m->out_m[0] = IODeviceArgInfo(out);
 }
 
 bool ExecutionObject::ProcessFrameStartAsync()
 {
-    return ProcessFrameStartAsync(0);
-}
-
-bool ExecutionObject::ProcessFrameStartAsync(uint32_t context_idx)
-{
-    TRACE::print("-> ExecutionObject::ProcessFrameStartAsync(%d)\n",
-                 context_idx);
-    assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
-    return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS, context_idx);
+    return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS, 0);
 }
 
 bool ExecutionObject::ProcessFrameWait()
 {
-    return ProcessFrameWait(0);
-}
-
-bool ExecutionObject::ProcessFrameWait(uint32_t context_idx)
-{
-    TRACE::print("-> ExecutionObject::ProcessFrameWait(%d)\n", context_idx);
-    return pimpl_m->Wait(ExecutionObject::CallType::PROCESS, context_idx);
+    return pimpl_m->Wait(ExecutionObject::CallType::PROCESS, 0);
 }
 
 bool ExecutionObject::RunAsync (CallType ct)
@@ -274,31 +240,44 @@ bool ExecutionObject::Wait (CallType ct)
     return pimpl_m->Wait(ct, 0);
 }
 
-bool ExecutionObject::AddCallback(CallType ct, void *user_data,
-                                  uint32_t context_idx)
-{
-    return pimpl_m->AddCallback(ct, user_data, context_idx);
-}
 
-float ExecutionObject::GetProcessTimeInMilliSeconds() const
+bool ExecutionObject::AcquireAndRunContext(uint32_t& context_idx,
+                                          int frame_idx,
+                                          const IODeviceArgInfo& in,
+                                          const IODeviceArgInfo& out)
 {
-    return GetProcessTimeInMilliSeconds(0);
+    pimpl_m->AcquireContext(context_idx);
+
+    pimpl_m->current_frame_idx_m[context_idx] = frame_idx;
+    pimpl_m->in_m[context_idx]  = in;
+    pimpl_m->out_m[context_idx] = out;
+
+    return pimpl_m->RunAsync(ExecutionObject::CallType::PROCESS,
+                                    context_idx);
 }
 
-float ExecutionObject::GetProcessTimeInMilliSeconds(uint32_t context_idx) const
+bool ExecutionObject::WaitAndReleaseContext(uint32_t context_idx)
 {
-    float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
-    return ((float)pimpl_m->GetProcessCycles(context_idx)) / frequency * 1000;
+    TRACE::print("-> ExecutionObject::WaitAndReleaseContext(%d)\n",
+                 context_idx);
+
+    bool status = pimpl_m->Wait(ExecutionObject::CallType::PROCESS,
+                                context_idx);
+    pimpl_m->ReleaseContext(context_idx);
+
+    return status;
 }
 
-float ExecutionObject::GetHostProcessTimeInMilliSeconds() const
+bool ExecutionObject::AddCallback(CallType ct, void *user_data,
+                                  uint32_t context_idx)
 {
-    return GetHostProcessTimeInMilliSeconds(0);
+    return pimpl_m->AddCallback(ct, user_data, context_idx);
 }
 
-float ExecutionObject::GetHostProcessTimeInMilliSeconds(uint32_t context_idx) const
+float ExecutionObject::GetProcessTimeInMilliSeconds() const
 {
-    return pimpl_m->host_time_m[context_idx];
+    float frequency = pimpl_m->device_m->GetFrequencyInMhz() * 1000000;
+    return ((float)pimpl_m->GetProcessCycles(0)) / frequency * 1000;
 }
 
 void
@@ -328,15 +307,6 @@ const std::string& ExecutionObject::GetDeviceName() const
     return pimpl_m->device_name_m;
 }
 
-void ExecutionObject::AcquireContext(uint32_t& context_idx)
-{
-    pimpl_m->AcquireContext(context_idx);
-}
-
-void ExecutionObject::ReleaseContext(uint32_t context_idx)
-{
-    pimpl_m->ReleaseContext(context_idx);
-}
 
 //
 // Create a kernel to call the "initialize" function
@@ -586,7 +556,6 @@ void ExecutionObject::Impl::ComputeInputOutputSizes()
     }
 }
 
-
 bool ExecutionObject::Impl::RunAsync(CallType ct, uint32_t context_idx)
 {
     switch (ct)
@@ -598,12 +567,13 @@ bool ExecutionObject::Impl::RunAsync(CallType ct, uint32_t context_idx)
         }
         case CallType::PROCESS:
         {
-            std::chrono::time_point<std::chrono::steady_clock> t1, t2;
-            t1 = std::chrono::steady_clock::now();
+            RecordEvent(current_frame_idx_m[context_idx],
+                        (layers_group_id_m == 1) ? TimeStamp::EO1_PFSA_START:
+                                                   TimeStamp::EO2_PFSA_START);
 
             OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
                                                + context_idx;
-            p_params->frameIdx = current_frame_idx_m;
+            p_params->frameIdx = current_frame_idx_m[context_idx];
             HostWriteNetInput(context_idx);
             {
                 std::unique_lock<std::mutex> lock(mutex_access_m);
@@ -611,9 +581,9 @@ bool ExecutionObject::Impl::RunAsync(CallType ct, uint32_t context_idx)
                 k_process_m->RunAsync(context_idx);
             }
 
-            t2 = std::chrono::steady_clock::now();
-            std::chrono::duration<float> elapsed = t2 - t1;
-            host_time_m[context_idx] = elapsed.count() * 1000;
+            RecordEvent(current_frame_idx_m[context_idx],
+                        (layers_group_id_m == 1) ?  TimeStamp::EO1_PFSA_END:
+                                                    TimeStamp::EO2_PFSA_END);
             break;
         }
         case CallType::CLEANUP:
@@ -648,7 +618,14 @@ bool ExecutionObject::Impl::Wait(CallType ct, uint32_t context_idx)
         case CallType::PROCESS:
         {
             float host_elapsed_ms = 0.0f;
-            bool has_work = k_process_m->Wait(&host_elapsed_ms);
+            float *p_host_elapsed_ms = tidl::internal::NUM_CONTEXTS == 1 ?
+                                       &host_elapsed_ms : nullptr;
+
+            RecordEvent(current_frame_idx_m[context_idx],
+                        (layers_group_id_m == 1) ? TimeStamp::EO1_PFW_START:
+                                                   TimeStamp::EO2_PFW_START);
+
+            bool has_work = k_process_m->Wait(p_host_elapsed_ms, context_idx);
             if (has_work)
             {
                 OCL_TIDL_ProcessParams *p_params = shared_process_params_m.get()
@@ -657,14 +634,13 @@ bool ExecutionObject::Impl::Wait(CallType ct, uint32_t context_idx)
                     throw Exception(p_params->errorCode,
                                     __FILE__, __FUNCTION__, __LINE__);
 
-                std::chrono::time_point<std::chrono::steady_clock> t1, t2;
-                t1 = std::chrono::steady_clock::now();
                 HostReadNetOutput(context_idx);
-                t2 = std::chrono::steady_clock::now();
-                std::chrono::duration<float> elapsed = t2 - t1;
-                host_time_m += elapsed.count() * 1000 + host_elapsed_ms;
             }
 
+            RecordEvent(current_frame_idx_m[context_idx],
+                        (layers_group_id_m == 1) ? TimeStamp::EO1_PFW_END:
+                                                   TimeStamp::EO2_PFW_END);
+
             return has_work;
         }
         case CallType::CLEANUP:
index 279d1ed7e9c0262807110aa81533ce5283b26208..30856e290078d7417b6cc0d743c61ea696c9e16a 100644 (file)
 #include <assert.h>
 #include <mutex>
 #include <condition_variable>
-#include <chrono>
 #include "device_arginfo.h"
 #include "execution_object_pipeline.h"
 #include "parameters.h"
+#include "util.h"
 
 using namespace tidl;
 
@@ -56,8 +56,6 @@ class ExecutionObjectPipeline::Impl
         //! for pipelined execution
         std::vector<ExecutionObject*> eos_m;
         std::vector<IODeviceArgInfo*> iobufs_m;
-        std::vector<float> eo_device_time_m;
-        std::vector<float> eo_host_time_m;
 
         std::string device_name_m;
 
@@ -68,10 +66,6 @@ class ExecutionObjectPipeline::Impl
         uint32_t curr_eo_idx_m;
         uint32_t curr_eo_context_idx_m;
 
-        // device and host time tracking: pipeline start to finish
-        float device_time_m;
-        float host_time_m;
-
     private:
         //! @brief Initialize ExecutionObjectPipeline with given
         //! ExecutionObjects: check consecutive layersGroup, allocate memory
@@ -81,9 +75,6 @@ class ExecutionObjectPipeline::Impl
         bool has_work_m, is_processed_m;
         std::mutex mutex_m;
         std::condition_variable cv_m;
-
-        // host time tracking: pipeline start to finish
-        std::chrono::time_point<std::chrono::steady_clock> start_m;
 };
 
 ExecutionObjectPipeline::ExecutionObjectPipeline(
@@ -148,11 +139,15 @@ int ExecutionObjectPipeline::GetFrameIndex() const
 
 bool ExecutionObjectPipeline::ProcessFrameStartAsync()
 {
+    RecordEvent(pimpl_m->frame_idx_m, TimeStamp::EOP_PFSA_START);
+
     assert(GetInputBufferPtr() != nullptr && GetOutputBufferPtr() != nullptr);
     bool st = pimpl_m->RunAsyncStart();
     if (st)
         st = pimpl_m->eos_m[0]->AddCallback(ExecutionObject::CallType::PROCESS,
                                          this, pimpl_m->curr_eo_context_idx_m);
+
+    RecordEvent(pimpl_m->frame_idx_m, TimeStamp::EOP_PFSA_END);
     return st;
 }
 
@@ -163,7 +158,12 @@ bool ExecutionObjectPipeline::ProcessFrameWait()
 
 void CallbackWrapper(void *user_data)
 {
+    int frame_index = ((ExecutionObjectPipeline *) user_data)->GetFrameIndex();
+    RecordEvent(frame_index, TimeStamp::EOP_RAN_START);
+
     ((ExecutionObjectPipeline *) user_data)->RunAsyncNext();
+
+    RecordEvent(frame_index, TimeStamp::EOP_RAN_END);
 }
 
 void ExecutionObjectPipeline::RunAsyncNext()
@@ -175,30 +175,6 @@ void ExecutionObjectPipeline::RunAsyncNext()
                                      pimpl_m->curr_eo_context_idx_m);
 }
 
-float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds() const
-{
-    return pimpl_m->device_time_m;
-}
-
-float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds() const
-{
-    return pimpl_m->host_time_m;
-}
-
-float ExecutionObjectPipeline::GetProcessTimeInMilliSeconds(
-        uint32_t eo_index) const
-{
-    assert(eo_index < pimpl_m->eos_m.size());
-    return pimpl_m->eo_device_time_m[eo_index];
-}
-
-float ExecutionObjectPipeline::GetHostProcessTimeInMilliSeconds(
-        uint32_t eo_index) const
-{
-    assert(eo_index < pimpl_m->eos_m.size());
-    return pimpl_m->eo_host_time_m[eo_index];
-}
-
 const std::string& ExecutionObjectPipeline::GetDeviceName() const
 {
     return pimpl_m->device_name_m;
@@ -275,11 +251,6 @@ void ExecutionObjectPipeline::Impl::Initialize()
         ArgInfo out(ptr, size);
         iobufs_m.push_back(new IODeviceArgInfo(out));
     }
-
-    // Record keeping for each EO's device time and host time
-    // because EO could be shared by another EOP
-    eo_device_time_m.resize(eos_m.size());
-    eo_host_time_m.resize(eos_m.size());
 }
 
 ExecutionObjectPipeline::Impl::~Impl()
@@ -302,54 +273,39 @@ void ExecutionObjectPipeline::Impl::SetInputOutputBuffer(const ArgInfo &in,
     iobufs_m.back()  = new IODeviceArgInfo(out);
 }
 
+// Start execution on the first EO in the pipeline. Callbacks are used
+// to trigger execution on subsequent EOs
 bool ExecutionObjectPipeline::Impl::RunAsyncStart()
 {
     has_work_m = true;
     is_processed_m = false;
-    device_time_m = 0.0f;
-    host_time_m = 0.0f;
     curr_eo_idx_m = 0;
-    eos_m[0]->AcquireContext(curr_eo_context_idx_m);
-    if (tidl::internal::NUM_CONTEXTS == 1)
-        start_m = std::chrono::steady_clock::now();
-    eos_m[0]->SetFrameIndex(frame_idx_m);
-    eos_m[0]->SetInputOutputBuffer(iobufs_m[0], iobufs_m[1],
-                                   curr_eo_context_idx_m);
-    return eos_m[0]->ProcessFrameStartAsync(curr_eo_context_idx_m);
+    return eos_m[0]->AcquireAndRunContext(curr_eo_context_idx_m,
+                                          frame_idx_m,
+                                          *iobufs_m[0], *iobufs_m[1]);
 }
 
+// Invoked via the callback function, CallbackWrapper. Used to advance the
+// pipeline.
 // returns true if we have more EOs to execute
 bool ExecutionObjectPipeline::Impl::RunAsyncNext()
 {
-    eos_m[curr_eo_idx_m]->ProcessFrameWait(curr_eo_context_idx_m);
-    // need to capture EO's device/host time before we release its lock
-    eo_device_time_m[curr_eo_idx_m] = eos_m[curr_eo_idx_m]->
-                           GetProcessTimeInMilliSeconds(curr_eo_context_idx_m);
-    eo_host_time_m[curr_eo_idx_m]   = eos_m[curr_eo_idx_m]->
-                       GetHostProcessTimeInMilliSeconds(curr_eo_context_idx_m);
-    device_time_m += eo_device_time_m[curr_eo_idx_m];
-    if (tidl::internal::NUM_CONTEXTS > 1)
-        host_time_m += eo_host_time_m[curr_eo_idx_m];
-    eos_m[curr_eo_idx_m]->ReleaseContext(curr_eo_context_idx_m);
+    eos_m[curr_eo_idx_m]->WaitAndReleaseContext(curr_eo_context_idx_m);
     curr_eo_idx_m += 1;
     if (curr_eo_idx_m < eos_m.size())
     {
-        eos_m[curr_eo_idx_m]->AcquireContext(curr_eo_context_idx_m);
-        eos_m[curr_eo_idx_m]->SetFrameIndex(frame_idx_m);
-        eos_m[curr_eo_idx_m]->SetInputOutputBuffer(iobufs_m[curr_eo_idx_m],
-                          iobufs_m[curr_eo_idx_m+1], curr_eo_context_idx_m);
-        eos_m[curr_eo_idx_m]->ProcessFrameStartAsync(curr_eo_context_idx_m);
+        eos_m[curr_eo_idx_m]->AcquireAndRunContext(curr_eo_context_idx_m,
+                                                   frame_idx_m,
+                                                   *iobufs_m[curr_eo_idx_m],
+                                                   *iobufs_m[curr_eo_idx_m+1]);
         return true;
     }
     else
     {
-        if (tidl::internal::NUM_CONTEXTS == 1)
         {
-            std::chrono::duration<float> elapsed =
-                                    std::chrono::steady_clock::now() - start_m;
-            host_time_m = elapsed.count() * 1000;  // seconds to milliseconds
+            std::lock_guard<std::mutex> lock(mutex_m);
+            is_processed_m = true;
         }
-        is_processed_m = true;
         cv_m.notify_all();
         return false;
     }
@@ -359,9 +315,14 @@ bool ExecutionObjectPipeline::Impl::Wait()
 {
     if (! has_work_m)  return false;
 
+    RecordEvent(frame_idx_m, TimeStamp::EOP_PFW_START);
+
     std::unique_lock<std::mutex> lock(mutex_m);
     cv_m.wait(lock, [this]{ return this->is_processed_m; });
     has_work_m = false;
+
+    RecordEvent(frame_idx_m, TimeStamp::EOP_PFW_END);
+
     return true;
 }
 
index 508c5498a352d43ab99c24390ffbd9010d3d1d88..74e1150263b6d4a0ca1dcd3225e6de16e26aae5d 100644 (file)
@@ -187,7 +187,8 @@ EveDevice::EveDevice(const DeviceIds& ids, const std::string &kernel_names):
         int index = static_cast<int>(id);
         queue_m[index] = clCreateCommandQueue(context_m,
                                       all_device_ids[index],
-                                      CL_QUEUE_PROFILING_ENABLE,
+                                      CL_QUEUE_PROFILING_ENABLE|
+                                      CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
                                       &errcode);
         errorCheck(errcode, __LINE__);
     }
index 239b94399cfcc7719dc5182d207a4ffffd58a619..711b289ec4977a952f8b9979151fe0a8dc4387b9 100644 (file)
 #include <iostream>
 #include <fstream>
 #include <assert.h>
+#include <chrono>
+#include <mutex>
+#include <cstring>
+#include <memory>
+#include "executor.h"
 
 using namespace tidl;
 
+using namespace std::chrono;
+
+std::unique_ptr<TimeStamp> tidl_api_timestamps(nullptr);
+
+bool tidl::EnableTimeStamps(const std::string& file, size_t num_frames)
+{
+    std::mutex m;
+    std::lock_guard<std::mutex> guard(m);
+
+    if (tidl_api_timestamps.get() != nullptr)
+        return true;
+
+    tidl_api_timestamps.reset(new TimeStamp(file, num_frames));
+    if (tidl_api_timestamps.get() == nullptr)
+        return false;
+
+    return true;
+}
+
+TimeStamp::TimeStamp(const std::string& file, int num_entries):
+                        num_entries_m(num_entries), file_m(file)
+{
+    entries_m = new Entry[num_entries_m];
+    std::memset(entries_m, 0, sizeof(Entry)*num_entries_m);
+}
+
+void TimeStamp::Update(int frame_idx, EventKind k)
+{
+    int idx = frame_idx % num_entries_m;
+    entries_m[idx].frame_idx = frame_idx;
+    entries_m[idx].timestamp[k] = duration_cast<microseconds>
+                 (high_resolution_clock::now().time_since_epoch()).count();
+}
+
+TimeStamp::~TimeStamp()
+{
+    std::ofstream ofs;
+    ofs.open(file_m, std::ofstream::out);
+
+    for (int i=0; i < num_entries_m; i++)
+        for (int j=0; j < EventKind::NUM_EVENTS; j++)
+            if (entries_m[i].timestamp[j] != 0)
+            {
+                ofs << entries_m[i].frame_idx << ",";
+                switch (j)
+                {
+                    case EOP_PFSA_START: ofs << "EOP:PFSA:Start"; break;
+                    case EOP_PFSA_END:   ofs << "EOP:PFSA:End"; break;
+                    case EOP_PFW_START:  ofs << "EOP:PFW:Start"; break;
+                    case EOP_PFW_END:    ofs << "EOP:PFW:End"; break;
+                    case EOP_RAN_START:  ofs << "EOP:RAN:Start"; break;
+                    case EOP_RAN_END:    ofs << "EOP:RAN:End"; break;
+
+                    case EO1_PFSA_START: ofs << "EO1:PFSA:Start"; break;
+                    case EO1_PFSA_END:   ofs << "EO1:PFSA:End"; break;
+                    case EO1_PFW_START:  ofs << "EO1:PFW:Start"; break;
+                    case EO1_PFW_END:    ofs << "EO1:PFW:End"; break;
+
+                    case EO2_PFSA_START: ofs << "EO2:PFSA:Start"; break;
+                    case EO2_PFSA_END:   ofs << "EO2:PFSA:End"; break;
+                    case EO2_PFW_START:  ofs << "EO2:PFW:Start"; break;
+                    case EO2_PFW_END:    ofs << "EO2:PFW:End"; break;
+
+                    default:             ofs << "UNKNOWN"; break;
+                }
+                ofs << "," << entries_m[i].timestamp[j] << std::endl;
+            }
+
+    ofs.close();
+
+    delete [] entries_m;
+}
+
+
+void tidl::RecordEvent(int frame_idx, TimeStamp::EventKind k)
+{
+    TimeStamp* t = tidl_api_timestamps.get();
+    if (t)
+        t->Update(frame_idx, k);
+}
+
 std::size_t tidl::GetBinaryFileSize(const std::string &F)
 {
     std::ifstream is;
index 10b9ef82cd7fa1911331a5e6001267fdf88ba93d..d40d4c9be780b676b14287e3934f9c7181c22f1f 100644 (file)
@@ -40,4 +40,35 @@ bool        CompareFiles      (const std::string &F1, const std::string &F2);
 bool        CompareFrames(const std::string &F1, const std::string &F2,
                          int numFrames, int width, int height);
 
+class TimeStamp
+{
+    public:
+        enum EventKind { EOP_PFSA_START=0, EOP_PFSA_END,
+                         EOP_PFW_START,    EOP_PFW_END,
+                         EOP_RAN_START,    EOP_RAN_END,
+                         EO1_PFSA_START,   EO1_PFSA_END,
+                         EO1_PFW_START,    EO1_PFW_END,
+                         EO2_PFSA_START,   EO2_PFSA_END,
+                         EO2_PFW_START,    EO2_PFW_END,
+                         NUM_EVENTS };
+        struct Entry
+        {
+            unsigned long long frame_idx;
+            unsigned long long timestamp[EventKind::NUM_EVENTS];
+        };
+
+
+        TimeStamp(const std::string& file, int num_entries);
+        ~TimeStamp();
+        void Update(int frame_idx, EventKind k);
+
+    private:
+        Entry*            entries_m;
+        const int         num_entries_m;
+        const std::string file_m;
+};
+
+
+void RecordEvent(int frame_idx, TimeStamp::EventKind k);
+
 } // namespace tidl