Add canny kernel
authorDjordje Senicic <d-senicic1@ti.com>
Wed, 23 Mar 2016 11:20:49 +0000 (07:20 -0400)
committerDjordje Senicic <d-senicic1@ti.com>
Wed, 23 Mar 2016 11:20:49 +0000 (07:20 -0400)
src/gstdsp66videokernel.c
src/gstdsp66videokernel.h
src/kernels/oclconv/conv.cl
src/kernels/oclconv/oclconv.cpp

index 48c7764870e7f6c96362c4341c2cee5623b09adb..8174710084dabad2c4b53e0bfa2876600eba1c97 100644 (file)
@@ -68,6 +68,7 @@ static const GEnumValue dsp66_video_kerneltype[] = {
   {GST_DSP66_VIDEO_KERNELTYPE_MEDIAN, "Kernel median", "0"},
   {GST_DSP66_VIDEO_KERNELTYPE_SOBEL, "Kernel sobel", "1"},
   {GST_DSP66_VIDEO_KERNELTYPE_CONV,  "Kernel conv",  "2"},
+  {GST_DSP66_VIDEO_KERNELTYPE_CANNY, "Kernel canny", "3"},
   {0, NULL, NULL},
 };
 
index 9d79b1550635c4ed51c5c70e3f8cbd00c66c76c9..b78354bf926072a8d43eee4c264613e81560bffb 100644 (file)
@@ -52,7 +52,8 @@ typedef enum
 typedef enum {
   GST_DSP66_VIDEO_KERNELTYPE_MEDIAN = 0,
   GST_DSP66_VIDEO_KERNELTYPE_SOBEL  = 1,
-  GST_DSP66_VIDEO_KERNELTYPE_CONV   = 2
+  GST_DSP66_VIDEO_KERNELTYPE_CONV   = 2,
+  GST_DSP66_VIDEO_KERNELTYPE_CANNY  = 3
 } GstDsp66VideoKernelType;
 
 struct _GstDsp66VideoKernel {
index 6c86e52298b655debf3ea443d719b014a115d567..dab866e713dd2808d67e354d0888014a90b672f4 100644 (file)
 void IMG_median_3x3_8 (const unsigned char *restrict in_data, int cols, unsigned char * restrict out_data);
 void IMG_sobel_3x3_8  (const unsigned char *restrict in_data, unsigned char *restrict out_data, int rows, int cols);
 void IMG_conv_3x3_i8_c8s (const unsigned char *restrict in_data, unsigned char *restrict out_data, int cols, const char *restrict mask, int shift);
+void VLIB_Canny_Edge_Detection (ARGS);
+kernel void canny_tiocl(ARGS)
+{
+    VLIB_Canny_Edge_Detection(pInput, pBufGradX, pBufGradY, pBufMag, pBufOut, pScratch, numItems, width, height);
+}
 
 kernel void Median3x3(global const uchar* src, global uchar *dest,
                       const int width, const int height,
index c7b722c99f62207992d525497ec3b9d868c8f0e1..00ef898646608cb49cec7c263b2b39a19f0c829f 100644 (file)
@@ -37,6 +37,7 @@
 using namespace cl;
 using namespace std;
 
+/*----------------------------------------------------------------------------------------------------------------------*/
 static int oclconv_imgproc(char *kernelName, unsigned char *data_in, unsigned char *data_out, int width, int height, int sstride, int dstride)
 {
    cl_int err     = CL_SUCCESS;
@@ -47,6 +48,7 @@ static int oclconv_imgproc(char *kernelName, unsigned char *data_in, unsigned ch
    logfile << "Entered oclconv_test, width=" << width << " height=" << height << " dstride=" << dstride << " sstride=" << sstride << '\n';
    logfile.close();
 #endif
+
    try 
    {
      Context context(CL_DEVICE_TYPE_ACCELERATOR);
@@ -84,7 +86,81 @@ static int oclconv_imgproc(char *kernelName, unsigned char *data_in, unsigned ch
 #endif
    return 0;
 }
+/*----------------------------------------------------------------------------------------------------------------------*/
+static bool          canny_first_call = true;
+static Context       canny_ctx(CL_DEVICE_TYPE_ACCELERATOR);
+static CommandQueue *canny_Q;
+static Buffer       *canny_gradX, *canny_gradY, *canny_mag, *canny_scratch, *canny_numItems;
+static Kernel       *canny_K;
+static Buffer       *canny_input, *canny_output;
+
+/******************************************************************************
+ * Canny Edge Detection - called on ARM, but algorithm dispatched to 1 DSP
+ *
+ * Note: Assumes arguments are invariant from call 1 to call N. If this is 
+ *   not the case, then move buffer creation back to the every frame section
+ *   rather than being cached in frame 0.
+ *
+ * Note: Also assumes total size is not overly large as it allocates temp 
+ *       buffers in MSMC
+ *****************************************************************************/
+static int ocl_canny(unsigned char *data_in, unsigned char *data_out, unsigned short height, unsigned short width)
+{
+    int numelem = (int)height*(int)width;
+    try 
+    {
+        Event canny_ev, canny_ev1, canny_ev2;
+        /*---------------------------------------------------------------------
+        * Cache as much OpenCL plumbing on the first call, so the cost is not 
+        * repeatedfor every frame.
+        *--------------------------------------------------------------------*/
+        if (canny_first_call)
+        {
+            canny_first_call = false;
 
+            std::vector<Device> devices = canny_ctx.getInfo<CL_CONTEXT_DEVICES>();
+            devices.resize(1); // resize to 1 since we are only running on 1 DSP
+            canny_Q = new CommandQueue(canny_ctx, devices[0]);
+
+            canny_input   = new Buffer(canny_ctx, CL_MEM_READ_ONLY, numelem);
+            canny_output  = new Buffer(canny_ctx, CL_MEM_WRITE_ONLY, numelem);
+            canny_gradX   = new Buffer(canny_ctx, CL_MEM_WRITE_ONLY, numelem*sizeof(short));
+            canny_gradY   = new Buffer(canny_ctx, CL_MEM_WRITE_ONLY, numelem*sizeof(short));
+            canny_mag     = new Buffer(canny_ctx, CL_MEM_WRITE_ONLY, numelem*sizeof(short));
+            canny_scratch = new Buffer(canny_ctx, CL_MEM_WRITE_ONLY, numelem);
+            canny_numItems= new Buffer(canny_ctx, CL_MEM_WRITE_ONLY, sizeof(int));
+
+            /*---------------------------------------------------------------------
+            * Compile the Kernel Source for the devices
+            *--------------------------------------------------------------------*/
+            Program::Binaries binary(1, make_pair(conv_dsp_bin, sizeof(conv_dsp_bin)));
+            Program program(canny_ctx, devices, binary);
+            program.build(devices);
+            canny_K = new Kernel(program, "canny_tiocl");
+
+            canny_K->setArg(0, *canny_input);
+            canny_K->setArg(1, *canny_gradX);
+            canny_K->setArg(2, *canny_gradY);
+            canny_K->setArg(3, *canny_mag);
+            canny_K->setArg(4, *canny_output);
+            canny_K->setArg(5, *canny_scratch);
+            canny_K->setArg(6, *canny_numItems);
+            canny_K->setArg(7, width);
+            canny_K->setArg(8, height);
+        }
+
+       canny_Q->enqueueWriteBuffer(*canny_input, CL_FALSE, 0, numelem, data_in, NULL, &canny_ev1);
+       canny_Q->enqueueTask(*canny_K, 0, &canny_ev);
+       canny_Q->enqueueReadBuffer (*canny_output, CL_TRUE, 0, numelem, data_out, NULL, &canny_ev2);
+    }
+    catch (cl::Error err)
+    {
+       cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << endl;
+       return (-1);
+    }
+    return 0;
+}
+/*----------------------------------------------------------------------------------------------------------------------*/
 
 #ifdef __cplusplus
 extern "C" {
@@ -114,6 +190,10 @@ int oclconv_kernel(int kernel_type, int filter_size,
         return 0;
       }
       break;
+    case 3: /* vlib canny */
+      /* filter size is ignored */
+      retval = ocl_canny (data_in, data_out, width, height); /* input and output stride assumed to be == width */
+      break;
     default:
       break;
   }