1 #include <cmath>
2 #include <cstdlib>
3 #include <cstring>
5 #include <device_functions.h>
7 #include "caffeine/common.hpp"
8 #include "caffeine/util/im2col.hpp"
10 namespace caffeine {
12 template <typename Dtype>
13 __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
14 const int height, const int width, const int ksize,
15 const int stride, const int height_col, const int width_col, Dtype* data_col) {
16 int index = threadIdx.x + blockIdx.x * blockDim.x;
17 if (index < n) {
18 int w_out = index % width_col;
19 index /= width_col;
20 int h_out = index % height_col;
21 int channel_in = index / height_col;
22 int channel_out = channel_in * ksize * ksize;
23 int h_in = h_out * stride;
24 int w_in = w_out * stride;
25 data_col += (channel_out * height_col + h_out) * width_col + w_out;
26 data_im += (channel_in * height + h_in) * width + w_in;
27 for (int i = 0; i < ksize; ++i) {
28 for (int j = 0; j < ksize; ++j) {
29 *data_col = data_im[i * width + j];
30 data_col += height_col * width_col;
31 }
32 }
33 }
34 }
36 template <typename Dtype>
37 void im2col_gpu(const Dtype* data_im, const int channels,
38 const int height, const int width, const int ksize, const int stride,
39 Dtype* data_col) {
40 // We are going to launch channels * height_col * width_col kernels, each
41 // kernel responsible for copying a single-channel grid.
42 int height_col = (height - ksize) / stride + 1;
43 int width_col = (width - ksize) / stride + 1;
44 int num_kernels = channels * height_col * width_col;
45 im2col_gpu_kernel<Dtype><<<CAFFEINE_GET_BLOCKS(num_kernels), CAFFEINE_CUDA_NUM_THREADS>>>(
46 num_kernels, data_im, height, width, ksize, stride, height_col, width_col,
47 data_col);
48 CUDA_POST_KERNEL_CHECK;
49 }
51 // Explicit instantiation
52 template void im2col_gpu<float>(const float* data_im, const int channels,
53 const int height, const int width, const int ksize, const int stride,
54 float* data_col);
55 template void im2col_gpu<double>(const double* data_im, const int channels,
56 const int height, const int width, const int ksize, const int stride,
57 double* data_col);
60 /*
61 // A bunch of stuff dealing with double atomic add
62 template <typename Dtype>
63 __device__ inline Dtype MyAtomicAdd(Dtype* address, Dtype val);
65 template <>
66 __device__ float MyAtomicAdd<float>(float* address, float val) {
67 return atomicAdd(address, val);
68 }
69 template <>
70 __device__ double MyAtomicAdd<double>(double* address, double val)
71 {
72 unsigned long long int* address_as_ull = (unsigned long long int*)address;
73 unsigned long long int old = *address_as_ull, assumed;
74 do {
75 assumed = old;
76 old = atomicCAS(address_as_ull, assumed,__double_as_longlong(val +
77 __longlong_as_double(assumed)));
78 } while (assumed != old);
79 return __longlong_as_double(old);
80 }
82 template <typename Dtype>
83 __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
84 const int height, const int width, const int ksize,
85 const int stride, const int height_col, const int width_col, Dtype* data_im) {
86 int index = threadIdx.x + blockIdx.x * blockDim.x;
87 if (index < n) {
88 int w_out = index % width_col;
89 index /= width_col;
90 int h_out = index % height_col;
91 int channel_out = index / height_col;
92 int w_in = w_out * stride + channel_out % ksize;
93 int h_in = h_out * stride + (channel_out / ksize) % ksize;
94 int channel_in = channel_out / ksize / ksize;
95 MyAtomicAdd(data_im + (channel_in * height + h_in) * width + w_in,
96 data_col[(channel_out* height_col + h_out) * width_col + w_out]);
97 }
98 }
100 template <typename Dtype>
101 void col2im_gpu(const Dtype* data_col, const int channels,
102 const int height, const int width, const int ksize, const int stride,
103 Dtype* data_im) {
104 CUDA_CHECK(cudaMemset(data_im, 0, sizeof(Dtype) * height * width * channels));
105 int height_col = (height - ksize) / stride + 1;
106 int width_col = (width - ksize) / stride + 1;
107 int channels_col = channels * ksize * ksize;
108 int num_kernels = channels_col * height_col * width_col;
109 col2im_gpu_kernel<Dtype><<<CAFFEINE_GET_BLOCKS(num_kernels), CAFFEINE_CUDA_NUM_THREADS>>>(
110 num_kernels, data_col, height, width, ksize, stride, height_col, width_col,
111 data_im);
112 CUDA_POST_KERNEL_CHECK;
113 }
116 // Explicit instantiation
117 template void col2im_gpu<float>(const float* data_col, const int channels,
118 const int height, const int width, const int psize, const int stride,
119 float* data_im);
120 template void col2im_gpu<double>(const double* data_col, const int channels,
121 const int height, const int width, const int psize, const int stride,
122 double* data_im);
123 */
125 } // namespace caffeine