summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 9f176c4)
raw | patch | inline | side by side (parent: 9f176c4)
author | Sergei Nikolaev <snikolaev@nvidia.com> | |
Sat, 3 Mar 2018 00:36:17 +0000 (16:36 -0800) | ||
committer | Sergei Nikolaev <snikolaev@nvidia.com> | |
Sat, 3 Mar 2018 00:36:17 +0000 (16:36 -0800) |
src/caffe/layers/axpy_layer.cu | patch | blob | history | |
src/caffe/util/math_functions.cu | patch | blob | history |
index 04cba231c20bf2d3461f6ac8d2cb4641ebf520df..31bddbe1094edcf7d207a253e7d991b456bcc0a5 100644 (file)
* Author: hujie
*/
+#include <device_launch_parameters.h>
+#include "caffe/util/half.cuh"
+#include "caffe/util/gpu_math_functions.cuh"
#include "caffe/layers/axpy_layer.hpp"
namespace caffe {
const Ftype* y_data = bottom[2]->gpu_data<Ftype>();
Ftype* out_data = top[0]->mutable_gpu_data<Ftype>();
const int count = bottom[1]->count();
- AxpyForward<Ftype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
- count, bottom[1]->count(2), scale_data, x_data, y_data, out_data);
+ cudaStream_t stream = Caffe::thread_stream();
+ AxpyForward<Ftype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
+ count, bottom[1]->count(2), scale_data, x_data, y_data, out_data);
+ CUDA_CHECK(cudaStreamSynchronize(stream));
}
template <typename Dtype>
__global__ void AxpyBackwardScale(const int outer_num, const int spatial_dim,
const Dtype* x_data, const Dtype* top_diff, Dtype* scale_diff) {
- __shared__ Dtype buffer[CAFFE_CUDA_NUM_THREADS];
+ __shared__ char axpy_buffer[CAFFE_CUDA_NUM_THREADS * sizeof(Dtype)];
+ Dtype* buffer = reinterpret_cast<Dtype*>(axpy_buffer);
unsigned int tid = threadIdx.x;
buffer[tid] = 0;
__syncthreads();
const int count = top[0]->count();
const Btype* top_diff = top[0]->gpu_diff<Btype>();
if (propagate_down[0]) {
+ cudaStream_t stream = Caffe::thread_stream();
int outer_num = bottom[1]->count(0, 2);
- AxpyBackwardScale<Btype><<<outer_num, CAFFE_CUDA_NUM_THREADS>>>(
+ AxpyBackwardScale<<<outer_num, CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
outer_num, bottom[1]->count(2),
bottom[1]->gpu_data<Btype>(), top_diff,
bottom[0]->mutable_gpu_diff<Btype>());
+ CUDA_POST_KERNEL_CHECK;
+ CUDA_CHECK(cudaStreamSynchronize(stream));
}
if (propagate_down[1]) {
- AxpyBackwardX<Btype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+ cudaStream_t stream = Caffe::thread_stream();
+ AxpyBackwardX<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS, 0, stream>>>(
count, top[0]->count(2),
bottom[0]->gpu_data<Btype>(), top_diff,
bottom[1]->mutable_gpu_diff<Btype>());
+ CUDA_POST_KERNEL_CHECK;
+ CUDA_CHECK(cudaStreamSynchronize(stream));
}
if (propagate_down[2]) {
caffe_copy(count, top_diff, bottom[2]->mutable_gpu_diff<Btype>());
}
- CUDA_POST_KERNEL_CHECK;
}
INSTANTIATE_LAYER_GPU_FUNCS_FB(AxpyLayer);
index f46450e59b4a857f8c720a7c97978dfae551a55d..12563b795e7a0773434f3d8f3145eaea3a1fc87d 100644 (file)
#include <device_launch_parameters.h>
#include "caffe/util/half.cuh"
-//#include "caffe/common.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/util/gpu_math_functions.cuh"
#include "caffe/type.hpp"
Mtype cache[CAFFE_CUDA_NUM_THREADS];
const int tidx = threadIdx.x;
cache[tidx] = 0.;
+ __syncthreads();
for (int i = tidx; i < N; i += blockDim.x) {
cache[tidx] += static_cast<Mtype>(x[i]) * static_cast<Mtype>(y[i]);
}