author | Boris Fomitchev <bfomitchev@nvidia.com> | |
Tue, 13 Oct 2015 00:58:57 +0000 (17:58 -0700) | ||
committer | Boris Fomitchev <bfomitchev@nvidia.com> | |
Tue, 13 Oct 2015 00:58:57 +0000 (17:58 -0700) |
1 | 2 | |||
---|---|---|---|---|
include/caffe/MemoryHandler.hpp | patch | | | | diff2 | | blob | history |
include/caffe/common.hpp | patch | | diff1 | | diff2 | | blob | history |
include/caffe/vision_layers.hpp | patch | | diff1 | | diff2 | | blob | history |
src/caffe/MemoryHandler.cpp | patch | | | | diff2 | | blob | history |
src/caffe/layers/cudnn_conv_layer.cpp | patch | | diff1 | | diff2 | | blob | history |
src/caffe/layers/cudnn_conv_layer.cu | patch | | diff1 | | diff2 | | blob | history |
src/caffe/layers/cudnn_lcn_layer.cpp | patch | | diff1 | | diff2 | | blob | history |
src/caffe/layers/cudnn_lcn_layer.cu | patch | | diff1 | | diff2 | | blob | history |
src/caffe/parallel.cpp | patch | | diff1 | | diff2 | | blob | history |
tools/caffe.cpp | patch | | diff1 | | diff2 | | blob | history |
diff --cc include/caffe/MemoryHandler.hpp
index 0000000000000000000000000000000000000000,fbb25270f1ee01b69ec6e8cd177b330850ffbd23..74acc9c5fef4372aa3d2a004d5b6542b3d66e94e
mode 000000,100644..100644
mode 000000,100644..100644
--- /dev/null
- static void init(const std::vector<int>& gpus_, bool use_pool=true);
+ #ifndef CAFFE_MEMORYHANDLER_HPP_
+ #define CAFFE_MEMORYHANDLER_HPP_
+
+ #include "common.hpp"
+
+ namespace caffe {
+
+ class MemoryHandler {
+ public:
+ #ifndef CPU_ONLY
+ static void mallocGPU(void **ptr, size_t size,
+ cudaStream_t stream = cudaStreamDefault);
+ static void freeGPU(void *ptr, cudaStream_t = cudaStreamDefault);
+ static void registerStream(cudaStream_t stream);
+ #endif
+
+ static bool usingPool() {
+ return using_pool_;
+ }
+
+ static void getInfo(size_t *free_mem, size_t *used_mem);
+
+ private:
- explicit MemoryHandlerActivator(const std::vector<int>& gpus, bool use_pool = true) {
- MemoryHandler::init(gpus, use_pool && gpus.size() > 0);
++ static void init(const std::vector<int>& gpus, bool use_pool=true);
+ static void destroy();
+
+ friend class MemoryHandlerActivator;
+ static bool using_pool_;
+ static bool initialized_;
+
+
+ };
+
+ class MemoryHandlerActivator {
+ public:
++ MemoryHandlerActivator(const std::vector<int>& gpus,
++ bool use_pool = true) {
++ MemoryHandler::init(gpus, use_pool && gpus.size() > 0);
+ }
+ ~MemoryHandlerActivator() {
+ MemoryHandler::destroy();
+ }
+ };
+
+ } // namespace caffe
+
+ # endif
diff --cc include/caffe/common.hpp
Simple merge
diff --cc include/caffe/vision_layers.hpp
index af2b1028589e4b8de521c93a63090cfd9789e592,6498137c098e0131fc35ae4c652f92253b55c2d8..cbdbe4c7135f7280f733b1af025a90e06788bb4f
cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
++ // algorithms for forward and backwards convolutions
++ cudnnConvolutionFwdAlgo_t *fwd_algo_;
++ cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
++ cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
++
+ vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
+ cudnnTensorDescriptor_t bias_desc_;
+ cudnnFilterDescriptor_t filter_desc_;
+ vector<cudnnConvolutionDescriptor_t> conv_descs_;
++
int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
size_t *workspace_fwd_sizes_;
size_t *workspace_bwd_filter_sizes_;
size_t workspaceSizeInBytes; // size of underlying storage
void *workspaceData; // underlying storage
-- void **workspace; // aliases into workspaceData
-
- vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
- cudnnTensorDescriptor_t bias_desc_;
- cudnnFilterDescriptor_t filter_desc_;
- vector<cudnnConvolutionDescriptor_t> conv_descs_;
-
};
#endif
diff --cc src/caffe/MemoryHandler.cpp
index 0000000000000000000000000000000000000000,aa373c9e0838f3135840d32fccf99b9d075fe916..23048c673b8195964757716215345a4788bfa2da
mode 000000,100644..100644
mode 000000,100644..100644
--- /dev/null
-void MemoryHandler::init(const std::vector<int>& gpus, bool use_pool) {
+ #include "caffe/common.hpp"
+ #include "caffe/MemoryHandler.hpp"
+ #ifdef USE_CNMEM
+ // CNMEM integration
+ #include <cnmem.h>
+ #endif
+
+ #include <boost/thread.hpp>
+
+ namespace caffe {
+
+ bool MemoryHandler::using_pool_ = false;
+ bool MemoryHandler::initialized_ = false;
+
+ using namespace boost;
+
+ #ifndef CNMEM_CHECK
+ # define CNMEM_CHECK(x)
+ #endif
+
+ #ifndef CPU_ONLY // CPU-only Caffe.
+
+ void MemoryHandler::mallocGPU(void **ptr, size_t size, cudaStream_t stream) {
+ CHECK(initialized_);
++ CHECK(ptr!=0);
+ if (using_pool_) {
+ CNMEM_CHECK(cnmemMalloc(ptr, size, stream));
+ } else {
+ CUDA_CHECK(cudaMalloc(ptr, size));
+ }
+ }
+
+
+ void MemoryHandler::freeGPU(void *ptr, cudaStream_t stream) {
+ CHECK(initialized_);
++ // allow for null pointer deallocation
++ if(!ptr)
++ return;
+ if (using_pool_) {
+ CNMEM_CHECK(cnmemFree(ptr, stream));
+ } else {
+ CUDA_CHECK(cudaFree(ptr));
+ }
+ }
+
+ void MemoryHandler::registerStream(cudaStream_t stream) {
+ CHECK(initialized_);
+ if (using_pool_) {
+ CNMEM_CHECK(cnmemRegisterStream(stream));
+ }
+ }
+
+ void MemoryHandler::destroy() {
+ CHECK(initialized_);
+ CNMEM_CHECK(cnmemFinalize());
+ initialized_ = false;
+ using_pool_ = false;
+ }
+
++void MemoryHandler::init(const std::vector<int>& gpus, bool use_pool)
++{
+ CHECK(!initialized_);
++
+ #ifdef USE_CNMEM
+ if (use_pool) {
+ using_pool_ = true;
+ cnmemDevice_t *devs = new cnmemDevice_t[gpus.size()];
+
+ int initial_device;
+ CUDA_CHECK(cudaGetDevice(&initial_device));
+
+ for (int i = 0; i < gpus.size(); i++) {
+ CUDA_CHECK(cudaSetDevice(gpus[i]));
+
+ devs[i].device = gpus[i];
+
+ size_t free_mem, used_mem;
+ CUDA_CHECK(cudaMemGetInfo(&free_mem, &used_mem));
+
+ devs[i].size = size_t(0.95*free_mem);
+ devs[i].numStreams = 0;
+ devs[i].streams = NULL;
+ }
+ CNMEM_CHECK(cnmemInit(gpus.size(), devs, CNMEM_FLAGS_DEFAULT));
+ initialized_ = true;
+
+ CUDA_CHECK(cudaSetDevice(initial_device));
+
+ delete [] devs;
+ }
+ #endif
+ initialized_ = true;
+ std::cout << "MemoryHandler initialized" <<
+ (using_pool_ ? " with CNMEM pool.\n" : " with CUDA allocator.\n");
+ }
+
+ void MemoryHandler::getInfo(size_t *free_mem, size_t *total_mem) {
+ if (using_pool_) {
+ CNMEM_CHECK(cnmemMemGetInfo(free_mem, total_mem, cudaStreamDefault));
+ } else {
+ CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));
+ }
+ }
+
+ }
+
+ #endif // CPU_ONLY
+
diff --cc src/caffe/layers/cudnn_conv_layer.cpp
index d3fd930fb19f831c1df50215eeccf39c71fea0b2,0ba34b01ec59eac663c0ac10d446df2ae08d7af9..938aa4981336563cdfdbbfe1b1750bb61634a443
namespace caffe {
--// Set to three for the benefit of the backward pass, which
--// can use separate streams for calculating the gradient w.r.t.
--// bias, filter weights, and bottom data for each group independently
- #define CUDNN_STREAMS_PER_GROUP 1
-#define CUDNN_STREAMS_PER_GROUP 3
--
/**
* TODO(dox) explain cuDNN interface
*/
template <typename Dtype>
void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+
ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
- // Initialize CUDA streams and cuDNN.
- stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
- handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
-
// Initialize algorithm arrays
fwd_algo_ = new cudnnConvolutionFwdAlgo_t[bottom.size()];
bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
workspace_fwd_sizes_ = new size_t[bottom.size()];
workspace_bwd_filter_sizes_ = new size_t[bottom.size()];
workspace_bwd_data_sizes_ = new size_t[bottom.size()];
-
+
// workspace data
- workspaceData = 0;
- workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];
workspaceSizeInBytes = 0;
-
- for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
- CUDA_CHECK(cudaStreamCreate(&stream_[g]));
- CUDNN_CHECK(cudnnCreate(&handle_[g]));
- CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
- workspace[g] = NULL;
+ workspaceData = NULL;
- workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];
+
+ for (size_t i = 0; i < bottom.size(); ++i) {
+ // initialize all to default algorithms
+ fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0;
+ bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0;
+ bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0;
+ // default algorithms don't require workspace
+ workspace_fwd_sizes_[i] = 0;
+ workspace_bwd_data_sizes_[i] = 0;
+ workspace_bwd_filter_sizes_[i] = 0;
}
- for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
- workspace[g] = NULL;
- }
-
// Set the indexing parameters.
bias_offset_ = (this->num_output_ / this->group_);
// Specify workspace limit for kernels directly until we have a
// planning strategy and a rewrite of Caffe's GPU memory mangagement
size_t workspace_limit_bytes;
-- if (MemoryHandler::usingPool()) {
-- size_t total_memory;
-- MemoryHandler::getInfo(&workspace_limit_bytes, &total_memory);
-- } else {
-- workspace_limit_bytes = 8*1024*1024;
-- }
++ MemoryHandler::getInfo(&workspace_limit_bytes, &total_memory);
for (int i = 0; i < bottom.size(); i++) {
cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
this->num_output_ / this->group_, height_out, width_out,
this->num_output_ * this->out_spatial_dim_,
this->out_spatial_dim_, width_out, 1);
++
cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
- filter_desc_, pad_h, pad_w,
- stride_h, stride_w);
+ filter_desc_, pad_h, pad_w, stride_h, stride_w);
- // choose forward and backward algorithms + workspace(s)
+ // choose forward and backward algorithms + workspace(s)
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(),
bottom_descs_[i],
filter_desc_,
filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
}
-
- if (!MemoryHandler::usingPool()) {
- // reduce over all workspace sizes to get a maximum to allocate / reallocate
- size_t total_workspace_fwd = 0;
- size_t total_workspace_bwd_data = 0;
- size_t total_workspace_bwd_filter = 0;
-
- for (size_t i = 0; i < bottom.size(); i++) {
- total_workspace_fwd = std::max(total_workspace_fwd,
- workspace_fwd_sizes_[i]);
- total_workspace_bwd_data = std::max(total_workspace_bwd_data,
- workspace_bwd_data_sizes_[i]);
- total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
- workspace_bwd_filter_sizes_[i]);
- }
- // get max over all operations
- size_t max_workspace = std::max(total_workspace_fwd,
- total_workspace_bwd_data);
- max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
- // ensure all groups have enough workspace
- size_t total_max_workspace = max_workspace *
- (this->group_ * CUDNN_STREAMS_PER_GROUP);
-
- // this is the total amount of storage needed over all groups + streams
- if (total_max_workspace > workspaceSizeInBytes) {
- LOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
- workspaceSizeInBytes = total_max_workspace;
-
- // free the existing workspace and allocate a new (larger) one
- if (this->workspaceData)
- MemoryHandler::freeGPU(this->workspaceData);
- this->workspaceData = NULL;
-
- MemoryHandler::mallocGPU(&(this->workspaceData), workspaceSizeInBytes);
-
- // if we succeed in the allocation, set pointer aliases for workspaces
- for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
- workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
- }
- }
- }
- #ifndef USE_CNMEM
- // reduce over all workspace sizes to get a maximum to allocate / reallocate
- size_t total_workspace_fwd = 0;
- size_t total_workspace_bwd_data = 0;
- size_t total_workspace_bwd_filter = 0;
-
- for (size_t i = 0; i < bottom.size(); i++) {
- total_workspace_fwd = std::max(total_workspace_fwd,
- workspace_fwd_sizes_[i]);
- total_workspace_bwd_data = std::max(total_workspace_bwd_data,
- workspace_bwd_data_sizes_[i]);
- total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
- workspace_bwd_filter_sizes_[i]);
- }
- // get max over all operations
- size_t max_workspace = std::max(total_workspace_fwd,
- total_workspace_bwd_data);
- max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
- // ensure all groups have enough workspace
- size_t total_max_workspace = max_workspace *
- (this->group_ * CUDNN_STREAMS_PER_GROUP);
-
- // this is the total amount of storage needed over all groups + streams
- if (total_max_workspace > workspaceSizeInBytes) {
- LOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
- workspaceSizeInBytes = total_max_workspace;
-
- // free the existing workspace and allocate a new (larger) one
- MemoryHandler::freeGPU(this->workspaceData);
- this->workspaceData = NULL;
-
- cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
- if (err != cudaSuccess) {
- // force zero memory path
- for (int i = 0; i < bottom.size(); i++) {
- workspace_fwd_sizes_[i] = 0;
- workspace_bwd_filter_sizes_[i] = 0;
- workspace_bwd_data_sizes_[i] = 0;
- fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
- bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
- bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
- }
-
- // NULL out all workspace pointers
- for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
- workspace[g] = NULL;
- }
- // NULL out underlying data
- workspaceData = NULL;
- workspaceSizeInBytes = 0;
- }
-
- // if we succeed in the allocation, set pointer aliases for workspaces
- for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
- workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
- }
- }
- #endif
-
// Tensor descriptor for bias.
if (this->bias_term_) {
cudnn::setTensor4dDesc<Dtype>(&bias_desc_,
}
cudnnDestroyFilterDescriptor(filter_desc_);
- cudaFree(workspaceData);
- for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
- cudaStreamDestroy(stream_[g]);
- cudnnDestroy(handle_[g]);
- }
-
- delete [] stream_;
- delete [] handle_;
-
delete [] fwd_algo_;
delete [] bwd_filter_algo_;
delete [] bwd_data_algo_;
delete [] workspace_fwd_sizes_;
delete [] workspace_bwd_data_sizes_;
delete [] workspace_bwd_filter_sizes_;
- if (this->workspaceData)
- MemoryHandler::freeGPU(this->workspaceData);
- if (this->workspace)
- delete [] this->workspace;
+
++ MemoryHandler::freeGPU(this->workspaceData);
}
INSTANTIATE_CLASS(CuDNNConvolutionLayer);
diff --cc src/caffe/layers/cudnn_conv_layer.cu
index f00267a4ce73eb322ed57055fc952c76f0399613,1ba369bb1f79bf0723f4867a227ac504fae4c7f6..4ab9a94235378d727efb08355995993613887fb8
// Forward through cuDNN in parallel over groups.
for (int g = 0; g < this->group_; g++) {
- #ifdef USE_CNMEM
- MemoryHandler::mallocGPU(&workspace[0], workspace_fwd_sizes_[i]);
- #endif
- cudnnConvolutionFwdAlgo_t algo;
-
- if (!MemoryHandler::usingPool()) {
- handle = handle_[g];
-
- // pick the convolution algorithm
- // TODO(shelhamer) this should be done during reshape
- // TODO(shelhamer) the choice of automatic or manual algorithm picking
- // should be exposed in proto
-
- CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle,
- bottom_descs_[i],
- filter_desc_,
- conv_descs_[i],
- top_descs_[i],
- CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
- workspace_limit_bytes, // memoryLimitInBytes,
- &algo));
-
- CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle,
- bottom_descs_[i],
- filter_desc_,
- conv_descs_[i],
- top_descs_[i],
- algo,
- &workspaceSizeInBytes_temp));
-
- }
- else {
- workspaceSizeInBytes_temp = workspace_fwd_sizes_[i];
- algo=fwd_algo_[i];
- }
-
-
- if (workspaceSizeInBytes_temp > workspaceSizeInBytes) {
- workspaceSizeInBytes = workspaceSizeInBytes_temp;
- // free the existing workspace and allocate a new (larger) one
- MemoryHandler::freeGPU(&this->workspaceData);
- MemoryHandler::mallocGPU(&workspaceData, workspaceSizeInBytes);
- if (!workspaceData) {
- // force zero memory path
- algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
- workspaceSizeInBytes = 0;
- }
- }
-
-
- // Filters.
-
- CUDNN_CHECK(cudnnConvolutionForward(handle,
- cudnn::dataType<Dtype>::one,
- bottom_descs_[i], bottom_data + bottom_offset_ * g,
- filter_desc_, weight + this->weight_offset_ * g,
- conv_descs_[i],
- algo, workspace, workspaceSizeInBytes,
- cudnn::dataType<Dtype>::zero,
- top_descs_[i], top_data + top_offset_ * g));
-
- if (MemoryHandler::usingPool()) {
- MemoryHandler::freeGPU(workspace[0]);
- workspace[0] = NULL;
- }
-
++ MemoryHandler::mallocGPU(&workspaceData, workspace_fwd_sizes_[i]);
+ // Filters.
- // CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
+ CUDNN_CHECK(cudnnConvolutionForward(Caffe::cudnn_handle(),
+ cudnn::dataType<Dtype>::one,
+ bottom_descs_[i], bottom_data + bottom_offset_ * g,
+ filter_desc_, weight + this->weight_offset_ * g,
+ conv_descs_[i],
- fwd_algo_[i], workspace[0], workspace_fwd_sizes_[i],
++ fwd_algo_[i], workspaceData, workspace_fwd_sizes_[i],
+ cudnn::dataType<Dtype>::zero,
+ top_descs_[i], top_data + top_offset_ * g));
+
- #ifdef USE_CNMEM
- MemoryHandler::freeGPU(workspace[0]);
- workspace[0] = NULL;
- #endif
++ MemoryHandler::freeGPU(workspaceData);
++ workspaceData = NULL;
// Bias.
if (this->bias_term_) {
const Dtype* bias_data = this->blobs_[1]->gpu_data();
- CUDNN_CHECK(cudnnAddTensor_v3(Caffe::cudnn_handle(), //CUDNN_ADD_SAME_C,
- CUDNN_CHECK(cudnnAddTensor_v3(
- handle,
++ CUDNN_CHECK(cudnnAddTensor_v3(Caffe::cudnn_handle(),
cudnn::dataType<Dtype>::one,
bias_desc_, bias_data + bias_offset_ * g,
cudnn::dataType<Dtype>::one,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
const Dtype* weight = NULL;
Dtype* weight_diff = NULL;
- cudnnHandle_t handle = Caffe::cudnn_handle();
-
+
if (this->param_propagate_down_[0]) {
weight = this->blobs_[0]->gpu_data();
weight_diff = this->blobs_[0]->mutable_gpu_diff();
- if (MemoryHandler::usingPool())
- caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
+ caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
}
Dtype* bias_diff = NULL;
+
if (this->bias_term_ && this->param_propagate_down_[1]) {
bias_diff = this->blobs_[1]->mutable_gpu_diff();
- if (!MemoryHandler::usingPool())
- caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
+ caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
}
+
for (int i = 0; i < top.size(); ++i) {
const Dtype* top_diff = top[i]->gpu_diff();
+
// Backward through cuDNN in parallel over groups and gradients.
for (int g = 0; g < this->group_; g++) {
- if (!MemoryHandler::usingPool())
- handle = handle_[2*this->group_ + g];
// Gradient w.r.t. bias.
if (this->bias_term_ && this->param_propagate_down_[1]) {
- CUDNN_CHECK(cudnnConvolutionBackwardBias(
- handle,
+ CUDNN_CHECK(cudnnConvolutionBackwardBias(Caffe::cudnn_handle(),
cudnn::dataType<Dtype>::one,
top_descs_[i], top_diff + top_offset_ * g,
cudnn::dataType<Dtype>::one,
// Gradient w.r.t. weights.
if (this->param_propagate_down_[0]) {
- #ifdef USE_CNMEM
- MemoryHandler::mallocGPU(&workspace[0], workspace_bwd_filter_sizes_[i]);
- #endif
-
- if (MemoryHandler::usingPool())
- MemoryHandler::mallocGPU(&workspace[0], workspace_bwd_filter_sizes_[i]);
-
++ MemoryHandler::mallocGPU(&workspaceData, workspace_bwd_filter_sizes_[i]);
const Dtype* bottom_data = bottom[i]->gpu_data();
- CUDNN_CHECK(cudnnConvolutionBackwardFilter_v2(
- handle,
+ CUDNN_CHECK(cudnnConvolutionBackwardFilter_v3(
+ Caffe::cudnn_handle(),
cudnn::dataType<Dtype>::one,
bottom_descs_[i], bottom_data + bottom_offset_ * g,
top_descs_[i], top_diff + top_offset_ * g,
conv_descs_[i],
- bwd_filter_algo_[i], workspace[0], workspace_bwd_filter_sizes_[i],
-// bwd_filter_algo_[i], workspace[0], workspace_bwd_filter_sizes_[i],
++ bwd_filter_algo_[i], workspaceData, workspace_bwd_filter_sizes_[i],
cudnn::dataType<Dtype>::one,
filter_desc_, weight_diff + this->weight_offset_ * g));
- #ifdef USE_CNMEM
- MemoryHandler::freeGPU(workspace[0]);
- workspace[0] = NULL;
- #endif
-
- if (MemoryHandler::usingPool()) {
- MemoryHandler::freeGPU(workspace[0]);
- workspace[0] = NULL;
- }
++ MemoryHandler::freeGPU(workspaceData);
++ workspaceData = NULL;
}
// Gradient w.r.t. bottom data.
weight = this->blobs_[0]->gpu_data();
}
Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
- #ifdef USE_CNMEM
- MemoryHandler::mallocGPU(&workspace[0], workspace_bwd_data_sizes_[i]);
- #endif
- if (MemoryHandler::usingPool()) {
- MemoryHandler::mallocGPU(&workspace[0], workspace_bwd_data_sizes_[i]);
- }
-
- CUDNN_CHECK(cudnnConvolutionBackwardData_v2(handle,
++ MemoryHandler::mallocGPU(&workspaceData, workspace_bwd_data_sizes_[i]);
+ CUDNN_CHECK(cudnnConvolutionBackwardData_v3(
+ Caffe::cudnn_handle(),
cudnn::dataType<Dtype>::one,
filter_desc_, weight + this->weight_offset_ * g,
top_descs_[i], top_diff + top_offset_ * g,
conv_descs_[i],
- bwd_data_algo_[i], workspace[0], workspace_bwd_data_sizes_[i],
-// bwd_data_algo_[i], workspace[0], workspace_bwd_data_sizes_[i],
++ bwd_data_algo_[i], workspaceData, workspace_bwd_data_sizes_[i],
+ workspace_bwd_data_sizes_[i],
cudnn::dataType<Dtype>::zero,
bottom_descs_[i], bottom_diff + bottom_offset_ * g));
- #ifdef USE_CNMEM
- MemoryHandler::freeGPU(workspace[0]);
- workspace[0] = NULL;
- #endif
- if (MemoryHandler::usingPool()) {
- MemoryHandler::freeGPU(workspace[0]);
- workspace[0] = NULL;
- }
++ MemoryHandler::freeGPU(workspaceData);
++ workspaceData = NULL;
}
}
diff --cc src/caffe/layers/cudnn_lcn_layer.cpp
index 0bfa41a48d36069c8bbb1add4efc203d2ef4ab64,bf58867b92bd25c8f28d9af734d9f8e623173dec..d4301dc1a8aed9f30c3253e2c82b58d7088489ec
this->channels_, this->height_, this->width_);
CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_));
-- // allocate / reallocate tempData buffers
-- size_t totalSizeInBytes = sizeof(Dtype)*bottom[0]->num()* \
++ // size for tempData buffers
++ tempDataSize = sizeof(Dtype)*bottom[0]->num()* \
this->channels_*this->height_*this->width_;
-
- #ifdef USE_CNMEM
- this->tempDataSize = totalSizeInBytes;
- #else
- if (totalSizeInBytes > tempDataSize) {
- tempDataSize = totalSizeInBytes;
-
- MemoryHandler::freeGPU(tempData1);
- MemoryHandler::freeGPU(tempData2);
- tempData1 = NULL;
- tempData2 = NULL;
--
- // allocate new buffers
- MemoryHandler::mallocGPU(&tempData1, totalSizeInBytes);
- MemoryHandler::mallocGPU(&tempData2, totalSizeInBytes);
- }
- #endif
- if (MemoryHandler::usingPool())
- this->tempDataSize = totalSizeInBytes;
- else
- if (totalSizeInBytes > tempDataSize) {
- tempDataSize = totalSizeInBytes;
-
- MemoryHandler::freeGPU(tempData1);
- MemoryHandler::freeGPU(tempData2);
- tempData1 = NULL;
- tempData2 = NULL;
-
- // allocate new buffers
- MemoryHandler::mallocGPU(&tempData1, totalSizeInBytes);
- MemoryHandler::mallocGPU(&tempData2, totalSizeInBytes);
- }
}
template <typename Dtype>
// destroy LRN handle
CUDNN_CHECK(cudnnDestroyLRNDescriptor(norm_desc_));
--
-- // free temp buffers
-- cudaFree(tempData1);
-- cudaFree(tempData2);
}
INSTANTIATE_CLASS(CuDNNLCNLayer);
diff --cc src/caffe/layers/cudnn_lcn_layer.cu
index 245b967edd4798fd2e01b75e118c7f750d0a37d4,61c4f98d987f8f5517000aa9c3541bae9337d95f..76ed4b36d1a9a4e71cb5bf4c5257c5715bfb266d
const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
--
- #ifdef USE_CNMEM
++
MemoryHandler::mallocGPU(&this->tempData1, this->tempDataSize);
MemoryHandler::mallocGPU(&this->tempData2, this->tempDataSize);
- #endif
CUDNN_CHECK(cudnnDivisiveNormalizationForward(
Caffe::cudnn_handle(), norm_desc_, CUDNN_DIVNORM_PRECOMPUTED_MEANS,
cudnn::dataType<Dtype>::zero,
top_desc_, top_data) );
- #ifdef USE_CNMEM
MemoryHandler::freeGPU(this->tempData1);
MemoryHandler::freeGPU(this->tempData2);
-- this->tempData1 = NULL;
-- this->tempData2 = NULL;
- #endif
}
template <typename Dtype>
bottom_desc_, bottom_diff,
NULL) );
- #ifdef USE_CNMEM
MemoryHandler::freeGPU(this->tempData1);
MemoryHandler::freeGPU(this->tempData2);
-- this->tempData1 = NULL;
-- this->tempData2 = NULL;
- #endif
-
}
INSTANTIATE_LAYER_GPU_FUNCS(CuDNNLCNLayer);
diff --cc src/caffe/parallel.cpp
index cf244bcf91672bcee7e63ed7680dfdcd653177cb,9d47f1cd422e8c8502cea80a8c5a9e1d2ae47a03..1ebbc17da52676443aceec0b44d81b8d6140d578
+++ b/src/caffe/parallel.cpp
template<typename Dtype>
P2PSync<Dtype>::~P2PSync() {
#ifndef CPU_ONLY
- int initial_device;
- CUDA_CHECK(cudaGetDevice(&initial_device));
- const int self = solver_->param().device_id();
- CUDA_CHECK(cudaSetDevice(self));
-
if (parent_) {
- int initial_device;
- CUDA_CHECK(cudaGetDevice(&initial_device));
- const int self = solver_->param().device_id();
- const int peer = parent_->solver_->param().device_id();
+
- CUDA_CHECK(cudaSetDevice(peer));
+ const int peer = parent_->solver_->param().device_id();
+ cudaSetDevice(peer);
MemoryHandler::freeGPU(parent_grads_);
parent_grads_ = NULL;
-
- CUDA_CHECK(cudaSetDevice(self));
+ cudaSetDevice(self);
int access;
CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
if (access) {
CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
}
-
- CUDA_CHECK(cudaSetDevice(initial_device));
}
-
- CUDA_CHECK(cudaSetDevice(initial_device));
#endif
}
diff --cc tools/caffe.cpp
index fb44681cf7b972123d774591dae6059724151d90,77c68b64d6a73ac764568799852893370eb52e12..608708822682687e250311cc31e89a01aec7b42c
--- 1/tools/caffe.cpp
--- 2/tools/caffe.cpp
+++ b/tools/caffe.cpp
Caffe::set_mode(Caffe::GPU);
Caffe::set_solver_count(gpus.size());
}
+#ifdef USE_CNMEM
+ MemoryHandlerActivator handler(gpus);
+#endif
+ MemoryHandlerActivator handler(gpus);
+
caffe::SignalHandler signal_handler(
GetRequestedAction(FLAGS_sigint_effect),
GetRequestedAction(FLAGS_sighup_effect));