|
@@ -0,0 +1,504 @@
|
|
|
+ https://blog.csdn.net/XCCCCZ/article/details/112909233
|
|
|
+
|
|
|
+
|
|
|
+ 修改cmake/Cuda.cmake , 将里面的"cudnn.h" 全部用 "cudnn_version.h"代替;
|
|
|
+
|
|
|
+
|
|
|
+ src/caffe/layers/cudnn_conv_layer.cpp:
|
|
|
+
|
|
|
+ template <typename Dtype>
|
|
|
+void CuDNNConvolutionLayer<Dtype>::Reshape(
|
|
|
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
|
|
|
+ ConvolutionLayer<Dtype>::Reshape(bottom, top);
|
|
|
+ CHECK_LE(2, this->num_spatial_axes_)
|
|
|
+ << "CuDNNConvolution input must have 2 spatial axes "
|
|
|
+ << "(e.g., height and width). "
|
|
|
+ << "Use 'engine: CAFFE' for general ND convolution.";
|
|
|
+ bottom_offset_ = this->bottom_dim_ / this->group_;
|
|
|
+ top_offset_ = this->top_dim_ / this->group_;
|
|
|
+ const bool forced_3d = this->forced_3d_;
|
|
|
+ const int height = bottom[0]->shape(this->channel_axis_ + 1 + forced_3d);
|
|
|
+ const int width = bottom[0]->shape(this->channel_axis_ + 2 + forced_3d);
|
|
|
+ const int height_out = top[0]->shape(this->channel_axis_ + 1 + forced_3d);
|
|
|
+ const int width_out = top[0]->shape(this->channel_axis_ + 2 + forced_3d);
|
|
|
+ const int* pad_data = this->pad_.cpu_data();
|
|
|
+ const int pad_h = pad_data[0];
|
|
|
+ const int pad_w = pad_data[1];
|
|
|
+ const int* stride_data = this->stride_.cpu_data();
|
|
|
+ const int stride_h = stride_data[0];
|
|
|
+ const int stride_w = stride_data[1];
|
|
|
+ #if CUDNN_VERSION_MIN(8, 0, 0)
|
|
|
+ int RetCnt;
|
|
|
+ bool found_conv_algorithm;
|
|
|
+ size_t free_memory, total_memory;
|
|
|
+ cudnnConvolutionFwdAlgoPerf_t fwd_algo_pref_[4];
|
|
|
+ cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
|
|
|
+
|
|
|
+ //get memory sizes
|
|
|
+ cudaMemGetInfo(&free_memory, &total_memory);
|
|
|
+ #else
|
|
|
+ // Specify workspace limit for kernels directly until we have a
|
|
|
+ // planning strategy and a rewrite of Caffe's GPU memory mangagement
|
|
|
+ size_t workspace_limit_bytes = 8*1024*1024;
|
|
|
+ #endif
|
|
|
+ for (int i = 0; i < bottom.size(); i++) {
|
|
|
+ cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
|
|
|
+ this->num_,
|
|
|
+ this->channels_ / this->group_, height, width,
|
|
|
+ this->channels_ * height * width,
|
|
|
+ height * width, width, 1);
|
|
|
+ cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
|
|
|
+ this->num_,
|
|
|
+ this->num_output_ / this->group_, height_out, width_out,
|
|
|
+ this->num_output_ * this->out_spatial_dim_,
|
|
|
+ this->out_spatial_dim_, width_out, 1);
|
|
|
+ cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
|
|
|
+ filter_desc_, pad_h, pad_w,
|
|
|
+ stride_h, stride_w);
|
|
|
+ #if CUDNN_VERSION_MIN(8, 0, 0)
|
|
|
+ // choose forward algorithm for filter
|
|
|
+ // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
|
|
|
+ bottom_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ conv_descs_[i],
|
|
|
+ top_descs_[i],
|
|
|
+ 4,
|
|
|
+ &RetCnt,
|
|
|
+ fwd_algo_pref_));
|
|
|
+
|
|
|
+ found_conv_algorithm = false;
|
|
|
+ for(int n=0;n<RetCnt;n++){
|
|
|
+ if (fwd_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
|
|
|
+ fwd_algo_pref_[n].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
|
|
|
+ fwd_algo_pref_[n].memory < free_memory){
|
|
|
+ found_conv_algorithm = true;
|
|
|
+ fwd_algo_[i] = fwd_algo_pref_[n].algo;
|
|
|
+ workspace_fwd_sizes_[i] = fwd_algo_pref_[n].memory;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
|
|
|
+ else{
|
|
|
+ // choose backward algorithm for filter
|
|
|
+ // for better or worse, just a fixed constant due to the missing
|
|
|
+ // cudnnGetConvolutionBackwardFilterAlgorithm in cuDNN version 8.0
|
|
|
+ bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
|
|
|
+ //twice the amount of the forward search to be save
|
|
|
+ workspace_bwd_filter_sizes_[i] = 2*workspace_fwd_sizes_[i];
|
|
|
+ }
|
|
|
+
|
|
|
+ // choose backward algo for data
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0],
|
|
|
+ filter_desc_,
|
|
|
+ top_descs_[i],
|
|
|
+ conv_descs_[i],
|
|
|
+ bottom_descs_[i],
|
|
|
+ 4,
|
|
|
+ &RetCnt,
|
|
|
+ bwd_data_algo_pref_));
|
|
|
+
|
|
|
+ found_conv_algorithm = false;
|
|
|
+ for(int n=0;n<RetCnt;n++){
|
|
|
+ if (bwd_data_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
|
|
|
+ bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD &&
|
|
|
+ bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
|
|
|
+ bwd_data_algo_pref_[n].memory < free_memory){
|
|
|
+ found_conv_algorithm = true;
|
|
|
+ bwd_data_algo_[i] = bwd_data_algo_pref_[n].algo;
|
|
|
+ workspace_bwd_data_sizes_[i] = bwd_data_algo_pref_[n].memory;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
|
|
|
+ #else
|
|
|
+ // choose forward and backward algorithms + workspace(s)
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
|
|
|
+ bottom_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ conv_descs_[i],
|
|
|
+ top_descs_[i],
|
|
|
+ CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
|
|
|
+ workspace_limit_bytes,
|
|
|
+ &fwd_algo_[i]));
|
|
|
+
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
|
|
|
+ bottom_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ conv_descs_[i],
|
|
|
+ top_descs_[i],
|
|
|
+ fwd_algo_[i],
|
|
|
+ &(workspace_fwd_sizes_[i])));
|
|
|
+
|
|
|
+ // choose backward algorithm for filter
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
|
|
|
+ bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
|
|
|
+ CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
|
|
|
+ workspace_limit_bytes, &bwd_filter_algo_[i]) );
|
|
|
+
|
|
|
+ // get workspace for backwards filter algorithm
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
|
|
|
+ bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
|
|
|
+ bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
|
|
|
+
|
|
|
+ // choose backward algo for data
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
|
|
|
+ filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
|
|
|
+ CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
|
|
|
+ workspace_limit_bytes, &bwd_data_algo_[i]));
|
|
|
+
|
|
|
+ // get workspace size
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
|
|
|
+ filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
|
|
|
+ bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
|
|
|
+ #endif
|
|
|
+ }
|
|
|
+
|
|
|
+ // reduce over all workspace sizes to get a maximum to allocate / reallocate
|
|
|
+ size_t total_workspace_fwd = 0;
|
|
|
+ size_t total_workspace_bwd_data = 0;
|
|
|
+ size_t total_workspace_bwd_filter = 0;
|
|
|
+
|
|
|
+ for (size_t i = 0; i < bottom.size(); i++) {
|
|
|
+ total_workspace_fwd = std::max(total_workspace_fwd,
|
|
|
+ workspace_fwd_sizes_[i]);
|
|
|
+ total_workspace_bwd_data = std::max(total_workspace_bwd_data,
|
|
|
+ workspace_bwd_data_sizes_[i]);
|
|
|
+ total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
|
|
|
+ workspace_bwd_filter_sizes_[i]);
|
|
|
+ }
|
|
|
+ // get max over all operations
|
|
|
+ size_t max_workspace = std::max(total_workspace_fwd,
|
|
|
+ total_workspace_bwd_data);
|
|
|
+ max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
|
|
|
+ // ensure all groups have enough workspace
|
|
|
+ size_t total_max_workspace = max_workspace *
|
|
|
+ (this->group_ * CUDNN_STREAMS_PER_GROUP);
|
|
|
+
|
|
|
+ // this is the total amount of storage needed over all groups + streams
|
|
|
+ if (total_max_workspace > workspaceSizeInBytes) {
|
|
|
+ DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
|
|
|
+ workspaceSizeInBytes = total_max_workspace;
|
|
|
+
|
|
|
+ // free the existing workspace and allocate a new (larger) one
|
|
|
+ cudaFree(this->workspaceData);
|
|
|
+
|
|
|
+ cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
|
|
|
+ if (err != cudaSuccess) {
|
|
|
+ // force zero memory path
|
|
|
+ for (int i = 0; i < bottom.size(); i++) {
|
|
|
+ workspace_fwd_sizes_[i] = 0;
|
|
|
+ workspace_bwd_filter_sizes_[i] = 0;
|
|
|
+ workspace_bwd_data_sizes_[i] = 0;
|
|
|
+ fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
|
|
|
+ bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
|
|
|
+ bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
|
|
|
+ }
|
|
|
+
|
|
|
+ // NULL out all workspace pointers
|
|
|
+ for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
|
|
|
+ workspace[g] = NULL;
|
|
|
+ }
|
|
|
+ // NULL out underlying data
|
|
|
+ workspaceData = NULL;
|
|
|
+ workspaceSizeInBytes = 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ // if we succeed in the allocation, set pointer aliases for workspaces
|
|
|
+ for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
|
|
|
+ workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Tensor descriptor for bias.
|
|
|
+ if (this->bias_term_) {
|
|
|
+ cudnn::setTensor4dDesc<Dtype>(&bias_desc_,
|
|
|
+ 1, this->num_output_ / this->group_, 1, 1);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+src/caffe/layers/cudnn_deconv_layer.cpp:
|
|
|
+
|
|
|
+template <typename Dtype>
|
|
|
+void CuDNNDeconvolutionLayer<Dtype>::Reshape(
|
|
|
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
|
|
|
+ DeconvolutionLayer<Dtype>::Reshape(bottom, top);
|
|
|
+ CHECK_EQ(2, this->num_spatial_axes_)
|
|
|
+ << "CuDNNDeconvolutionLayer input must have 2 spatial axes "
|
|
|
+ << "(e.g., height and width). "
|
|
|
+ << "Use 'engine: CAFFE' for general ND convolution.";
|
|
|
+ bottom_offset_ = this->bottom_dim_ / this->group_;
|
|
|
+ top_offset_ = this->top_dim_ / this->group_;
|
|
|
+ const bool forced_3d = this->forced_3d_;
|
|
|
+ const int height = bottom[0]->shape(this->channel_axis_ + 1 + forced_3d);
|
|
|
+ const int width = bottom[0]->shape(this->channel_axis_ + 2 + forced_3d);
|
|
|
+ const int height_out = top[0]->shape(this->channel_axis_ + 1 + forced_3d);
|
|
|
+ const int width_out = top[0]->shape(this->channel_axis_ + 2 + forced_3d);
|
|
|
+ const int* pad_data = this->pad_.cpu_data();
|
|
|
+ const int pad_h = pad_data[0];
|
|
|
+ const int pad_w = pad_data[1];
|
|
|
+ const int* stride_data = this->stride_.cpu_data();
|
|
|
+ const int stride_h = stride_data[0];
|
|
|
+ const int stride_w = stride_data[1];
|
|
|
+ #if CUDNN_VERSION_MIN(8, 0, 0)
|
|
|
+ int RetCnt;
|
|
|
+ bool found_conv_algorithm;
|
|
|
+ size_t free_memory, total_memory;
|
|
|
+ cudnnConvolutionFwdAlgoPerf_t fwd_algo_pref_[4];
|
|
|
+ cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
|
|
|
+
|
|
|
+ //get memory sizes
|
|
|
+ cudaMemGetInfo(&free_memory, &total_memory);
|
|
|
+ #else
|
|
|
+ // Specify workspace limit for kernels directly until we have a
|
|
|
+ // planning strategy and a rewrite of Caffe's GPU memory mangagement
|
|
|
+ size_t workspace_limit_bytes = 8*1024*1024;
|
|
|
+ #endif
|
|
|
+ for (int i = 0; i < bottom.size(); i++) {
|
|
|
+ cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
|
|
|
+ this->num_,
|
|
|
+ this->channels_ / this->group_,
|
|
|
+ height,
|
|
|
+ width,
|
|
|
+ this->channels_ * height * width,
|
|
|
+ height * width,
|
|
|
+ width,
|
|
|
+ 1);
|
|
|
+ cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
|
|
|
+ this->num_,
|
|
|
+ this->num_output_ / this->group_,
|
|
|
+ height_out,
|
|
|
+ width_out,
|
|
|
+ this->num_output_ * height_out * width_out,
|
|
|
+ height_out * width_out,
|
|
|
+ width_out,
|
|
|
+ 1);
|
|
|
+ cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i],
|
|
|
+ top_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ pad_h,
|
|
|
+ pad_w,
|
|
|
+ stride_h,
|
|
|
+ stride_w);
|
|
|
+ #if CUDNN_VERSION_MIN(8, 0, 0)
|
|
|
+ // choose forward algorithm for filter
|
|
|
+ // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
|
|
|
+ top_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ conv_descs_[i],
|
|
|
+ bottom_descs_[i],
|
|
|
+ 4,
|
|
|
+ &RetCnt,
|
|
|
+ fwd_algo_pref_));
|
|
|
+
|
|
|
+ found_conv_algorithm = false;
|
|
|
+ for(int n=0;n<RetCnt;n++){
|
|
|
+ if (fwd_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
|
|
|
+ fwd_algo_pref_[n].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
|
|
|
+ fwd_algo_pref_[n].memory < free_memory){
|
|
|
+ found_conv_algorithm = true;
|
|
|
+ fwd_algo_[i] = fwd_algo_pref_[n].algo;
|
|
|
+ workspace_fwd_sizes_[i] = fwd_algo_pref_[n].memory;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
|
|
|
+ else{
|
|
|
+ // choose backward algorithm for filter
|
|
|
+ // for better or worse, just a fixed constant due to the missing
|
|
|
+ // cudnnGetConvolutionBackwardFilterAlgorithm in cuDNN version 8.0
|
|
|
+ bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
|
|
|
+ //twice the amount of the forward search to be save
|
|
|
+ workspace_bwd_filter_sizes_[i] = 2*workspace_fwd_sizes_[i];
|
|
|
+ }
|
|
|
+
|
|
|
+ // choose backward algo for data
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0],
|
|
|
+ filter_desc_,
|
|
|
+ bottom_descs_[i],
|
|
|
+ conv_descs_[i],
|
|
|
+ top_descs_[i],
|
|
|
+ 4,
|
|
|
+ &RetCnt,
|
|
|
+ bwd_data_algo_pref_));
|
|
|
+
|
|
|
+ found_conv_algorithm = false;
|
|
|
+ for(int n=0;n<RetCnt;n++){
|
|
|
+ if (bwd_data_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
|
|
|
+ bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD &&
|
|
|
+ bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
|
|
|
+ bwd_data_algo_pref_[n].memory < free_memory){
|
|
|
+ found_conv_algorithm = true;
|
|
|
+ bwd_data_algo_[i] = bwd_data_algo_pref_[n].algo;
|
|
|
+ workspace_bwd_data_sizes_[i] = bwd_data_algo_pref_[n].memory;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
|
|
|
+ #else
|
|
|
+ // choose forward and backward algorithms + workspace(s)
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
|
|
|
+ handle_[0],
|
|
|
+ top_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ conv_descs_[i],
|
|
|
+ bottom_descs_[i],
|
|
|
+ CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
|
|
|
+ workspace_limit_bytes,
|
|
|
+ &fwd_algo_[i]));
|
|
|
+
|
|
|
+ // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
|
|
|
+ // buggy. Thus, if this algo was chosen, choose winograd instead. If
|
|
|
+ // winograd is not supported or workspace is larger than threshold, choose
|
|
|
+ // implicit_gemm instead.
|
|
|
+ if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
|
|
|
+ size_t winograd_workspace_size;
|
|
|
+ cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
|
|
|
+ handle_[0],
|
|
|
+ top_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ conv_descs_[i],
|
|
|
+ bottom_descs_[i],
|
|
|
+ CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
|
|
|
+ &winograd_workspace_size);
|
|
|
+ if (status != CUDNN_STATUS_SUCCESS ||
|
|
|
+ winograd_workspace_size >= workspace_limit_bytes) {
|
|
|
+ fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
|
|
|
+ } else {
|
|
|
+ fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
|
|
|
+ handle_[0],
|
|
|
+ top_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ conv_descs_[i],
|
|
|
+ bottom_descs_[i],
|
|
|
+ fwd_algo_[i],
|
|
|
+ &(workspace_fwd_sizes_[i])));
|
|
|
+
|
|
|
+ // choose backward algorithm for filter
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
|
|
|
+ handle_[0],
|
|
|
+ top_descs_[i],
|
|
|
+ bottom_descs_[i],
|
|
|
+ conv_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
|
|
|
+ workspace_limit_bytes,
|
|
|
+ &bwd_filter_algo_[i]));
|
|
|
+
|
|
|
+ // get workspace for backwards filter algorithm
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
|
|
|
+ handle_[0],
|
|
|
+ top_descs_[i],
|
|
|
+ bottom_descs_[i],
|
|
|
+ conv_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ bwd_filter_algo_[i],
|
|
|
+ &workspace_bwd_filter_sizes_[i]));
|
|
|
+
|
|
|
+ // choose backward algo for data
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
|
|
|
+ handle_[0],
|
|
|
+ filter_desc_,
|
|
|
+ bottom_descs_[i],
|
|
|
+ conv_descs_[i],
|
|
|
+ top_descs_[i],
|
|
|
+ CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
|
|
|
+ workspace_limit_bytes,
|
|
|
+ &bwd_data_algo_[i]));
|
|
|
+
|
|
|
+ // get workspace size
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
|
|
|
+ handle_[0],
|
|
|
+ filter_desc_,
|
|
|
+ bottom_descs_[i],
|
|
|
+ conv_descs_[i],
|
|
|
+ top_descs_[i],
|
|
|
+ bwd_data_algo_[i],
|
|
|
+ &workspace_bwd_data_sizes_[i]));
|
|
|
+ #endif
|
|
|
+ }
|
|
|
+
|
|
|
+ // reduce over all workspace sizes to get a maximum to allocate / reallocate
|
|
|
+ size_t total_workspace_fwd = 0;
|
|
|
+ size_t total_workspace_bwd_data = 0;
|
|
|
+ size_t total_workspace_bwd_filter = 0;
|
|
|
+
|
|
|
+ for (size_t i = 0; i < bottom.size(); i++) {
|
|
|
+ total_workspace_fwd = std::max(total_workspace_fwd,
|
|
|
+ workspace_fwd_sizes_[i]);
|
|
|
+ total_workspace_bwd_data = std::max(total_workspace_bwd_data,
|
|
|
+ workspace_bwd_data_sizes_[i]);
|
|
|
+ total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
|
|
|
+ workspace_bwd_filter_sizes_[i]);
|
|
|
+ }
|
|
|
+ // get max over all operations
|
|
|
+ size_t max_workspace = std::max(total_workspace_fwd,
|
|
|
+ total_workspace_bwd_data);
|
|
|
+ max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
|
|
|
+ // ensure all groups have enough workspace
|
|
|
+ size_t total_max_workspace = max_workspace *
|
|
|
+ (this->group_ * CUDNN_STREAMS_PER_GROUP);
|
|
|
+
|
|
|
+ // this is the total amount of storage needed over all groups + streams
|
|
|
+ if (total_max_workspace > workspaceSizeInBytes) {
|
|
|
+ DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
|
|
|
+ workspaceSizeInBytes = total_max_workspace;
|
|
|
+
|
|
|
+ // free the existing workspace and allocate a new (larger) one
|
|
|
+ cudaFree(this->workspaceData);
|
|
|
+
|
|
|
+ cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
|
|
|
+ if (err != cudaSuccess) {
|
|
|
+ // force zero memory path
|
|
|
+ for (int i = 0; i < bottom.size(); i++) {
|
|
|
+ workspace_fwd_sizes_[i] = 0;
|
|
|
+ workspace_bwd_filter_sizes_[i] = 0;
|
|
|
+ workspace_bwd_data_sizes_[i] = 0;
|
|
|
+ fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
|
|
|
+ bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
|
|
|
+ bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
|
|
|
+ }
|
|
|
+
|
|
|
+ // NULL out all workspace pointers
|
|
|
+ for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
|
|
|
+ workspace[g] = NULL;
|
|
|
+ }
|
|
|
+ // NULL out underlying data
|
|
|
+ workspaceData = NULL;
|
|
|
+ workspaceSizeInBytes = 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ // if we succeed in the allocation, set pointer aliases for workspaces
|
|
|
+ for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
|
|
|
+ workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Tensor descriptor for bias.
|
|
|
+ if (this->bias_term_) {
|
|
|
+ cudnn::setTensor4dDesc<Dtype>(
|
|
|
+ &bias_desc_, 1, this->num_output_ / this->group_, 1, 1);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+报错关于:CV_LOAD_IMAGE_COLOR,加入头文件
|
|
|
+
|
|
|
+#include "opencv2/imgcodecs/legacy/constants_c.h"
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+CMakeLists.txt :
|
|
|
+caffe_option(BUILD_python "Build Python wrapper" OFF)
|
|
|
+
|