2 éve · 58ef4af3e0
--- a/src/detection/detection_lidar_cnn_segmentation/cudnn8caffe.md
+++ b/src/detection/detection_lidar_cnn_segmentation/cudnn8caffe.md
@@ -0,0 +1,504 @@
 
				+ https://blog.csdn.net/XCCCCZ/article/details/112909233
			
 
				+ 
			
 
				+ 
			
 
				+ 修改cmake/Cuda.cmake ,  将里面的"cudnn.h" 全部用 "cudnn_version.h"代替;
			
 
				+ 
			
 
				+ 
			
 
				+ src/caffe/layers/cudnn_conv_layer.cpp:
			
 
				+ 
			
 
				+ template <typename Dtype>
			
 
				+void CuDNNConvolutionLayer<Dtype>::Reshape(
			
 
				+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
			
 
				+  ConvolutionLayer<Dtype>::Reshape(bottom, top);
			
 
				+  CHECK_LE(2, this->num_spatial_axes_)
			
 
				+      << "CuDNNConvolution input must have 2 spatial axes "
			
 
				+      << "(e.g., height and width). "
			
 
				+      << "Use 'engine: CAFFE' for general ND convolution.";
			
 
				+  bottom_offset_ = this->bottom_dim_ / this->group_;
			
 
				+  top_offset_ = this->top_dim_ / this->group_;
			
 
				+  const bool forced_3d = this->forced_3d_;
			
 
				+  const int height = bottom[0]->shape(this->channel_axis_ + 1 + forced_3d);
			
 
				+  const int width = bottom[0]->shape(this->channel_axis_ + 2 + forced_3d);
			
 
				+  const int height_out = top[0]->shape(this->channel_axis_ + 1 + forced_3d);
			
 
				+  const int width_out = top[0]->shape(this->channel_axis_ + 2 + forced_3d);
			
 
				+  const int* pad_data = this->pad_.cpu_data();
			
 
				+  const int pad_h = pad_data[0];
			
 
				+  const int pad_w = pad_data[1];
			
 
				+  const int* stride_data = this->stride_.cpu_data();
			
 
				+  const int stride_h = stride_data[0];
			
 
				+  const int stride_w = stride_data[1];
			
 
				+  #if  CUDNN_VERSION_MIN(8, 0, 0)
			
 
				+  int RetCnt;
			
 
				+  bool found_conv_algorithm;
			
 
				+  size_t free_memory, total_memory;
			
 
				+  cudnnConvolutionFwdAlgoPerf_t     fwd_algo_pref_[4];
			
 
				+  cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
			
 
				+ 
			
 
				+  //get memory sizes
			
 
				+  cudaMemGetInfo(&free_memory, &total_memory);
			
 
				+  #else
			
 
				+  // Specify workspace limit for kernels directly until we have a
			
 
				+  // planning strategy and a rewrite of Caffe's GPU memory mangagement
			
 
				+  size_t workspace_limit_bytes = 8*1024*1024;
			
 
				+  #endif
			
 
				+  for (int i = 0; i < bottom.size(); i++) {
			
 
				+    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
			
 
				+        this->num_,
			
 
				+        this->channels_ / this->group_, height, width,
			
 
				+        this->channels_ * height * width,
			
 
				+        height * width, width, 1);
			
 
				+    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
			
 
				+        this->num_,
			
 
				+        this->num_output_ / this->group_, height_out, width_out,
			
 
				+        this->num_output_ * this->out_spatial_dim_,
			
 
				+        this->out_spatial_dim_, width_out, 1);
			
 
				+    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
			
 
				+        filter_desc_, pad_h, pad_w,
			
 
				+        stride_h, stride_w);
			
 
				+    #if CUDNN_VERSION_MIN(8, 0, 0)
			
 
				+    // choose forward algorithm for filter
			
 
				+    // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
			
 
				+      bottom_descs_[i],
			
 
				+      filter_desc_,
			
 
				+      conv_descs_[i],
			
 
				+      top_descs_[i],
			
 
				+      4,
			
 
				+      &RetCnt,
			
 
				+      fwd_algo_pref_));
			
 
				+ 
			
 
				+    found_conv_algorithm = false;
			
 
				+    for(int n=0;n<RetCnt;n++){
			
 
				+      if (fwd_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
			
 
				+          fwd_algo_pref_[n].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
			
 
				+          fwd_algo_pref_[n].memory < free_memory){
			
 
				+        found_conv_algorithm = true;
			
 
				+        fwd_algo_[i]                   = fwd_algo_pref_[n].algo;
			
 
				+        workspace_fwd_sizes_[i]        = fwd_algo_pref_[n].memory;
			
 
				+        break;
			
 
				+      }
			
 
				+    }
			
 
				+    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
			
 
				+    else{
			
 
				+        // choose backward algorithm for filter
			
 
				+        // for better or worse, just a fixed constant due to the missing
			
 
				+        // cudnnGetConvolutionBackwardFilterAlgorithm in cuDNN version 8.0
			
 
				+        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
			
 
				+        //twice the amount of the forward search to be save
			
 
				+        workspace_bwd_filter_sizes_[i] = 2*workspace_fwd_sizes_[i];
			
 
				+    }
			
 
				+ 
			
 
				+    // choose backward algo for data
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0],
			
 
				+      filter_desc_,
			
 
				+      top_descs_[i],
			
 
				+      conv_descs_[i],
			
 
				+      bottom_descs_[i],
			
 
				+      4,
			
 
				+      &RetCnt,
			
 
				+      bwd_data_algo_pref_));
			
 
				+ 
			
 
				+    found_conv_algorithm = false;
			
 
				+    for(int n=0;n<RetCnt;n++){
			
 
				+      if (bwd_data_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
			
 
				+          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD &&
			
 
				+          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
			
 
				+          bwd_data_algo_pref_[n].memory < free_memory){
			
 
				+        found_conv_algorithm = true;
			
 
				+        bwd_data_algo_[i]              = bwd_data_algo_pref_[n].algo;
			
 
				+        workspace_bwd_data_sizes_[i]   = bwd_data_algo_pref_[n].memory;
			
 
				+        break;
			
 
				+      }
			
 
				+    }
			
 
				+    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
			
 
				+    #else
			
 
				+    // choose forward and backward algorithms + workspace(s)
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
			
 
				+      bottom_descs_[i],
			
 
				+      filter_desc_,
			
 
				+      conv_descs_[i],
			
 
				+      top_descs_[i],
			
 
				+      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
			
 
				+      workspace_limit_bytes,
			
 
				+      &fwd_algo_[i]));
			
 
				+ 
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
			
 
				+      bottom_descs_[i],
			
 
				+      filter_desc_,
			
 
				+      conv_descs_[i],
			
 
				+      top_descs_[i],
			
 
				+      fwd_algo_[i],
			
 
				+      &(workspace_fwd_sizes_[i])));
			
 
				+ 
			
 
				+    // choose backward algorithm for filter
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
			
 
				+          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
			
 
				+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
			
 
				+          workspace_limit_bytes, &bwd_filter_algo_[i]) );
			
 
				+ 
			
 
				+    // get workspace for backwards filter algorithm
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
			
 
				+          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
			
 
				+          bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
			
 
				+ 
			
 
				+    // choose backward algo for data
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
			
 
				+          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
			
 
				+          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
			
 
				+        workspace_limit_bytes, &bwd_data_algo_[i]));
			
 
				+ 
			
 
				+    // get workspace size
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
			
 
				+          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
			
 
				+          bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
			
 
				+    #endif
			
 
				+  }
			
 
				+ 
			
 
				+  // reduce over all workspace sizes to get a maximum to allocate / reallocate
			
 
				+  size_t total_workspace_fwd = 0;
			
 
				+  size_t total_workspace_bwd_data = 0;
			
 
				+  size_t total_workspace_bwd_filter = 0;
			
 
				+ 
			
 
				+  for (size_t i = 0; i < bottom.size(); i++) {
			
 
				+    total_workspace_fwd        = std::max(total_workspace_fwd,
			
 
				+                                     workspace_fwd_sizes_[i]);
			
 
				+    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
			
 
				+                                     workspace_bwd_data_sizes_[i]);
			
 
				+    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
			
 
				+                                     workspace_bwd_filter_sizes_[i]);
			
 
				+  }
			
 
				+  // get max over all operations
			
 
				+  size_t max_workspace = std::max(total_workspace_fwd,
			
 
				+                             total_workspace_bwd_data);
			
 
				+  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
			
 
				+  // ensure all groups have enough workspace
			
 
				+  size_t total_max_workspace = max_workspace *
			
 
				+                               (this->group_ * CUDNN_STREAMS_PER_GROUP);
			
 
				+ 
			
 
				+  // this is the total amount of storage needed over all groups + streams
			
 
				+  if (total_max_workspace > workspaceSizeInBytes) {
			
 
				+    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
			
 
				+    workspaceSizeInBytes = total_max_workspace;
			
 
				+ 
			
 
				+    // free the existing workspace and allocate a new (larger) one
			
 
				+    cudaFree(this->workspaceData);
			
 
				+ 
			
 
				+    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
			
 
				+    if (err != cudaSuccess) {
			
 
				+      // force zero memory path
			
 
				+      for (int i = 0; i < bottom.size(); i++) {
			
 
				+        workspace_fwd_sizes_[i] = 0;
			
 
				+        workspace_bwd_filter_sizes_[i] = 0;
			
 
				+        workspace_bwd_data_sizes_[i] = 0;
			
 
				+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
			
 
				+        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
			
 
				+        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
			
 
				+      }
			
 
				+ 
			
 
				+      // NULL out all workspace pointers
			
 
				+      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
			
 
				+        workspace[g] = NULL;
			
 
				+      }
			
 
				+      // NULL out underlying data
			
 
				+      workspaceData = NULL;
			
 
				+      workspaceSizeInBytes = 0;
			
 
				+    }
			
 
				+ 
			
 
				+    // if we succeed in the allocation, set pointer aliases for workspaces
			
 
				+    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
			
 
				+      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
			
 
				+    }
			
 
				+  }
			
 
				+ 
			
 
				+  // Tensor descriptor for bias.
			
 
				+  if (this->bias_term_) {
			
 
				+    cudnn::setTensor4dDesc<Dtype>(&bias_desc_,
			
 
				+        1, this->num_output_ / this->group_, 1, 1);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+src/caffe/layers/cudnn_deconv_layer.cpp:
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+void CuDNNDeconvolutionLayer<Dtype>::Reshape(
			
 
				+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
			
 
				+  DeconvolutionLayer<Dtype>::Reshape(bottom, top);
			
 
				+  CHECK_EQ(2, this->num_spatial_axes_)
			
 
				+      << "CuDNNDeconvolutionLayer input must have 2 spatial axes "
			
 
				+      << "(e.g., height and width). "
			
 
				+      << "Use 'engine: CAFFE' for general ND convolution.";
			
 
				+  bottom_offset_ = this->bottom_dim_ / this->group_;
			
 
				+  top_offset_ = this->top_dim_ / this->group_;
			
 
				+  const bool forced_3d = this->forced_3d_;
			
 
				+  const int height = bottom[0]->shape(this->channel_axis_ + 1 + forced_3d);
			
 
				+  const int width = bottom[0]->shape(this->channel_axis_ + 2 + forced_3d);
			
 
				+  const int height_out = top[0]->shape(this->channel_axis_ + 1 + forced_3d);
			
 
				+  const int width_out = top[0]->shape(this->channel_axis_ + 2 + forced_3d);
			
 
				+  const int* pad_data = this->pad_.cpu_data();
			
 
				+  const int pad_h = pad_data[0];
			
 
				+  const int pad_w = pad_data[1];
			
 
				+  const int* stride_data = this->stride_.cpu_data();
			
 
				+  const int stride_h = stride_data[0];
			
 
				+  const int stride_w = stride_data[1];
			
 
				+  #if  CUDNN_VERSION_MIN(8, 0, 0)
			
 
				+  int RetCnt;
			
 
				+  bool found_conv_algorithm;
			
 
				+  size_t free_memory, total_memory;
			
 
				+  cudnnConvolutionFwdAlgoPerf_t     fwd_algo_pref_[4];
			
 
				+  cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
			
 
				+ 
			
 
				+  //get memory sizes
			
 
				+  cudaMemGetInfo(&free_memory, &total_memory);
			
 
				+  #else
			
 
				+  // Specify workspace limit for kernels directly until we have a
			
 
				+  // planning strategy and a rewrite of Caffe's GPU memory mangagement
			
 
				+  size_t workspace_limit_bytes = 8*1024*1024;
			
 
				+  #endif
			
 
				+  for (int i = 0; i < bottom.size(); i++) {
			
 
				+    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
			
 
				+                                  this->num_,
			
 
				+                                  this->channels_ / this->group_,
			
 
				+                                  height,
			
 
				+                                  width,
			
 
				+                                  this->channels_ * height * width,
			
 
				+                                  height * width,
			
 
				+                                  width,
			
 
				+                                  1);
			
 
				+    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
			
 
				+                                  this->num_,
			
 
				+                                  this->num_output_ / this->group_,
			
 
				+                                  height_out,
			
 
				+                                  width_out,
			
 
				+                                  this->num_output_ * height_out * width_out,
			
 
				+                                  height_out * width_out,
			
 
				+                                  width_out,
			
 
				+                                  1);
			
 
				+    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i],
			
 
				+                                     top_descs_[i],
			
 
				+                                     filter_desc_,
			
 
				+                                     pad_h,
			
 
				+                                     pad_w,
			
 
				+                                     stride_h,
			
 
				+                                     stride_w);
			
 
				+    #if  CUDNN_VERSION_MIN(8, 0, 0)
			
 
				+    // choose forward algorithm for filter
			
 
				+    // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
			
 
				+      top_descs_[i],
			
 
				+      filter_desc_,
			
 
				+      conv_descs_[i],
			
 
				+      bottom_descs_[i],
			
 
				+      4,
			
 
				+      &RetCnt,
			
 
				+      fwd_algo_pref_));
			
 
				+ 
			
 
				+    found_conv_algorithm = false;
			
 
				+    for(int n=0;n<RetCnt;n++){
			
 
				+      if (fwd_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
			
 
				+          fwd_algo_pref_[n].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
			
 
				+          fwd_algo_pref_[n].memory < free_memory){
			
 
				+        found_conv_algorithm = true;
			
 
				+        fwd_algo_[i]                   = fwd_algo_pref_[n].algo;
			
 
				+        workspace_fwd_sizes_[i]        = fwd_algo_pref_[n].memory;
			
 
				+        break;
			
 
				+      }
			
 
				+    }
			
 
				+    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
			
 
				+    else{
			
 
				+        // choose backward algorithm for filter
			
 
				+        // for better or worse, just a fixed constant due to the missing
			
 
				+        // cudnnGetConvolutionBackwardFilterAlgorithm in cuDNN version 8.0
			
 
				+        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
			
 
				+        //twice the amount of the forward search to be save
			
 
				+        workspace_bwd_filter_sizes_[i] = 2*workspace_fwd_sizes_[i];
			
 
				+    }
			
 
				+ 
			
 
				+    // choose backward algo for data
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0],
			
 
				+      filter_desc_,
			
 
				+      bottom_descs_[i],
			
 
				+      conv_descs_[i],
			
 
				+      top_descs_[i],
			
 
				+      4,
			
 
				+      &RetCnt,
			
 
				+      bwd_data_algo_pref_));
			
 
				+    
			
 
				+    found_conv_algorithm = false;
			
 
				+    for(int n=0;n<RetCnt;n++){
			
 
				+      if (bwd_data_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
			
 
				+          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD &&
			
 
				+          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
			
 
				+          bwd_data_algo_pref_[n].memory < free_memory){
			
 
				+        found_conv_algorithm = true;
			
 
				+        bwd_data_algo_[i]              = bwd_data_algo_pref_[n].algo;
			
 
				+        workspace_bwd_data_sizes_[i]   = bwd_data_algo_pref_[n].memory;
			
 
				+        break;
			
 
				+      }
			
 
				+    }
			
 
				+    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
			
 
				+   #else
			
 
				+    // choose forward and backward algorithms + workspace(s)
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
			
 
				+        handle_[0],
			
 
				+        top_descs_[i],
			
 
				+        filter_desc_,
			
 
				+        conv_descs_[i],
			
 
				+        bottom_descs_[i],
			
 
				+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
			
 
				+        workspace_limit_bytes,
			
 
				+        &fwd_algo_[i]));
			
 
				+ 
			
 
				+    // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
			
 
				+    // buggy. Thus, if this algo was chosen, choose winograd instead. If
			
 
				+    // winograd is not supported or workspace is larger than threshold, choose
			
 
				+    // implicit_gemm instead.
			
 
				+    if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
			
 
				+      size_t winograd_workspace_size;
			
 
				+      cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
			
 
				+          handle_[0],
			
 
				+          top_descs_[i],
			
 
				+          filter_desc_,
			
 
				+          conv_descs_[i],
			
 
				+          bottom_descs_[i],
			
 
				+          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
			
 
				+          &winograd_workspace_size);
			
 
				+      if (status != CUDNN_STATUS_SUCCESS ||
			
 
				+          winograd_workspace_size >= workspace_limit_bytes) {
			
 
				+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
			
 
				+      } else {
			
 
				+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
			
 
				+      }
			
 
				+    }
			
 
				+ 
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
			
 
				+        handle_[0],
			
 
				+        top_descs_[i],
			
 
				+        filter_desc_,
			
 
				+        conv_descs_[i],
			
 
				+        bottom_descs_[i],
			
 
				+        fwd_algo_[i],
			
 
				+        &(workspace_fwd_sizes_[i])));
			
 
				+ 
			
 
				+    // choose backward algorithm for filter
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
			
 
				+        handle_[0],
			
 
				+        top_descs_[i],
			
 
				+        bottom_descs_[i],
			
 
				+        conv_descs_[i],
			
 
				+        filter_desc_,
			
 
				+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
			
 
				+        workspace_limit_bytes,
			
 
				+        &bwd_filter_algo_[i]));
			
 
				+ 
			
 
				+    // get workspace for backwards filter algorithm
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
			
 
				+        handle_[0],
			
 
				+        top_descs_[i],
			
 
				+        bottom_descs_[i],
			
 
				+        conv_descs_[i],
			
 
				+        filter_desc_,
			
 
				+        bwd_filter_algo_[i],
			
 
				+        &workspace_bwd_filter_sizes_[i]));
			
 
				+ 
			
 
				+    // choose backward algo for data
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
			
 
				+        handle_[0],
			
 
				+        filter_desc_,
			
 
				+        bottom_descs_[i],
			
 
				+        conv_descs_[i],
			
 
				+        top_descs_[i],
			
 
				+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
			
 
				+        workspace_limit_bytes,
			
 
				+        &bwd_data_algo_[i]));
			
 
				+ 
			
 
				+    // get workspace size
			
 
				+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
			
 
				+        handle_[0],
			
 
				+        filter_desc_,
			
 
				+        bottom_descs_[i],
			
 
				+        conv_descs_[i],
			
 
				+        top_descs_[i],
			
 
				+        bwd_data_algo_[i],
			
 
				+        &workspace_bwd_data_sizes_[i]));
			
 
				+    #endif
			
 
				+  }
			
 
				+ 
			
 
				+  // reduce over all workspace sizes to get a maximum to allocate / reallocate
			
 
				+  size_t total_workspace_fwd = 0;
			
 
				+  size_t total_workspace_bwd_data = 0;
			
 
				+  size_t total_workspace_bwd_filter = 0;
			
 
				+ 
			
 
				+  for (size_t i = 0; i < bottom.size(); i++) {
			
 
				+    total_workspace_fwd        = std::max(total_workspace_fwd,
			
 
				+                                     workspace_fwd_sizes_[i]);
			
 
				+    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
			
 
				+                                     workspace_bwd_data_sizes_[i]);
			
 
				+    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
			
 
				+                                     workspace_bwd_filter_sizes_[i]);
			
 
				+  }
			
 
				+  // get max over all operations
			
 
				+  size_t max_workspace = std::max(total_workspace_fwd,
			
 
				+                             total_workspace_bwd_data);
			
 
				+  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
			
 
				+  // ensure all groups have enough workspace
			
 
				+  size_t total_max_workspace = max_workspace *
			
 
				+                               (this->group_ * CUDNN_STREAMS_PER_GROUP);
			
 
				+ 
			
 
				+  // this is the total amount of storage needed over all groups + streams
			
 
				+  if (total_max_workspace > workspaceSizeInBytes) {
			
 
				+    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
			
 
				+    workspaceSizeInBytes = total_max_workspace;
			
 
				+ 
			
 
				+    // free the existing workspace and allocate a new (larger) one
			
 
				+    cudaFree(this->workspaceData);
			
 
				+ 
			
 
				+    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
			
 
				+    if (err != cudaSuccess) {
			
 
				+      // force zero memory path
			
 
				+      for (int i = 0; i < bottom.size(); i++) {
			
 
				+        workspace_fwd_sizes_[i] = 0;
			
 
				+        workspace_bwd_filter_sizes_[i] = 0;
			
 
				+        workspace_bwd_data_sizes_[i] = 0;
			
 
				+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
			
 
				+        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
			
 
				+        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
			
 
				+      }
			
 
				+ 
			
 
				+      // NULL out all workspace pointers
			
 
				+      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
			
 
				+        workspace[g] = NULL;
			
 
				+      }
			
 
				+      // NULL out underlying data
			
 
				+      workspaceData = NULL;
			
 
				+      workspaceSizeInBytes = 0;
			
 
				+    }
			
 
				+ 
			
 
				+    // if we succeed in the allocation, set pointer aliases for workspaces
			
 
				+    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
			
 
				+      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
			
 
				+    }
			
 
				+  }
			
 
				+ 
			
 
				+  // Tensor descriptor for bias.
			
 
				+  if (this->bias_term_) {
			
 
				+    cudnn::setTensor4dDesc<Dtype>(
			
 
				+        &bias_desc_, 1, this->num_output_ / this->group_, 1, 1);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+报错关于：CV_LOAD_IMAGE_COLOR，加入头文件
			
 
				+
			
 
				+#include "opencv2/imgcodecs/legacy/constants_c.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+CMakeLists.txt :
			
 
				+caffe_option(BUILD_python "Build Python wrapper" OFF)
			
 
				+
			
--- a/src/detection/detection_lidar_cnn_segmentation/detection_lidar_cnn_segmentation.pro
+++ b/src/detection/detection_lidar_cnn_segmentation/detection_lidar_cnn_segmentation.pro
@@ -1,6 +1,6 @@
 
				 QT -= gui
			
 
				 
			
 
				-CONFIG += c++11 console
			
 
				+CONFIG += c++14 #console
			
 
				 CONFIG -= app_bundle
			
 
				 
			
 
				 # The following define makes your compiler emit warnings if you use
			
@@ -40,6 +40,13 @@ INCLUDEPATH += /usr/include/opencv4
 
				 
			
 
				 #INCLUDEPATH += /opt/ros/kinetic/include/opencv-3.3.1-dev
			
 
				 
			
 
				+INCLUDEPATH += /usr/include/pcl-1.10
			
 
				+
			
 
				+
			
 
				+INCLUDEPATH += /usr/local/cuda-11.4/targets/aarch64-linux/include
			
 
				+
			
 
				+LIBS += -L/usr/local/cuda-11.4/targets/aarch64-linux/lib  # -lcublas
			
 
				+
			
 
				 
			
 
				 INCLUDEPATH += /usr/local/cuda-10.2/targets/aarch64-linux/include
			
 
				 
			
--- a/src/detection/detection_lidar_cnn_segmentation_trt/main.cpp
+++ b/src/detection/detection_lidar_cnn_segmentation_trt/main.cpp
@@ -320,7 +320,7 @@ void ListenPointCloud(const char * strdata,const unsigned int nSize,const unsign
 
				 //        else
			
 
				 //        {
			
 
				             memcpy(&xp,p,sizeof(pcl::PointXYZI));
			
 
				-            xp.z = xp.z + (-1.0) ;
			
 
				+            xp.z = xp.z + (-0.0) ;
			
 
				             point_cloud->push_back(xp);
			
 
				 
			
 
				 //        }
			
--- a/src/detection/detection_lidar_cnn_segmentation_trt/src/cluster2d.cpp
+++ b/src/detection/detection_lidar_cnn_segmentation_trt/src/cluster2d.cpp
@@ -264,8 +264,11 @@ void Cluster2D::getObjectVector(const float confidence_thresh,
 
				 
			
 
				         int point_id = valid_indices_in_pc_->at(i);
			
 
				 
			
 
				+
			
 
				+
			
 
				         if (obstacle_id >= 0 && obstacles_[obstacle_id].score >= confidence_thresh) {
			
 
				- //           std::cout<<obstacle_id<<" type: " <<obstacles_[obstacle_id].GetTypeString()<<std::endl;
			
 
				+
			
 
				+            std::cout<<obstacle_id<<" type: " <<obstacles_[obstacle_id].GetTypeString()<<std::endl;
			
 
				             if (
			
 
				                     height_thresh < 0 ||
			
 
				                     pc_ptr_->points[point_id].z <= obstacles_[obstacle_id].height + height_thresh) {
			
--- a/src/detection/detection_lidar_cnn_segmentation_trt/src/detector.cpp
+++ b/src/detection/detection_lidar_cnn_segmentation_trt/src/detector.cpp
@@ -31,13 +31,13 @@ LidarApolloInstanceSegmentation::LidarApolloInstanceSegmentation()
 
				   std::string caffemodel_file;
			
 
				   score_threshold_ = 0.4;//node_->declare_parameter("score_threshold", 0.8);
			
 
				   range = 60;//node_->declare_parameter("range", 60);
			
 
				-  width = 512;// 640;//node_->declare_parameter("width", 640);
			
 
				-  height = 512; //640;//node_->declare_parameter("height", 640);
			
 
				+  width = 640;// 640;//node_->declare_parameter("width", 640);
			
 
				+  height = 640; //640;//node_->declare_parameter("height", 640);
			
 
				   engine_file =  "deploy.engine";//node_->declare_parameter("engine_file", "vls-128.engine");
			
 
				   prototxt_file = "hdl-64.prototxt";//node_->declare_parameter("prototxt_file", "vls-128.prototxt");
			
 
				   caffemodel_file = "deploy.caffemodel";// node_->declare_parameter("caffemodel_file", "vls-128.caffemodel");
			
 
				   use_intensity_feature = true;//node_->declare_parameter("use_intensity_feature", true);
			
 
				-  use_constant_feature = true;//node_->declare_parameter("use_constant_feature", true);
			
 
				+  use_constant_feature = false;//node_->declare_parameter("use_constant_feature", true);
			
 
				   target_frame_ = "base_link";//node_->declare_parameter("target_frame", "base_link");
			
 
				   z_offset_ =  -2.0;//node_->declare_parameter<float>("z_offset", -2.0);
			
 
				 
			
@@ -132,7 +132,7 @@ bool LidarApolloInstanceSegmentation::detectDynamicObjects(
 
				     cluster2d_->cluster(
			
 
				       inferred_data, pcl_pointcloud_raw_ptr, valid_idx, objectness_thresh,
			
 
				       true /*use all grids for clustering*/);
			
 
				-    const float height_thresh = 0.5;
			
 
				+    const float height_thresh = 0.5;// 0.5;
			
 
				     const int min_pts_num = 3;
			
 
				 
			
 
				     cluster2d_->getObjectVector(
			
--- a/src/tool/pointcloudviewer/pointcloudviewer.pro
+++ b/src/tool/pointcloudviewer/pointcloudviewer.pro
@@ -1,6 +1,6 @@
 
				 QT -= gui
			
 
				 
			
 
				-CONFIG += c++14 console
			
 
				+CONFIG += c++14 #console
			
 
				 CONFIG -= app_bundle
			
 
				 
			
 
				 QMAKE_LFLAGS += -no-pie
			
@@ -26,6 +26,8 @@ INCLUDEPATH += /usr/include/pcl-1.7
 
				 INCLUDEPATH += /usr/include/eigen3
			
 
				 INCLUDEPATH += /usr/include/vtk-6.3
			
 
				 INCLUDEPATH += /usr/include/vtk-9.1
			
 
				+INCLUDEPATH += /usr/include/vtk-7.1
			
 
				+INCLUDEPATH += /usr/include/pcl-1.10
			
 
				 
			
 
				 unix:INCLUDEPATH += /usr/include/pcl-1.12
			
 
				 
			
@@ -61,5 +63,9 @@ LIBS += -lboost_system
 
				 #LIBS += -lvtkCommonExecutionModel-9.1 -lvtkCommonCore-9.1 -lvtkRenderingLOD-9.1 -lvtkRenderingCore-9.1 \
			
 
				 #       -lvtkFiltersSources-9.1
			
 
				 
			
 
				+LIBS += -lvtkCommonExecutionModel-7.1 -lvtkCommonCore-7.1 -lvtkRenderingLOD-7.1 -lvtkRenderingCore-7.1 \
			
 
				+        -lvtkFiltersSources-7.1
			
 
				+
			
 
				+
			
 
				 #LIBS += -lvtkCommonExecutionModel-6.3 -lvtkCommonCore-6.3 -lvtkRenderingLOD-6.3 -lvtkRenderingCore-6.3 \
			
 
				 #        -lvtkFiltersSources-6.3