preprocess.cu 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. /******************************************************************************
  2. * Copyright 2020 The Apollo Authors. All Rights Reserved.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. *****************************************************************************/
  16. /*
  17. * Copyright 2018-2019 Autoware Foundation. All rights reserved.
  18. *
  19. * Licensed under the Apache License, Version 2.0 (the "License");
  20. * you may not use this file except in compliance with the License.
  21. * You may obtain a copy of the License at
  22. *
  23. * http://www.apache.org/licenses/LICENSE-2.0
  24. *
  25. * Unless required by applicable law or agreed to in writing, software
  26. * distributed under the License is distributed on an "AS IS" BASIS,
  27. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  28. * See the License for the specific language governing permissions and
  29. * limitations under the License.
  30. */
  31. /**
  32. * @author Kosuke Murakami
  33. * @date 2019/02/26
  34. */
  35. /**
  36. * @author Yan haixu
  37. * Contact: just github.com/hova88
  38. * @date 2021/04/30
  39. */
  40. // headers in STL
  41. #include <stdio.h>
  42. // headers in local files
  43. #include "common.h"
  44. #include "preprocess.h"
  45. __global__ void make_pillar_histo_kernel(
  46. const float* dev_points, float* dev_pillar_point_feature_in_coors,
  47. int* pillar_count_histo, const int num_points,
  48. const int max_points_per_pillar, const int grid_x_size,
  49. const int grid_y_size, const int grid_z_size, const float min_x_range,
  50. const float min_y_range, const float min_z_range, const float pillar_x_size,
  51. const float pillar_y_size, const float pillar_z_size,
  52. const int num_point_feature) {
  53. int th_i = blockIdx.x * blockDim.x + threadIdx.x ;
  54. if (th_i >= num_points) {
  55. return;
  56. }
  57. int x_coor = floor((dev_points[th_i * num_point_feature + 0] - min_x_range) / pillar_x_size);
  58. int y_coor = floor((dev_points[th_i * num_point_feature + 1] - min_y_range) / pillar_y_size);
  59. int z_coor = floor((dev_points[th_i * num_point_feature + 2] - min_z_range) / pillar_z_size);
  60. if (x_coor >= 0 && x_coor < grid_x_size && y_coor >= 0 &&
  61. y_coor < grid_y_size && z_coor >= 0 && z_coor < grid_z_size) {
  62. int count =
  63. atomicAdd(&pillar_count_histo[y_coor * grid_x_size + x_coor], 1);
  64. if (count < max_points_per_pillar) {
  65. int ind =
  66. y_coor * grid_x_size * max_points_per_pillar * num_point_feature +
  67. x_coor * max_points_per_pillar * num_point_feature +
  68. count * num_point_feature;
  69. for (int i = 0; i < num_point_feature; ++i) {
  70. dev_pillar_point_feature_in_coors[ind + i] =
  71. dev_points[th_i * num_point_feature + i];
  72. }
  73. }
  74. }
  75. }
  76. __global__ void make_pillar_index_kernel(
  77. int* dev_pillar_count_histo, int* dev_counter, int* dev_pillar_count,
  78. int* dev_x_coors, int* dev_y_coors, float* dev_num_points_per_pillar,
  79. int* dev_sparse_pillar_map, const int max_pillars,
  80. const int max_points_per_pillar, const int grid_x_size,
  81. const int num_inds_for_scan) {
  82. int x = blockIdx.x;
  83. int y = threadIdx.x;
  84. int num_points_at_this_pillar = dev_pillar_count_histo[y * grid_x_size + x];
  85. if (num_points_at_this_pillar == 0) {
  86. return;
  87. }
  88. int count = atomicAdd(dev_counter, 1);
  89. if (count < max_pillars) {
  90. atomicAdd(dev_pillar_count, 1);
  91. if (num_points_at_this_pillar >= max_points_per_pillar) {
  92. dev_num_points_per_pillar[count] = max_points_per_pillar;
  93. } else {
  94. dev_num_points_per_pillar[count] = num_points_at_this_pillar;
  95. }
  96. dev_x_coors[count] = x;
  97. dev_y_coors[count] = y;
  98. dev_sparse_pillar_map[y * num_inds_for_scan + x] = 1;
  99. }
  100. }
  101. __global__ void make_pillar_feature_kernel(
  102. float* dev_pillar_point_feature_in_coors, float* dev_pillar_point_feature,
  103. float* dev_pillar_coors, int* dev_x_coors, int* dev_y_coors,
  104. float* dev_num_points_per_pillar, const int max_points,
  105. const int num_point_feature, const int grid_x_size) {
  106. int ith_pillar = blockIdx.x;
  107. int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
  108. int ith_point = threadIdx.x;
  109. if (ith_point >= num_points_at_this_pillar) {
  110. return;
  111. }
  112. int x_ind = dev_x_coors[ith_pillar];
  113. int y_ind = dev_y_coors[ith_pillar];
  114. int pillar_ind = ith_pillar * max_points * num_point_feature +
  115. ith_point * num_point_feature;
  116. int coors_ind = y_ind * grid_x_size * max_points * num_point_feature +
  117. x_ind * max_points * num_point_feature +
  118. ith_point * num_point_feature;
  119. #pragma unroll
  120. for (int i = 0; i < num_point_feature; ++i) {
  121. dev_pillar_point_feature[pillar_ind + i] =
  122. dev_pillar_point_feature_in_coors[coors_ind + i];
  123. }
  124. float coor_x = static_cast<float>(x_ind);
  125. float coor_y = static_cast<float>(y_ind);
  126. dev_pillar_coors[ith_pillar * 4 + 0] = 0; // batch idx
  127. dev_pillar_coors[ith_pillar * 4 + 1] = 0; // z
  128. dev_pillar_coors[ith_pillar * 4 + 2] = coor_y;
  129. dev_pillar_coors[ith_pillar * 4 + 3] = coor_x;
  130. }
  131. __global__ void pillar_mean_kernel(
  132. float* dev_points_mean,
  133. const int num_point_feature,
  134. const float* dev_pillar_point_feature,
  135. const float* dev_num_points_per_pillar,
  136. int max_pillars ,
  137. int max_points_per_pillar) {
  138. extern __shared__ float temp[];
  139. int ith_pillar = blockIdx.x;
  140. int ith_point = threadIdx.x;
  141. int axis = threadIdx.y;
  142. int reduce_size = max_points_per_pillar > 32 ? 64 : 32;
  143. temp[threadIdx.x * 3 + axis] = dev_pillar_point_feature[ith_pillar * max_points_per_pillar * num_point_feature + ith_point * num_point_feature + axis];
  144. if (threadIdx.x < reduce_size - max_points_per_pillar) {
  145. temp[(threadIdx.x + max_points_per_pillar) * 3 + axis] = 0.0f; //--> dummy placeholds will set as 0
  146. }
  147. __syncthreads();
  148. int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
  149. if (ith_point >= num_points_at_this_pillar) {
  150. return;
  151. }
  152. for (unsigned int d = reduce_size >> 1 ; d > 0.6; d >>= 1) {
  153. if (ith_point < d) {
  154. temp[ith_point*3 +axis] += temp[(ith_point + d) * 3 + axis];
  155. }
  156. __syncthreads();
  157. }
  158. if (ith_point == 0) {
  159. dev_points_mean[ith_pillar * 3 + axis] = temp[ith_point + axis] / num_points_at_this_pillar ;
  160. }
  161. }
  162. __device__ void warpReduce(volatile float* sdata , int ith_point , int axis) {
  163. sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 8) * blockDim.y + axis];
  164. sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 4) * blockDim.y + axis];
  165. sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 2) * blockDim.y + axis];
  166. sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 1) * blockDim.y + axis];
  167. }
  168. __global__ void make_pillar_mean_kernel(
  169. float* dev_points_mean,
  170. const int num_point_feature,
  171. const float* dev_pillar_point_feature,
  172. const float* dev_num_points_per_pillar,
  173. int max_pillars ,
  174. int max_points_pre_pillar) {
  175. extern __shared__ float temp[];
  176. unsigned int ith_pillar = blockIdx.x; // { 0 , 1, 2, ... , 10000+}
  177. unsigned int ith_point = threadIdx.x; // { 0 , 1, 2, ...,9}
  178. unsigned int axis = threadIdx.y;
  179. unsigned int idx_pre = ith_pillar * max_points_pre_pillar * num_point_feature \
  180. + ith_point * num_point_feature;
  181. unsigned int idx_post = ith_pillar * max_points_pre_pillar * num_point_feature \
  182. + (ith_point + blockDim.x) * num_point_feature;
  183. temp[ith_point * blockDim.y + axis] = 0.0;
  184. unsigned int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
  185. // if (ith_point < num_points_at_this_pillar / 2) {
  186. temp[ith_point * blockDim.y + axis] = dev_pillar_point_feature[idx_pre + axis]
  187. + dev_pillar_point_feature[idx_post + axis];
  188. // }
  189. __syncthreads();
  190. // do reduction in shared mem
  191. // Sequential addressing. This solves the bank conflicts as
  192. // the threads now access shared memory with a stride of one
  193. // 32-bit word (unsigned int) now, which does not cause bank
  194. // conflicts
  195. warpReduce(temp , ith_point , axis);
  196. // // write result for this block to global mem
  197. if (ith_point == 0)
  198. dev_points_mean[ith_pillar * blockDim.y + axis] = temp[ith_point * blockDim.y + axis] / num_points_at_this_pillar ;
  199. }
  200. __global__ void gather_point_feature_kernel(
  201. const int max_num_pillars_,const int max_num_points_per_pillar,const int num_point_feature,
  202. const float min_x_range, const float min_y_range, const float min_z_range,
  203. const float pillar_x_size, const float pillar_y_size, const float pillar_z_size,
  204. const float* dev_pillar_point_feature, const float* dev_num_points_per_pillar,
  205. const float* dev_pillar_coors,
  206. float* dev_points_mean,
  207. float* dev_pfe_gather_feature_){
  208. int ith_pillar = blockIdx.x;
  209. int ith_point = threadIdx.x;
  210. // int kNumPointFeature = 5;
  211. int num_gather_feature = 11;
  212. int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
  213. if (ith_point >= num_points_at_this_pillar){
  214. return;
  215. }
  216. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 0]
  217. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0];
  218. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 1]
  219. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1];
  220. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 2]
  221. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2];
  222. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 3]
  223. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 3];
  224. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4]
  225. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 4];
  226. // dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4] = 0.0f;
  227. // f_cluster = voxel_features[:, :, :3] - points_mean
  228. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 5]
  229. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - dev_points_mean[ith_pillar * 3 + 0 ];
  230. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 6]
  231. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - dev_points_mean[ith_pillar * 3 + 1 ];
  232. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 7]
  233. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - dev_points_mean[ith_pillar * 3 + 2 ];
  234. // f_center[:, :, 0] = voxel_features[:, :, 0] - (coords[:, 3].to(voxel_features.dtype).unsqueeze(1) * self.voxel_x + self.x_offset)
  235. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 8]
  236. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - (dev_pillar_coors[ith_pillar * 4 + 3] * pillar_x_size + (pillar_x_size/2 + min_x_range));
  237. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 9]
  238. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - (dev_pillar_coors[ith_pillar * 4 + 2] * pillar_y_size + (pillar_y_size/2 + min_y_range));
  239. dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 10]
  240. = dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - (dev_pillar_coors[ith_pillar * 4 + 1] * pillar_z_size + (pillar_z_size/2 + min_z_range));
  241. }
  242. PreprocessPointsCuda::PreprocessPointsCuda(
  243. const int num_threads, const int max_num_pillars,
  244. const int max_points_per_pillar, const int num_point_feature,
  245. const int num_inds_for_scan, const int grid_x_size, const int grid_y_size,
  246. const int grid_z_size, const float pillar_x_size, const float pillar_y_size,
  247. const float pillar_z_size, const float min_x_range, const float min_y_range,
  248. const float min_z_range)
  249. : num_threads_(num_threads),
  250. max_num_pillars_(max_num_pillars),
  251. max_num_points_per_pillar_(max_points_per_pillar),
  252. num_point_feature_(num_point_feature),
  253. num_inds_for_scan_(num_inds_for_scan),
  254. grid_x_size_(grid_x_size),
  255. grid_y_size_(grid_y_size),
  256. grid_z_size_(grid_z_size),
  257. pillar_x_size_(pillar_x_size),
  258. pillar_y_size_(pillar_y_size),
  259. pillar_z_size_(pillar_z_size),
  260. min_x_range_(min_x_range),
  261. min_y_range_(min_y_range),
  262. min_z_range_(min_z_range) {
  263. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_in_coors_),
  264. grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ * num_point_feature_ * sizeof(float)));
  265. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_histo_),
  266. grid_y_size_ * grid_x_size_ * sizeof(int)));
  267. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_counter_), sizeof(int)));
  268. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_), sizeof(int)));
  269. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points_mean_), max_num_pillars_ * 3 *sizeof(float)));
  270. }
  271. PreprocessPointsCuda::~PreprocessPointsCuda() {
  272. GPU_CHECK(cudaFree(dev_pillar_point_feature_in_coors_));
  273. GPU_CHECK(cudaFree(dev_pillar_count_histo_));
  274. GPU_CHECK(cudaFree(dev_counter_));
  275. GPU_CHECK(cudaFree(dev_pillar_count_));
  276. GPU_CHECK(cudaFree(dev_points_mean_));
  277. }
  278. void PreprocessPointsCuda::DoPreprocessPointsCuda(
  279. const float* dev_points, const int in_num_points,
  280. int* dev_x_coors,int* dev_y_coors,
  281. float* dev_num_points_per_pillar,
  282. float* dev_pillar_point_feature, float* dev_pillar_coors,
  283. int* dev_sparse_pillar_map, int* host_pillar_count , float* dev_pfe_gather_feature) {
  284. // initialize paraments
  285. GPU_CHECK(cudaMemset(dev_pillar_point_feature_in_coors_, 0 , grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ * num_point_feature_ * sizeof(float)));
  286. GPU_CHECK(cudaMemset(dev_pillar_count_histo_, 0 , grid_y_size_ * grid_x_size_ * sizeof(int)));
  287. GPU_CHECK(cudaMemset(dev_counter_, 0, sizeof(int)));
  288. GPU_CHECK(cudaMemset(dev_pillar_count_, 0, sizeof(int)));
  289. GPU_CHECK(cudaMemset(dev_points_mean_, 0, max_num_pillars_ * 3 * sizeof(float)));
  290. int num_block = DIVUP(in_num_points , num_threads_);
  291. make_pillar_histo_kernel<<<num_block , num_threads_>>>(
  292. dev_points, dev_pillar_point_feature_in_coors_, dev_pillar_count_histo_,
  293. in_num_points, max_num_points_per_pillar_, grid_x_size_, grid_y_size_,
  294. grid_z_size_, min_x_range_, min_y_range_, min_z_range_, pillar_x_size_,
  295. pillar_y_size_, pillar_z_size_, num_point_feature_);
  296. make_pillar_index_kernel<<<grid_x_size_, grid_y_size_>>>(
  297. dev_pillar_count_histo_, dev_counter_, dev_pillar_count_, dev_x_coors,
  298. dev_y_coors, dev_num_points_per_pillar, dev_sparse_pillar_map,
  299. max_num_pillars_, max_num_points_per_pillar_, grid_x_size_,
  300. num_inds_for_scan_);
  301. GPU_CHECK(cudaMemcpy(host_pillar_count, dev_pillar_count_, 1 * sizeof(int),
  302. cudaMemcpyDeviceToHost));
  303. make_pillar_feature_kernel<<<host_pillar_count[0],max_num_points_per_pillar_>>>(
  304. dev_pillar_point_feature_in_coors_, dev_pillar_point_feature,
  305. dev_pillar_coors, dev_x_coors, dev_y_coors, dev_num_points_per_pillar,
  306. max_num_points_per_pillar_, num_point_feature_, grid_x_size_);
  307. dim3 mean_block(max_num_points_per_pillar_,3); //(32,3)
  308. pillar_mean_kernel<<<host_pillar_count[0],mean_block,64 * 3 *sizeof(float)>>>(
  309. dev_points_mean_ ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar,
  310. max_num_pillars_ , max_num_points_per_pillar_);
  311. // dim3 mean_block(10,3); // Unrolling the Last Warp
  312. // make_pillar_mean_kernel<<<host_pillar_count[0], mean_block , 32 * 3 *sizeof(float)>>>(
  313. // dev_points_mean_ ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar,
  314. // max_num_pillars_ , max_num_points_per_pillar_);
  315. gather_point_feature_kernel<<<max_num_pillars_, max_num_points_per_pillar_>>>(
  316. max_num_pillars_,max_num_points_per_pillar_,num_point_feature_,
  317. min_x_range_, min_y_range_, min_z_range_,
  318. pillar_x_size_, pillar_y_size_, pillar_z_size_,
  319. dev_pillar_point_feature, dev_num_points_per_pillar, dev_pillar_coors,
  320. dev_points_mean_,
  321. dev_pfe_gather_feature);
  322. // DEVICE_SAVE<float>(dev_pillar_point_feature , \
  323. // max_num_pillars_ * max_num_points_per_pillar_ * num_point_feature_ , "dev_pillar_point_feature");
  324. // DEVICE_SAVE<float>(dev_num_points_per_pillar , \
  325. // max_num_pillars_ , "dev_num_points_per_pillar");
  326. // DEVICE_SAVE<float>(dev_pfe_gather_feature , \
  327. // max_num_pillars_ * 11, "dev_pfe_gather_feature");
  328. }