preprocess_kernel.cu 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. // Copyright 2022 TIER IV, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. /*
  15. * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  16. * SPDX-License-Identifier: Apache-2.0
  17. *
  18. * Licensed under the Apache License, Version 2.0 (the "License");
  19. * you may not use this file except in compliance with the License.
  20. * You may obtain a copy of the License at
  21. *
  22. * http://www.apache.org/licenses/LICENSE-2.0
  23. *
  24. * Unless required by applicable law or agreed to in writing, software
  25. * distributed under the License is distributed on an "AS IS" BASIS,
  26. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  27. * See the License for the specific language governing permissions and
  28. * limitations under the License.
  29. */
  30. #include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"
  31. #include <lidar_centerpoint/utils.hpp>
  32. namespace
  33. {
  34. const std::size_t MAX_POINT_IN_VOXEL_SIZE = 32; // the same as max_point_in_voxel_size_ in config
  35. const std::size_t WARPS_PER_BLOCK = 4;
  36. const std::size_t ENCODER_IN_FEATURE_SIZE = 9; // the same as encoder_in_feature_size_ in config
  37. } // namespace
  38. namespace centerpoint
  39. {
  40. __global__ void generateFeatures_kernel(
  41. const float * voxel_features, const float * voxel_num_points, const int * coords,
  42. const std::size_t num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
  43. const float range_min_x, const float range_min_y, const float range_min_z, float * features)
  44. {
  45. // voxel_features (float): (max_voxel_size, max_point_in_voxel_size, point_feature_size)
  46. // voxel_num_points (int): (max_voxel_size)
  47. // coords (int): (max_voxel_size, point_dim_size)
  48. int pillar_idx = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x / MAX_POINT_IN_VOXEL_SIZE;
  49. int point_idx = threadIdx.x % MAX_POINT_IN_VOXEL_SIZE;
  50. int pillar_idx_inBlock = threadIdx.x / MAX_POINT_IN_VOXEL_SIZE; // max_point_in_voxel_size
  51. if (pillar_idx >= num_voxels) return;
  52. // load src
  53. __shared__ float4 pillarSM[WARPS_PER_BLOCK][MAX_POINT_IN_VOXEL_SIZE];
  54. __shared__ float3 pillarSumSM[WARPS_PER_BLOCK];
  55. __shared__ int3 cordsSM[WARPS_PER_BLOCK];
  56. __shared__ int pointsNumSM[WARPS_PER_BLOCK];
  57. __shared__ float pillarOutSM[WARPS_PER_BLOCK][MAX_POINT_IN_VOXEL_SIZE][ENCODER_IN_FEATURE_SIZE];
  58. if (threadIdx.x < WARPS_PER_BLOCK) {
  59. pointsNumSM[threadIdx.x] = voxel_num_points[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
  60. cordsSM[threadIdx.x] = ((int3 *)coords)[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
  61. pillarSumSM[threadIdx.x] = {0, 0, 0};
  62. }
  63. pillarSM[pillar_idx_inBlock][point_idx] =
  64. ((float4 *)voxel_features)[pillar_idx * MAX_POINT_IN_VOXEL_SIZE + point_idx];
  65. __syncthreads();
  66. // calculate sm in a pillar
  67. if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
  68. atomicAdd(&(pillarSumSM[pillar_idx_inBlock].x), pillarSM[pillar_idx_inBlock][point_idx].x);
  69. atomicAdd(&(pillarSumSM[pillar_idx_inBlock].y), pillarSM[pillar_idx_inBlock][point_idx].y);
  70. atomicAdd(&(pillarSumSM[pillar_idx_inBlock].z), pillarSM[pillar_idx_inBlock][point_idx].z);
  71. }
  72. __syncthreads();
  73. // feature-mean
  74. float3 mean;
  75. float validPoints = pointsNumSM[pillar_idx_inBlock];
  76. mean.x = pillarSumSM[pillar_idx_inBlock].x / validPoints;
  77. mean.y = pillarSumSM[pillar_idx_inBlock].y / validPoints;
  78. mean.z = pillarSumSM[pillar_idx_inBlock].z / validPoints;
  79. mean.x = pillarSM[pillar_idx_inBlock][point_idx].x - mean.x;
  80. mean.y = pillarSM[pillar_idx_inBlock][point_idx].y - mean.y;
  81. mean.z = pillarSM[pillar_idx_inBlock][point_idx].z - mean.z;
  82. // calculate offset
  83. float x_offset = voxel_x / 2 + cordsSM[pillar_idx_inBlock].z * voxel_x + range_min_x;
  84. float y_offset = voxel_y / 2 + cordsSM[pillar_idx_inBlock].y * voxel_y + range_min_y;
  85. float z_offset = voxel_z / 2 + cordsSM[pillar_idx_inBlock].x * voxel_z + range_min_z;
  86. // feature-offset
  87. float3 center;
  88. center.x = pillarSM[pillar_idx_inBlock][point_idx].x - x_offset;
  89. center.y = pillarSM[pillar_idx_inBlock][point_idx].y - y_offset;
  90. center.z = pillarSM[pillar_idx_inBlock][point_idx].z - z_offset;
  91. // store output
  92. if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
  93. pillarOutSM[pillar_idx_inBlock][point_idx][0] = pillarSM[pillar_idx_inBlock][point_idx].x;
  94. pillarOutSM[pillar_idx_inBlock][point_idx][1] = pillarSM[pillar_idx_inBlock][point_idx].y;
  95. pillarOutSM[pillar_idx_inBlock][point_idx][2] = pillarSM[pillar_idx_inBlock][point_idx].z;
  96. pillarOutSM[pillar_idx_inBlock][point_idx][3] = pillarSM[pillar_idx_inBlock][point_idx].w;
  97. pillarOutSM[pillar_idx_inBlock][point_idx][4] = mean.x;
  98. pillarOutSM[pillar_idx_inBlock][point_idx][5] = mean.y;
  99. pillarOutSM[pillar_idx_inBlock][point_idx][6] = mean.z;
  100. pillarOutSM[pillar_idx_inBlock][point_idx][7] = center.x;
  101. pillarOutSM[pillar_idx_inBlock][point_idx][8] = center.y;
  102. } else {
  103. pillarOutSM[pillar_idx_inBlock][point_idx][0] = 0;
  104. pillarOutSM[pillar_idx_inBlock][point_idx][1] = 0;
  105. pillarOutSM[pillar_idx_inBlock][point_idx][2] = 0;
  106. pillarOutSM[pillar_idx_inBlock][point_idx][3] = 0;
  107. pillarOutSM[pillar_idx_inBlock][point_idx][4] = 0;
  108. pillarOutSM[pillar_idx_inBlock][point_idx][5] = 0;
  109. pillarOutSM[pillar_idx_inBlock][point_idx][6] = 0;
  110. pillarOutSM[pillar_idx_inBlock][point_idx][7] = 0;
  111. pillarOutSM[pillar_idx_inBlock][point_idx][8] = 0;
  112. }
  113. __syncthreads();
  114. for (int i = 0; i < ENCODER_IN_FEATURE_SIZE; i++) {
  115. int outputSMId = pillar_idx_inBlock * MAX_POINT_IN_VOXEL_SIZE * ENCODER_IN_FEATURE_SIZE +
  116. i * MAX_POINT_IN_VOXEL_SIZE + point_idx;
  117. int outputId = pillar_idx * MAX_POINT_IN_VOXEL_SIZE * ENCODER_IN_FEATURE_SIZE +
  118. i * MAX_POINT_IN_VOXEL_SIZE + point_idx;
  119. features[outputId] = ((float *)pillarOutSM)[outputSMId];
  120. }
  121. }
  122. cudaError_t generateFeatures_launch(
  123. const float * voxel_features, const float * voxel_num_points, const int * coords,
  124. const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
  125. const float voxel_size_y, const float voxel_size_z, const float range_min_x,
  126. const float range_min_y, const float range_min_z, float * features, cudaStream_t stream)
  127. {
  128. dim3 blocks(divup(max_voxel_size, WARPS_PER_BLOCK));
  129. dim3 threads(WARPS_PER_BLOCK * MAX_POINT_IN_VOXEL_SIZE);
  130. generateFeatures_kernel<<<blocks, threads, 0, stream>>>(
  131. voxel_features, voxel_num_points, coords, num_voxels, voxel_size_x, voxel_size_y, voxel_size_z,
  132. range_min_x, range_min_y, range_min_z, features);
  133. return cudaGetLastError();
  134. }
  135. } // namespace centerpoint