123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383 |
- /******************************************************************************
- * Copyright 2020 The Apollo Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *****************************************************************************/
- /*
- * Copyright 2018-2019 Autoware Foundation. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /**
- * @author Kosuke Murakami
- * @date 2019/02/26
- */
- /**
- * @author Yan haixu
- * Contact: just github.com/hova88
- * @date 2021/04/30
- */
- // headers in CUDA
- #include <thrust/sort.h>
- // headers in local files
- #include "common.h"
- #include "postprocess.h"
- #include <stdio.h>
- // sigmoid_filter_warp
- __device__ void box_decode_warp(int head_offset, const float* box_pred,
- int tid , int num_anchors_per_head , int counter, float* filtered_box)
- {
- filtered_box[blockIdx.z * num_anchors_per_head * 7 + counter * 7 + 0] = box_pred[ head_offset + tid * 9 + 0];
- filtered_box[blockIdx.z * num_anchors_per_head * 7 + counter * 7 + 1] = box_pred[ head_offset + tid * 9 + 1];
- filtered_box[blockIdx.z * num_anchors_per_head * 7 + counter * 7 + 2] = box_pred[ head_offset + tid * 9 + 2];
- filtered_box[blockIdx.z * num_anchors_per_head * 7 + counter * 7 + 3] = box_pred[ head_offset + tid * 9 + 3];
- filtered_box[blockIdx.z * num_anchors_per_head * 7 + counter * 7 + 4] = box_pred[ head_offset + tid * 9 + 4];
- filtered_box[blockIdx.z * num_anchors_per_head * 7 + counter * 7 + 5] = box_pred[ head_offset + tid * 9 + 5];
- filtered_box[blockIdx.z * num_anchors_per_head * 7 + counter * 7 + 6] = box_pred[ head_offset + tid * 9 + 6];
- }
- __global__ void sigmoid_filter_kernel(
- float* cls_pred_0,
- float* cls_pred_12,
- float* cls_pred_34,
- float* cls_pred_5,
- float* cls_pred_67,
- float* cls_pred_89,
- const float* box_pred_0,
- const float* box_pred_1,
- const float* box_pred_2,
- const float* box_pred_3,
- const float* box_pred_4,
- const float* box_pred_5,
- const float* box_pred_6,
- const float* box_pred_7,
- const float* box_pred_8,
- const float* box_pred_9,
- float* filtered_box,
- float* filtered_score,
- int* filter_count,
- const float score_threshold) {
- // cls_pred_34
- // 32768*2 , 2
- int num_anchors_per_head = gridDim.x * gridDim.y * blockDim.x;
- // 16 * 4 * 512 = 32768
- extern __shared__ float cls_score[];
- cls_score[threadIdx.x + blockDim.x] = -1.0f;
- int tid = blockIdx.x * gridDim.y * blockDim.x + blockIdx.y * blockDim.x + threadIdx.x;
- if ( blockIdx.z == 0) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_0[ tid ]));
- if ( blockIdx.z == 1) {
- cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 ]));
- cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2]));}
- if ( blockIdx.z == 2) {
- cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 + 1]));
- cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2 + 1]));}
- if ( blockIdx.z == 3) {
- cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 ]));
- cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2]));}
- if ( blockIdx.z == 4) {
- cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 + 1 ]));
- cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2 + 1]));}
- if ( blockIdx.z == 5) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_5[ tid ]));
- if ( blockIdx.z == 6) {
- cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 ]));
- cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2]));}
- if ( blockIdx.z == 7) {
- cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 + 1 ]));
- cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2 + 1]));}
- if ( blockIdx.z == 8) {
- cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 ]));
- cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2]));}
- if ( blockIdx.z == 9) {
- cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 + 1 ]));
- cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2 + 1]));}
-
- __syncthreads();
-
- if( cls_score[ threadIdx.x ] > score_threshold)
- {
- int counter = atomicAdd(&filter_count[blockIdx.z], 1);
- if ( blockIdx.z == 0) {
- box_decode_warp(0 ,box_pred_0 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if ( blockIdx.z == 1) {
- box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if ( blockIdx.z == 2) {
- box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if ( blockIdx.z == 3) {
- box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if (blockIdx.z == 4) {
- box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if ( blockIdx.z == 5) {
- box_decode_warp(0 ,box_pred_5 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if ( blockIdx.z == 6) {
- box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if ( blockIdx.z == 7) {
- box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if ( blockIdx.z == 8) {
- box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if ( blockIdx.z == 9) {
- box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }
- }
- __syncthreads();
- if( cls_score[ threadIdx.x + blockDim.x ] > score_threshold) {
- int counter = atomicAdd(&filter_count[blockIdx.z], 1);
- // printf("counter : %d \n" , counter);
- if (blockIdx.z == 1) {
- box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if (blockIdx.z == 2) {
- box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if (blockIdx.z == 3) {
- box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if (blockIdx.z == 4) {
- box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if (blockIdx.z == 6) {
- box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if (blockIdx.z == 7) {
- box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if (blockIdx.z == 8) {
- box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }else
- if (blockIdx.z == 9) {
- box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
- filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
- }
- }
- }
- __global__ void sort_boxes_by_indexes_kernel(float* filtered_box, float* filtered_scores, int* indexes, int filter_count,
- float* sorted_filtered_boxes, float* sorted_filtered_scores,
- const int num_output_box_feature)
- {
- int tid = threadIdx.x + blockIdx.x * blockDim.x;
- if(tid < filter_count) {
- int sort_index = indexes[tid];
- sorted_filtered_boxes[tid * num_output_box_feature + 0] = filtered_box[sort_index * num_output_box_feature + 0];
- sorted_filtered_boxes[tid * num_output_box_feature + 1] = filtered_box[sort_index * num_output_box_feature + 1];
- sorted_filtered_boxes[tid * num_output_box_feature + 2] = filtered_box[sort_index * num_output_box_feature + 2];
- sorted_filtered_boxes[tid * num_output_box_feature + 3] = filtered_box[sort_index * num_output_box_feature + 3];
- sorted_filtered_boxes[tid * num_output_box_feature + 4] = filtered_box[sort_index * num_output_box_feature + 4];
- sorted_filtered_boxes[tid * num_output_box_feature + 5] = filtered_box[sort_index * num_output_box_feature + 5];
- sorted_filtered_boxes[tid * num_output_box_feature + 6] = filtered_box[sort_index * num_output_box_feature + 6];
- // sorted_filtered_dir[tid] = filtered_dir[sort_index];
- sorted_filtered_scores[tid] = filtered_scores[sort_index];
- }
- }
- PostprocessCuda::PostprocessCuda(const int num_threads, const float float_min, const float float_max,
- const int num_class,const int num_anchor_per_cls,
- const std::vector<std::vector<int>> multihead_label_mapping,
- const float score_threshold, const float nms_overlap_threshold,
- const int nms_pre_maxsize, const int nms_post_maxsize,
- const int num_box_corners,
- const int num_input_box_feature,
- const int num_output_box_feature)
- : num_threads_(num_threads),
- float_min_(float_min),
- float_max_(float_max),
- num_class_(num_class),
- num_anchor_per_cls_(num_anchor_per_cls),
- multihead_label_mapping_(multihead_label_mapping),
- score_threshold_(score_threshold),
- nms_overlap_threshold_(nms_overlap_threshold),
- nms_pre_maxsize_(nms_pre_maxsize),
- nms_post_maxsize_(nms_post_maxsize),
- num_box_corners_(num_box_corners),
- num_input_box_feature_(num_input_box_feature),
- num_output_box_feature_(num_output_box_feature) {
- nms_cuda_ptr_.reset(
- new NmsCuda(num_threads_, num_box_corners_, nms_overlap_threshold_));
- }
- void PostprocessCuda::DoPostprocessCuda(
- float* cls_pred_0,
- float* cls_pred_12,
- float* cls_pred_34,
- float* cls_pred_5,
- float* cls_pred_67,
- float* cls_pred_89,
- const float* box_preds,
-
- float* dev_filtered_box,
- float* dev_filtered_score,
- int* dev_filter_count,
- std::vector<float>& out_detection, std::vector<int>& out_label , std::vector<float>& out_score) {
- // 在此之前,先进行rpn_box_output的concat.
- // 128x128 的feature map, cls_pred 的shape为(32768,1),(32768,1),(32768,1),(65536,2),(32768,1)
- dim3 gridsize(16, 4 , 10); //16 * 4 * 512 = 32768 代表一个head的anchors
- sigmoid_filter_kernel<<< gridsize, 512 , 512 * 2 * sizeof(float)>>>(
- cls_pred_0,
- cls_pred_12,
- cls_pred_34,
- cls_pred_5,
- cls_pred_67,
- cls_pred_89,
- &box_preds[0 * 32768 * 9],
- &box_preds[1 * 32768 * 9],
- &box_preds[2 * 32768 * 9],
- &box_preds[3 * 32768 * 9],
- &box_preds[4 * 32768 * 9],
- &box_preds[5 * 32768 * 9],
- &box_preds[6 * 32768 * 9],
- &box_preds[7 * 32768 * 9],
- &box_preds[8 * 32768 * 9],
- &box_preds[9 * 32768 * 9],
- dev_filtered_box,
- dev_filtered_score,
- dev_filter_count,
-
- score_threshold_);
- cudaDeviceSynchronize();
-
- int host_filter_count[num_class_] = {0};
- GPU_CHECK(cudaMemcpy(host_filter_count, dev_filter_count, num_class_ * sizeof(int), cudaMemcpyDeviceToHost));
-
- for (int i = 0; i < num_class_; ++ i) {
- if(host_filter_count[i] <= 0) continue;
- int* dev_indexes;
- float* dev_sorted_filtered_box;
- float* dev_sorted_filtered_scores;
- GPU_CHECK(cudaMalloc((void**)&dev_indexes, host_filter_count[i] * sizeof(int)));
- GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float)));
- GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_scores, host_filter_count[i]*sizeof(float)));
- // GPU_CHECK(cudaMalloc((void**)&dev_sorted_box_for_nms, NUM_BOX_CORNERS_*host_filter_count[i]*sizeof(float)));
- thrust::sequence(thrust::device, dev_indexes, dev_indexes + host_filter_count[i]);
- thrust::sort_by_key(thrust::device,
- &dev_filtered_score[i * num_anchor_per_cls_],
- &dev_filtered_score[i * num_anchor_per_cls_ + host_filter_count[i]],
- dev_indexes,
- thrust::greater<float>());
- const int num_blocks = DIVUP(host_filter_count[i], num_threads_);
- sort_boxes_by_indexes_kernel<<<num_blocks, num_threads_>>>(
- &dev_filtered_box[i * num_anchor_per_cls_ * num_output_box_feature_],
- &dev_filtered_score[i * num_anchor_per_cls_],
- dev_indexes,
- host_filter_count[i],
- dev_sorted_filtered_box,
- dev_sorted_filtered_scores,
- num_output_box_feature_);
- int num_box_for_nms = min(nms_pre_maxsize_, host_filter_count[i]);
- long* keep_inds = new long[num_box_for_nms]; // index of kept box
- memset(keep_inds, 0, num_box_for_nms * sizeof(int));
- int num_out = 0;
- nms_cuda_ptr_->DoNmsCuda(num_box_for_nms, dev_sorted_filtered_box, keep_inds, &num_out);
- num_out = min(num_out, nms_post_maxsize_);
- float* host_filtered_box = new float[host_filter_count[i] * num_output_box_feature_]();
- float* host_filtered_scores = new float[host_filter_count[i]]();
- cudaMemcpy(host_filtered_box, dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float), cudaMemcpyDeviceToHost);
- cudaMemcpy(host_filtered_scores, dev_sorted_filtered_scores, host_filter_count[i] * sizeof(float), cudaMemcpyDeviceToHost);
- for (int j = 0; j < num_out; ++j) {
- out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 0]);
- out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 1]);
- out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 2]);
- out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 3]);
- out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 4]);
- out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 5]);
- out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 6]);
- out_score.emplace_back(host_filtered_scores[keep_inds[j]]);
- out_label.emplace_back(i);
- }
- delete[] keep_inds;
- delete[] host_filtered_scores;
- delete[] host_filtered_box;
- GPU_CHECK(cudaFree(dev_indexes));
- GPU_CHECK(cudaFree(dev_sorted_filtered_box));
- }
- }
|