Browse Source

add detection/detection_lidar_pointpillars_multihead. compile ok in AGX.

yuchuli 3 years ago
parent
commit
6ccf49c9fc

+ 73 - 0
src/detection/detection_lidar_PointPillars_MultiHead/.gitignore

@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+

+ 246 - 0
src/detection/detection_lidar_PointPillars_MultiHead/cfgs/cbgs_pp_multihead.yaml

@@ -0,0 +1,246 @@
+CLASS_NAMES: ['car','truck', 'construction_vehicle', 'bus', 'trailer',
+              'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']
+
+DATA_CONFIG:
+    _BASE_CONFIG_: cfgs/dataset_configs/nuscenes_dataset.yaml
+
+    POINT_CLOUD_RANGE: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+    DATA_PROCESSOR:
+        -   NAME: mask_points_and_boxes_outside_range
+            REMOVE_OUTSIDE_BOXES: True
+
+        -   NAME: shuffle_points
+            SHUFFLE_ENABLED: {
+                'train': True,
+                'test': True
+            }
+
+        -   NAME: transform_points_to_voxels
+            VOXEL_SIZE: [0.2, 0.2, 8.0]
+            MAX_POINTS_PER_VOXEL: 20
+            MAX_NUMBER_OF_VOXELS: {
+                'train': 30000,
+                'test': 30000
+            }
+
+MODEL:
+    NAME: PointPillar
+
+    VFE:
+        NAME: PillarVFE
+        WITH_DISTANCE: False
+        USE_ABSLOTE_XYZ: True
+        USE_NORM: True
+        NUM_FILTERS: [64]
+
+    MAP_TO_BEV:
+        NAME: PointPillarScatter
+        NUM_BEV_FEATURES: 64
+
+    BACKBONE_2D:
+        NAME: BaseBEVBackbone
+        LAYER_NUMS: [3, 5, 5]
+        LAYER_STRIDES: [2, 2, 2]
+        NUM_FILTERS: [64, 128, 256]
+        UPSAMPLE_STRIDES: [0.5, 1, 2]
+        NUM_UPSAMPLE_FILTERS: [128, 128, 128]
+
+    DENSE_HEAD:
+        NAME: AnchorHeadMulti
+        CLASS_AGNOSTIC: False
+
+        DIR_OFFSET: 0.78539
+        DIR_LIMIT_OFFSET: 0.0
+        NUM_DIR_BINS: 2
+
+        USE_MULTIHEAD: True
+        SEPARATE_MULTIHEAD: True
+        ANCHOR_GENERATOR_CONFIG: [
+            {
+                'class_name': car,
+                'anchor_sizes': [[4.63, 1.97, 1.74]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.95],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.6,
+                'unmatched_threshold': 0.45
+            },
+            {
+                'class_name': truck,
+                'anchor_sizes': [[6.93, 2.51, 2.84]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.6],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.55,
+                'unmatched_threshold': 0.4
+            },
+            {
+                'class_name': construction_vehicle,
+                'anchor_sizes': [[6.37, 2.85, 3.19]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.225],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.5,
+                'unmatched_threshold': 0.35
+            },
+            {
+                'class_name': bus,
+                'anchor_sizes': [[10.5, 2.94, 3.47]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.085],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.55,
+                'unmatched_threshold': 0.4
+            },
+            {
+                'class_name': trailer,
+                'anchor_sizes': [[12.29, 2.90, 3.87]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [0.115],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.5,
+                'unmatched_threshold': 0.35
+            },
+            {
+                'class_name': barrier,
+                'anchor_sizes': [[0.50, 2.53, 0.98]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-1.33],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.55,
+                'unmatched_threshold': 0.4
+            },
+            {
+                'class_name': motorcycle,
+                'anchor_sizes': [[2.11, 0.77, 1.47]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-1.085],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.5,
+                'unmatched_threshold': 0.3
+            },
+            {
+                'class_name': bicycle,
+                'anchor_sizes': [[1.70, 0.60, 1.28]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-1.18],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.5,
+                'unmatched_threshold': 0.35
+            },
+            {
+                'class_name': pedestrian,
+                'anchor_sizes': [[0.73, 0.67, 1.77]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.935],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.6,
+                'unmatched_threshold': 0.4
+            },
+            {
+                'class_name': traffic_cone,
+                'anchor_sizes': [[0.41, 0.41, 1.07]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-1.285],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.6,
+                'unmatched_threshold': 0.4
+            },
+        ]
+
+        SHARED_CONV_NUM_FILTER: 64
+
+        RPN_HEAD_CFGS: [
+            {
+                'HEAD_CLS_NAME': ['car'],
+            },
+            {
+                'HEAD_CLS_NAME': ['truck', 'construction_vehicle'],
+            },
+            {
+                'HEAD_CLS_NAME': ['bus', 'trailer'],
+            },
+            {
+                'HEAD_CLS_NAME': ['barrier'],
+            },
+            {
+                'HEAD_CLS_NAME': ['motorcycle', 'bicycle'],
+            },
+            {
+                'HEAD_CLS_NAME': ['pedestrian', 'traffic_cone'],
+            },
+        ]
+        SEPARATE_REG_CONFIG: 
+            NUM_MIDDLE_CONV: 1
+            NUM_MIDDLE_FILTER: 64
+            REG_LIST: ['reg:2', 'height:1', 'size:3', 'angle:2', 'velo:2']
+
+        TARGET_ASSIGNER_CONFIG:
+            NAME: AxisAlignedTargetAssigner
+            POS_FRACTION: -1.0
+            SAMPLE_SIZE: 512
+            NORM_BY_NUM_EXAMPLES: False
+            MATCH_HEIGHT: False
+            BOX_CODER: ResidualCoder
+            BOX_CODER_CONFIG: {
+                'code_size': 9,
+                'encode_angle_by_sincos': True
+            }
+
+
+        LOSS_CONFIG:
+            REG_LOSS_TYPE: WeightedL1Loss
+            LOSS_WEIGHTS: {
+                'pos_cls_weight': 1.0,
+                'neg_cls_weight': 2.0,
+                'cls_weight': 1.0,
+                'loc_weight': 0.25,
+                'dir_weight': 0.2,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+            }
+
+    POST_PROCESSING:
+        RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
+        SCORE_THRESH: 0.1
+        OUTPUT_RAW_SCORE: False
+
+        EVAL_METRIC: kitti
+
+        NMS_CONFIG:
+            MULTI_CLASSES_NMS: True
+            NMS_TYPE: nms_gpu
+            NMS_THRESH: 0.2
+            NMS_PRE_MAXSIZE: 1000
+            NMS_POST_MAXSIZE: 83
+
+
+OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 4
+    NUM_EPOCHS: 20
+
+    OPTIMIZER: adam_onecycle
+    LR: 0.001
+    WEIGHT_DECAY: 0.01
+    MOMENTUM: 0.9
+
+    MOMS: [0.95, 0.85]
+    PCT_START: 0.4
+    DIV_FACTOR: 10
+    DECAY_STEP_LIST: [35, 45]
+    LR_DECAY: 0.1
+    LR_CLIP: 0.0000001
+
+    LR_WARMUP: False
+    WARMUP_EPOCH: 1
+
+    GRAD_NORM_CLIP: 10

+ 141 - 0
src/detection/detection_lidar_PointPillars_MultiHead/common.h

@@ -0,0 +1,141 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+
+#pragma once
+
+// headers in STL
+#include <stdio.h>
+#include <assert.h>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+// headers in CUDA
+#include "cuda_runtime_api.h"
+
+using namespace std;
+// using MACRO to allocate memory inside CUDA kernel
+#define NUM_3D_BOX_CORNERS_MACRO 8
+
+#define NUM_2D_BOX_CORNERS_MACRO 4
+
+#define NUM_THREADS_MACRO 64
+
+// need to be changed when num_threads_ is changed
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#define GPU_CHECK(ans)                    \
+  {                                       \
+    GPUAssert((ans), __FILE__, __LINE__); \
+  }
+inline void GPUAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true)
+{
+  if (code != cudaSuccess)
+  {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort)
+      exit(code);
+  }
+};
+
+template <typename T>
+void HOST_SAVE(T *array, int size, string filename, string root = "../test/result", string postfix = ".txt")
+{
+  string filepath = root + "/" + filename + postfix;
+  if (postfix == ".bin")
+  {
+    fstream file(filepath, ios::out | ios::binary);
+    file.write(reinterpret_cast<char *>(array), sizeof(size * sizeof(T)));
+    file.close();
+    std::cout << "|>>>|  Data has been written in " << filepath << "  |<<<|" << std::endl;
+    return;
+  }
+  else if (postfix == ".txt")
+  {
+    ofstream file(filepath, ios::out);
+    for (int i = 0; i < size; ++i)
+      file << array[i] << " ";
+    file.close();
+    std::cout << "|>>>|  Data has been written in " << filepath << "  |<<<|" << std::endl;
+    return;
+  }
+};
+
+template <typename T>
+void DEVICE_SAVE(T *array, int size, string filename, string root = "../test/result", string postfix = ".txt")
+{
+  T *temp_ = new T[size];
+  cudaMemcpy(temp_, array, size * sizeof(T), cudaMemcpyDeviceToHost);
+  HOST_SAVE<T>(temp_, size, filename, root, postfix);
+  delete[] temp_;
+};
+
+
+// int TXTtoArrary( float* &points_array , string file_name , int num_feature = 4)
+// {
+//   ifstream InFile;
+//   InFile.open(file_name.data());
+//   assert(InFile.is_open());
+
+//   vector<float> temp_points;
+//   string c;
+
+//   while (!InFile.eof())
+//   {
+//       InFile >> c;
+
+//       temp_points.push_back(atof(c.c_str()));
+//   }
+//   points_array = new float[temp_points.size()];
+//   for (int i = 0 ; i < temp_points.size() ; ++i) {
+//     points_array[i] = temp_points[i];
+//   }
+
+//   InFile.close();  
+//   return temp_points.size() / num_feature;
+// };

+ 129 - 0
src/detection/detection_lidar_PointPillars_MultiHead/detection_lidar_PointPillars_MultiHead.pro

@@ -0,0 +1,129 @@
+QT -= gui
+
+CONFIG += c++11 console
+CONFIG -= app_bundle
+
+QMAKE_CXXFLAGS += -std=gnu++17
+QMAKE_LFLAGS += -no-pie  -Wl,--no-as-needed
+
+# The following define makes your compiler emit warnings if you use
+# any feature of Qt which as been marked deprecated (the exact warnings
+# depend on your compiler). Please consult the documentation of the
+# deprecated API in order to know how to port your code away from it.
+DEFINES += QT_DEPRECATED_WARNINGS
+
+# You can also make your code fail to compile if you use deprecated APIs.
+# In order to do so, uncomment the following line.
+# You can also select to disable deprecated APIs only up to a certain version of Qt.
+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
+
+SOURCES += main.cpp \
+    pointpillars.cc
+
+DISTFILES += \
+    nms.cu \
+    postprocess.cu \
+    preprocess.cu \
+    scatter.cu
+
+HEADERS += \
+    common.h \
+    nms.h \
+    pointpillars.h \
+    postprocess.h \
+    preprocess.h \
+    scatter.h
+
+
+CUDA_SOURCES +=  \
+    nms.cu \
+    postprocess.cu \
+    preprocess.cu \
+    scatter.cu
+
+CUDA_SDK = "/usr/local/cuda/"   # cudaSDK路径
+
+CUDA_DIR = "/usr/local/cuda/"            # CUDA tookit路径
+
+SYSTEM_NAME = linux         # 自己系统环境 'Win32', 'x64', or 'Win64'
+
+SYSTEM_TYPE = 64           #操作系统位数 '32' or '64',
+
+CUDA_ARCH = sm_72         # cuda架构, for example 'compute_10', 'compute_11', 'sm_10'
+
+NVCC_OPTIONS = --use_fast_math --compiler-options "-fPIC"
+
+
+# include paths
+
+INCLUDEPATH += $$CUDA_DIR/include
+#INCLUDEPATH += /usr/local/cuda-10.0/targets/aarch64-linux/include/crt
+
+# library directories
+
+QMAKE_LIBDIR += $$CUDA_DIR/lib/
+
+CUDA_OBJECTS_DIR = ./
+
+# The following library conflicts with something in Cuda
+
+#QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
+
+#QMAKE_LFLAGS_DEBUG   = /NODEFAULTLIB:msvcrtd.lib
+
+# Add the necessary libraries
+
+CUDA_LIBS =  cudart cufft
+
+# The following makes sure all path names (which often include spaces) are put between quotation marks
+
+CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
+
+NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
+
+#LIBS += $$join(CUDA_LIBS,'.so ', '', '.so')
+
+# Configuration of the Cuda compiler
+
+CONFIG(debug, debug|release) {
+
+    # Debug mode
+
+    cuda_d.input = CUDA_SOURCES
+
+    cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
+
+    cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+
+    cuda_d.dependency_type = TYPE_C
+
+    QMAKE_EXTRA_COMPILERS += cuda_d
+
+}
+
+else {
+
+    # Release mode
+
+    cuda.input = CUDA_SOURCES
+
+    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
+
+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -O3 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+
+    cuda.dependency_type = TYPE_C
+
+    QMAKE_EXTRA_COMPILERS += cuda
+
+}
+
+
+LIBS += -L/usr/local/cuda-10.2/targets/aarch64-linux/lib
+
+LIBS += -lcudart -lcufft -lyaml-cpp
+
+LIBS += -L/home/adc/soft/cudnn-10.2-linux-x64-v7.6.5.32/cuda/lib64 -lcudnn
+
+LIBS +=  -lmyelin -lnvinfer -lnvonnxparser -lnvcaffe_parser
+
+LIBS += -L/home/nvidia/git/libtorch_gpu-1.6.0-linux-aarch64/lib -ltorch_cuda  -ltorch -lc10 -ltorch_cpu

+ 8 - 0
src/detection/detection_lidar_PointPillars_MultiHead/main.cpp

@@ -0,0 +1,8 @@
+#include <QCoreApplication>
+
+int main(int argc, char *argv[])
+{
+    QCoreApplication a(argc, argv);
+
+    return a.exec();
+}

+ 422 - 0
src/detection/detection_lidar_PointPillars_MultiHead/nms.cu

@@ -0,0 +1,422 @@
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+#include <stdio.h>
+#include "nms.h"
+#include "common.h"
+#define THREADS_PER_BLOCK 16
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+// #define DEBUG
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+const float EPS = 1e-8;
+struct Point {
+    float x, y;
+    __device__ Point() {}
+    __device__ Point(double _x, double _y){
+        x = _x, y = _y;
+    }
+
+    __device__ void set(float _x, float _y){
+        x = _x; y = _y;
+    }
+
+    __device__ Point operator +(const Point &b)const{
+        return Point(x + b.x, y + b.y);
+    }
+
+    __device__ Point operator -(const Point &b)const{
+        return Point(x - b.x, y - b.y);
+    }
+};
+
+__device__ inline float cross(const Point &a, const Point &b){
+    return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){
+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
+    int ret = min(p1.x,p2.x) <= max(q1.x,q2.x)  &&
+              min(q1.x,q2.x) <= max(p1.x,p2.x) &&
+              min(p1.y,p2.y) <= max(q1.y,q2.y) &&
+              min(q1.y,q2.y) <= max(p1.y,p2.y);
+    return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p){
+    //params: (7) [x, y, z, dx, dy, dz, heading]
+    const float MARGIN = 1e-2;
+
+    float center_x = box[0], center_y = box[1];
+    float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);  // rotate the point in the opposite direction of box
+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+    return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
+    // fast exclusion
+    if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+    // check cross standing
+    float s1 = cross(q0, p1, p0);
+    float s2 = cross(p1, q1, p0);
+    float s3 = cross(p0, q1, q0);
+    float s4 = cross(q1, p1, q0);
+
+    if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+    // calculate intersection of two lines
+    float s5 = cross(q1, p1, p0);
+    if(fabs(s5 - s1) > EPS){
+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+    }
+    else{
+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+        float D = a0 * b1 - a1 * b0;
+
+        ans.x = (b0 * c1 - b1 * c0) / D;
+        ans.y = (a1 * c0 - a0 * c1) / D;
+    }
+
+    return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
+    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+    p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b, const Point &center){
+    return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b){
+    // params box_a: [x, y, z, dx, dy, dz, heading]
+    // params box_b: [x, y, z, dx, dy, dz, heading]
+
+    float a_angle = box_a[6], b_angle = box_b[6];
+    float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+    float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+    float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+    float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+    float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+    Point center_a(box_a[0], box_a[1]);
+    Point center_b(box_b[0], box_b[1]);
+
+#ifdef DEBUG
+    printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle,
+           b_x1, b_y1, b_x2, b_y2, b_angle);
+    printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y);
+#endif
+
+    Point box_a_corners[5];
+    box_a_corners[0].set(a_x1, a_y1);
+    box_a_corners[1].set(a_x2, a_y1);
+    box_a_corners[2].set(a_x2, a_y2);
+    box_a_corners[3].set(a_x1, a_y2);
+
+    Point box_b_corners[5];
+    box_b_corners[0].set(b_x1, b_y1);
+    box_b_corners[1].set(b_x2, b_y1);
+    box_b_corners[2].set(b_x2, b_y2);
+    box_b_corners[3].set(b_x1, b_y2);
+
+    // get oriented corners
+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+    for (int k = 0; k < 4; k++){
+#ifdef DEBUG
+        printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+#ifdef DEBUG
+        printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+    }
+
+    box_a_corners[4] = box_a_corners[0];
+    box_b_corners[4] = box_b_corners[0];
+
+    // get intersection of lines
+    Point cross_points[16];
+    Point poly_center;
+    int cnt = 0, flag = 0;
+
+    poly_center.set(0, 0);
+    for (int i = 0; i < 4; i++){
+        for (int j = 0; j < 4; j++){
+            flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
+            if (flag){
+                poly_center = poly_center + cross_points[cnt];
+                cnt++;
+#ifdef DEBUG
+                printf("Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f) \n",
+                    cross_points[cnt - 1].x, cross_points[cnt - 1].y,
+                    box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x, box_a_corners[i + 1].y,
+                    box_b_corners[i].x, box_b_corners[i].y, box_b_corners[i + 1].x, box_b_corners[i + 1].y);
+#endif
+            }
+        }
+    }
+
+    // check corners
+    for (int k = 0; k < 4; k++){
+        if (check_in_box2d(box_a, box_b_corners[k])){
+            poly_center = poly_center + box_b_corners[k];
+            cross_points[cnt] = box_b_corners[k];
+            cnt++;
+#ifdef DEBUG
+                printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+        }
+        if (check_in_box2d(box_b, box_a_corners[k])){
+            poly_center = poly_center + box_a_corners[k];
+            cross_points[cnt] = box_a_corners[k];
+            cnt++;
+#ifdef DEBUG
+                printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+        }
+    }
+
+    poly_center.x /= cnt;
+    poly_center.y /= cnt;
+
+    // sort the points of polygon
+    Point temp;
+    for (int j = 0; j < cnt - 1; j++){
+        for (int i = 0; i < cnt - j - 1; i++){
+            if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
+                temp = cross_points[i];
+                cross_points[i] = cross_points[i + 1];
+                cross_points[i + 1] = temp;
+            }
+        }
+    }
+
+#ifdef DEBUG
+    printf("cnt=%d\n", cnt);
+    for (int i = 0; i < cnt; i++){
+        printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y);
+    }
+#endif
+
+    // get the overlap areas
+    float area = 0;
+    for (int k = 0; k < cnt - 1; k++){
+        area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
+    }
+
+    return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b){
+    // params box_a: [x, y, z, dx, dy, dz, heading]
+    // params box_b: [x, y, z, dx, dy, dz, heading]
+    float sa = box_a[3] * box_a[4];
+    float sb = box_b[3] * box_b[4];
+    float s_overlap = box_overlap(box_a, box_b);
+    return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+    if (a_idx >= num_a || b_idx >= num_b){
+        return;
+    }
+    const float * cur_box_a = boxes_a + a_idx * 7;
+    const float * cur_box_b = boxes_b + b_idx * 7;
+    float s_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+}
+
+__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+    if (a_idx >= num_a || b_idx >= num_b){
+        return;
+    }
+
+    const float * cur_box_a = boxes_a + a_idx * 7;
+    const float * cur_box_b = boxes_b + b_idx * 7;
+    float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+    ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+}
+
+__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask){
+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+        const float *cur_box = boxes + cur_box_idx * 7;
+
+        int i = 0;
+        unsigned long long t = 0;
+        int start = 0;
+        if (row_start == col_start) {
+          start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
+                t |= 1ULL << i;
+            }
+        }
+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+        mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+}
+
+
+__device__ inline float iou_normal(float const * const a, float const * const b) {
+    //params: a: [x, y, z, dx, dy, dz, heading]
+    //params: b: [x, y, z, dx, dy, dz, heading]
+
+    float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+    float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+    float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+    float interS = width * height;
+    float Sa = a[3] * a[4];
+    float Sb = b[3] * b[4];
+    return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+
+__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask){
+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+        const float *cur_box = boxes + cur_box_idx * 7;
+
+        int i = 0;
+        unsigned long long t = 0;
+        int start = 0;
+        if (row_start == col_start) {
+          start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
+                t |= 1ULL << i;
+            }
+        }
+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+        mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+}
+
+
+NmsCuda::NmsCuda(const int num_threads, const int num_box_corners,
+    const float nms_overlap_threshold)
+: num_threads_(num_threads),
+  num_box_corners_(num_box_corners),
+  nms_overlap_threshold_(nms_overlap_threshold) {}
+
+void NmsCuda::DoNmsCuda(const int host_filter_count,
+    float *dev_sorted_box_for_nms, long *out_keep_inds,
+    int *out_num_to_keep) {
+
+    const int col_blocks = DIVUP(host_filter_count, num_threads_);
+    unsigned long long *dev_mask = NULL;
+    GPU_CHECK(cudaMalloc(&dev_mask, host_filter_count * col_blocks * sizeof(unsigned long long)));    
+
+    dim3 blocks(DIVUP(host_filter_count, num_threads_),
+                DIVUP(host_filter_count, num_threads_));
+    dim3 threads(num_threads_);
+
+    nms_kernel<<<blocks, threads>>>(host_filter_count, nms_overlap_threshold_, dev_sorted_box_for_nms, dev_mask);
+    // postprocess for nms output
+    std::vector<unsigned long long> host_mask(host_filter_count * col_blocks);
+    GPU_CHECK(cudaMemcpy(&host_mask[0], dev_mask,
+            sizeof(unsigned long long) * host_filter_count * col_blocks,
+            cudaMemcpyDeviceToHost));
+    std::vector<unsigned long long> remv(col_blocks);
+    memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+    for (int i = 0; i < host_filter_count; ++i) {
+        int nblock = i / num_threads_;
+        int inblock = i % num_threads_;
+
+        if (!(remv[nblock] & (1ULL << inblock))) {
+            out_keep_inds[(*out_num_to_keep)++] = i;
+            unsigned long long *p = &host_mask[0] + i * col_blocks;
+            for (int j = nblock; j < col_blocks; ++j) {
+                remv[j] |= p[j];
+            }
+        }
+    }
+    GPU_CHECK(cudaFree(dev_mask));
+}

+ 64 - 0
src/detection/detection_lidar_PointPillars_MultiHead/nms.h

@@ -0,0 +1,64 @@
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+class NmsCuda {
+ private:
+  const int num_threads_;
+  const int num_box_corners_;
+  const float nms_overlap_threshold_;
+
+ public:
+  /**
+   * @brief Constructor
+   * @param[in] num_threads Number of threads when launching cuda kernel
+   * @param[in] num_box_corners Number of corners for 2D box
+   * @param[in] nms_overlap_threshold IOU threshold for NMS
+   * @details Captital variables never change after the compile, Non-captital
+   * variables could be chaned through rosparam
+   */
+  NmsCuda(const int num_threads, const int num_box_corners,
+          const float nms_overlap_threshold);
+
+  /**
+   * @brief GPU Non-Maximum Suppresion for network output
+   * @param[in] host_filter_count Number of filtered output
+   * @param[in] dev_sorted_box_for_nms Bounding box output sorted by score
+   * @param[out] out_keep_inds Indexes of selected bounding box
+   * @param[out] out_num_to_keep Number of kept bounding boxes
+   * @details NMS in GPU and postprocessing for selecting box in CPU
+   */
+  void DoNmsCuda(const int host_filter_count, float* dev_sorted_box_for_nms,
+                 long* out_keep_inds, int* out_num_to_keep);
+};

+ 495 - 0
src/detection/detection_lidar_PointPillars_MultiHead/pointpillars.cc

@@ -0,0 +1,495 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+
+
+#include "pointpillars.h"
+
+#include <chrono>
+#include <iostream>
+#include <iostream>
+
+PointPillars::PointPillars(const float score_threshold,
+                           const float nms_overlap_threshold,
+                           const bool use_onnx,
+                           const std::string pfe_file,
+                           const std::string backbone_file,
+                           const std::string pp_config)
+    : score_threshold_(score_threshold),
+      nms_overlap_threshold_(nms_overlap_threshold),
+      use_onnx_(use_onnx),
+      pfe_file_(pfe_file),
+      backbone_file_(backbone_file),
+      pp_config_(pp_config)
+{
+    InitParams();
+    InitTRT(use_onnx_);
+    DeviceMemoryMalloc();
+
+    preprocess_points_cuda_ptr_.reset(new PreprocessPointsCuda(
+        kNumThreads,
+        kMaxNumPillars,
+        kMaxNumPointsPerPillar,
+        kNumPointFeature,
+        kNumIndsForScan,
+        kGridXSize,kGridYSize, kGridZSize,
+        kPillarXSize,kPillarYSize, kPillarZSize,
+        kMinXRange, kMinYRange, kMinZRange));
+
+    scatter_cuda_ptr_.reset(new ScatterCuda(kNumThreads, kGridXSize, kGridYSize));
+
+    const float float_min = std::numeric_limits<float>::lowest();
+    const float float_max = std::numeric_limits<float>::max();
+    postprocess_cuda_ptr_.reset(
+      new PostprocessCuda(kNumThreads,
+                          float_min, float_max, 
+                          kNumClass,kNumAnchorPerCls,
+                          kMultiheadLabelMapping,
+                          score_threshold_, 
+                          nms_overlap_threshold_,
+                          kNmsPreMaxsize, 
+                          kNmsPostMaxsize,
+                          kNumBoxCorners, 
+                          kNumInputBoxFeature,
+                          7));  /*kNumOutputBoxFeature*/
+    
+}
+
+PointPillars::~PointPillars() {
+    // for pillars 
+    GPU_CHECK(cudaFree(dev_num_points_per_pillar_));
+    GPU_CHECK(cudaFree(dev_x_coors_));
+    GPU_CHECK(cudaFree(dev_y_coors_));
+    GPU_CHECK(cudaFree(dev_pillar_point_feature_));
+    GPU_CHECK(cudaFree(dev_pillar_coors_));
+    // for sparse map
+    GPU_CHECK(cudaFree(dev_sparse_pillar_map_));    
+    GPU_CHECK(cudaFree(dev_cumsum_along_x_));
+    GPU_CHECK(cudaFree(dev_cumsum_along_y_));
+    // for pfe forward
+    GPU_CHECK(cudaFree(dev_pfe_gather_feature_));
+      
+    GPU_CHECK(cudaFree(pfe_buffers_[0]));
+    GPU_CHECK(cudaFree(pfe_buffers_[1]));
+
+    GPU_CHECK(cudaFree(rpn_buffers_[0]));
+    GPU_CHECK(cudaFree(rpn_buffers_[1]));
+    GPU_CHECK(cudaFree(rpn_buffers_[2]));
+    GPU_CHECK(cudaFree(rpn_buffers_[3]));
+    GPU_CHECK(cudaFree(rpn_buffers_[4]));
+    GPU_CHECK(cudaFree(rpn_buffers_[5]));
+    GPU_CHECK(cudaFree(rpn_buffers_[6]));
+    GPU_CHECK(cudaFree(rpn_buffers_[7]));
+    pfe_context_->destroy();
+    backbone_context_->destroy();
+    pfe_engine_->destroy();
+    backbone_engine_->destroy();
+    // for post process
+    GPU_CHECK(cudaFree(dev_scattered_feature_));
+    GPU_CHECK(cudaFree(dev_filtered_box_));
+    GPU_CHECK(cudaFree(dev_filtered_score_));
+    GPU_CHECK(cudaFree(dev_filtered_label_));
+    GPU_CHECK(cudaFree(dev_filtered_dir_));
+    GPU_CHECK(cudaFree(dev_box_for_nms_));
+    GPU_CHECK(cudaFree(dev_filter_count_));
+
+
+}
+
+void PointPillars::InitParams()
+{
+    YAML::Node params = YAML::LoadFile(pp_config_);
+    kPillarXSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][0].as<float>();
+    kPillarYSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][1].as<float>();
+    kPillarZSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][2].as<float>();
+    kMinXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][0].as<float>();
+    kMinYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][1].as<float>();
+    kMinZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][2].as<float>();
+    kMaxXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][3].as<float>();
+    kMaxYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][4].as<float>();
+    kMaxZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][5].as<float>();
+    kNumClass = params["CLASS_NAMES"].size();
+    kMaxNumPillars = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_NUMBER_OF_VOXELS"]["test"].as<int>();
+    kMaxNumPointsPerPillar = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_POINTS_PER_VOXEL"].as<int>();
+    kNumPointFeature = 5; // [x, y, z, i,0]
+    kNumInputBoxFeature = 7;
+    kNumOutputBoxFeature = params["MODEL"]["DENSE_HEAD"]["TARGET_ASSIGNER_CONFIG"]["BOX_CODER_CONFIG"]["code_size"].as<int>();
+    kBatchSize = 1;
+    kNumIndsForScan = 1024;
+    kNumThreads = 64;
+    kNumBoxCorners = 8;
+    kAnchorStrides = 4;
+    kNmsPreMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_PRE_MAXSIZE"].as<int>();
+    kNmsPostMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_POST_MAXSIZE"].as<int>();
+    //params for initialize anchors
+    //Adapt to OpenPCDet
+    kAnchorNames = params["CLASS_NAMES"].as<std::vector<std::string>>();
+    for (int i = 0; i < kAnchorNames.size(); ++i)
+    {
+        kAnchorDxSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][0].as<float>());
+        kAnchorDySizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][1].as<float>());
+        kAnchorDzSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][2].as<float>());
+        kAnchorBottom.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_bottom_heights"][0].as<float>());
+    }
+    for (int idx_head = 0; idx_head < params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"].size(); ++idx_head)
+    {
+        int num_cls_per_head = params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"][idx_head]["HEAD_CLS_NAME"].size();
+        std::vector<int> value;
+        for (int i = 0; i < num_cls_per_head; ++i)
+        {
+            value.emplace_back(idx_head + i);
+        }
+        kMultiheadLabelMapping.emplace_back(value);
+    }
+
+    // Generate secondary parameters based on above.
+    kGridXSize = static_cast<int>((kMaxXRange - kMinXRange) / kPillarXSize); //512
+    kGridYSize = static_cast<int>((kMaxYRange - kMinYRange) / kPillarYSize); //512
+    kGridZSize = static_cast<int>((kMaxZRange - kMinZRange) / kPillarZSize); //1
+    kRpnInputSize = 64 * kGridYSize * kGridXSize;
+
+    kNumAnchorXinds = static_cast<int>(kGridXSize / kAnchorStrides); //Width
+    kNumAnchorYinds = static_cast<int>(kGridYSize / kAnchorStrides); //Hight
+    kNumAnchor = kNumAnchorXinds * kNumAnchorYinds * 2 * kNumClass;  // H * W * Ro * N = 196608
+
+    kNumAnchorPerCls = kNumAnchorXinds * kNumAnchorYinds * 2; //H * W * Ro = 32768
+    kRpnBoxOutputSize = kNumAnchor * kNumOutputBoxFeature;
+    kRpnClsOutputSize = kNumAnchor * kNumClass;
+    kRpnDirOutputSize = kNumAnchor * 2;
+}
+
+void PointPillars::DeviceMemoryMalloc() {
+    // for pillars 
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_num_points_per_pillar_), kMaxNumPillars * sizeof(float))); // M
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_x_coors_), kMaxNumPillars * sizeof(int))); // M
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_y_coors_), kMaxNumPillars * sizeof(int))); // M
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_), kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float))); // [M , m , 4]
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_coors_),  kMaxNumPillars * 4 * sizeof(float))); // [M , 4]
+    // for sparse map
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_sparse_pillar_map_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_x_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_y_), kNumIndsForScan * kNumIndsForScan * sizeof(int)));// [1024 , 1024]
+
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pfe_gather_feature_),
+                        kMaxNumPillars * kMaxNumPointsPerPillar *
+                            kNumGatherPointFeature * sizeof(float)));
+    // for trt inference
+    // create GPU buffers and a stream
+
+    GPU_CHECK(
+        cudaMalloc(&pfe_buffers_[0], kMaxNumPillars * kMaxNumPointsPerPillar *
+                                        kNumGatherPointFeature * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * 64 * sizeof(float)));
+
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[0],  kRpnInputSize * sizeof(float)));
+
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[1],  kNumAnchorPerCls  * sizeof(float)));  //classes
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[2],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[3],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[4],  kNumAnchorPerCls  * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[5],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[6],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
+    
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[7],  kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature * sizeof(float))); //boxes
+
+    // for scatter kernel
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_scattered_feature_),
+                        kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
+    // for filter
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_box_),
+                        kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_score_),
+                        kNumAnchor * sizeof(float)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_label_),
+                        kNumAnchor * sizeof(int)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_dir_),
+                        kNumAnchor * sizeof(int)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_box_for_nms_),
+                        kNumAnchor * kNumBoxCorners * sizeof(float)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filter_count_), kNumClass * sizeof(int)));
+
+}
+
+void PointPillars::SetDeviceMemoryToZero() {
+
+    GPU_CHECK(cudaMemset(dev_num_points_per_pillar_, 0, kMaxNumPillars * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_x_coors_,               0, kMaxNumPillars * sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_y_coors_,               0, kMaxNumPillars * sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_pillar_point_feature_,  0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_pillar_coors_,          0, kMaxNumPillars * 4 * sizeof(float)));
+    // GPU_CHECK(cudaMemset(dev_sparse_pillar_map_,     0, kNumIndsForScan * kNumIndsForScan * sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_pfe_gather_feature_,    0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
+    
+    // GPU_CHECK(cudaMemset(pfe_buffers_[0],       0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(pfe_buffers_[1],       0, kMaxNumPillars * 64 * sizeof(float)));
+
+    GPU_CHECK(cudaMemset(dev_scattered_feature_,    0, kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
+    
+    // GPU_CHECK(cudaMemset(rpn_buffers_[0],    0, kRpnInputSize * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[1],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[2],    0, kNumAnchorPerCls     * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[3],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[4],    0, kNumAnchorPerCls * 4 * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[5],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[6],    0, kNumAnchorPerCls     * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[7],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[8],    0, kNumAnchorPerCls * 4 * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[9],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[10],   0, kNumAnchorPerCls     * sizeof(float)));
+
+    GPU_CHECK(cudaMemset(dev_filtered_box_,     0, kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_filtered_score_,   0, kNumAnchor * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_filter_count_,     0, kNumClass * sizeof(int)));
+}
+
+
+
+
+
+void PointPillars::InitTRT(const bool use_onnx) {
+  if (use_onnx_) {
+    // create a TensorRT model from the onnx model and load it into an engine
+    OnnxToTRTModel(pfe_file_, &pfe_engine_);
+    OnnxToTRTModel(backbone_file_, &backbone_engine_);
+  }else {
+    EngineToTRTModel(pfe_file_, &pfe_engine_);
+    EngineToTRTModel(backbone_file_, &backbone_engine_);
+  }
+    if (pfe_engine_ == nullptr || backbone_engine_ == nullptr) {
+        std::cerr << "Failed to load ONNX file.";
+    }
+
+    // create execution context from the engine
+    pfe_context_ = pfe_engine_->createExecutionContext();
+    backbone_context_ = backbone_engine_->createExecutionContext();
+    if (pfe_context_ == nullptr || backbone_context_ == nullptr) {
+        std::cerr << "Failed to create TensorRT Execution Context.";
+    }
+  
+}
+
+void PointPillars::OnnxToTRTModel(
+    const std::string& model_file,  // name of the onnx model
+    nvinfer1::ICudaEngine** engine_ptr) {
+    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
+
+    // create the builder
+    const auto explicit_batch =
+        static_cast<uint32_t>(kBatchSize) << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(g_logger_);
+    nvinfer1::INetworkDefinition* network =
+        builder->createNetworkV2(explicit_batch);
+
+    // parse onnx model
+    auto parser = nvonnxparser::createParser(*network, g_logger_);
+    if (!parser->parseFromFile(model_file.c_str(), verbosity)) {
+        std::string msg("failed to parse onnx file");
+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    // Build the engine
+    builder->setMaxBatchSize(kBatchSize);
+    builder->setHalf2Mode(true);
+    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
+    config->setMaxWorkspaceSize(1 << 25);
+    nvinfer1::ICudaEngine* engine =
+        builder->buildEngineWithConfig(*network, *config);
+
+    *engine_ptr = engine;
+    parser->destroy();
+    network->destroy();
+    config->destroy();
+    builder->destroy();
+}
+
+
+void PointPillars::EngineToTRTModel(
+    const std::string &engine_file ,     
+    nvinfer1::ICudaEngine** engine_ptr)  {
+    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
+    std::stringstream gieModelStream; 
+    gieModelStream.seekg(0, gieModelStream.beg); 
+
+    std::ifstream cache(engine_file); 
+    gieModelStream << cache.rdbuf();
+    cache.close(); 
+    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(g_logger_); 
+
+    if (runtime == nullptr) {
+        std::string msg("failed to build runtime parser");
+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
+        exit(EXIT_FAILURE);
+    }
+    gieModelStream.seekg(0, std::ios::end);
+    const int modelSize = gieModelStream.tellg(); 
+
+    gieModelStream.seekg(0, std::ios::beg);
+    void* modelMem = malloc(modelSize); 
+    gieModelStream.read((char*)modelMem, modelSize);
+
+    std::cout << "                                                                  "<< std::endl;
+    std::cout << "------------------------------------------------------------------"<< std::endl;
+    std::cout << ">>>>                                                          >>>>"<< std::endl;
+    std::cout << "                                                                  "<< std::endl;
+    std::cout << "Input filename:   " << engine_file << std::endl;
+    std::cout << "                                                                  "<< std::endl;
+    std::cout << ">>>>                                                          >>>>"<< std::endl;
+    std::cout << "------------------------------------------------------------------"<< std::endl;
+    std::cout << "                                                                  "<< std::endl;
+
+    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, NULL); 
+    if (engine == nullptr) {
+        std::string msg("failed to build engine parser");
+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
+        exit(EXIT_FAILURE);
+    }
+    *engine_ptr = engine;
+
+}
+
+void PointPillars::DoInference(const float* in_points_array,
+                                const int in_num_points,
+                                std::vector<float>* out_detections,
+                                std::vector<int>* out_labels,
+                                std::vector<float>* out_scores) 
+{
+    SetDeviceMemoryToZero();
+    cudaDeviceSynchronize();
+    // [STEP 1] : load pointcloud
+    float* dev_points;
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points),
+                        in_num_points * kNumPointFeature * sizeof(float))); // in_num_points , 5
+    GPU_CHECK(cudaMemset(dev_points, 0, in_num_points * kNumPointFeature * sizeof(float)));
+    GPU_CHECK(cudaMemcpy(dev_points, in_points_array,
+                        in_num_points * kNumPointFeature * sizeof(float),
+                        cudaMemcpyHostToDevice));
+    
+    // [STEP 2] : preprocess
+    host_pillar_count_[0] = 0;
+    auto preprocess_start = std::chrono::high_resolution_clock::now();
+    preprocess_points_cuda_ptr_->DoPreprocessPointsCuda(
+          dev_points, in_num_points, dev_x_coors_, dev_y_coors_,
+          dev_num_points_per_pillar_, dev_pillar_point_feature_, dev_pillar_coors_,
+          dev_sparse_pillar_map_, host_pillar_count_ ,
+          dev_pfe_gather_feature_ );
+    cudaDeviceSynchronize();
+    auto preprocess_end = std::chrono::high_resolution_clock::now();
+    // DEVICE_SAVE<float>(dev_pfe_gather_feature_,  kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature  , "0_Model_pfe_input_gather_feature");
+
+    // [STEP 3] : pfe forward
+    cudaStream_t stream;
+    GPU_CHECK(cudaStreamCreate(&stream));
+    auto pfe_start = std::chrono::high_resolution_clock::now();
+    GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[0], dev_pfe_gather_feature_,
+                            kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float), ///kNumGatherPointFeature
+                            cudaMemcpyDeviceToDevice, stream));
+    pfe_context_->enqueueV2(pfe_buffers_, stream, nullptr);
+    cudaDeviceSynchronize();
+    auto pfe_end = std::chrono::high_resolution_clock::now();
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(pfe_buffers_[1]),  kMaxNumPillars * 64 , "1_Model_pfe_output_buffers_[1]");
+
+    // [STEP 4] : scatter pillar feature
+    auto scatter_start = std::chrono::high_resolution_clock::now();
+    scatter_cuda_ptr_->DoScatterCuda(
+        host_pillar_count_[0], dev_x_coors_, dev_y_coors_,
+        reinterpret_cast<float*>(pfe_buffers_[1]), dev_scattered_feature_);
+    cudaDeviceSynchronize();
+    auto scatter_end = std::chrono::high_resolution_clock::now();   
+    // DEVICE_SAVE<float>(dev_scattered_feature_ ,  kRpnInputSize,"2_Model_backbone_input_dev_scattered_feature");
+
+    // [STEP 5] : backbone forward
+    auto backbone_start = std::chrono::high_resolution_clock::now();
+    GPU_CHECK(cudaMemcpyAsync(rpn_buffers_[0], dev_scattered_feature_,
+                            kBatchSize * kRpnInputSize * sizeof(float),
+                            cudaMemcpyDeviceToDevice, stream));
+    backbone_context_->enqueueV2(rpn_buffers_, stream, nullptr);
+    cudaDeviceSynchronize();
+    auto backbone_end = std::chrono::high_resolution_clock::now();
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[1]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[1]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[2]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[2]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[3]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[3]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[4]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[4]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[5]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[5]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[6]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[6]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[7]) ,  kNumAnchorPerCls * kNumClass * 9 ,"3_rpn_buffers_[7]");
+
+    // [STEP 6]: postprocess (multihead)
+    auto postprocess_start = std::chrono::high_resolution_clock::now();
+    GPU_CHECK(cudaMemset(dev_filter_count_, 0, kNumClass * sizeof(int)));
+    postprocess_cuda_ptr_->DoPostprocessCuda(
+        reinterpret_cast<float*>(rpn_buffers_[1]), // [cls]   kNumAnchorPerCls 
+        reinterpret_cast<float*>(rpn_buffers_[2]), // [cls]   kNumAnchorPerCls * 2 * 2
+        reinterpret_cast<float*>(rpn_buffers_[3]), // [cls]   kNumAnchorPerCls * 2 * 2
+        reinterpret_cast<float*>(rpn_buffers_[4]), // [cls]   kNumAnchorPerCls 
+        reinterpret_cast<float*>(rpn_buffers_[5]), // [cls]   kNumAnchorPerCls * 2 * 2
+        reinterpret_cast<float*>(rpn_buffers_[6]), // [cls]   kNumAnchorPerCls * 2 * 2
+        reinterpret_cast<float*>(rpn_buffers_[7]), // [boxes] kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature
+        dev_filtered_box_, dev_filtered_score_, dev_filter_count_,
+        *out_detections, *out_labels , *out_scores);
+    cudaDeviceSynchronize();
+    auto postprocess_end = std::chrono::high_resolution_clock::now();
+
+    // release the stream and the buffers
+    std::chrono::duration<double> preprocess_cost = preprocess_end - preprocess_start;
+    std::chrono::duration<double> pfe_cost = pfe_end - pfe_start;
+    std::chrono::duration<double> scatter_cost = scatter_end - scatter_start;
+    std::chrono::duration<double> backbone_cost = backbone_end - backbone_start;
+    std::chrono::duration<double> postprocess_cost = postprocess_end - postprocess_start;
+
+    std::chrono::duration<double> pointpillars_cost = postprocess_end - preprocess_start;
+    std::cout << "------------------------------------" << std::endl;
+    std::cout << setiosflags(ios::left)  << setw(14) << "Module" << setw(12)  << "Time"  << resetiosflags(ios::left) << std::endl;
+    std::cout << "------------------------------------" << std::endl;
+    std::string Modules[] = {"Preprocess" , "Pfe" , "Scatter" , "Backbone" , "Postprocess" , "Summary"};
+    double Times[] = {preprocess_cost.count() , pfe_cost.count() , scatter_cost.count() , backbone_cost.count() , postprocess_cost.count() , pointpillars_cost.count()}; 
+
+    for (int i =0 ; i < 6 ; ++i) {
+        std::cout << setiosflags(ios::left) << setw(14) << Modules[i]  << setw(8)  << Times[i] * 1000 << " ms" << resetiosflags(ios::left) << std::endl;
+    }
+    std::cout << "------------------------------------" << std::endl;
+    cudaStreamDestroy(stream);
+
+}

+ 286 - 0
src/detection/detection_lidar_PointPillars_MultiHead/pointpillars.h

@@ -0,0 +1,286 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+#pragma once
+
+// headers in STL
+#include <algorithm>
+#include <cmath>
+#include <iomanip>
+#include <limits>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+// headers in TensorRT
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
+
+// headers in local files
+// #include "params.h"
+#include "common.h"
+#include <yaml-cpp/yaml.h>
+#include "preprocess.h"
+#include "scatter.h"
+#include "postprocess.h"
+
+using namespace std;
+
+// Logger for TensorRT info/warning/errors
+class Logger : public nvinfer1::ILogger {
+ public:
+  explicit Logger(Severity severity = Severity::kWARNING)
+      : reportable_severity(severity) {}
+
+  void log(Severity severity, const char* msg) override {
+    // suppress messages with severity enum value greater than the reportable
+    if (severity > reportable_severity) return;
+
+    switch (severity) {
+      case Severity::kINTERNAL_ERROR:
+        std::cerr << "INTERNAL_ERROR: ";
+        break;
+      case Severity::kERROR:
+        std::cerr << "ERROR: ";
+        break;
+      case Severity::kWARNING:
+        std::cerr << "WARNING: ";
+        break;
+      case Severity::kINFO:
+        std::cerr << "INFO: ";
+        break;
+      default:
+        std::cerr << "UNKNOWN: ";
+        break;
+    }
+    std::cerr << msg << std::endl;
+  }
+
+  Severity reportable_severity;
+};
+
+
+
+class PointPillars {
+ private:
+    // initialize in initializer list
+    const float score_threshold_;
+    const float nms_overlap_threshold_;
+    const bool use_onnx_;
+    const std::string pfe_file_;
+    const std::string backbone_file_;
+    const std::string pp_config_;
+    // end initializer list
+    // voxel size
+    float kPillarXSize;
+    float kPillarYSize;
+    float kPillarZSize;
+    // point cloud range
+    float kMinXRange;
+    float kMinYRange;
+    float kMinZRange;
+    float kMaxXRange;
+    float kMaxYRange;
+    float kMaxZRange;
+    // hyper parameters
+    int kNumClass;
+    int kMaxNumPillars;
+    int kMaxNumPointsPerPillar;
+    int kNumPointFeature;
+    int kNumGatherPointFeature = 11;
+    int kGridXSize;
+    int kGridYSize;
+    int kGridZSize;
+    int kNumAnchorXinds;
+    int kNumAnchorYinds;
+    int kRpnInputSize;
+    int kNumAnchor;
+    int kNumInputBoxFeature;
+    int kNumOutputBoxFeature;
+    int kRpnBoxOutputSize;
+    int kRpnClsOutputSize;
+    int kRpnDirOutputSize;
+    int kBatchSize;
+    int kNumIndsForScan;
+    int kNumThreads;
+    // if you change kNumThreads, need to modify NUM_THREADS_MACRO in
+    // common.h
+    int kNumBoxCorners;
+    int kNmsPreMaxsize;
+    int kNmsPostMaxsize;
+    //params for initialize anchors
+    //Adapt to OpenPCDet
+    int kAnchorStrides;
+    std::vector<string> kAnchorNames;
+    std::vector<float> kAnchorDxSizes;
+    std::vector<float> kAnchorDySizes;
+    std::vector<float> kAnchorDzSizes;
+    std::vector<float> kAnchorBottom;
+    std::vector<std::vector<int>> kMultiheadLabelMapping;
+    int kNumAnchorPerCls;
+    int host_pillar_count_[1];
+
+    int* dev_x_coors_;
+    int* dev_y_coors_;
+    float* dev_num_points_per_pillar_;
+    int* dev_sparse_pillar_map_;
+    int* dev_cumsum_along_x_;
+    int* dev_cumsum_along_y_;
+
+    float* dev_pillar_point_feature_;
+    float* dev_pillar_coors_;
+    float* dev_points_mean_;
+
+    float* dev_pfe_gather_feature_;
+    void* pfe_buffers_[2];
+    //variable for doPostprocessCudaMultiHead
+    void* rpn_buffers_[8];
+    
+    std::vector<float*> rpn_box_output_; 
+    std::vector<float*> rpn_cls_output_;
+
+    float* dev_scattered_feature_;
+
+    float* dev_filtered_box_;
+    float* dev_filtered_score_;
+    int*   dev_filtered_label_;
+    int*   dev_filtered_dir_;
+    float* dev_box_for_nms_;
+    int*   dev_filter_count_;
+
+    std::unique_ptr<PreprocessPointsCuda> preprocess_points_cuda_ptr_;
+    std::unique_ptr<ScatterCuda> scatter_cuda_ptr_;
+    std::unique_ptr<PostprocessCuda> postprocess_cuda_ptr_;
+
+    Logger g_logger_;
+    nvinfer1::ICudaEngine* pfe_engine_;
+    nvinfer1::ICudaEngine* backbone_engine_;
+    nvinfer1::IExecutionContext* pfe_context_;
+    nvinfer1::IExecutionContext* backbone_context_;
+
+    /**
+     * @brief Memory allocation for device memory
+     * @details Called in the constructor
+     */
+    void DeviceMemoryMalloc();
+
+    /**
+     * @brief Memory set to 0 for device memory
+     * @details Called in the DoInference
+     */
+    void SetDeviceMemoryToZero();
+
+    /**
+     * @brief Initializing paraments from pointpillars.yaml
+     * @details Called in the constructor
+     */
+    void InitParams();
+    /**
+     * @brief Initializing TensorRT instances
+     * @param[in] usr_onnx_ if true, parse ONNX 
+     * @details Called in the constructor
+     */
+    void InitTRT(const bool use_onnx);
+    /**
+     * @brief Convert ONNX to TensorRT model
+     * @param[in] model_file ONNX model file path
+     * @param[out] engine_ptr TensorRT model engine made out of ONNX model
+     * @details Load ONNX model, and convert it to TensorRT model
+     */
+    void OnnxToTRTModel(const std::string& model_file,
+                        nvinfer1::ICudaEngine** engine_ptr);
+
+    /**
+     * @brief Convert Engine to TensorRT model
+     * @param[in] model_file Engine(TensorRT) model file path
+     * @param[out] engine_ptr TensorRT model engine made 
+     * @details Load Engine model, and convert it to TensorRT model
+     */
+    void EngineToTRTModel(const std::string &engine_file ,     
+                        nvinfer1::ICudaEngine** engine_ptr) ;
+
+    /**
+     * @brief Preproces points
+     * @param[in] in_points_array Point cloud array
+     * @param[in] in_num_points Number of points
+     * @details Call CPU or GPU preprocess
+     */
+    void Preprocess(const float* in_points_array, const int in_num_points);
+
+    public:
+    /**
+     * @brief Constructor
+     * @param[in] score_threshold Score threshold for filtering output
+     * @param[in] nms_overlap_threshold IOU threshold for NMS
+     * @param[in] use_onnx if true,using onnx file ,else using engine file
+     * @param[in] pfe_file Pillar Feature Extractor ONNX file path
+     * @param[in] rpn_file Region Proposal Network ONNX file path
+     * @details Variables could be changed through point_pillars_detection
+     */
+    PointPillars(const float score_threshold,
+                const float nms_overlap_threshold,
+                const bool use_onnx,
+                const std::string pfe_file,
+                const std::string rpn_file,
+                const std::string pp_config);
+    ~PointPillars();
+
+    /**
+     * @brief Call PointPillars for the inference
+     * @param[in] in_points_array Point cloud array
+     * @param[in] in_num_points Number of points
+     * @param[out] out_detections Network output bounding box
+     * @param[out] out_labels Network output object's label
+     * @details This is an interface for the algorithm
+     */
+    void DoInference(const float* in_points_array,
+                    const int in_num_points,
+                    std::vector<float>* out_detections,
+                    std::vector<int>* out_labels,
+                    std::vector<float>* out_scores);
+};
+

+ 384 - 0
src/detection/detection_lidar_PointPillars_MultiHead/postprocess.cu

@@ -0,0 +1,384 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+// headers in CUDA
+#include <thrust/sort.h>
+
+// headers in local files
+#include "common.h"
+#include "postprocess.h"
+#include <stdio.h>
+
+
+// sigmoid_filter_warp
+__device__ void box_decode_warp(int head_offset, const float* box_pred, 
+    int tid , int num_anchors_per_head , int counter, float* filtered_box) 
+{
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 0] = box_pred[ head_offset + tid * 9 + 0];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 1] = box_pred[ head_offset + tid * 9 + 1];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 2] = box_pred[ head_offset + tid * 9 + 2];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 3] = box_pred[ head_offset + tid * 9 + 3];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 4] = box_pred[ head_offset + tid * 9 + 4];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 5] = box_pred[ head_offset + tid * 9 + 5];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 6] = box_pred[ head_offset + tid * 9 + 6];
+}
+
+
+__global__ void sigmoid_filter_kernel(
+
+    float* cls_pred_0,
+    float* cls_pred_12,
+    float* cls_pred_34,
+    float* cls_pred_5,
+    float* cls_pred_67,
+    float* cls_pred_89,
+
+    const float* box_pred_0,
+
+    const float* box_pred_1,
+    const float* box_pred_2,
+
+    const float* box_pred_3,
+    const float* box_pred_4,
+
+    const float* box_pred_5,
+
+    const float* box_pred_6,
+    const float* box_pred_7,
+
+    const float* box_pred_8,
+    const float* box_pred_9,
+
+    float* filtered_box, 
+    float* filtered_score, 
+    int* filter_count,
+
+    const float score_threshold) {   
+
+    // cls_pred_34 
+    // 32768*2 , 2
+
+    int num_anchors_per_head = gridDim.x * gridDim.y * blockDim.x;
+    // 16 * 4 * 512 = 32768
+    extern __shared__ float cls_score[];
+    cls_score[threadIdx.x + blockDim.x] = -1.0f;
+
+    int tid = blockIdx.x * gridDim.y * blockDim.x + blockIdx.y *  blockDim.x + threadIdx.x; 
+
+
+    if ( blockIdx.z == 0) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_0[ tid ]));
+    if ( blockIdx.z == 1) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2]));}
+    if ( blockIdx.z == 2) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 + 1]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2 + 1]));}
+
+    if ( blockIdx.z == 3) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2]));}
+    if ( blockIdx.z == 4) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 + 1 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2 + 1]));}
+
+    if ( blockIdx.z == 5) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_5[ tid ]));
+
+    if ( blockIdx.z == 6) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2]));}
+    if ( blockIdx.z == 7) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 + 1 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2 + 1]));}
+
+    if ( blockIdx.z == 8) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2]));}
+    if ( blockIdx.z == 9) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 + 1 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2 + 1]));}
+    
+    __syncthreads();
+    
+    if( cls_score[ threadIdx.x ] > score_threshold) 
+    {
+        int counter = atomicAdd(&filter_count[blockIdx.z], 1);
+        if ( blockIdx.z == 0) {
+            box_decode_warp(0 ,box_pred_0 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 1) {
+            box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 2) {
+            box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 3) {
+            box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else 
+        if (blockIdx.z == 4) {
+            box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];            
+        }else
+        if ( blockIdx.z == 5) {
+            box_decode_warp(0 ,box_pred_5 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 6) {
+            box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 7) {
+            box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 8) {
+
+            box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 9) {
+            box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }
+    }
+    __syncthreads();  
+    if( cls_score[ threadIdx.x + blockDim.x ] > score_threshold)  {     
+            int counter = atomicAdd(&filter_count[blockIdx.z], 1);
+            // printf("counter : %d \n" , counter);
+            if (blockIdx.z == 1) {
+                box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 2) {
+                box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 3) {
+                box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 4) {
+                box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 6) {
+                box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 7) {
+                box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 8) {
+                box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 9) {
+                box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }
+    }
+}
+
+__global__ void sort_boxes_by_indexes_kernel(float* filtered_box, float* filtered_scores, int* indexes, int filter_count,
+    float* sorted_filtered_boxes, float* sorted_filtered_scores,
+    const int num_output_box_feature)
+{
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if(tid < filter_count)  {
+
+        int sort_index = indexes[tid];
+        sorted_filtered_boxes[tid * num_output_box_feature + 0] = filtered_box[sort_index * num_output_box_feature + 0];
+        sorted_filtered_boxes[tid * num_output_box_feature + 1] = filtered_box[sort_index * num_output_box_feature + 1];
+        sorted_filtered_boxes[tid * num_output_box_feature + 2] = filtered_box[sort_index * num_output_box_feature + 2];
+        sorted_filtered_boxes[tid * num_output_box_feature + 3] = filtered_box[sort_index * num_output_box_feature + 3];
+        sorted_filtered_boxes[tid * num_output_box_feature + 4] = filtered_box[sort_index * num_output_box_feature + 4];
+        sorted_filtered_boxes[tid * num_output_box_feature + 5] = filtered_box[sort_index * num_output_box_feature + 5];
+        sorted_filtered_boxes[tid * num_output_box_feature + 6] = filtered_box[sort_index * num_output_box_feature + 6];
+
+        // sorted_filtered_dir[tid] = filtered_dir[sort_index];
+        sorted_filtered_scores[tid] = filtered_scores[sort_index];
+    }
+}
+
+
+
+PostprocessCuda::PostprocessCuda(const int num_threads, const float float_min, const float float_max,
+    const int num_class,const int num_anchor_per_cls,
+    const std::vector<std::vector<int>> multihead_label_mapping,
+    const float score_threshold,  const float nms_overlap_threshold, 
+    const int nms_pre_maxsize, const int nms_post_maxsize,
+    const int num_box_corners, 
+    const int num_input_box_feature,
+    const int num_output_box_feature)
+: num_threads_(num_threads),
+  float_min_(float_min),
+  float_max_(float_max),
+  num_class_(num_class),
+  num_anchor_per_cls_(num_anchor_per_cls),
+  multihead_label_mapping_(multihead_label_mapping),
+  score_threshold_(score_threshold),
+  nms_overlap_threshold_(nms_overlap_threshold),
+  nms_pre_maxsize_(nms_pre_maxsize),
+  nms_post_maxsize_(nms_post_maxsize),
+  num_box_corners_(num_box_corners),
+  num_input_box_feature_(num_input_box_feature),
+  num_output_box_feature_(num_output_box_feature) {
+    nms_cuda_ptr_.reset(
+    new NmsCuda(num_threads_, num_box_corners_, nms_overlap_threshold_));
+
+}
+
+
+void PostprocessCuda::DoPostprocessCuda(
+    float* cls_pred_0,
+    float* cls_pred_12,
+    float* cls_pred_34,
+    float* cls_pred_5,
+    float* cls_pred_67,
+    float* cls_pred_89,
+
+    const float* box_preds,
+   
+    float* dev_filtered_box, 
+    float* dev_filtered_score, 
+    int* dev_filter_count,
+    std::vector<float>& out_detection, std::vector<int>& out_label , std::vector<float>& out_score) {
+    // 在此之前,先进行rpn_box_output的concat. 
+    // 128x128 的feature map, cls_pred 的shape为(32768,1),(32768,1),(32768,1),(65536,2),(32768,1)
+    dim3 gridsize(16, 4 , 10);  //16 *  4  * 512  = 32768 代表一个head的anchors
+    sigmoid_filter_kernel<<< gridsize, 512 , 512 * 2 * sizeof(float)>>>(
+        cls_pred_0,
+        cls_pred_12, 
+        cls_pred_34, 
+        cls_pred_5, 
+        cls_pred_67, 
+        cls_pred_89,
+
+        &box_preds[0 * 32768 * 9],
+        &box_preds[1 * 32768 * 9],
+        &box_preds[2 * 32768 * 9],
+        &box_preds[3 * 32768 * 9],
+        &box_preds[4 * 32768 * 9],
+        &box_preds[5 * 32768 * 9],
+        &box_preds[6 * 32768 * 9],
+        &box_preds[7 * 32768 * 9],
+        &box_preds[8 * 32768 * 9],
+        &box_preds[9 * 32768 * 9],
+
+        dev_filtered_box, 
+        dev_filtered_score,  
+        dev_filter_count, 
+    
+        score_threshold_);
+    cudaDeviceSynchronize();
+    
+    int host_filter_count[num_class_] = {0};
+    GPU_CHECK(cudaMemcpy(host_filter_count, dev_filter_count, num_class_ * sizeof(int), cudaMemcpyDeviceToHost));
+    
+    for (int i = 0; i < num_class_; ++ i) {
+        if(host_filter_count[i] <= 0) continue;
+
+        int* dev_indexes;
+        float* dev_sorted_filtered_box;
+        float* dev_sorted_filtered_scores;
+        GPU_CHECK(cudaMalloc((void**)&dev_indexes, host_filter_count[i] * sizeof(int)));
+        GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float)));
+        GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_scores, host_filter_count[i]*sizeof(float)));
+        // GPU_CHECK(cudaMalloc((void**)&dev_sorted_box_for_nms, NUM_BOX_CORNERS_*host_filter_count[i]*sizeof(float)));
+        thrust::sequence(thrust::device, dev_indexes, dev_indexes + host_filter_count[i]);
+        thrust::sort_by_key(thrust::device, 
+                            &dev_filtered_score[i * num_anchor_per_cls_], 
+                            &dev_filtered_score[i * num_anchor_per_cls_ + host_filter_count[i]],
+                            dev_indexes, 
+                            thrust::greater<float>());
+
+        const int num_blocks = DIVUP(host_filter_count[i], num_threads_);
+
+        sort_boxes_by_indexes_kernel<<<num_blocks, num_threads_>>>(
+            &dev_filtered_box[i * num_anchor_per_cls_ * num_output_box_feature_], 
+            &dev_filtered_score[i * num_anchor_per_cls_], 
+            dev_indexes, 
+            host_filter_count[i],
+            dev_sorted_filtered_box, 
+            dev_sorted_filtered_scores,
+            num_output_box_feature_);
+
+        int num_box_for_nms = min(nms_pre_maxsize_, host_filter_count[i]);
+        long* keep_inds = new long[num_box_for_nms];  // index of kept box
+        memset(keep_inds, 0, num_box_for_nms * sizeof(int));
+        int num_out = 0;
+        nms_cuda_ptr_->DoNmsCuda(num_box_for_nms, dev_sorted_filtered_box, keep_inds, &num_out);
+
+        num_out = min(num_out, nms_post_maxsize_);
+
+        float* host_filtered_box = new float[host_filter_count[i] * num_output_box_feature_]();
+        float* host_filtered_scores = new float[host_filter_count[i]]();
+
+
+        cudaMemcpy(host_filtered_box, dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float), cudaMemcpyDeviceToHost);
+        cudaMemcpy(host_filtered_scores, dev_sorted_filtered_scores, host_filter_count[i] * sizeof(float), cudaMemcpyDeviceToHost);
+        for (int i = 0; i < num_out; ++i)  {
+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 0]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 1]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 2]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 3]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 4]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 5]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 6]);
+            out_score.emplace_back(host_filtered_scores[keep_inds[i]]);
+            out_label.emplace_back(i);
+
+        }
+        delete[] keep_inds;
+        delete[] host_filtered_scores;
+        delete[] host_filtered_box;
+
+        GPU_CHECK(cudaFree(dev_indexes));
+        GPU_CHECK(cudaFree(dev_sorted_filtered_box));
+    }
+}

+ 125 - 0
src/detection/detection_lidar_PointPillars_MultiHead/postprocess.h

@@ -0,0 +1,125 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+#pragma once
+#include <memory>
+#include <vector>
+#include "nms.h"
+
+class PostprocessCuda {
+ private:
+    // initializer list
+
+    const int num_threads_;
+    const float float_min_;
+    const float float_max_;
+    const int num_class_;
+    const int num_anchor_per_cls_;
+    const float score_threshold_;
+    const float nms_overlap_threshold_;
+    const int nms_pre_maxsize_;
+    const int nms_post_maxsize_;
+    const int num_box_corners_;
+    const int num_input_box_feature_;
+    const int num_output_box_feature_;
+    const std::vector<std::vector<int>> multihead_label_mapping_;
+    // end initializer list
+
+    std::unique_ptr<NmsCuda> nms_cuda_ptr_;
+  public:
+  /**
+   * @brief Constructor
+   * @param[in] num_threads Number of threads when launching cuda kernel
+   * @param[in] float_min The lowest float value
+   * @param[in] float_max The maximum float value
+   * @param[in] num_class Number of classes 
+   * @param[in] num_anchor_per_cls Number anchor per category
+   * @param[in] multihead_label_mapping 
+   * @param[in] score_threshold Score threshold for filtering output
+   * @param[in] nms_overlap_threshold IOU threshold for NMS
+   * @param[in] nms_pre_maxsize Maximum number of boxes into NMS
+   * @param[in] nms_post_maxsize Maximum number of boxes after NMS
+   * @param[in] num_box_corners Number of box's corner
+   * @param[in] num_output_box_feature Number of output box's feature
+   * @details Captital variables never change after the compile, non-capital
+   * variables could be changed through rosparam
+   */
+  PostprocessCuda(const int num_threads, 
+                  const float float_min, const float float_max,
+                  const int num_class, const int num_anchor_per_cls, 
+                  const std::vector<std::vector<int>> multihead_label_mapping,
+                  const float score_threshold,  
+                  const float nms_overlap_threshold, 
+                  const int nms_pre_maxsize, 
+                  const int nms_post_maxsize,
+                  const int num_box_corners,
+                  const int num_input_box_feature, 
+                  const int num_output_box_feature);
+  ~PostprocessCuda(){}
+
+  /**
+   * @brief Postprocessing for the network output
+   * @param[in] rpn_box_output Box predictions from the network output
+   * @param[in] rpn_cls_output Class predictions from the network output
+   * @param[in] rpn_dir_output Direction predictions from the network output
+   * @param[in] dev_filtered_box Filtered box predictions
+   * @param[in] dev_filtered_score Filtered score predictions
+   * @param[in] dev_filter_count The number of filtered output
+   * @param[out] out_detection Output bounding boxes
+   * @param[out] out_label Output labels of objects
+   * @details dev_* represents device memory allocated variables
+   */
+  void DoPostprocessCuda(
+    float* cls_pred_0,
+    float* cls_pred_12,
+    float* cls_pred_34,
+    float* cls_pred_5,
+    float* cls_pred_67,
+    float* cls_pred_89,
+    
+    const float* box_preds,
+    float* dev_filtered_box, 
+    float* dev_filtered_score, 
+    int* dev_filter_count,
+    std::vector<float>& out_detection, std::vector<int>& out_label , std::vector<float>& out_score);
+};

+ 410 - 0
src/detection/detection_lidar_PointPillars_MultiHead/preprocess.cu

@@ -0,0 +1,410 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+
+// headers in STL
+#include <stdio.h>
+
+// headers in local files
+#include "common.h"
+#include "preprocess.h"
+
+__global__ void make_pillar_histo_kernel(
+    const float* dev_points, float* dev_pillar_point_feature_in_coors,
+    int* pillar_count_histo, const int num_points,
+    const int max_points_per_pillar, const int grid_x_size,
+    const int grid_y_size, const int grid_z_size, const float min_x_range,
+    const float min_y_range, const float min_z_range, const float pillar_x_size,
+    const float pillar_y_size, const float pillar_z_size,
+    const int num_point_feature) {
+  int th_i = blockIdx.x * blockDim.x +  threadIdx.x ;
+  if (th_i >= num_points) {
+    return;
+  }
+  int x_coor = floor((dev_points[th_i * num_point_feature + 0] - min_x_range) / pillar_x_size);
+  int y_coor = floor((dev_points[th_i * num_point_feature + 1] - min_y_range) / pillar_y_size);
+  int z_coor = floor((dev_points[th_i * num_point_feature + 2] - min_z_range) / pillar_z_size);
+
+  if (x_coor >= 0 && x_coor < grid_x_size && y_coor >= 0 &&
+      y_coor < grid_y_size && z_coor >= 0 && z_coor < grid_z_size) {
+    int count =
+        atomicAdd(&pillar_count_histo[y_coor * grid_x_size + x_coor], 1);
+    if (count < max_points_per_pillar) {
+      int ind =
+          y_coor * grid_x_size * max_points_per_pillar * num_point_feature +
+          x_coor * max_points_per_pillar * num_point_feature +
+          count * num_point_feature;
+ 
+      for (int i = 0; i < num_point_feature; ++i) {
+        dev_pillar_point_feature_in_coors[ind + i] =
+            dev_points[th_i * num_point_feature + i];
+      }
+    }
+  }
+}
+
+__global__ void make_pillar_index_kernel(
+    int* dev_pillar_count_histo, int* dev_counter, int* dev_pillar_count,
+    int* dev_x_coors, int* dev_y_coors, float* dev_num_points_per_pillar,
+    int* dev_sparse_pillar_map, const int max_pillars,
+    const int max_points_per_pillar, const int grid_x_size,
+    const int num_inds_for_scan) {
+  int x = blockIdx.x;
+  int y = threadIdx.x;
+  int num_points_at_this_pillar = dev_pillar_count_histo[y * grid_x_size + x];
+  if (num_points_at_this_pillar == 0) {
+    return;
+  }
+
+  int count = atomicAdd(dev_counter, 1);
+  if (count < max_pillars) {
+    atomicAdd(dev_pillar_count, 1);
+    if (num_points_at_this_pillar >= max_points_per_pillar) {
+      dev_num_points_per_pillar[count] = max_points_per_pillar;
+    } else {
+      dev_num_points_per_pillar[count] = num_points_at_this_pillar;
+    }
+    dev_x_coors[count] = x;
+    dev_y_coors[count] = y;
+    dev_sparse_pillar_map[y * num_inds_for_scan + x] = 1;
+  }
+}
+
+__global__ void make_pillar_feature_kernel(
+    float* dev_pillar_point_feature_in_coors, float* dev_pillar_point_feature,
+    float* dev_pillar_coors, int* dev_x_coors, int* dev_y_coors,
+    float* dev_num_points_per_pillar, const int max_points,
+    const int num_point_feature, const int grid_x_size) {
+  int ith_pillar = blockIdx.x;
+  int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
+  int ith_point = threadIdx.x;
+  if (ith_point >= num_points_at_this_pillar) {
+    return;
+  }
+  int x_ind = dev_x_coors[ith_pillar];
+  int y_ind = dev_y_coors[ith_pillar];
+  int pillar_ind = ith_pillar * max_points * num_point_feature +
+                   ith_point * num_point_feature;
+  int coors_ind = y_ind * grid_x_size * max_points * num_point_feature +
+                  x_ind * max_points * num_point_feature +
+                  ith_point * num_point_feature;
+  #pragma unroll 
+  for (int i = 0; i < num_point_feature; ++i) {
+    dev_pillar_point_feature[pillar_ind + i] =
+        dev_pillar_point_feature_in_coors[coors_ind + i];
+  }
+
+  float coor_x = static_cast<float>(x_ind);
+  float coor_y = static_cast<float>(y_ind);
+  dev_pillar_coors[ith_pillar * 4 + 0] = 0;  // batch idx
+  dev_pillar_coors[ith_pillar * 4 + 1] = 0;  // z
+  dev_pillar_coors[ith_pillar * 4 + 2] = coor_y;
+  dev_pillar_coors[ith_pillar * 4 + 3] = coor_x;
+}
+
+
+
+__global__ void pillar_mean_kernel(
+  float* dev_points_mean, 
+  const int num_point_feature,
+  const float* dev_pillar_point_feature, 
+  const float* dev_num_points_per_pillar, 
+  int max_pillars , 
+  int max_points_per_pillar) {
+
+    extern __shared__ float temp[];
+    int ith_pillar = blockIdx.x; 
+    int ith_point  = threadIdx.x;
+    int axis = threadIdx.y;
+  
+    int reduce_size = max_points_per_pillar > 32 ? 64 : 32;
+    temp[threadIdx.x * 3 + axis] =  dev_pillar_point_feature[ith_pillar * max_points_per_pillar * num_point_feature + ith_point * num_point_feature + axis];  
+    if (threadIdx.x < reduce_size - max_points_per_pillar) {
+        temp[(threadIdx.x + max_points_per_pillar) * 3 + axis] = 0.0f; //--> dummy placeholds will set as 0
+    }
+    __syncthreads();
+    int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
+
+    if (ith_point >= num_points_at_this_pillar) {
+          return;
+    }
+
+    for (unsigned int d = reduce_size >> 1 ; d > 0.6; d >>= 1) {
+        if (ith_point < d) {
+            temp[ith_point*3 +axis] += temp[(ith_point + d) * 3 + axis];
+        }
+        __syncthreads();
+    }
+
+    if (ith_point == 0) {
+        dev_points_mean[ith_pillar * 3 + axis] = temp[ith_point + axis] / num_points_at_this_pillar ;
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+__device__ void warpReduce(volatile float* sdata , int ith_point , int axis) {
+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 8) * blockDim.y + axis];
+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 4) * blockDim.y + axis];
+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 2) * blockDim.y + axis];
+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 1) * blockDim.y + axis];
+}
+
+
+
+
+
+__global__ void make_pillar_mean_kernel(
+  float* dev_points_mean, 
+  const int num_point_feature,
+  const float* dev_pillar_point_feature, 
+  const float* dev_num_points_per_pillar, 
+  int max_pillars , 
+  int max_points_pre_pillar) {
+    extern __shared__ float temp[];
+    unsigned int ith_pillar = blockIdx.x;  // { 0 , 1, 2, ... , 10000+}
+    unsigned int ith_point  = threadIdx.x; // { 0 , 1, 2, ...,9}
+    unsigned int axis = threadIdx.y; 
+    unsigned int idx_pre  = ith_pillar * max_points_pre_pillar * num_point_feature \
+                     + ith_point  * num_point_feature;
+    unsigned int idx_post = ith_pillar * max_points_pre_pillar * num_point_feature \
+                     + (ith_point + blockDim.x)  * num_point_feature;
+
+    temp[ith_point * blockDim.y + axis] = 0.0;
+    unsigned int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
+
+    // if (ith_point < num_points_at_this_pillar / 2) {
+      temp[ith_point * blockDim.y + axis] = dev_pillar_point_feature[idx_pre  + axis] 
+                                          + dev_pillar_point_feature[idx_post + axis];
+    // }
+    __syncthreads();
+
+    // do reduction in shared mem
+    // Sequential addressing. This solves the bank conflicts as
+    // the threads now access shared memory with a stride of one
+    // 32-bit word (unsigned int) now, which does not cause bank 
+    // conflicts
+    warpReduce(temp , ith_point , axis);
+
+	// // write result for this block to global mem
+    if (ith_point == 0)
+    dev_points_mean[ith_pillar * blockDim.y + axis] = temp[ith_point * blockDim.y + axis] / num_points_at_this_pillar ;
+}
+
+
+__global__ void gather_point_feature_kernel(
+  const int max_num_pillars_,const int max_num_points_per_pillar,const int num_point_feature,
+  const float min_x_range, const float min_y_range, const float min_z_range, 
+  const float pillar_x_size,  const float pillar_y_size, const float pillar_z_size,
+  const float* dev_pillar_point_feature, const float* dev_num_points_per_pillar, 
+  const float* dev_pillar_coors,
+  float* dev_points_mean, 
+  float* dev_pfe_gather_feature_){
+
+  int ith_pillar = blockIdx.x; 
+  int ith_point = threadIdx.x;
+  // int kNumPointFeature = 5;
+  int num_gather_feature = 11;
+  int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
+
+  if (ith_point >= num_points_at_this_pillar){
+        return;
+    }
+
+
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 0] 
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0]; 
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 1]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1];
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 2]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2];
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 3]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 3];
+
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 4];
+  
+    // dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4]  =  0.0f;
+    //   f_cluster = voxel_features[:, :, :3] - points_mean
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 5]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - dev_points_mean[ith_pillar * 3 + 0 ];
+
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 6] 
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - dev_points_mean[ith_pillar * 3 + 1 ];
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 7]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - dev_points_mean[ith_pillar * 3 + 2 ];
+
+    // f_center[:, :, 0] = voxel_features[:, :, 0] - (coords[:, 3].to(voxel_features.dtype).unsqueeze(1) * self.voxel_x + self.x_offset)
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 8]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - (dev_pillar_coors[ith_pillar * 4 + 3] * pillar_x_size + (pillar_x_size/2 + min_x_range));
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 9]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - (dev_pillar_coors[ith_pillar * 4 + 2] * pillar_y_size + (pillar_y_size/2 + min_y_range));
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 10] 
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - (dev_pillar_coors[ith_pillar * 4 + 1] * pillar_z_size + (pillar_z_size/2 + min_z_range));
+
+}
+
+
+
+
+PreprocessPointsCuda::PreprocessPointsCuda(
+    const int num_threads, const int max_num_pillars,
+    const int max_points_per_pillar, const int num_point_feature,
+    const int num_inds_for_scan, const int grid_x_size, const int grid_y_size,
+    const int grid_z_size, const float pillar_x_size, const float pillar_y_size,
+    const float pillar_z_size, const float min_x_range, const float min_y_range,
+    const float min_z_range)
+    : num_threads_(num_threads),
+      max_num_pillars_(max_num_pillars),
+      max_num_points_per_pillar_(max_points_per_pillar),
+      num_point_feature_(num_point_feature),
+      num_inds_for_scan_(num_inds_for_scan),
+      grid_x_size_(grid_x_size),
+      grid_y_size_(grid_y_size),
+      grid_z_size_(grid_z_size),
+      pillar_x_size_(pillar_x_size),
+      pillar_y_size_(pillar_y_size),
+      pillar_z_size_(pillar_z_size),
+      min_x_range_(min_x_range),
+      min_y_range_(min_y_range),
+      min_z_range_(min_z_range) {
+    
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_in_coors_),
+        grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ *  num_point_feature_ * sizeof(float)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_histo_),
+        grid_y_size_ * grid_x_size_ * sizeof(int)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_counter_), sizeof(int)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_), sizeof(int)));    
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points_mean_), max_num_pillars_ * 3 *sizeof(float)));  
+    }
+
+PreprocessPointsCuda::~PreprocessPointsCuda() {
+    GPU_CHECK(cudaFree(dev_pillar_point_feature_in_coors_));
+    GPU_CHECK(cudaFree(dev_pillar_count_histo_));
+    GPU_CHECK(cudaFree(dev_counter_));
+    GPU_CHECK(cudaFree(dev_pillar_count_));
+    GPU_CHECK(cudaFree(dev_points_mean_));
+  }
+
+
+void PreprocessPointsCuda::DoPreprocessPointsCuda(
+    const float* dev_points, const int in_num_points, 
+    int* dev_x_coors,int* dev_y_coors, 
+    float* dev_num_points_per_pillar,
+    float* dev_pillar_point_feature, float* dev_pillar_coors,
+    int* dev_sparse_pillar_map, int* host_pillar_count , float* dev_pfe_gather_feature) {
+    // initialize paraments
+    GPU_CHECK(cudaMemset(dev_pillar_point_feature_in_coors_, 0 , grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ *  num_point_feature_ * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_pillar_count_histo_, 0 , grid_y_size_ * grid_x_size_ * sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_counter_, 0, sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_pillar_count_, 0, sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_points_mean_, 0,  max_num_pillars_ * 3 * sizeof(float)));
+    int num_block = DIVUP(in_num_points , num_threads_);
+    make_pillar_histo_kernel<<<num_block , num_threads_>>>(
+        dev_points, dev_pillar_point_feature_in_coors_, dev_pillar_count_histo_,
+        in_num_points, max_num_points_per_pillar_, grid_x_size_, grid_y_size_,
+        grid_z_size_, min_x_range_, min_y_range_, min_z_range_, pillar_x_size_,
+        pillar_y_size_, pillar_z_size_, num_point_feature_);
+    
+    make_pillar_index_kernel<<<grid_x_size_, grid_y_size_>>>(
+        dev_pillar_count_histo_, dev_counter_, dev_pillar_count_, dev_x_coors,
+        dev_y_coors, dev_num_points_per_pillar, dev_sparse_pillar_map,
+        max_num_pillars_, max_num_points_per_pillar_, grid_x_size_,
+        num_inds_for_scan_);  
+
+    GPU_CHECK(cudaMemcpy(host_pillar_count, dev_pillar_count_, 1 * sizeof(int),
+        cudaMemcpyDeviceToHost));
+    make_pillar_feature_kernel<<<host_pillar_count[0],max_num_points_per_pillar_>>>(
+        dev_pillar_point_feature_in_coors_, dev_pillar_point_feature,
+        dev_pillar_coors, dev_x_coors, dev_y_coors, dev_num_points_per_pillar,
+        max_num_points_per_pillar_, num_point_feature_, grid_x_size_);
+    
+
+    dim3 mean_block(max_num_points_per_pillar_,3); //(32,3)
+
+    pillar_mean_kernel<<<host_pillar_count[0],mean_block,64 * 3 *sizeof(float)>>>(
+      dev_points_mean_  ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar, 
+        max_num_pillars_ , max_num_points_per_pillar_);
+
+    // dim3 mean_block(10,3); // Unrolling the Last Warp
+    // make_pillar_mean_kernel<<<host_pillar_count[0], mean_block , 32 * 3 *sizeof(float)>>>(
+    //       dev_points_mean_  ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar, 
+    //       max_num_pillars_ , max_num_points_per_pillar_);
+
+    gather_point_feature_kernel<<<max_num_pillars_, max_num_points_per_pillar_>>>(
+      max_num_pillars_,max_num_points_per_pillar_,num_point_feature_,
+      min_x_range_, min_y_range_, min_z_range_,
+      pillar_x_size_, pillar_y_size_, pillar_z_size_, 
+      dev_pillar_point_feature, dev_num_points_per_pillar, dev_pillar_coors,
+      dev_points_mean_,
+      dev_pfe_gather_feature);
+
+    // DEVICE_SAVE<float>(dev_pillar_point_feature , \
+    //     max_num_pillars_ * max_num_points_per_pillar_ * num_point_feature_ , "dev_pillar_point_feature");
+    // DEVICE_SAVE<float>(dev_num_points_per_pillar , \
+    //   max_num_pillars_ , "dev_num_points_per_pillar");
+    // DEVICE_SAVE<float>(dev_pfe_gather_feature , \
+    //   max_num_pillars_ * 11, "dev_pfe_gather_feature");
+}
+
+

+ 138 - 0
src/detection/detection_lidar_PointPillars_MultiHead/preprocess.h

@@ -0,0 +1,138 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file preprocess_points_cuda.h
+ * @brief GPU version of preprocess points
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+#pragma once
+
+
+class PreprocessPointsCuda {
+ private:
+    // initializer list
+    const int num_threads_;
+    const int max_num_pillars_;
+    const int max_num_points_per_pillar_;
+    const int num_point_feature_;
+    const int num_inds_for_scan_;
+    const int grid_x_size_;
+    const int grid_y_size_;
+    const int grid_z_size_;
+    const float pillar_x_size_;
+    const float pillar_y_size_;
+    const float pillar_z_size_;
+    const float min_x_range_;
+    const float min_y_range_;
+    const float min_z_range_;
+    // end initializer list
+
+    float* dev_pillar_point_feature_in_coors_;
+    int* dev_pillar_count_histo_;
+
+    int* dev_counter_;
+    int* dev_pillar_count_;
+    float* dev_points_mean_;
+
+
+
+ public:
+  /**
+   * @brief Constructor
+   * @param[in] num_threads Number of threads when launching cuda kernel
+   * @param[in] num_point_feature Number of features in a point
+   * @param[in] num_inds_for_scan Number of indexes for scan(cumsum)
+   * 
+   * @param[in] max_num_pillars Maximum number of pillars
+   * @param[in] max_points_per_pillar Maximum number of points per pillar
+
+   * @param[in] grid_x_size Number of pillars in x-coordinate
+   * @param[in] grid_y_size Number of pillars in y-coordinate
+   * @param[in] grid_z_size Number of pillars in z-coordinate
+   * 
+   * @param[in] pillar_x_size Size of x-dimension for a pillar
+   * @param[in] pillar_y_size Size of y-dimension for a pillar
+   * @param[in] pillar_z_size Size of z-dimension for a pillar
+   * 
+   * @param[in] min_x_range Minimum x value for point cloud
+   * @param[in] min_y_range Minimum y value for point cloud
+   * @param[in] min_z_range Minimum z value for point cloud
+   * @details Captital variables never change after the compile
+   */
+  PreprocessPointsCuda(const int num_threads, const int num_point_feature, const int num_inds_for_scan,
+                       const int max_num_pillars, const int max_points_per_pillar,  
+                       const int grid_x_size, const int grid_y_size, const int grid_z_size,  // grid size
+                       const float pillar_x_size, const float pillar_y_size, const float pillar_z_size, //voxel size
+                       const float min_x_range, const float min_y_range, const float min_z_range); // point cloud range
+  ~PreprocessPointsCuda();
+
+  /**
+   * @brief CUDA preprocessing for input point cloud
+   * @param[in] dev_points Point cloud array
+   * @param[in] in_num_points The number of points
+   * @param[mid] dev_x_coors X-coordinate indexes for corresponding pillars
+   * @param[mid] dev_y_coors Y-coordinate indexes for corresponding pillars
+   * @param[mid] dev_num_points_per_pillar
+   *   Number of points in corresponding pillars
+   * @param[mid] pillar_point_feature
+   *   Values of point feature in each pillar
+   * @param[mid] pillar_coors Array for coors of pillars
+   * @param[mid] dev_points_mean Array for calculate the point center
+   * @param[out] dev_sparse_pillar_map
+   *   Grid map representation for pillar-occupancy
+   * @param[out] host_pillar_count
+   *   The number of valid pillars for an input point cloud
+   * @param[out] dev_pfe_gather_feature
+   *   11 dimensions feature for pfe input channel
+   * @details Convert point cloud to pillar representation
+   */
+  void DoPreprocessPointsCuda(const float* dev_points,
+                              const int in_num_points,
+                              int* dev_x_coors,
+                              int* dev_y_coors,
+                              float* dev_num_points_per_pillar,
+                              float* dev_pillar_point_feature,
+                              float* dev_pillar_coors,
+                              int* dev_sparse_pillar_map,
+                              int* host_pillar_count,
+                              float* dev_pfe_gather_feature);
+};

+ 73 - 0
src/detection/detection_lidar_PointPillars_MultiHead/scatter.cu

@@ -0,0 +1,73 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+
+//headers in local files
+#include "scatter.h"
+
+__global__ void scatter_kernel(int *x_coors, int *y_coors, float *pfe_output,
+  float *scattered_feature, const int grid_x_size,
+  const int grid_y_size) {
+int i_pillar = blockIdx.x;
+int i_feature = threadIdx.x;
+int x_ind = x_coors[i_pillar];
+int y_ind = y_coors[i_pillar];
+float feature = pfe_output[i_pillar * 64 + i_feature];
+scattered_feature[i_feature * grid_y_size * grid_x_size +
+y_ind * grid_x_size + x_ind] = feature;
+}
+
+ScatterCuda::ScatterCuda(const int num_threads, const int grid_x_size,
+const int grid_y_size)
+: num_threads_(num_threads),
+grid_x_size_(grid_x_size),
+grid_y_size_(grid_y_size) {}
+
+void ScatterCuda::DoScatterCuda(const int pillar_count, int *x_coors,
+   int *y_coors, float *pfe_output,
+   float *scattered_feature) {
+scatter_kernel<<<pillar_count, num_threads_>>>(x_coors, y_coors, pfe_output,
+                    scattered_feature,
+                    grid_x_size_, grid_y_size_);
+}

+ 77 - 0
src/detection/detection_lidar_PointPillars_MultiHead/scatter.h

@@ -0,0 +1,77 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+#pragma once
+
+class ScatterCuda {
+ private:
+  const int num_threads_;
+  const int grid_x_size_;
+  const int grid_y_size_;
+
+ public:
+  /**
+   * @brief Constructor
+   * @param[in] num_threads The number of threads to launch cuda kernel
+   * @param[in] grid_x_size Number of pillars in x-coordinate
+   * @param[in] grid_y_size Number of pillars in y-coordinate
+   * @details Captital variables never change after the compile
+   */
+  ScatterCuda(const int num_threads, const int grid_x_size,
+              const int grid_y_size);
+
+  /**
+   * @brief Call scatter cuda kernel
+   * @param[in] pillar_count The valid number of pillars
+   * @param[in] x_coors X-coordinate indexes for corresponding pillars
+   * @param[in] y_coors Y-coordinate indexes for corresponding pillars
+   * @param[in] pfe_output Output from Pillar Feature Extractor
+   * @param[out] scattered_feature Gridmap representation for pillars' feature
+   * @details Allocate pillars in gridmap based on index(coordinates)
+   * information
+   */
+  void DoScatterCuda(const int pillar_count, int* x_coors, int* y_coors,
+                     float* pfe_output, float* scattered_feature);
+};
+

+ 1 - 2
src/detection/detection_lidar_point_pillars_onnx/detection_lidar_point_pillars_onnx.pro

@@ -132,10 +132,9 @@ else {
 
 }
 
-
 LIBS += -lcudart -lcufft
 
-LIBS += -L/home/adc/soft/cudnn-10.2-linux-x64-v7.6.5.32/cuda/lib64 -lcudnn
+#LIBS += -L/home/adc/soft/cudnn-10.2-linux-x64-v7.6.5.32/cuda/lib64 -lcudnn
 
 INCLUDEPATH += /home/adc/soft/TensorRT-7.0.0.11.Ubuntu-18.04.x86_64-gnu.cuda-10.2.cudnn7.6/TensorRT-7.0.0.11/include