3 years ago · 6ccf49c9fc
--- a/src/detection/detection_lidar_PointPillars_MultiHead/.gitignore
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/.gitignore
@@ -0,0 +1,73 @@
 
				+# This file is used to ignore files which are generated
			
 
				+# ----------------------------------------------------------------------------
			
 
				+
			
 
				+*~
			
 
				+*.autosave
			
 
				+*.a
			
 
				+*.core
			
 
				+*.moc
			
 
				+*.o
			
 
				+*.obj
			
 
				+*.orig
			
 
				+*.rej
			
 
				+*.so
			
 
				+*.so.*
			
 
				+*_pch.h.cpp
			
 
				+*_resource.rc
			
 
				+*.qm
			
 
				+.#*
			
 
				+*.*#
			
 
				+core
			
 
				+!core/
			
 
				+tags
			
 
				+.DS_Store
			
 
				+.directory
			
 
				+*.debug
			
 
				+Makefile*
			
 
				+*.prl
			
 
				+*.app
			
 
				+moc_*.cpp
			
 
				+ui_*.h
			
 
				+qrc_*.cpp
			
 
				+Thumbs.db
			
 
				+*.res
			
 
				+*.rc
			
 
				+/.qmake.cache
			
 
				+/.qmake.stash
			
 
				+
			
 
				+# qtcreator generated files
			
 
				+*.pro.user*
			
 
				+
			
 
				+# xemacs temporary files
			
 
				+*.flc
			
 
				+
			
 
				+# Vim temporary files
			
 
				+.*.swp
			
 
				+
			
 
				+# Visual Studio generated files
			
 
				+*.ib_pdb_index
			
 
				+*.idb
			
 
				+*.ilk
			
 
				+*.pdb
			
 
				+*.sln
			
 
				+*.suo
			
 
				+*.vcproj
			
 
				+*vcproj.*.*.user
			
 
				+*.ncb
			
 
				+*.sdf
			
 
				+*.opensdf
			
 
				+*.vcxproj
			
 
				+*vcxproj.*
			
 
				+
			
 
				+# MinGW generated files
			
 
				+*.Debug
			
 
				+*.Release
			
 
				+
			
 
				+# Python byte code
			
 
				+*.pyc
			
 
				+
			
 
				+# Binaries
			
 
				+# --------
			
 
				+*.dll
			
 
				+*.exe
			
 
				+
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/cfgs/cbgs_pp_multihead.yaml
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/cfgs/cbgs_pp_multihead.yaml
@@ -0,0 +1,246 @@
 
				+CLASS_NAMES: ['car','truck', 'construction_vehicle', 'bus', 'trailer',
			
 
				+              'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']
			
 
				+
			
 
				+DATA_CONFIG:
			
 
				+    _BASE_CONFIG_: cfgs/dataset_configs/nuscenes_dataset.yaml
			
 
				+
			
 
				+    POINT_CLOUD_RANGE: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
			
 
				+    DATA_PROCESSOR:
			
 
				+        -   NAME: mask_points_and_boxes_outside_range
			
 
				+            REMOVE_OUTSIDE_BOXES: True
			
 
				+
			
 
				+        -   NAME: shuffle_points
			
 
				+            SHUFFLE_ENABLED: {
			
 
				+                'train': True,
			
 
				+                'test': True
			
 
				+            }
			
 
				+
			
 
				+        -   NAME: transform_points_to_voxels
			
 
				+            VOXEL_SIZE: [0.2, 0.2, 8.0]
			
 
				+            MAX_POINTS_PER_VOXEL: 20
			
 
				+            MAX_NUMBER_OF_VOXELS: {
			
 
				+                'train': 30000,
			
 
				+                'test': 30000
			
 
				+            }
			
 
				+
			
 
				+MODEL:
			
 
				+    NAME: PointPillar
			
 
				+
			
 
				+    VFE:
			
 
				+        NAME: PillarVFE
			
 
				+        WITH_DISTANCE: False
			
 
				+        USE_ABSLOTE_XYZ: True
			
 
				+        USE_NORM: True
			
 
				+        NUM_FILTERS: [64]
			
 
				+
			
 
				+    MAP_TO_BEV:
			
 
				+        NAME: PointPillarScatter
			
 
				+        NUM_BEV_FEATURES: 64
			
 
				+
			
 
				+    BACKBONE_2D:
			
 
				+        NAME: BaseBEVBackbone
			
 
				+        LAYER_NUMS: [3, 5, 5]
			
 
				+        LAYER_STRIDES: [2, 2, 2]
			
 
				+        NUM_FILTERS: [64, 128, 256]
			
 
				+        UPSAMPLE_STRIDES: [0.5, 1, 2]
			
 
				+        NUM_UPSAMPLE_FILTERS: [128, 128, 128]
			
 
				+
			
 
				+    DENSE_HEAD:
			
 
				+        NAME: AnchorHeadMulti
			
 
				+        CLASS_AGNOSTIC: False
			
 
				+
			
 
				+        DIR_OFFSET: 0.78539
			
 
				+        DIR_LIMIT_OFFSET: 0.0
			
 
				+        NUM_DIR_BINS: 2
			
 
				+
			
 
				+        USE_MULTIHEAD: True
			
 
				+        SEPARATE_MULTIHEAD: True
			
 
				+        ANCHOR_GENERATOR_CONFIG: [
			
 
				+            {
			
 
				+                'class_name': car,
			
 
				+                'anchor_sizes': [[4.63, 1.97, 1.74]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.95],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.6,
			
 
				+                'unmatched_threshold': 0.45
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': truck,
			
 
				+                'anchor_sizes': [[6.93, 2.51, 2.84]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.6],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.55,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': construction_vehicle,
			
 
				+                'anchor_sizes': [[6.37, 2.85, 3.19]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.225],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.5,
			
 
				+                'unmatched_threshold': 0.35
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': bus,
			
 
				+                'anchor_sizes': [[10.5, 2.94, 3.47]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.085],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.55,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': trailer,
			
 
				+                'anchor_sizes': [[12.29, 2.90, 3.87]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [0.115],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.5,
			
 
				+                'unmatched_threshold': 0.35
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': barrier,
			
 
				+                'anchor_sizes': [[0.50, 2.53, 0.98]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-1.33],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.55,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': motorcycle,
			
 
				+                'anchor_sizes': [[2.11, 0.77, 1.47]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-1.085],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.5,
			
 
				+                'unmatched_threshold': 0.3
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': bicycle,
			
 
				+                'anchor_sizes': [[1.70, 0.60, 1.28]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-1.18],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.5,
			
 
				+                'unmatched_threshold': 0.35
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': pedestrian,
			
 
				+                'anchor_sizes': [[0.73, 0.67, 1.77]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.935],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.6,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': traffic_cone,
			
 
				+                'anchor_sizes': [[0.41, 0.41, 1.07]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-1.285],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.6,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+        ]
			
 
				+
			
 
				+        SHARED_CONV_NUM_FILTER: 64
			
 
				+
			
 
				+        RPN_HEAD_CFGS: [
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['car'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['truck', 'construction_vehicle'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['bus', 'trailer'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['barrier'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['motorcycle', 'bicycle'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['pedestrian', 'traffic_cone'],
			
 
				+            },
			
 
				+        ]
			
 
				+        SEPARATE_REG_CONFIG: 
			
 
				+            NUM_MIDDLE_CONV: 1
			
 
				+            NUM_MIDDLE_FILTER: 64
			
 
				+            REG_LIST: ['reg:2', 'height:1', 'size:3', 'angle:2', 'velo:2']
			
 
				+
			
 
				+        TARGET_ASSIGNER_CONFIG:
			
 
				+            NAME: AxisAlignedTargetAssigner
			
 
				+            POS_FRACTION: -1.0
			
 
				+            SAMPLE_SIZE: 512
			
 
				+            NORM_BY_NUM_EXAMPLES: False
			
 
				+            MATCH_HEIGHT: False
			
 
				+            BOX_CODER: ResidualCoder
			
 
				+            BOX_CODER_CONFIG: {
			
 
				+                'code_size': 9,
			
 
				+                'encode_angle_by_sincos': True
			
 
				+            }
			
 
				+
			
 
				+
			
 
				+        LOSS_CONFIG:
			
 
				+            REG_LOSS_TYPE: WeightedL1Loss
			
 
				+            LOSS_WEIGHTS: {
			
 
				+                'pos_cls_weight': 1.0,
			
 
				+                'neg_cls_weight': 2.0,
			
 
				+                'cls_weight': 1.0,
			
 
				+                'loc_weight': 0.25,
			
 
				+                'dir_weight': 0.2,
			
 
				+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
			
 
				+            }
			
 
				+
			
 
				+    POST_PROCESSING:
			
 
				+        RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
			
 
				+        SCORE_THRESH: 0.1
			
 
				+        OUTPUT_RAW_SCORE: False
			
 
				+
			
 
				+        EVAL_METRIC: kitti
			
 
				+
			
 
				+        NMS_CONFIG:
			
 
				+            MULTI_CLASSES_NMS: True
			
 
				+            NMS_TYPE: nms_gpu
			
 
				+            NMS_THRESH: 0.2
			
 
				+            NMS_PRE_MAXSIZE: 1000
			
 
				+            NMS_POST_MAXSIZE: 83
			
 
				+
			
 
				+
			
 
				+OPTIMIZATION:
			
 
				+    BATCH_SIZE_PER_GPU: 4
			
 
				+    NUM_EPOCHS: 20
			
 
				+
			
 
				+    OPTIMIZER: adam_onecycle
			
 
				+    LR: 0.001
			
 
				+    WEIGHT_DECAY: 0.01
			
 
				+    MOMENTUM: 0.9
			
 
				+
			
 
				+    MOMS: [0.95, 0.85]
			
 
				+    PCT_START: 0.4
			
 
				+    DIV_FACTOR: 10
			
 
				+    DECAY_STEP_LIST: [35, 45]
			
 
				+    LR_DECAY: 0.1
			
 
				+    LR_CLIP: 0.0000001
			
 
				+
			
 
				+    LR_WARMUP: False
			
 
				+    WARMUP_EPOCH: 1
			
 
				+
			
 
				+    GRAD_NORM_CLIP: 10
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/common.h
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/common.h
@@ -0,0 +1,141 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+// headers in STL
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+#include <iostream>
			
 
				+#include <fstream>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+// headers in CUDA
			
 
				+#include "cuda_runtime_api.h"
			
 
				+
			
 
				+using namespace std;
			
 
				+// using MACRO to allocate memory inside CUDA kernel
			
 
				+#define NUM_3D_BOX_CORNERS_MACRO 8
			
 
				+
			
 
				+#define NUM_2D_BOX_CORNERS_MACRO 4
			
 
				+
			
 
				+#define NUM_THREADS_MACRO 64
			
 
				+
			
 
				+// need to be changed when num_threads_ is changed
			
 
				+
			
 
				+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
			
 
				+
			
 
				+#define GPU_CHECK(ans)                    \
			
 
				+  {                                       \
			
 
				+    GPUAssert((ans), __FILE__, __LINE__); \
			
 
				+  }
			
 
				+inline void GPUAssert(cudaError_t code, const char *file, int line,
			
 
				+                      bool abort = true)
			
 
				+{
			
 
				+  if (code != cudaSuccess)
			
 
				+  {
			
 
				+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
			
 
				+            line);
			
 
				+    if (abort)
			
 
				+      exit(code);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+void HOST_SAVE(T *array, int size, string filename, string root = "../test/result", string postfix = ".txt")
			
 
				+{
			
 
				+  string filepath = root + "/" + filename + postfix;
			
 
				+  if (postfix == ".bin")
			
 
				+  {
			
 
				+    fstream file(filepath, ios::out | ios::binary);
			
 
				+    file.write(reinterpret_cast<char *>(array), sizeof(size * sizeof(T)));
			
 
				+    file.close();
			
 
				+    std::cout << "|>>>|  Data has been written in " << filepath << "  |<<<|" << std::endl;
			
 
				+    return;
			
 
				+  }
			
 
				+  else if (postfix == ".txt")
			
 
				+  {
			
 
				+    ofstream file(filepath, ios::out);
			
 
				+    for (int i = 0; i < size; ++i)
			
 
				+      file << array[i] << " ";
			
 
				+    file.close();
			
 
				+    std::cout << "|>>>|  Data has been written in " << filepath << "  |<<<|" << std::endl;
			
 
				+    return;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+void DEVICE_SAVE(T *array, int size, string filename, string root = "../test/result", string postfix = ".txt")
			
 
				+{
			
 
				+  T *temp_ = new T[size];
			
 
				+  cudaMemcpy(temp_, array, size * sizeof(T), cudaMemcpyDeviceToHost);
			
 
				+  HOST_SAVE<T>(temp_, size, filename, root, postfix);
			
 
				+  delete[] temp_;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// int TXTtoArrary( float* &points_array , string file_name , int num_feature = 4)
			
 
				+// {
			
 
				+//   ifstream InFile;
			
 
				+//   InFile.open(file_name.data());
			
 
				+//   assert(InFile.is_open());
			
 
				+
			
 
				+//   vector<float> temp_points;
			
 
				+//   string c;
			
 
				+
			
 
				+//   while (!InFile.eof())
			
 
				+//   {
			
 
				+//       InFile >> c;
			
 
				+
			
 
				+//       temp_points.push_back(atof(c.c_str()));
			
 
				+//   }
			
 
				+//   points_array = new float[temp_points.size()];
			
 
				+//   for (int i = 0 ; i < temp_points.size() ; ++i) {
			
 
				+//     points_array[i] = temp_points[i];
			
 
				+//   }
			
 
				+
			
 
				+//   InFile.close();  
			
 
				+//   return temp_points.size() / num_feature;
			
 
				+// };
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/detection_lidar_PointPillars_MultiHead.pro
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/detection_lidar_PointPillars_MultiHead.pro
@@ -0,0 +1,129 @@
 
				+QT -= gui
			
 
				+
			
 
				+CONFIG += c++11 console
			
 
				+CONFIG -= app_bundle
			
 
				+
			
 
				+QMAKE_CXXFLAGS += -std=gnu++17
			
 
				+QMAKE_LFLAGS += -no-pie  -Wl,--no-as-needed
			
 
				+
			
 
				+# The following define makes your compiler emit warnings if you use
			
 
				+# any feature of Qt which as been marked deprecated (the exact warnings
			
 
				+# depend on your compiler). Please consult the documentation of the
			
 
				+# deprecated API in order to know how to port your code away from it.
			
 
				+DEFINES += QT_DEPRECATED_WARNINGS
			
 
				+
			
 
				+# You can also make your code fail to compile if you use deprecated APIs.
			
 
				+# In order to do so, uncomment the following line.
			
 
				+# You can also select to disable deprecated APIs only up to a certain version of Qt.
			
 
				+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
			
 
				+
			
 
				+SOURCES += main.cpp \
			
 
				+    pointpillars.cc
			
 
				+
			
 
				+DISTFILES += \
			
 
				+    nms.cu \
			
 
				+    postprocess.cu \
			
 
				+    preprocess.cu \
			
 
				+    scatter.cu
			
 
				+
			
 
				+HEADERS += \
			
 
				+    common.h \
			
 
				+    nms.h \
			
 
				+    pointpillars.h \
			
 
				+    postprocess.h \
			
 
				+    preprocess.h \
			
 
				+    scatter.h
			
 
				+
			
 
				+
			
 
				+CUDA_SOURCES +=  \
			
 
				+    nms.cu \
			
 
				+    postprocess.cu \
			
 
				+    preprocess.cu \
			
 
				+    scatter.cu
			
 
				+
			
 
				+CUDA_SDK = "/usr/local/cuda/"   # cudaSDK路径
			
 
				+
			
 
				+CUDA_DIR = "/usr/local/cuda/"            # CUDA tookit路径
			
 
				+
			
 
				+SYSTEM_NAME = linux         # 自己系统环境 'Win32', 'x64', or 'Win64'
			
 
				+
			
 
				+SYSTEM_TYPE = 64           #操作系统位数 '32' or '64',
			
 
				+
			
 
				+CUDA_ARCH = sm_72         # cuda架构, for example 'compute_10', 'compute_11', 'sm_10'
			
 
				+
			
 
				+NVCC_OPTIONS = --use_fast_math --compiler-options "-fPIC"
			
 
				+
			
 
				+
			
 
				+# include paths
			
 
				+
			
 
				+INCLUDEPATH += $$CUDA_DIR/include
			
 
				+#INCLUDEPATH += /usr/local/cuda-10.0/targets/aarch64-linux/include/crt
			
 
				+
			
 
				+# library directories
			
 
				+
			
 
				+QMAKE_LIBDIR += $$CUDA_DIR/lib/
			
 
				+
			
 
				+CUDA_OBJECTS_DIR = ./
			
 
				+
			
 
				+# The following library conflicts with something in Cuda
			
 
				+
			
 
				+#QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
			
 
				+
			
 
				+#QMAKE_LFLAGS_DEBUG   = /NODEFAULTLIB:msvcrtd.lib
			
 
				+
			
 
				+# Add the necessary libraries
			
 
				+
			
 
				+CUDA_LIBS =  cudart cufft
			
 
				+
			
 
				+# The following makes sure all path names (which often include spaces) are put between quotation marks
			
 
				+
			
 
				+CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
			
 
				+
			
 
				+NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
			
 
				+
			
 
				+#LIBS += $$join(CUDA_LIBS,'.so ', '', '.so')
			
 
				+
			
 
				+# Configuration of the Cuda compiler
			
 
				+
			
 
				+CONFIG(debug, debug|release) {
			
 
				+
			
 
				+    # Debug mode
			
 
				+
			
 
				+    cuda_d.input = CUDA_SOURCES
			
 
				+
			
 
				+    cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
			
 
				+
			
 
				+    cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
			
 
				+
			
 
				+    cuda_d.dependency_type = TYPE_C
			
 
				+
			
 
				+    QMAKE_EXTRA_COMPILERS += cuda_d
			
 
				+
			
 
				+}
			
 
				+
			
 
				+else {
			
 
				+
			
 
				+    # Release mode
			
 
				+
			
 
				+    cuda.input = CUDA_SOURCES
			
 
				+
			
 
				+    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
			
 
				+
			
 
				+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -O3 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
			
 
				+
			
 
				+    cuda.dependency_type = TYPE_C
			
 
				+
			
 
				+    QMAKE_EXTRA_COMPILERS += cuda
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+LIBS += -L/usr/local/cuda-10.2/targets/aarch64-linux/lib
			
 
				+
			
 
				+LIBS += -lcudart -lcufft -lyaml-cpp
			
 
				+
			
 
				+LIBS += -L/home/adc/soft/cudnn-10.2-linux-x64-v7.6.5.32/cuda/lib64 -lcudnn
			
 
				+
			
 
				+LIBS +=  -lmyelin -lnvinfer -lnvonnxparser -lnvcaffe_parser
			
 
				+
			
 
				+LIBS += -L/home/nvidia/git/libtorch_gpu-1.6.0-linux-aarch64/lib -ltorch_cuda  -ltorch -lc10 -ltorch_cpu
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/main.cpp
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/main.cpp
@@ -0,0 +1,8 @@
 
				+#include <QCoreApplication>
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+    QCoreApplication a(argc, argv);
			
 
				+
			
 
				+    return a.exec();
			
 
				+}
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/nms.cu
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/nms.cu
@@ -0,0 +1,422 @@
 
				+/*
			
 
				+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
			
 
				+Written by Shaoshuai Shi
			
 
				+All Rights Reserved 2019-2020.
			
 
				+*/
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include "nms.h"
			
 
				+#include "common.h"
			
 
				+#define THREADS_PER_BLOCK 16
			
 
				+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
			
 
				+
			
 
				+// #define DEBUG
			
 
				+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
			
 
				+const float EPS = 1e-8;
			
 
				+struct Point {
			
 
				+    float x, y;
			
 
				+    __device__ Point() {}
			
 
				+    __device__ Point(double _x, double _y){
			
 
				+        x = _x, y = _y;
			
 
				+    }
			
 
				+
			
 
				+    __device__ void set(float _x, float _y){
			
 
				+        x = _x; y = _y;
			
 
				+    }
			
 
				+
			
 
				+    __device__ Point operator +(const Point &b)const{
			
 
				+        return Point(x + b.x, y + b.y);
			
 
				+    }
			
 
				+
			
 
				+    __device__ Point operator -(const Point &b)const{
			
 
				+        return Point(x - b.x, y - b.y);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+__device__ inline float cross(const Point &a, const Point &b){
			
 
				+    return a.x * b.y - a.y * b.x;
			
 
				+}
			
 
				+
			
 
				+__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){
			
 
				+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
			
 
				+}
			
 
				+
			
 
				+__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
			
 
				+    int ret = min(p1.x,p2.x) <= max(q1.x,q2.x)  &&
			
 
				+              min(q1.x,q2.x) <= max(p1.x,p2.x) &&
			
 
				+              min(p1.y,p2.y) <= max(q1.y,q2.y) &&
			
 
				+              min(q1.y,q2.y) <= max(p1.y,p2.y);
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+__device__ inline int check_in_box2d(const float *box, const Point &p){
			
 
				+    //params: (7) [x, y, z, dx, dy, dz, heading]
			
 
				+    const float MARGIN = 1e-2;
			
 
				+
			
 
				+    float center_x = box[0], center_y = box[1];
			
 
				+    float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);  // rotate the point in the opposite direction of box
			
 
				+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
			
 
				+    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
			
 
				+
			
 
				+    return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN);
			
 
				+}
			
 
				+
			
 
				+__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
			
 
				+    // fast exclusion
			
 
				+    if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
			
 
				+
			
 
				+    // check cross standing
			
 
				+    float s1 = cross(q0, p1, p0);
			
 
				+    float s2 = cross(p1, q1, p0);
			
 
				+    float s3 = cross(p0, q1, q0);
			
 
				+    float s4 = cross(q1, p1, q0);
			
 
				+
			
 
				+    if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
			
 
				+
			
 
				+    // calculate intersection of two lines
			
 
				+    float s5 = cross(q1, p1, p0);
			
 
				+    if(fabs(s5 - s1) > EPS){
			
 
				+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
			
 
				+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
			
 
				+
			
 
				+    }
			
 
				+    else{
			
 
				+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
			
 
				+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
			
 
				+        float D = a0 * b1 - a1 * b0;
			
 
				+
			
 
				+        ans.x = (b0 * c1 - b1 * c0) / D;
			
 
				+        ans.y = (a1 * c0 - a0 * c1) / D;
			
 
				+    }
			
 
				+
			
 
				+    return 1;
			
 
				+}
			
 
				+
			
 
				+__device__ inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
			
 
				+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
			
 
				+    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
			
 
				+    p.set(new_x, new_y);
			
 
				+}
			
 
				+
			
 
				+__device__ inline int point_cmp(const Point &a, const Point &b, const Point &center){
			
 
				+    return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
			
 
				+}
			
 
				+
			
 
				+__device__ inline float box_overlap(const float *box_a, const float *box_b){
			
 
				+    // params box_a: [x, y, z, dx, dy, dz, heading]
			
 
				+    // params box_b: [x, y, z, dx, dy, dz, heading]
			
 
				+
			
 
				+    float a_angle = box_a[6], b_angle = box_b[6];
			
 
				+    float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
			
 
				+    float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
			
 
				+    float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
			
 
				+    float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
			
 
				+    float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
			
 
				+
			
 
				+    Point center_a(box_a[0], box_a[1]);
			
 
				+    Point center_b(box_b[0], box_b[1]);
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+    printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle,
			
 
				+           b_x1, b_y1, b_x2, b_y2, b_angle);
			
 
				+    printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y);
			
 
				+#endif
			
 
				+
			
 
				+    Point box_a_corners[5];
			
 
				+    box_a_corners[0].set(a_x1, a_y1);
			
 
				+    box_a_corners[1].set(a_x2, a_y1);
			
 
				+    box_a_corners[2].set(a_x2, a_y2);
			
 
				+    box_a_corners[3].set(a_x1, a_y2);
			
 
				+
			
 
				+    Point box_b_corners[5];
			
 
				+    box_b_corners[0].set(b_x1, b_y1);
			
 
				+    box_b_corners[1].set(b_x2, b_y1);
			
 
				+    box_b_corners[2].set(b_x2, b_y2);
			
 
				+    box_b_corners[3].set(b_x1, b_y2);
			
 
				+
			
 
				+    // get oriented corners
			
 
				+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
			
 
				+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
			
 
				+
			
 
				+    for (int k = 0; k < 4; k++){
			
 
				+#ifdef DEBUG
			
 
				+        printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
			
 
				+#endif
			
 
				+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
			
 
				+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
			
 
				+#ifdef DEBUG
			
 
				+        printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
			
 
				+#endif
			
 
				+    }
			
 
				+
			
 
				+    box_a_corners[4] = box_a_corners[0];
			
 
				+    box_b_corners[4] = box_b_corners[0];
			
 
				+
			
 
				+    // get intersection of lines
			
 
				+    Point cross_points[16];
			
 
				+    Point poly_center;
			
 
				+    int cnt = 0, flag = 0;
			
 
				+
			
 
				+    poly_center.set(0, 0);
			
 
				+    for (int i = 0; i < 4; i++){
			
 
				+        for (int j = 0; j < 4; j++){
			
 
				+            flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
			
 
				+            if (flag){
			
 
				+                poly_center = poly_center + cross_points[cnt];
			
 
				+                cnt++;
			
 
				+#ifdef DEBUG
			
 
				+                printf("Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f) \n",
			
 
				+                    cross_points[cnt - 1].x, cross_points[cnt - 1].y,
			
 
				+                    box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x, box_a_corners[i + 1].y,
			
 
				+                    box_b_corners[i].x, box_b_corners[i].y, box_b_corners[i + 1].x, box_b_corners[i + 1].y);
			
 
				+#endif
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // check corners
			
 
				+    for (int k = 0; k < 4; k++){
			
 
				+        if (check_in_box2d(box_a, box_b_corners[k])){
			
 
				+            poly_center = poly_center + box_b_corners[k];
			
 
				+            cross_points[cnt] = box_b_corners[k];
			
 
				+            cnt++;
			
 
				+#ifdef DEBUG
			
 
				+                printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
			
 
				+#endif
			
 
				+        }
			
 
				+        if (check_in_box2d(box_b, box_a_corners[k])){
			
 
				+            poly_center = poly_center + box_a_corners[k];
			
 
				+            cross_points[cnt] = box_a_corners[k];
			
 
				+            cnt++;
			
 
				+#ifdef DEBUG
			
 
				+                printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
			
 
				+#endif
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    poly_center.x /= cnt;
			
 
				+    poly_center.y /= cnt;
			
 
				+
			
 
				+    // sort the points of polygon
			
 
				+    Point temp;
			
 
				+    for (int j = 0; j < cnt - 1; j++){
			
 
				+        for (int i = 0; i < cnt - j - 1; i++){
			
 
				+            if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
			
 
				+                temp = cross_points[i];
			
 
				+                cross_points[i] = cross_points[i + 1];
			
 
				+                cross_points[i + 1] = temp;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+    printf("cnt=%d\n", cnt);
			
 
				+    for (int i = 0; i < cnt; i++){
			
 
				+        printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y);
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    // get the overlap areas
			
 
				+    float area = 0;
			
 
				+    for (int k = 0; k < cnt - 1; k++){
			
 
				+        area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
			
 
				+    }
			
 
				+
			
 
				+    return fabs(area) / 2.0;
			
 
				+}
			
 
				+
			
 
				+__device__ inline float iou_bev(const float *box_a, const float *box_b){
			
 
				+    // params box_a: [x, y, z, dx, dy, dz, heading]
			
 
				+    // params box_b: [x, y, z, dx, dy, dz, heading]
			
 
				+    float sa = box_a[3] * box_a[4];
			
 
				+    float sb = box_b[3] * box_b[4];
			
 
				+    float s_overlap = box_overlap(box_a, box_b);
			
 
				+    return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
			
 
				+}
			
 
				+
			
 
				+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
			
 
				+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
			
 
				+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
			
 
				+
			
 
				+    if (a_idx >= num_a || b_idx >= num_b){
			
 
				+        return;
			
 
				+    }
			
 
				+    const float * cur_box_a = boxes_a + a_idx * 7;
			
 
				+    const float * cur_box_b = boxes_b + b_idx * 7;
			
 
				+    float s_overlap = box_overlap(cur_box_a, cur_box_b);
			
 
				+    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
			
 
				+}
			
 
				+
			
 
				+__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
			
 
				+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
			
 
				+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
			
 
				+
			
 
				+    if (a_idx >= num_a || b_idx >= num_b){
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const float * cur_box_a = boxes_a + a_idx * 7;
			
 
				+    const float * cur_box_b = boxes_b + b_idx * 7;
			
 
				+    float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
			
 
				+    ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
			
 
				+}
			
 
				+
			
 
				+__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
			
 
				+                           const float *boxes, unsigned long long *mask){
			
 
				+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
			
 
				+
			
 
				+    const int row_start = blockIdx.y;
			
 
				+    const int col_start = blockIdx.x;
			
 
				+
			
 
				+    // if (row_start > col_start) return;
			
 
				+
			
 
				+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+
			
 
				+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
			
 
				+
			
 
				+    if (threadIdx.x < col_size) {
			
 
				+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
			
 
				+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
			
 
				+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
			
 
				+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
			
 
				+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
			
 
				+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
			
 
				+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    if (threadIdx.x < row_size) {
			
 
				+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
			
 
				+        const float *cur_box = boxes + cur_box_idx * 7;
			
 
				+
			
 
				+        int i = 0;
			
 
				+        unsigned long long t = 0;
			
 
				+        int start = 0;
			
 
				+        if (row_start == col_start) {
			
 
				+          start = threadIdx.x + 1;
			
 
				+        }
			
 
				+        for (i = start; i < col_size; i++) {
			
 
				+            if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
			
 
				+                t |= 1ULL << i;
			
 
				+            }
			
 
				+        }
			
 
				+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
			
 
				+        mask[cur_box_idx * col_blocks + col_start] = t;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__device__ inline float iou_normal(float const * const a, float const * const b) {
			
 
				+    //params: a: [x, y, z, dx, dy, dz, heading]
			
 
				+    //params: b: [x, y, z, dx, dy, dz, heading]
			
 
				+
			
 
				+    float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
			
 
				+    float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
			
 
				+    float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
			
 
				+    float interS = width * height;
			
 
				+    float Sa = a[3] * a[4];
			
 
				+    float Sb = b[3] * b[4];
			
 
				+    return interS / fmaxf(Sa + Sb - interS, EPS);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh,
			
 
				+                           const float *boxes, unsigned long long *mask){
			
 
				+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
			
 
				+
			
 
				+    const int row_start = blockIdx.y;
			
 
				+    const int col_start = blockIdx.x;
			
 
				+
			
 
				+    // if (row_start > col_start) return;
			
 
				+
			
 
				+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+
			
 
				+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
			
 
				+
			
 
				+    if (threadIdx.x < col_size) {
			
 
				+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
			
 
				+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
			
 
				+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
			
 
				+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
			
 
				+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
			
 
				+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
			
 
				+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    if (threadIdx.x < row_size) {
			
 
				+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
			
 
				+        const float *cur_box = boxes + cur_box_idx * 7;
			
 
				+
			
 
				+        int i = 0;
			
 
				+        unsigned long long t = 0;
			
 
				+        int start = 0;
			
 
				+        if (row_start == col_start) {
			
 
				+          start = threadIdx.x + 1;
			
 
				+        }
			
 
				+        for (i = start; i < col_size; i++) {
			
 
				+            if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
			
 
				+                t |= 1ULL << i;
			
 
				+            }
			
 
				+        }
			
 
				+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
			
 
				+        mask[cur_box_idx * col_blocks + col_start] = t;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+NmsCuda::NmsCuda(const int num_threads, const int num_box_corners,
			
 
				+    const float nms_overlap_threshold)
			
 
				+: num_threads_(num_threads),
			
 
				+  num_box_corners_(num_box_corners),
			
 
				+  nms_overlap_threshold_(nms_overlap_threshold) {}
			
 
				+
			
 
				+void NmsCuda::DoNmsCuda(const int host_filter_count,
			
 
				+    float *dev_sorted_box_for_nms, long *out_keep_inds,
			
 
				+    int *out_num_to_keep) {
			
 
				+
			
 
				+    const int col_blocks = DIVUP(host_filter_count, num_threads_);
			
 
				+    unsigned long long *dev_mask = NULL;
			
 
				+    GPU_CHECK(cudaMalloc(&dev_mask, host_filter_count * col_blocks * sizeof(unsigned long long)));    
			
 
				+
			
 
				+    dim3 blocks(DIVUP(host_filter_count, num_threads_),
			
 
				+                DIVUP(host_filter_count, num_threads_));
			
 
				+    dim3 threads(num_threads_);
			
 
				+
			
 
				+    nms_kernel<<<blocks, threads>>>(host_filter_count, nms_overlap_threshold_, dev_sorted_box_for_nms, dev_mask);
			
 
				+    // postprocess for nms output
			
 
				+    std::vector<unsigned long long> host_mask(host_filter_count * col_blocks);
			
 
				+    GPU_CHECK(cudaMemcpy(&host_mask[0], dev_mask,
			
 
				+            sizeof(unsigned long long) * host_filter_count * col_blocks,
			
 
				+            cudaMemcpyDeviceToHost));
			
 
				+    std::vector<unsigned long long> remv(col_blocks);
			
 
				+    memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
			
 
				+
			
 
				+    for (int i = 0; i < host_filter_count; ++i) {
			
 
				+        int nblock = i / num_threads_;
			
 
				+        int inblock = i % num_threads_;
			
 
				+
			
 
				+        if (!(remv[nblock] & (1ULL << inblock))) {
			
 
				+            out_keep_inds[(*out_num_to_keep)++] = i;
			
 
				+            unsigned long long *p = &host_mask[0] + i * col_blocks;
			
 
				+            for (int j = nblock; j < col_blocks; ++j) {
			
 
				+                remv[j] |= p[j];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    GPU_CHECK(cudaFree(dev_mask));
			
 
				+}
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/nms.h
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/nms.h
@@ -0,0 +1,64 @@
 
				+
			
 
				+/*
			
 
				+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
			
 
				+Written by Shaoshuai Shi
			
 
				+All Rights Reserved 2019-2020.
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+class NmsCuda {
			
 
				+ private:
			
 
				+  const int num_threads_;
			
 
				+  const int num_box_corners_;
			
 
				+  const float nms_overlap_threshold_;
			
 
				+
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @brief Constructor
			
 
				+   * @param[in] num_threads Number of threads when launching cuda kernel
			
 
				+   * @param[in] num_box_corners Number of corners for 2D box
			
 
				+   * @param[in] nms_overlap_threshold IOU threshold for NMS
			
 
				+   * @details Captital variables never change after the compile, Non-captital
			
 
				+   * variables could be chaned through rosparam
			
 
				+   */
			
 
				+  NmsCuda(const int num_threads, const int num_box_corners,
			
 
				+          const float nms_overlap_threshold);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief GPU Non-Maximum Suppresion for network output
			
 
				+   * @param[in] host_filter_count Number of filtered output
			
 
				+   * @param[in] dev_sorted_box_for_nms Bounding box output sorted by score
			
 
				+   * @param[out] out_keep_inds Indexes of selected bounding box
			
 
				+   * @param[out] out_num_to_keep Number of kept bounding boxes
			
 
				+   * @details NMS in GPU and postprocessing for selecting box in CPU
			
 
				+   */
			
 
				+  void DoNmsCuda(const int host_filter_count, float* dev_sorted_box_for_nms,
			
 
				+                 long* out_keep_inds, int* out_num_to_keep);
			
 
				+};
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/pointpillars.cc
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/pointpillars.cc
@@ -0,0 +1,495 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+#include "pointpillars.h"
			
 
				+
			
 
				+#include <chrono>
			
 
				+#include <iostream>
			
 
				+#include <iostream>
			
 
				+
			
 
				+PointPillars::PointPillars(const float score_threshold,
			
 
				+                           const float nms_overlap_threshold,
			
 
				+                           const bool use_onnx,
			
 
				+                           const std::string pfe_file,
			
 
				+                           const std::string backbone_file,
			
 
				+                           const std::string pp_config)
			
 
				+    : score_threshold_(score_threshold),
			
 
				+      nms_overlap_threshold_(nms_overlap_threshold),
			
 
				+      use_onnx_(use_onnx),
			
 
				+      pfe_file_(pfe_file),
			
 
				+      backbone_file_(backbone_file),
			
 
				+      pp_config_(pp_config)
			
 
				+{
			
 
				+    InitParams();
			
 
				+    InitTRT(use_onnx_);
			
 
				+    DeviceMemoryMalloc();
			
 
				+
			
 
				+    preprocess_points_cuda_ptr_.reset(new PreprocessPointsCuda(
			
 
				+        kNumThreads,
			
 
				+        kMaxNumPillars,
			
 
				+        kMaxNumPointsPerPillar,
			
 
				+        kNumPointFeature,
			
 
				+        kNumIndsForScan,
			
 
				+        kGridXSize,kGridYSize, kGridZSize,
			
 
				+        kPillarXSize,kPillarYSize, kPillarZSize,
			
 
				+        kMinXRange, kMinYRange, kMinZRange));
			
 
				+
			
 
				+    scatter_cuda_ptr_.reset(new ScatterCuda(kNumThreads, kGridXSize, kGridYSize));
			
 
				+
			
 
				+    const float float_min = std::numeric_limits<float>::lowest();
			
 
				+    const float float_max = std::numeric_limits<float>::max();
			
 
				+    postprocess_cuda_ptr_.reset(
			
 
				+      new PostprocessCuda(kNumThreads,
			
 
				+                          float_min, float_max, 
			
 
				+                          kNumClass,kNumAnchorPerCls,
			
 
				+                          kMultiheadLabelMapping,
			
 
				+                          score_threshold_, 
			
 
				+                          nms_overlap_threshold_,
			
 
				+                          kNmsPreMaxsize, 
			
 
				+                          kNmsPostMaxsize,
			
 
				+                          kNumBoxCorners, 
			
 
				+                          kNumInputBoxFeature,
			
 
				+                          7));  /*kNumOutputBoxFeature*/
			
 
				+    
			
 
				+}
			
 
				+
			
 
				+PointPillars::~PointPillars() {
			
 
				+    // for pillars 
			
 
				+    GPU_CHECK(cudaFree(dev_num_points_per_pillar_));
			
 
				+    GPU_CHECK(cudaFree(dev_x_coors_));
			
 
				+    GPU_CHECK(cudaFree(dev_y_coors_));
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_point_feature_));
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_coors_));
			
 
				+    // for sparse map
			
 
				+    GPU_CHECK(cudaFree(dev_sparse_pillar_map_));    
			
 
				+    GPU_CHECK(cudaFree(dev_cumsum_along_x_));
			
 
				+    GPU_CHECK(cudaFree(dev_cumsum_along_y_));
			
 
				+    // for pfe forward
			
 
				+    GPU_CHECK(cudaFree(dev_pfe_gather_feature_));
			
 
				+      
			
 
				+    GPU_CHECK(cudaFree(pfe_buffers_[0]));
			
 
				+    GPU_CHECK(cudaFree(pfe_buffers_[1]));
			
 
				+
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[0]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[1]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[2]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[3]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[4]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[5]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[6]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[7]));
			
 
				+    pfe_context_->destroy();
			
 
				+    backbone_context_->destroy();
			
 
				+    pfe_engine_->destroy();
			
 
				+    backbone_engine_->destroy();
			
 
				+    // for post process
			
 
				+    GPU_CHECK(cudaFree(dev_scattered_feature_));
			
 
				+    GPU_CHECK(cudaFree(dev_filtered_box_));
			
 
				+    GPU_CHECK(cudaFree(dev_filtered_score_));
			
 
				+    GPU_CHECK(cudaFree(dev_filtered_label_));
			
 
				+    GPU_CHECK(cudaFree(dev_filtered_dir_));
			
 
				+    GPU_CHECK(cudaFree(dev_box_for_nms_));
			
 
				+    GPU_CHECK(cudaFree(dev_filter_count_));
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void PointPillars::InitParams()
			
 
				+{
			
 
				+    YAML::Node params = YAML::LoadFile(pp_config_);
			
 
				+    kPillarXSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][0].as<float>();
			
 
				+    kPillarYSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][1].as<float>();
			
 
				+    kPillarZSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][2].as<float>();
			
 
				+    kMinXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][0].as<float>();
			
 
				+    kMinYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][1].as<float>();
			
 
				+    kMinZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][2].as<float>();
			
 
				+    kMaxXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][3].as<float>();
			
 
				+    kMaxYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][4].as<float>();
			
 
				+    kMaxZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][5].as<float>();
			
 
				+    kNumClass = params["CLASS_NAMES"].size();
			
 
				+    kMaxNumPillars = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_NUMBER_OF_VOXELS"]["test"].as<int>();
			
 
				+    kMaxNumPointsPerPillar = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_POINTS_PER_VOXEL"].as<int>();
			
 
				+    kNumPointFeature = 5; // [x, y, z, i,0]
			
 
				+    kNumInputBoxFeature = 7;
			
 
				+    kNumOutputBoxFeature = params["MODEL"]["DENSE_HEAD"]["TARGET_ASSIGNER_CONFIG"]["BOX_CODER_CONFIG"]["code_size"].as<int>();
			
 
				+    kBatchSize = 1;
			
 
				+    kNumIndsForScan = 1024;
			
 
				+    kNumThreads = 64;
			
 
				+    kNumBoxCorners = 8;
			
 
				+    kAnchorStrides = 4;
			
 
				+    kNmsPreMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_PRE_MAXSIZE"].as<int>();
			
 
				+    kNmsPostMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_POST_MAXSIZE"].as<int>();
			
 
				+    //params for initialize anchors
			
 
				+    //Adapt to OpenPCDet
			
 
				+    kAnchorNames = params["CLASS_NAMES"].as<std::vector<std::string>>();
			
 
				+    for (int i = 0; i < kAnchorNames.size(); ++i)
			
 
				+    {
			
 
				+        kAnchorDxSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][0].as<float>());
			
 
				+        kAnchorDySizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][1].as<float>());
			
 
				+        kAnchorDzSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][2].as<float>());
			
 
				+        kAnchorBottom.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_bottom_heights"][0].as<float>());
			
 
				+    }
			
 
				+    for (int idx_head = 0; idx_head < params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"].size(); ++idx_head)
			
 
				+    {
			
 
				+        int num_cls_per_head = params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"][idx_head]["HEAD_CLS_NAME"].size();
			
 
				+        std::vector<int> value;
			
 
				+        for (int i = 0; i < num_cls_per_head; ++i)
			
 
				+        {
			
 
				+            value.emplace_back(idx_head + i);
			
 
				+        }
			
 
				+        kMultiheadLabelMapping.emplace_back(value);
			
 
				+    }
			
 
				+
			
 
				+    // Generate secondary parameters based on above.
			
 
				+    kGridXSize = static_cast<int>((kMaxXRange - kMinXRange) / kPillarXSize); //512
			
 
				+    kGridYSize = static_cast<int>((kMaxYRange - kMinYRange) / kPillarYSize); //512
			
 
				+    kGridZSize = static_cast<int>((kMaxZRange - kMinZRange) / kPillarZSize); //1
			
 
				+    kRpnInputSize = 64 * kGridYSize * kGridXSize;
			
 
				+
			
 
				+    kNumAnchorXinds = static_cast<int>(kGridXSize / kAnchorStrides); //Width
			
 
				+    kNumAnchorYinds = static_cast<int>(kGridYSize / kAnchorStrides); //Hight
			
 
				+    kNumAnchor = kNumAnchorXinds * kNumAnchorYinds * 2 * kNumClass;  // H * W * Ro * N = 196608
			
 
				+
			
 
				+    kNumAnchorPerCls = kNumAnchorXinds * kNumAnchorYinds * 2; //H * W * Ro = 32768
			
 
				+    kRpnBoxOutputSize = kNumAnchor * kNumOutputBoxFeature;
			
 
				+    kRpnClsOutputSize = kNumAnchor * kNumClass;
			
 
				+    kRpnDirOutputSize = kNumAnchor * 2;
			
 
				+}
			
 
				+
			
 
				+void PointPillars::DeviceMemoryMalloc() {
			
 
				+    // for pillars 
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_num_points_per_pillar_), kMaxNumPillars * sizeof(float))); // M
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_x_coors_), kMaxNumPillars * sizeof(int))); // M
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_y_coors_), kMaxNumPillars * sizeof(int))); // M
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_), kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float))); // [M , m , 4]
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_coors_),  kMaxNumPillars * 4 * sizeof(float))); // [M , 4]
			
 
				+    // for sparse map
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_sparse_pillar_map_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_x_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_y_), kNumIndsForScan * kNumIndsForScan * sizeof(int)));// [1024 , 1024]
			
 
				+
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pfe_gather_feature_),
			
 
				+                        kMaxNumPillars * kMaxNumPointsPerPillar *
			
 
				+                            kNumGatherPointFeature * sizeof(float)));
			
 
				+    // for trt inference
			
 
				+    // create GPU buffers and a stream
			
 
				+
			
 
				+    GPU_CHECK(
			
 
				+        cudaMalloc(&pfe_buffers_[0], kMaxNumPillars * kMaxNumPointsPerPillar *
			
 
				+                                        kNumGatherPointFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * 64 * sizeof(float)));
			
 
				+
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[0],  kRpnInputSize * sizeof(float)));
			
 
				+
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[1],  kNumAnchorPerCls  * sizeof(float)));  //classes
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[2],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[3],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[4],  kNumAnchorPerCls  * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[5],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[6],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
			
 
				+    
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[7],  kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature * sizeof(float))); //boxes
			
 
				+
			
 
				+    // for scatter kernel
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_scattered_feature_),
			
 
				+                        kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
			
 
				+    // for filter
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_box_),
			
 
				+                        kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_score_),
			
 
				+                        kNumAnchor * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_label_),
			
 
				+                        kNumAnchor * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_dir_),
			
 
				+                        kNumAnchor * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_box_for_nms_),
			
 
				+                        kNumAnchor * kNumBoxCorners * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filter_count_), kNumClass * sizeof(int)));
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void PointPillars::SetDeviceMemoryToZero() {
			
 
				+
			
 
				+    GPU_CHECK(cudaMemset(dev_num_points_per_pillar_, 0, kMaxNumPillars * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_x_coors_,               0, kMaxNumPillars * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_y_coors_,               0, kMaxNumPillars * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_point_feature_,  0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_coors_,          0, kMaxNumPillars * 4 * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(dev_sparse_pillar_map_,     0, kNumIndsForScan * kNumIndsForScan * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pfe_gather_feature_,    0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
			
 
				+    
			
 
				+    // GPU_CHECK(cudaMemset(pfe_buffers_[0],       0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(pfe_buffers_[1],       0, kMaxNumPillars * 64 * sizeof(float)));
			
 
				+
			
 
				+    GPU_CHECK(cudaMemset(dev_scattered_feature_,    0, kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
			
 
				+    
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[0],    0, kRpnInputSize * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[1],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[2],    0, kNumAnchorPerCls     * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[3],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[4],    0, kNumAnchorPerCls * 4 * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[5],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[6],    0, kNumAnchorPerCls     * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[7],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[8],    0, kNumAnchorPerCls * 4 * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[9],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[10],   0, kNumAnchorPerCls     * sizeof(float)));
			
 
				+
			
 
				+    GPU_CHECK(cudaMemset(dev_filtered_box_,     0, kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_filtered_score_,   0, kNumAnchor * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_filter_count_,     0, kNumClass * sizeof(int)));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+void PointPillars::InitTRT(const bool use_onnx) {
			
 
				+  if (use_onnx_) {
			
 
				+    // create a TensorRT model from the onnx model and load it into an engine
			
 
				+    OnnxToTRTModel(pfe_file_, &pfe_engine_);
			
 
				+    OnnxToTRTModel(backbone_file_, &backbone_engine_);
			
 
				+  }else {
			
 
				+    EngineToTRTModel(pfe_file_, &pfe_engine_);
			
 
				+    EngineToTRTModel(backbone_file_, &backbone_engine_);
			
 
				+  }
			
 
				+    if (pfe_engine_ == nullptr || backbone_engine_ == nullptr) {
			
 
				+        std::cerr << "Failed to load ONNX file.";
			
 
				+    }
			
 
				+
			
 
				+    // create execution context from the engine
			
 
				+    pfe_context_ = pfe_engine_->createExecutionContext();
			
 
				+    backbone_context_ = backbone_engine_->createExecutionContext();
			
 
				+    if (pfe_context_ == nullptr || backbone_context_ == nullptr) {
			
 
				+        std::cerr << "Failed to create TensorRT Execution Context.";
			
 
				+    }
			
 
				+  
			
 
				+}
			
 
				+
			
 
				+void PointPillars::OnnxToTRTModel(
			
 
				+    const std::string& model_file,  // name of the onnx model
			
 
				+    nvinfer1::ICudaEngine** engine_ptr) {
			
 
				+    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
			
 
				+
			
 
				+    // create the builder
			
 
				+    const auto explicit_batch =
			
 
				+        static_cast<uint32_t>(kBatchSize) << static_cast<uint32_t>(
			
 
				+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
			
 
				+    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(g_logger_);
			
 
				+    nvinfer1::INetworkDefinition* network =
			
 
				+        builder->createNetworkV2(explicit_batch);
			
 
				+
			
 
				+    // parse onnx model
			
 
				+    auto parser = nvonnxparser::createParser(*network, g_logger_);
			
 
				+    if (!parser->parseFromFile(model_file.c_str(), verbosity)) {
			
 
				+        std::string msg("failed to parse onnx file");
			
 
				+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
			
 
				+        exit(EXIT_FAILURE);
			
 
				+    }
			
 
				+
			
 
				+    // Build the engine
			
 
				+    builder->setMaxBatchSize(kBatchSize);
			
 
				+    builder->setHalf2Mode(true);
			
 
				+    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
			
 
				+    config->setMaxWorkspaceSize(1 << 25);
			
 
				+    nvinfer1::ICudaEngine* engine =
			
 
				+        builder->buildEngineWithConfig(*network, *config);
			
 
				+
			
 
				+    *engine_ptr = engine;
			
 
				+    parser->destroy();
			
 
				+    network->destroy();
			
 
				+    config->destroy();
			
 
				+    builder->destroy();
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void PointPillars::EngineToTRTModel(
			
 
				+    const std::string &engine_file ,     
			
 
				+    nvinfer1::ICudaEngine** engine_ptr)  {
			
 
				+    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
			
 
				+    std::stringstream gieModelStream; 
			
 
				+    gieModelStream.seekg(0, gieModelStream.beg); 
			
 
				+
			
 
				+    std::ifstream cache(engine_file); 
			
 
				+    gieModelStream << cache.rdbuf();
			
 
				+    cache.close(); 
			
 
				+    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(g_logger_); 
			
 
				+
			
 
				+    if (runtime == nullptr) {
			
 
				+        std::string msg("failed to build runtime parser");
			
 
				+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
			
 
				+        exit(EXIT_FAILURE);
			
 
				+    }
			
 
				+    gieModelStream.seekg(0, std::ios::end);
			
 
				+    const int modelSize = gieModelStream.tellg(); 
			
 
				+
			
 
				+    gieModelStream.seekg(0, std::ios::beg);
			
 
				+    void* modelMem = malloc(modelSize); 
			
 
				+    gieModelStream.read((char*)modelMem, modelSize);
			
 
				+
			
 
				+    std::cout << "                                                                  "<< std::endl;
			
 
				+    std::cout << "------------------------------------------------------------------"<< std::endl;
			
 
				+    std::cout << ">>>>                                                          >>>>"<< std::endl;
			
 
				+    std::cout << "                                                                  "<< std::endl;
			
 
				+    std::cout << "Input filename:   " << engine_file << std::endl;
			
 
				+    std::cout << "                                                                  "<< std::endl;
			
 
				+    std::cout << ">>>>                                                          >>>>"<< std::endl;
			
 
				+    std::cout << "------------------------------------------------------------------"<< std::endl;
			
 
				+    std::cout << "                                                                  "<< std::endl;
			
 
				+
			
 
				+    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, NULL); 
			
 
				+    if (engine == nullptr) {
			
 
				+        std::string msg("failed to build engine parser");
			
 
				+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
			
 
				+        exit(EXIT_FAILURE);
			
 
				+    }
			
 
				+    *engine_ptr = engine;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void PointPillars::DoInference(const float* in_points_array,
			
 
				+                                const int in_num_points,
			
 
				+                                std::vector<float>* out_detections,
			
 
				+                                std::vector<int>* out_labels,
			
 
				+                                std::vector<float>* out_scores) 
			
 
				+{
			
 
				+    SetDeviceMemoryToZero();
			
 
				+    cudaDeviceSynchronize();
			
 
				+    // [STEP 1] : load pointcloud
			
 
				+    float* dev_points;
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points),
			
 
				+                        in_num_points * kNumPointFeature * sizeof(float))); // in_num_points , 5
			
 
				+    GPU_CHECK(cudaMemset(dev_points, 0, in_num_points * kNumPointFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemcpy(dev_points, in_points_array,
			
 
				+                        in_num_points * kNumPointFeature * sizeof(float),
			
 
				+                        cudaMemcpyHostToDevice));
			
 
				+    
			
 
				+    // [STEP 2] : preprocess
			
 
				+    host_pillar_count_[0] = 0;
			
 
				+    auto preprocess_start = std::chrono::high_resolution_clock::now();
			
 
				+    preprocess_points_cuda_ptr_->DoPreprocessPointsCuda(
			
 
				+          dev_points, in_num_points, dev_x_coors_, dev_y_coors_,
			
 
				+          dev_num_points_per_pillar_, dev_pillar_point_feature_, dev_pillar_coors_,
			
 
				+          dev_sparse_pillar_map_, host_pillar_count_ ,
			
 
				+          dev_pfe_gather_feature_ );
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto preprocess_end = std::chrono::high_resolution_clock::now();
			
 
				+    // DEVICE_SAVE<float>(dev_pfe_gather_feature_,  kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature  , "0_Model_pfe_input_gather_feature");
			
 
				+
			
 
				+    // [STEP 3] : pfe forward
			
 
				+    cudaStream_t stream;
			
 
				+    GPU_CHECK(cudaStreamCreate(&stream));
			
 
				+    auto pfe_start = std::chrono::high_resolution_clock::now();
			
 
				+    GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[0], dev_pfe_gather_feature_,
			
 
				+                            kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float), ///kNumGatherPointFeature
			
 
				+                            cudaMemcpyDeviceToDevice, stream));
			
 
				+    pfe_context_->enqueueV2(pfe_buffers_, stream, nullptr);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto pfe_end = std::chrono::high_resolution_clock::now();
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(pfe_buffers_[1]),  kMaxNumPillars * 64 , "1_Model_pfe_output_buffers_[1]");
			
 
				+
			
 
				+    // [STEP 4] : scatter pillar feature
			
 
				+    auto scatter_start = std::chrono::high_resolution_clock::now();
			
 
				+    scatter_cuda_ptr_->DoScatterCuda(
			
 
				+        host_pillar_count_[0], dev_x_coors_, dev_y_coors_,
			
 
				+        reinterpret_cast<float*>(pfe_buffers_[1]), dev_scattered_feature_);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto scatter_end = std::chrono::high_resolution_clock::now();   
			
 
				+    // DEVICE_SAVE<float>(dev_scattered_feature_ ,  kRpnInputSize,"2_Model_backbone_input_dev_scattered_feature");
			
 
				+
			
 
				+    // [STEP 5] : backbone forward
			
 
				+    auto backbone_start = std::chrono::high_resolution_clock::now();
			
 
				+    GPU_CHECK(cudaMemcpyAsync(rpn_buffers_[0], dev_scattered_feature_,
			
 
				+                            kBatchSize * kRpnInputSize * sizeof(float),
			
 
				+                            cudaMemcpyDeviceToDevice, stream));
			
 
				+    backbone_context_->enqueueV2(rpn_buffers_, stream, nullptr);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto backbone_end = std::chrono::high_resolution_clock::now();
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[1]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[1]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[2]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[2]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[3]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[3]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[4]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[4]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[5]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[5]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[6]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[6]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[7]) ,  kNumAnchorPerCls * kNumClass * 9 ,"3_rpn_buffers_[7]");
			
 
				+
			
 
				+    // [STEP 6]: postprocess (multihead)
			
 
				+    auto postprocess_start = std::chrono::high_resolution_clock::now();
			
 
				+    GPU_CHECK(cudaMemset(dev_filter_count_, 0, kNumClass * sizeof(int)));
			
 
				+    postprocess_cuda_ptr_->DoPostprocessCuda(
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[1]), // [cls]   kNumAnchorPerCls 
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[2]), // [cls]   kNumAnchorPerCls * 2 * 2
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[3]), // [cls]   kNumAnchorPerCls * 2 * 2
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[4]), // [cls]   kNumAnchorPerCls 
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[5]), // [cls]   kNumAnchorPerCls * 2 * 2
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[6]), // [cls]   kNumAnchorPerCls * 2 * 2
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[7]), // [boxes] kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature
			
 
				+        dev_filtered_box_, dev_filtered_score_, dev_filter_count_,
			
 
				+        *out_detections, *out_labels , *out_scores);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto postprocess_end = std::chrono::high_resolution_clock::now();
			
 
				+
			
 
				+    // release the stream and the buffers
			
 
				+    std::chrono::duration<double> preprocess_cost = preprocess_end - preprocess_start;
			
 
				+    std::chrono::duration<double> pfe_cost = pfe_end - pfe_start;
			
 
				+    std::chrono::duration<double> scatter_cost = scatter_end - scatter_start;
			
 
				+    std::chrono::duration<double> backbone_cost = backbone_end - backbone_start;
			
 
				+    std::chrono::duration<double> postprocess_cost = postprocess_end - postprocess_start;
			
 
				+
			
 
				+    std::chrono::duration<double> pointpillars_cost = postprocess_end - preprocess_start;
			
 
				+    std::cout << "------------------------------------" << std::endl;
			
 
				+    std::cout << setiosflags(ios::left)  << setw(14) << "Module" << setw(12)  << "Time"  << resetiosflags(ios::left) << std::endl;
			
 
				+    std::cout << "------------------------------------" << std::endl;
			
 
				+    std::string Modules[] = {"Preprocess" , "Pfe" , "Scatter" , "Backbone" , "Postprocess" , "Summary"};
			
 
				+    double Times[] = {preprocess_cost.count() , pfe_cost.count() , scatter_cost.count() , backbone_cost.count() , postprocess_cost.count() , pointpillars_cost.count()}; 
			
 
				+
			
 
				+    for (int i =0 ; i < 6 ; ++i) {
			
 
				+        std::cout << setiosflags(ios::left) << setw(14) << Modules[i]  << setw(8)  << Times[i] * 1000 << " ms" << resetiosflags(ios::left) << std::endl;
			
 
				+    }
			
 
				+    std::cout << "------------------------------------" << std::endl;
			
 
				+    cudaStreamDestroy(stream);
			
 
				+
			
 
				+}
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/pointpillars.h
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/pointpillars.h
@@ -0,0 +1,286 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+// headers in STL
			
 
				+#include <algorithm>
			
 
				+#include <cmath>
			
 
				+#include <iomanip>
			
 
				+#include <limits>
			
 
				+#include <map>
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+#include <iostream>
			
 
				+#include <sstream>
			
 
				+#include <fstream>
			
 
				+// headers in TensorRT
			
 
				+#include "NvInfer.h"
			
 
				+#include "NvOnnxParser.h"
			
 
				+
			
 
				+// headers in local files
			
 
				+// #include "params.h"
			
 
				+#include "common.h"
			
 
				+#include <yaml-cpp/yaml.h>
			
 
				+#include "preprocess.h"
			
 
				+#include "scatter.h"
			
 
				+#include "postprocess.h"
			
 
				+
			
 
				+using namespace std;
			
 
				+
			
 
				+// Logger for TensorRT info/warning/errors
			
 
				+class Logger : public nvinfer1::ILogger {
			
 
				+ public:
			
 
				+  explicit Logger(Severity severity = Severity::kWARNING)
			
 
				+      : reportable_severity(severity) {}
			
 
				+
			
 
				+  void log(Severity severity, const char* msg) override {
			
 
				+    // suppress messages with severity enum value greater than the reportable
			
 
				+    if (severity > reportable_severity) return;
			
 
				+
			
 
				+    switch (severity) {
			
 
				+      case Severity::kINTERNAL_ERROR:
			
 
				+        std::cerr << "INTERNAL_ERROR: ";
			
 
				+        break;
			
 
				+      case Severity::kERROR:
			
 
				+        std::cerr << "ERROR: ";
			
 
				+        break;
			
 
				+      case Severity::kWARNING:
			
 
				+        std::cerr << "WARNING: ";
			
 
				+        break;
			
 
				+      case Severity::kINFO:
			
 
				+        std::cerr << "INFO: ";
			
 
				+        break;
			
 
				+      default:
			
 
				+        std::cerr << "UNKNOWN: ";
			
 
				+        break;
			
 
				+    }
			
 
				+    std::cerr << msg << std::endl;
			
 
				+  }
			
 
				+
			
 
				+  Severity reportable_severity;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+class PointPillars {
			
 
				+ private:
			
 
				+    // initialize in initializer list
			
 
				+    const float score_threshold_;
			
 
				+    const float nms_overlap_threshold_;
			
 
				+    const bool use_onnx_;
			
 
				+    const std::string pfe_file_;
			
 
				+    const std::string backbone_file_;
			
 
				+    const std::string pp_config_;
			
 
				+    // end initializer list
			
 
				+    // voxel size
			
 
				+    float kPillarXSize;
			
 
				+    float kPillarYSize;
			
 
				+    float kPillarZSize;
			
 
				+    // point cloud range
			
 
				+    float kMinXRange;
			
 
				+    float kMinYRange;
			
 
				+    float kMinZRange;
			
 
				+    float kMaxXRange;
			
 
				+    float kMaxYRange;
			
 
				+    float kMaxZRange;
			
 
				+    // hyper parameters
			
 
				+    int kNumClass;
			
 
				+    int kMaxNumPillars;
			
 
				+    int kMaxNumPointsPerPillar;
			
 
				+    int kNumPointFeature;
			
 
				+    int kNumGatherPointFeature = 11;
			
 
				+    int kGridXSize;
			
 
				+    int kGridYSize;
			
 
				+    int kGridZSize;
			
 
				+    int kNumAnchorXinds;
			
 
				+    int kNumAnchorYinds;
			
 
				+    int kRpnInputSize;
			
 
				+    int kNumAnchor;
			
 
				+    int kNumInputBoxFeature;
			
 
				+    int kNumOutputBoxFeature;
			
 
				+    int kRpnBoxOutputSize;
			
 
				+    int kRpnClsOutputSize;
			
 
				+    int kRpnDirOutputSize;
			
 
				+    int kBatchSize;
			
 
				+    int kNumIndsForScan;
			
 
				+    int kNumThreads;
			
 
				+    // if you change kNumThreads, need to modify NUM_THREADS_MACRO in
			
 
				+    // common.h
			
 
				+    int kNumBoxCorners;
			
 
				+    int kNmsPreMaxsize;
			
 
				+    int kNmsPostMaxsize;
			
 
				+    //params for initialize anchors
			
 
				+    //Adapt to OpenPCDet
			
 
				+    int kAnchorStrides;
			
 
				+    std::vector<string> kAnchorNames;
			
 
				+    std::vector<float> kAnchorDxSizes;
			
 
				+    std::vector<float> kAnchorDySizes;
			
 
				+    std::vector<float> kAnchorDzSizes;
			
 
				+    std::vector<float> kAnchorBottom;
			
 
				+    std::vector<std::vector<int>> kMultiheadLabelMapping;
			
 
				+    int kNumAnchorPerCls;
			
 
				+    int host_pillar_count_[1];
			
 
				+
			
 
				+    int* dev_x_coors_;
			
 
				+    int* dev_y_coors_;
			
 
				+    float* dev_num_points_per_pillar_;
			
 
				+    int* dev_sparse_pillar_map_;
			
 
				+    int* dev_cumsum_along_x_;
			
 
				+    int* dev_cumsum_along_y_;
			
 
				+
			
 
				+    float* dev_pillar_point_feature_;
			
 
				+    float* dev_pillar_coors_;
			
 
				+    float* dev_points_mean_;
			
 
				+
			
 
				+    float* dev_pfe_gather_feature_;
			
 
				+    void* pfe_buffers_[2];
			
 
				+    //variable for doPostprocessCudaMultiHead
			
 
				+    void* rpn_buffers_[8];
			
 
				+    
			
 
				+    std::vector<float*> rpn_box_output_; 
			
 
				+    std::vector<float*> rpn_cls_output_;
			
 
				+
			
 
				+    float* dev_scattered_feature_;
			
 
				+
			
 
				+    float* dev_filtered_box_;
			
 
				+    float* dev_filtered_score_;
			
 
				+    int*   dev_filtered_label_;
			
 
				+    int*   dev_filtered_dir_;
			
 
				+    float* dev_box_for_nms_;
			
 
				+    int*   dev_filter_count_;
			
 
				+
			
 
				+    std::unique_ptr<PreprocessPointsCuda> preprocess_points_cuda_ptr_;
			
 
				+    std::unique_ptr<ScatterCuda> scatter_cuda_ptr_;
			
 
				+    std::unique_ptr<PostprocessCuda> postprocess_cuda_ptr_;
			
 
				+
			
 
				+    Logger g_logger_;
			
 
				+    nvinfer1::ICudaEngine* pfe_engine_;
			
 
				+    nvinfer1::ICudaEngine* backbone_engine_;
			
 
				+    nvinfer1::IExecutionContext* pfe_context_;
			
 
				+    nvinfer1::IExecutionContext* backbone_context_;
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Memory allocation for device memory
			
 
				+     * @details Called in the constructor
			
 
				+     */
			
 
				+    void DeviceMemoryMalloc();
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Memory set to 0 for device memory
			
 
				+     * @details Called in the DoInference
			
 
				+     */
			
 
				+    void SetDeviceMemoryToZero();
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Initializing paraments from pointpillars.yaml
			
 
				+     * @details Called in the constructor
			
 
				+     */
			
 
				+    void InitParams();
			
 
				+    /**
			
 
				+     * @brief Initializing TensorRT instances
			
 
				+     * @param[in] usr_onnx_ if true, parse ONNX 
			
 
				+     * @details Called in the constructor
			
 
				+     */
			
 
				+    void InitTRT(const bool use_onnx);
			
 
				+    /**
			
 
				+     * @brief Convert ONNX to TensorRT model
			
 
				+     * @param[in] model_file ONNX model file path
			
 
				+     * @param[out] engine_ptr TensorRT model engine made out of ONNX model
			
 
				+     * @details Load ONNX model, and convert it to TensorRT model
			
 
				+     */
			
 
				+    void OnnxToTRTModel(const std::string& model_file,
			
 
				+                        nvinfer1::ICudaEngine** engine_ptr);
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Convert Engine to TensorRT model
			
 
				+     * @param[in] model_file Engine(TensorRT) model file path
			
 
				+     * @param[out] engine_ptr TensorRT model engine made 
			
 
				+     * @details Load Engine model, and convert it to TensorRT model
			
 
				+     */
			
 
				+    void EngineToTRTModel(const std::string &engine_file ,     
			
 
				+                        nvinfer1::ICudaEngine** engine_ptr) ;
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Preproces points
			
 
				+     * @param[in] in_points_array Point cloud array
			
 
				+     * @param[in] in_num_points Number of points
			
 
				+     * @details Call CPU or GPU preprocess
			
 
				+     */
			
 
				+    void Preprocess(const float* in_points_array, const int in_num_points);
			
 
				+
			
 
				+    public:
			
 
				+    /**
			
 
				+     * @brief Constructor
			
 
				+     * @param[in] score_threshold Score threshold for filtering output
			
 
				+     * @param[in] nms_overlap_threshold IOU threshold for NMS
			
 
				+     * @param[in] use_onnx if true,using onnx file ,else using engine file
			
 
				+     * @param[in] pfe_file Pillar Feature Extractor ONNX file path
			
 
				+     * @param[in] rpn_file Region Proposal Network ONNX file path
			
 
				+     * @details Variables could be changed through point_pillars_detection
			
 
				+     */
			
 
				+    PointPillars(const float score_threshold,
			
 
				+                const float nms_overlap_threshold,
			
 
				+                const bool use_onnx,
			
 
				+                const std::string pfe_file,
			
 
				+                const std::string rpn_file,
			
 
				+                const std::string pp_config);
			
 
				+    ~PointPillars();
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Call PointPillars for the inference
			
 
				+     * @param[in] in_points_array Point cloud array
			
 
				+     * @param[in] in_num_points Number of points
			
 
				+     * @param[out] out_detections Network output bounding box
			
 
				+     * @param[out] out_labels Network output object's label
			
 
				+     * @details This is an interface for the algorithm
			
 
				+     */
			
 
				+    void DoInference(const float* in_points_array,
			
 
				+                    const int in_num_points,
			
 
				+                    std::vector<float>* out_detections,
			
 
				+                    std::vector<int>* out_labels,
			
 
				+                    std::vector<float>* out_scores);
			
 
				+};
			
 
				+
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/postprocess.cu
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/postprocess.cu
@@ -0,0 +1,384 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+// headers in CUDA
			
 
				+#include <thrust/sort.h>
			
 
				+
			
 
				+// headers in local files
			
 
				+#include "common.h"
			
 
				+#include "postprocess.h"
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+
			
 
				+// sigmoid_filter_warp
			
 
				+__device__ void box_decode_warp(int head_offset, const float* box_pred, 
			
 
				+    int tid , int num_anchors_per_head , int counter, float* filtered_box) 
			
 
				+{
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 0] = box_pred[ head_offset + tid * 9 + 0];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 1] = box_pred[ head_offset + tid * 9 + 1];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 2] = box_pred[ head_offset + tid * 9 + 2];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 3] = box_pred[ head_offset + tid * 9 + 3];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 4] = box_pred[ head_offset + tid * 9 + 4];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 5] = box_pred[ head_offset + tid * 9 + 5];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 6] = box_pred[ head_offset + tid * 9 + 6];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void sigmoid_filter_kernel(
			
 
				+
			
 
				+    float* cls_pred_0,
			
 
				+    float* cls_pred_12,
			
 
				+    float* cls_pred_34,
			
 
				+    float* cls_pred_5,
			
 
				+    float* cls_pred_67,
			
 
				+    float* cls_pred_89,
			
 
				+
			
 
				+    const float* box_pred_0,
			
 
				+
			
 
				+    const float* box_pred_1,
			
 
				+    const float* box_pred_2,
			
 
				+
			
 
				+    const float* box_pred_3,
			
 
				+    const float* box_pred_4,
			
 
				+
			
 
				+    const float* box_pred_5,
			
 
				+
			
 
				+    const float* box_pred_6,
			
 
				+    const float* box_pred_7,
			
 
				+
			
 
				+    const float* box_pred_8,
			
 
				+    const float* box_pred_9,
			
 
				+
			
 
				+    float* filtered_box, 
			
 
				+    float* filtered_score, 
			
 
				+    int* filter_count,
			
 
				+
			
 
				+    const float score_threshold) {   
			
 
				+
			
 
				+    // cls_pred_34 
			
 
				+    // 32768*2 , 2
			
 
				+
			
 
				+    int num_anchors_per_head = gridDim.x * gridDim.y * blockDim.x;
			
 
				+    // 16 * 4 * 512 = 32768
			
 
				+    extern __shared__ float cls_score[];
			
 
				+    cls_score[threadIdx.x + blockDim.x] = -1.0f;
			
 
				+
			
 
				+    int tid = blockIdx.x * gridDim.y * blockDim.x + blockIdx.y *  blockDim.x + threadIdx.x; 
			
 
				+
			
 
				+
			
 
				+    if ( blockIdx.z == 0) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_0[ tid ]));
			
 
				+    if ( blockIdx.z == 1) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2]));}
			
 
				+    if ( blockIdx.z == 2) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 + 1]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2 + 1]));}
			
 
				+
			
 
				+    if ( blockIdx.z == 3) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2]));}
			
 
				+    if ( blockIdx.z == 4) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 + 1 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2 + 1]));}
			
 
				+
			
 
				+    if ( blockIdx.z == 5) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_5[ tid ]));
			
 
				+
			
 
				+    if ( blockIdx.z == 6) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2]));}
			
 
				+    if ( blockIdx.z == 7) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 + 1 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2 + 1]));}
			
 
				+
			
 
				+    if ( blockIdx.z == 8) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2]));}
			
 
				+    if ( blockIdx.z == 9) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 + 1 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2 + 1]));}
			
 
				+    
			
 
				+    __syncthreads();
			
 
				+    
			
 
				+    if( cls_score[ threadIdx.x ] > score_threshold) 
			
 
				+    {
			
 
				+        int counter = atomicAdd(&filter_count[blockIdx.z], 1);
			
 
				+        if ( blockIdx.z == 0) {
			
 
				+            box_decode_warp(0 ,box_pred_0 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 1) {
			
 
				+            box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 2) {
			
 
				+            box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 3) {
			
 
				+            box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else 
			
 
				+        if (blockIdx.z == 4) {
			
 
				+            box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];            
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 5) {
			
 
				+            box_decode_warp(0 ,box_pred_5 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 6) {
			
 
				+            box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 7) {
			
 
				+            box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 8) {
			
 
				+
			
 
				+            box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 9) {
			
 
				+            box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }
			
 
				+    }
			
 
				+    __syncthreads();  
			
 
				+    if( cls_score[ threadIdx.x + blockDim.x ] > score_threshold)  {     
			
 
				+            int counter = atomicAdd(&filter_count[blockIdx.z], 1);
			
 
				+            // printf("counter : %d \n" , counter);
			
 
				+            if (blockIdx.z == 1) {
			
 
				+                box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 2) {
			
 
				+                box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 3) {
			
 
				+                box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 4) {
			
 
				+                box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 6) {
			
 
				+                box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 7) {
			
 
				+                box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 8) {
			
 
				+                box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 9) {
			
 
				+                box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+__global__ void sort_boxes_by_indexes_kernel(float* filtered_box, float* filtered_scores, int* indexes, int filter_count,
			
 
				+    float* sorted_filtered_boxes, float* sorted_filtered_scores,
			
 
				+    const int num_output_box_feature)
			
 
				+{
			
 
				+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    if(tid < filter_count)  {
			
 
				+
			
 
				+        int sort_index = indexes[tid];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 0] = filtered_box[sort_index * num_output_box_feature + 0];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 1] = filtered_box[sort_index * num_output_box_feature + 1];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 2] = filtered_box[sort_index * num_output_box_feature + 2];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 3] = filtered_box[sort_index * num_output_box_feature + 3];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 4] = filtered_box[sort_index * num_output_box_feature + 4];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 5] = filtered_box[sort_index * num_output_box_feature + 5];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 6] = filtered_box[sort_index * num_output_box_feature + 6];
			
 
				+
			
 
				+        // sorted_filtered_dir[tid] = filtered_dir[sort_index];
			
 
				+        sorted_filtered_scores[tid] = filtered_scores[sort_index];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+PostprocessCuda::PostprocessCuda(const int num_threads, const float float_min, const float float_max,
			
 
				+    const int num_class,const int num_anchor_per_cls,
			
 
				+    const std::vector<std::vector<int>> multihead_label_mapping,
			
 
				+    const float score_threshold,  const float nms_overlap_threshold, 
			
 
				+    const int nms_pre_maxsize, const int nms_post_maxsize,
			
 
				+    const int num_box_corners, 
			
 
				+    const int num_input_box_feature,
			
 
				+    const int num_output_box_feature)
			
 
				+: num_threads_(num_threads),
			
 
				+  float_min_(float_min),
			
 
				+  float_max_(float_max),
			
 
				+  num_class_(num_class),
			
 
				+  num_anchor_per_cls_(num_anchor_per_cls),
			
 
				+  multihead_label_mapping_(multihead_label_mapping),
			
 
				+  score_threshold_(score_threshold),
			
 
				+  nms_overlap_threshold_(nms_overlap_threshold),
			
 
				+  nms_pre_maxsize_(nms_pre_maxsize),
			
 
				+  nms_post_maxsize_(nms_post_maxsize),
			
 
				+  num_box_corners_(num_box_corners),
			
 
				+  num_input_box_feature_(num_input_box_feature),
			
 
				+  num_output_box_feature_(num_output_box_feature) {
			
 
				+    nms_cuda_ptr_.reset(
			
 
				+    new NmsCuda(num_threads_, num_box_corners_, nms_overlap_threshold_));
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void PostprocessCuda::DoPostprocessCuda(
			
 
				+    float* cls_pred_0,
			
 
				+    float* cls_pred_12,
			
 
				+    float* cls_pred_34,
			
 
				+    float* cls_pred_5,
			
 
				+    float* cls_pred_67,
			
 
				+    float* cls_pred_89,
			
 
				+
			
 
				+    const float* box_preds,
			
 
				+   
			
 
				+    float* dev_filtered_box, 
			
 
				+    float* dev_filtered_score, 
			
 
				+    int* dev_filter_count,
			
 
				+    std::vector<float>& out_detection, std::vector<int>& out_label , std::vector<float>& out_score) {
			
 
				+    // 在此之前，先进行rpn_box_output的concat. 
			
 
				+    // 128x128 的feature map， cls_pred 的shape为（32768，1），（32768,1），（32768,1），（65536,2），（32768，1）
			
 
				+    dim3 gridsize(16, 4 , 10);  //16 *  4  * 512  = 32768 代表一个head的anchors
			
 
				+    sigmoid_filter_kernel<<< gridsize, 512 , 512 * 2 * sizeof(float)>>>(
			
 
				+        cls_pred_0,
			
 
				+        cls_pred_12, 
			
 
				+        cls_pred_34, 
			
 
				+        cls_pred_5, 
			
 
				+        cls_pred_67, 
			
 
				+        cls_pred_89,
			
 
				+
			
 
				+        &box_preds[0 * 32768 * 9],
			
 
				+        &box_preds[1 * 32768 * 9],
			
 
				+        &box_preds[2 * 32768 * 9],
			
 
				+        &box_preds[3 * 32768 * 9],
			
 
				+        &box_preds[4 * 32768 * 9],
			
 
				+        &box_preds[5 * 32768 * 9],
			
 
				+        &box_preds[6 * 32768 * 9],
			
 
				+        &box_preds[7 * 32768 * 9],
			
 
				+        &box_preds[8 * 32768 * 9],
			
 
				+        &box_preds[9 * 32768 * 9],
			
 
				+
			
 
				+        dev_filtered_box, 
			
 
				+        dev_filtered_score,  
			
 
				+        dev_filter_count, 
			
 
				+    
			
 
				+        score_threshold_);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    
			
 
				+    int host_filter_count[num_class_] = {0};
			
 
				+    GPU_CHECK(cudaMemcpy(host_filter_count, dev_filter_count, num_class_ * sizeof(int), cudaMemcpyDeviceToHost));
			
 
				+    
			
 
				+    for (int i = 0; i < num_class_; ++ i) {
			
 
				+        if(host_filter_count[i] <= 0) continue;
			
 
				+
			
 
				+        int* dev_indexes;
			
 
				+        float* dev_sorted_filtered_box;
			
 
				+        float* dev_sorted_filtered_scores;
			
 
				+        GPU_CHECK(cudaMalloc((void**)&dev_indexes, host_filter_count[i] * sizeof(int)));
			
 
				+        GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float)));
			
 
				+        GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_scores, host_filter_count[i]*sizeof(float)));
			
 
				+        // GPU_CHECK(cudaMalloc((void**)&dev_sorted_box_for_nms, NUM_BOX_CORNERS_*host_filter_count[i]*sizeof(float)));
			
 
				+        thrust::sequence(thrust::device, dev_indexes, dev_indexes + host_filter_count[i]);
			
 
				+        thrust::sort_by_key(thrust::device, 
			
 
				+                            &dev_filtered_score[i * num_anchor_per_cls_], 
			
 
				+                            &dev_filtered_score[i * num_anchor_per_cls_ + host_filter_count[i]],
			
 
				+                            dev_indexes, 
			
 
				+                            thrust::greater<float>());
			
 
				+
			
 
				+        const int num_blocks = DIVUP(host_filter_count[i], num_threads_);
			
 
				+
			
 
				+        sort_boxes_by_indexes_kernel<<<num_blocks, num_threads_>>>(
			
 
				+            &dev_filtered_box[i * num_anchor_per_cls_ * num_output_box_feature_], 
			
 
				+            &dev_filtered_score[i * num_anchor_per_cls_], 
			
 
				+            dev_indexes, 
			
 
				+            host_filter_count[i],
			
 
				+            dev_sorted_filtered_box, 
			
 
				+            dev_sorted_filtered_scores,
			
 
				+            num_output_box_feature_);
			
 
				+
			
 
				+        int num_box_for_nms = min(nms_pre_maxsize_, host_filter_count[i]);
			
 
				+        long* keep_inds = new long[num_box_for_nms];  // index of kept box
			
 
				+        memset(keep_inds, 0, num_box_for_nms * sizeof(int));
			
 
				+        int num_out = 0;
			
 
				+        nms_cuda_ptr_->DoNmsCuda(num_box_for_nms, dev_sorted_filtered_box, keep_inds, &num_out);
			
 
				+
			
 
				+        num_out = min(num_out, nms_post_maxsize_);
			
 
				+
			
 
				+        float* host_filtered_box = new float[host_filter_count[i] * num_output_box_feature_]();
			
 
				+        float* host_filtered_scores = new float[host_filter_count[i]]();
			
 
				+
			
 
				+
			
 
				+        cudaMemcpy(host_filtered_box, dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float), cudaMemcpyDeviceToHost);
			
 
				+        cudaMemcpy(host_filtered_scores, dev_sorted_filtered_scores, host_filter_count[i] * sizeof(float), cudaMemcpyDeviceToHost);
			
 
				+        for (int i = 0; i < num_out; ++i)  {
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 0]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 1]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 2]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 3]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 4]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 5]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[i] * num_output_box_feature_ + 6]);
			
 
				+            out_score.emplace_back(host_filtered_scores[keep_inds[i]]);
			
 
				+            out_label.emplace_back(i);
			
 
				+
			
 
				+        }
			
 
				+        delete[] keep_inds;
			
 
				+        delete[] host_filtered_scores;
			
 
				+        delete[] host_filtered_box;
			
 
				+
			
 
				+        GPU_CHECK(cudaFree(dev_indexes));
			
 
				+        GPU_CHECK(cudaFree(dev_sorted_filtered_box));
			
 
				+    }
			
 
				+}
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/postprocess.h
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/postprocess.h
@@ -0,0 +1,125 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+#pragma once
			
 
				+#include <memory>
			
 
				+#include <vector>
			
 
				+#include "nms.h"
			
 
				+
			
 
				+class PostprocessCuda {
			
 
				+ private:
			
 
				+    // initializer list
			
 
				+
			
 
				+    const int num_threads_;
			
 
				+    const float float_min_;
			
 
				+    const float float_max_;
			
 
				+    const int num_class_;
			
 
				+    const int num_anchor_per_cls_;
			
 
				+    const float score_threshold_;
			
 
				+    const float nms_overlap_threshold_;
			
 
				+    const int nms_pre_maxsize_;
			
 
				+    const int nms_post_maxsize_;
			
 
				+    const int num_box_corners_;
			
 
				+    const int num_input_box_feature_;
			
 
				+    const int num_output_box_feature_;
			
 
				+    const std::vector<std::vector<int>> multihead_label_mapping_;
			
 
				+    // end initializer list
			
 
				+
			
 
				+    std::unique_ptr<NmsCuda> nms_cuda_ptr_;
			
 
				+  public:
			
 
				+  /**
			
 
				+   * @brief Constructor
			
 
				+   * @param[in] num_threads Number of threads when launching cuda kernel
			
 
				+   * @param[in] float_min The lowest float value
			
 
				+   * @param[in] float_max The maximum float value
			
 
				+   * @param[in] num_class Number of classes 
			
 
				+   * @param[in] num_anchor_per_cls Number anchor per category
			
 
				+   * @param[in] multihead_label_mapping 
			
 
				+   * @param[in] score_threshold Score threshold for filtering output
			
 
				+   * @param[in] nms_overlap_threshold IOU threshold for NMS
			
 
				+   * @param[in] nms_pre_maxsize Maximum number of boxes into NMS
			
 
				+   * @param[in] nms_post_maxsize Maximum number of boxes after NMS
			
 
				+   * @param[in] num_box_corners Number of box's corner
			
 
				+   * @param[in] num_output_box_feature Number of output box's feature
			
 
				+   * @details Captital variables never change after the compile, non-capital
			
 
				+   * variables could be changed through rosparam
			
 
				+   */
			
 
				+  PostprocessCuda(const int num_threads, 
			
 
				+                  const float float_min, const float float_max,
			
 
				+                  const int num_class, const int num_anchor_per_cls, 
			
 
				+                  const std::vector<std::vector<int>> multihead_label_mapping,
			
 
				+                  const float score_threshold,  
			
 
				+                  const float nms_overlap_threshold, 
			
 
				+                  const int nms_pre_maxsize, 
			
 
				+                  const int nms_post_maxsize,
			
 
				+                  const int num_box_corners,
			
 
				+                  const int num_input_box_feature, 
			
 
				+                  const int num_output_box_feature);
			
 
				+  ~PostprocessCuda(){}
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Postprocessing for the network output
			
 
				+   * @param[in] rpn_box_output Box predictions from the network output
			
 
				+   * @param[in] rpn_cls_output Class predictions from the network output
			
 
				+   * @param[in] rpn_dir_output Direction predictions from the network output
			
 
				+   * @param[in] dev_filtered_box Filtered box predictions
			
 
				+   * @param[in] dev_filtered_score Filtered score predictions
			
 
				+   * @param[in] dev_filter_count The number of filtered output
			
 
				+   * @param[out] out_detection Output bounding boxes
			
 
				+   * @param[out] out_label Output labels of objects
			
 
				+   * @details dev_* represents device memory allocated variables
			
 
				+   */
			
 
				+  void DoPostprocessCuda(
			
 
				+    float* cls_pred_0,
			
 
				+    float* cls_pred_12,
			
 
				+    float* cls_pred_34,
			
 
				+    float* cls_pred_5,
			
 
				+    float* cls_pred_67,
			
 
				+    float* cls_pred_89,
			
 
				+    
			
 
				+    const float* box_preds,
			
 
				+    float* dev_filtered_box, 
			
 
				+    float* dev_filtered_score, 
			
 
				+    int* dev_filter_count,
			
 
				+    std::vector<float>& out_detection, std::vector<int>& out_label , std::vector<float>& out_score);
			
 
				+};
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/preprocess.cu
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/preprocess.cu
@@ -0,0 +1,410 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+// headers in STL
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+// headers in local files
			
 
				+#include "common.h"
			
 
				+#include "preprocess.h"
			
 
				+
			
 
				+__global__ void make_pillar_histo_kernel(
			
 
				+    const float* dev_points, float* dev_pillar_point_feature_in_coors,
			
 
				+    int* pillar_count_histo, const int num_points,
			
 
				+    const int max_points_per_pillar, const int grid_x_size,
			
 
				+    const int grid_y_size, const int grid_z_size, const float min_x_range,
			
 
				+    const float min_y_range, const float min_z_range, const float pillar_x_size,
			
 
				+    const float pillar_y_size, const float pillar_z_size,
			
 
				+    const int num_point_feature) {
			
 
				+  int th_i = blockIdx.x * blockDim.x +  threadIdx.x ;
			
 
				+  if (th_i >= num_points) {
			
 
				+    return;
			
 
				+  }
			
 
				+  int x_coor = floor((dev_points[th_i * num_point_feature + 0] - min_x_range) / pillar_x_size);
			
 
				+  int y_coor = floor((dev_points[th_i * num_point_feature + 1] - min_y_range) / pillar_y_size);
			
 
				+  int z_coor = floor((dev_points[th_i * num_point_feature + 2] - min_z_range) / pillar_z_size);
			
 
				+
			
 
				+  if (x_coor >= 0 && x_coor < grid_x_size && y_coor >= 0 &&
			
 
				+      y_coor < grid_y_size && z_coor >= 0 && z_coor < grid_z_size) {
			
 
				+    int count =
			
 
				+        atomicAdd(&pillar_count_histo[y_coor * grid_x_size + x_coor], 1);
			
 
				+    if (count < max_points_per_pillar) {
			
 
				+      int ind =
			
 
				+          y_coor * grid_x_size * max_points_per_pillar * num_point_feature +
			
 
				+          x_coor * max_points_per_pillar * num_point_feature +
			
 
				+          count * num_point_feature;
			
 
				+ 
			
 
				+      for (int i = 0; i < num_point_feature; ++i) {
			
 
				+        dev_pillar_point_feature_in_coors[ind + i] =
			
 
				+            dev_points[th_i * num_point_feature + i];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+__global__ void make_pillar_index_kernel(
			
 
				+    int* dev_pillar_count_histo, int* dev_counter, int* dev_pillar_count,
			
 
				+    int* dev_x_coors, int* dev_y_coors, float* dev_num_points_per_pillar,
			
 
				+    int* dev_sparse_pillar_map, const int max_pillars,
			
 
				+    const int max_points_per_pillar, const int grid_x_size,
			
 
				+    const int num_inds_for_scan) {
			
 
				+  int x = blockIdx.x;
			
 
				+  int y = threadIdx.x;
			
 
				+  int num_points_at_this_pillar = dev_pillar_count_histo[y * grid_x_size + x];
			
 
				+  if (num_points_at_this_pillar == 0) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  int count = atomicAdd(dev_counter, 1);
			
 
				+  if (count < max_pillars) {
			
 
				+    atomicAdd(dev_pillar_count, 1);
			
 
				+    if (num_points_at_this_pillar >= max_points_per_pillar) {
			
 
				+      dev_num_points_per_pillar[count] = max_points_per_pillar;
			
 
				+    } else {
			
 
				+      dev_num_points_per_pillar[count] = num_points_at_this_pillar;
			
 
				+    }
			
 
				+    dev_x_coors[count] = x;
			
 
				+    dev_y_coors[count] = y;
			
 
				+    dev_sparse_pillar_map[y * num_inds_for_scan + x] = 1;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+__global__ void make_pillar_feature_kernel(
			
 
				+    float* dev_pillar_point_feature_in_coors, float* dev_pillar_point_feature,
			
 
				+    float* dev_pillar_coors, int* dev_x_coors, int* dev_y_coors,
			
 
				+    float* dev_num_points_per_pillar, const int max_points,
			
 
				+    const int num_point_feature, const int grid_x_size) {
			
 
				+  int ith_pillar = blockIdx.x;
			
 
				+  int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
			
 
				+  int ith_point = threadIdx.x;
			
 
				+  if (ith_point >= num_points_at_this_pillar) {
			
 
				+    return;
			
 
				+  }
			
 
				+  int x_ind = dev_x_coors[ith_pillar];
			
 
				+  int y_ind = dev_y_coors[ith_pillar];
			
 
				+  int pillar_ind = ith_pillar * max_points * num_point_feature +
			
 
				+                   ith_point * num_point_feature;
			
 
				+  int coors_ind = y_ind * grid_x_size * max_points * num_point_feature +
			
 
				+                  x_ind * max_points * num_point_feature +
			
 
				+                  ith_point * num_point_feature;
			
 
				+  #pragma unroll 
			
 
				+  for (int i = 0; i < num_point_feature; ++i) {
			
 
				+    dev_pillar_point_feature[pillar_ind + i] =
			
 
				+        dev_pillar_point_feature_in_coors[coors_ind + i];
			
 
				+  }
			
 
				+
			
 
				+  float coor_x = static_cast<float>(x_ind);
			
 
				+  float coor_y = static_cast<float>(y_ind);
			
 
				+  dev_pillar_coors[ith_pillar * 4 + 0] = 0;  // batch idx
			
 
				+  dev_pillar_coors[ith_pillar * 4 + 1] = 0;  // z
			
 
				+  dev_pillar_coors[ith_pillar * 4 + 2] = coor_y;
			
 
				+  dev_pillar_coors[ith_pillar * 4 + 3] = coor_x;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+__global__ void pillar_mean_kernel(
			
 
				+  float* dev_points_mean, 
			
 
				+  const int num_point_feature,
			
 
				+  const float* dev_pillar_point_feature, 
			
 
				+  const float* dev_num_points_per_pillar, 
			
 
				+  int max_pillars , 
			
 
				+  int max_points_per_pillar) {
			
 
				+
			
 
				+    extern __shared__ float temp[];
			
 
				+    int ith_pillar = blockIdx.x; 
			
 
				+    int ith_point  = threadIdx.x;
			
 
				+    int axis = threadIdx.y;
			
 
				+  
			
 
				+    int reduce_size = max_points_per_pillar > 32 ? 64 : 32;
			
 
				+    temp[threadIdx.x * 3 + axis] =  dev_pillar_point_feature[ith_pillar * max_points_per_pillar * num_point_feature + ith_point * num_point_feature + axis];  
			
 
				+    if (threadIdx.x < reduce_size - max_points_per_pillar) {
			
 
				+        temp[(threadIdx.x + max_points_per_pillar) * 3 + axis] = 0.0f; //--> dummy placeholds will set as 0
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+    int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
			
 
				+
			
 
				+    if (ith_point >= num_points_at_this_pillar) {
			
 
				+          return;
			
 
				+    }
			
 
				+
			
 
				+    for (unsigned int d = reduce_size >> 1 ; d > 0.6; d >>= 1) {
			
 
				+        if (ith_point < d) {
			
 
				+            temp[ith_point*3 +axis] += temp[(ith_point + d) * 3 + axis];
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+    }
			
 
				+
			
 
				+    if (ith_point == 0) {
			
 
				+        dev_points_mean[ith_pillar * 3 + axis] = temp[ith_point + axis] / num_points_at_this_pillar ;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+__device__ void warpReduce(volatile float* sdata , int ith_point , int axis) {
			
 
				+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 8) * blockDim.y + axis];
			
 
				+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 4) * blockDim.y + axis];
			
 
				+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 2) * blockDim.y + axis];
			
 
				+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 1) * blockDim.y + axis];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+__global__ void make_pillar_mean_kernel(
			
 
				+  float* dev_points_mean, 
			
 
				+  const int num_point_feature,
			
 
				+  const float* dev_pillar_point_feature, 
			
 
				+  const float* dev_num_points_per_pillar, 
			
 
				+  int max_pillars , 
			
 
				+  int max_points_pre_pillar) {
			
 
				+    extern __shared__ float temp[];
			
 
				+    unsigned int ith_pillar = blockIdx.x;  // { 0 , 1, 2, ... , 10000+}
			
 
				+    unsigned int ith_point  = threadIdx.x; // { 0 , 1, 2, ...,9}
			
 
				+    unsigned int axis = threadIdx.y; 
			
 
				+    unsigned int idx_pre  = ith_pillar * max_points_pre_pillar * num_point_feature \
			
 
				+                     + ith_point  * num_point_feature;
			
 
				+    unsigned int idx_post = ith_pillar * max_points_pre_pillar * num_point_feature \
			
 
				+                     + (ith_point + blockDim.x)  * num_point_feature;
			
 
				+
			
 
				+    temp[ith_point * blockDim.y + axis] = 0.0;
			
 
				+    unsigned int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
			
 
				+
			
 
				+    // if (ith_point < num_points_at_this_pillar / 2) {
			
 
				+      temp[ith_point * blockDim.y + axis] = dev_pillar_point_feature[idx_pre  + axis] 
			
 
				+                                          + dev_pillar_point_feature[idx_post + axis];
			
 
				+    // }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    // do reduction in shared mem
			
 
				+    // Sequential addressing. This solves the bank conflicts as
			
 
				+    // the threads now access shared memory with a stride of one
			
 
				+    // 32-bit word (unsigned int) now, which does not cause bank 
			
 
				+    // conflicts
			
 
				+    warpReduce(temp , ith_point , axis);
			
 
				+
			
 
				+	// // write result for this block to global mem
			
 
				+    if (ith_point == 0)
			
 
				+    dev_points_mean[ith_pillar * blockDim.y + axis] = temp[ith_point * blockDim.y + axis] / num_points_at_this_pillar ;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void gather_point_feature_kernel(
			
 
				+  const int max_num_pillars_,const int max_num_points_per_pillar,const int num_point_feature,
			
 
				+  const float min_x_range, const float min_y_range, const float min_z_range, 
			
 
				+  const float pillar_x_size,  const float pillar_y_size, const float pillar_z_size,
			
 
				+  const float* dev_pillar_point_feature, const float* dev_num_points_per_pillar, 
			
 
				+  const float* dev_pillar_coors,
			
 
				+  float* dev_points_mean, 
			
 
				+  float* dev_pfe_gather_feature_){
			
 
				+
			
 
				+  int ith_pillar = blockIdx.x; 
			
 
				+  int ith_point = threadIdx.x;
			
 
				+  // int kNumPointFeature = 5;
			
 
				+  int num_gather_feature = 11;
			
 
				+  int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
			
 
				+
			
 
				+  if (ith_point >= num_points_at_this_pillar){
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 0] 
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0]; 
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 1]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1];
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 2]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2];
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 3]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 3];
			
 
				+
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 4];
			
 
				+  
			
 
				+    // dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4]  =  0.0f;
			
 
				+    //   f_cluster = voxel_features[:, :, :3] - points_mean
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 5]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - dev_points_mean[ith_pillar * 3 + 0 ];
			
 
				+
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 6] 
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - dev_points_mean[ith_pillar * 3 + 1 ];
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 7]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - dev_points_mean[ith_pillar * 3 + 2 ];
			
 
				+
			
 
				+    // f_center[:, :, 0] = voxel_features[:, :, 0] - (coords[:, 3].to(voxel_features.dtype).unsqueeze(1) * self.voxel_x + self.x_offset)
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 8]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - (dev_pillar_coors[ith_pillar * 4 + 3] * pillar_x_size + (pillar_x_size/2 + min_x_range));
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 9]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - (dev_pillar_coors[ith_pillar * 4 + 2] * pillar_y_size + (pillar_y_size/2 + min_y_range));
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 10] 
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - (dev_pillar_coors[ith_pillar * 4 + 1] * pillar_z_size + (pillar_z_size/2 + min_z_range));
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+PreprocessPointsCuda::PreprocessPointsCuda(
			
 
				+    const int num_threads, const int max_num_pillars,
			
 
				+    const int max_points_per_pillar, const int num_point_feature,
			
 
				+    const int num_inds_for_scan, const int grid_x_size, const int grid_y_size,
			
 
				+    const int grid_z_size, const float pillar_x_size, const float pillar_y_size,
			
 
				+    const float pillar_z_size, const float min_x_range, const float min_y_range,
			
 
				+    const float min_z_range)
			
 
				+    : num_threads_(num_threads),
			
 
				+      max_num_pillars_(max_num_pillars),
			
 
				+      max_num_points_per_pillar_(max_points_per_pillar),
			
 
				+      num_point_feature_(num_point_feature),
			
 
				+      num_inds_for_scan_(num_inds_for_scan),
			
 
				+      grid_x_size_(grid_x_size),
			
 
				+      grid_y_size_(grid_y_size),
			
 
				+      grid_z_size_(grid_z_size),
			
 
				+      pillar_x_size_(pillar_x_size),
			
 
				+      pillar_y_size_(pillar_y_size),
			
 
				+      pillar_z_size_(pillar_z_size),
			
 
				+      min_x_range_(min_x_range),
			
 
				+      min_y_range_(min_y_range),
			
 
				+      min_z_range_(min_z_range) {
			
 
				+    
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_in_coors_),
			
 
				+        grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ *  num_point_feature_ * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_histo_),
			
 
				+        grid_y_size_ * grid_x_size_ * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_counter_), sizeof(int)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_), sizeof(int)));    
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points_mean_), max_num_pillars_ * 3 *sizeof(float)));  
			
 
				+    }
			
 
				+
			
 
				+PreprocessPointsCuda::~PreprocessPointsCuda() {
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_point_feature_in_coors_));
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_count_histo_));
			
 
				+    GPU_CHECK(cudaFree(dev_counter_));
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_count_));
			
 
				+    GPU_CHECK(cudaFree(dev_points_mean_));
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+void PreprocessPointsCuda::DoPreprocessPointsCuda(
			
 
				+    const float* dev_points, const int in_num_points, 
			
 
				+    int* dev_x_coors,int* dev_y_coors, 
			
 
				+    float* dev_num_points_per_pillar,
			
 
				+    float* dev_pillar_point_feature, float* dev_pillar_coors,
			
 
				+    int* dev_sparse_pillar_map, int* host_pillar_count , float* dev_pfe_gather_feature) {
			
 
				+    // initialize paraments
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_point_feature_in_coors_, 0 , grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ *  num_point_feature_ * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_count_histo_, 0 , grid_y_size_ * grid_x_size_ * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_counter_, 0, sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_count_, 0, sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_points_mean_, 0,  max_num_pillars_ * 3 * sizeof(float)));
			
 
				+    int num_block = DIVUP(in_num_points , num_threads_);
			
 
				+    make_pillar_histo_kernel<<<num_block , num_threads_>>>(
			
 
				+        dev_points, dev_pillar_point_feature_in_coors_, dev_pillar_count_histo_,
			
 
				+        in_num_points, max_num_points_per_pillar_, grid_x_size_, grid_y_size_,
			
 
				+        grid_z_size_, min_x_range_, min_y_range_, min_z_range_, pillar_x_size_,
			
 
				+        pillar_y_size_, pillar_z_size_, num_point_feature_);
			
 
				+    
			
 
				+    make_pillar_index_kernel<<<grid_x_size_, grid_y_size_>>>(
			
 
				+        dev_pillar_count_histo_, dev_counter_, dev_pillar_count_, dev_x_coors,
			
 
				+        dev_y_coors, dev_num_points_per_pillar, dev_sparse_pillar_map,
			
 
				+        max_num_pillars_, max_num_points_per_pillar_, grid_x_size_,
			
 
				+        num_inds_for_scan_);  
			
 
				+
			
 
				+    GPU_CHECK(cudaMemcpy(host_pillar_count, dev_pillar_count_, 1 * sizeof(int),
			
 
				+        cudaMemcpyDeviceToHost));
			
 
				+    make_pillar_feature_kernel<<<host_pillar_count[0],max_num_points_per_pillar_>>>(
			
 
				+        dev_pillar_point_feature_in_coors_, dev_pillar_point_feature,
			
 
				+        dev_pillar_coors, dev_x_coors, dev_y_coors, dev_num_points_per_pillar,
			
 
				+        max_num_points_per_pillar_, num_point_feature_, grid_x_size_);
			
 
				+    
			
 
				+
			
 
				+    dim3 mean_block(max_num_points_per_pillar_,3); //(32,3)
			
 
				+
			
 
				+    pillar_mean_kernel<<<host_pillar_count[0],mean_block,64 * 3 *sizeof(float)>>>(
			
 
				+      dev_points_mean_  ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar, 
			
 
				+        max_num_pillars_ , max_num_points_per_pillar_);
			
 
				+
			
 
				+    // dim3 mean_block(10,3); // Unrolling the Last Warp
			
 
				+    // make_pillar_mean_kernel<<<host_pillar_count[0], mean_block , 32 * 3 *sizeof(float)>>>(
			
 
				+    //       dev_points_mean_  ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar, 
			
 
				+    //       max_num_pillars_ , max_num_points_per_pillar_);
			
 
				+
			
 
				+    gather_point_feature_kernel<<<max_num_pillars_, max_num_points_per_pillar_>>>(
			
 
				+      max_num_pillars_,max_num_points_per_pillar_,num_point_feature_,
			
 
				+      min_x_range_, min_y_range_, min_z_range_,
			
 
				+      pillar_x_size_, pillar_y_size_, pillar_z_size_, 
			
 
				+      dev_pillar_point_feature, dev_num_points_per_pillar, dev_pillar_coors,
			
 
				+      dev_points_mean_,
			
 
				+      dev_pfe_gather_feature);
			
 
				+
			
 
				+    // DEVICE_SAVE<float>(dev_pillar_point_feature , \
			
 
				+    //     max_num_pillars_ * max_num_points_per_pillar_ * num_point_feature_ , "dev_pillar_point_feature");
			
 
				+    // DEVICE_SAVE<float>(dev_num_points_per_pillar , \
			
 
				+    //   max_num_pillars_ , "dev_num_points_per_pillar");
			
 
				+    // DEVICE_SAVE<float>(dev_pfe_gather_feature , \
			
 
				+    //   max_num_pillars_ * 11, "dev_pfe_gather_feature");
			
 
				+}
			
 
				+
			
 
				+
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/preprocess.h
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/preprocess.h
@@ -0,0 +1,138 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @file preprocess_points_cuda.h
			
 
				+ * @brief GPU version of preprocess points
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+
			
 
				+class PreprocessPointsCuda {
			
 
				+ private:
			
 
				+    // initializer list
			
 
				+    const int num_threads_;
			
 
				+    const int max_num_pillars_;
			
 
				+    const int max_num_points_per_pillar_;
			
 
				+    const int num_point_feature_;
			
 
				+    const int num_inds_for_scan_;
			
 
				+    const int grid_x_size_;
			
 
				+    const int grid_y_size_;
			
 
				+    const int grid_z_size_;
			
 
				+    const float pillar_x_size_;
			
 
				+    const float pillar_y_size_;
			
 
				+    const float pillar_z_size_;
			
 
				+    const float min_x_range_;
			
 
				+    const float min_y_range_;
			
 
				+    const float min_z_range_;
			
 
				+    // end initializer list
			
 
				+
			
 
				+    float* dev_pillar_point_feature_in_coors_;
			
 
				+    int* dev_pillar_count_histo_;
			
 
				+
			
 
				+    int* dev_counter_;
			
 
				+    int* dev_pillar_count_;
			
 
				+    float* dev_points_mean_;
			
 
				+
			
 
				+
			
 
				+
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @brief Constructor
			
 
				+   * @param[in] num_threads Number of threads when launching cuda kernel
			
 
				+   * @param[in] num_point_feature Number of features in a point
			
 
				+   * @param[in] num_inds_for_scan Number of indexes for scan(cumsum)
			
 
				+   * 
			
 
				+   * @param[in] max_num_pillars Maximum number of pillars
			
 
				+   * @param[in] max_points_per_pillar Maximum number of points per pillar
			
 
				+
			
 
				+   * @param[in] grid_x_size Number of pillars in x-coordinate
			
 
				+   * @param[in] grid_y_size Number of pillars in y-coordinate
			
 
				+   * @param[in] grid_z_size Number of pillars in z-coordinate
			
 
				+   * 
			
 
				+   * @param[in] pillar_x_size Size of x-dimension for a pillar
			
 
				+   * @param[in] pillar_y_size Size of y-dimension for a pillar
			
 
				+   * @param[in] pillar_z_size Size of z-dimension for a pillar
			
 
				+   * 
			
 
				+   * @param[in] min_x_range Minimum x value for point cloud
			
 
				+   * @param[in] min_y_range Minimum y value for point cloud
			
 
				+   * @param[in] min_z_range Minimum z value for point cloud
			
 
				+   * @details Captital variables never change after the compile
			
 
				+   */
			
 
				+  PreprocessPointsCuda(const int num_threads, const int num_point_feature, const int num_inds_for_scan,
			
 
				+                       const int max_num_pillars, const int max_points_per_pillar,  
			
 
				+                       const int grid_x_size, const int grid_y_size, const int grid_z_size,  // grid size
			
 
				+                       const float pillar_x_size, const float pillar_y_size, const float pillar_z_size, //voxel size
			
 
				+                       const float min_x_range, const float min_y_range, const float min_z_range); // point cloud range
			
 
				+  ~PreprocessPointsCuda();
			
 
				+
			
 
				+  /**
			
 
				+   * @brief CUDA preprocessing for input point cloud
			
 
				+   * @param[in] dev_points Point cloud array
			
 
				+   * @param[in] in_num_points The number of points
			
 
				+   * @param[mid] dev_x_coors X-coordinate indexes for corresponding pillars
			
 
				+   * @param[mid] dev_y_coors Y-coordinate indexes for corresponding pillars
			
 
				+   * @param[mid] dev_num_points_per_pillar
			
 
				+   *   Number of points in corresponding pillars
			
 
				+   * @param[mid] pillar_point_feature
			
 
				+   *   Values of point feature in each pillar
			
 
				+   * @param[mid] pillar_coors Array for coors of pillars
			
 
				+   * @param[mid] dev_points_mean Array for calculate the point center
			
 
				+   * @param[out] dev_sparse_pillar_map
			
 
				+   *   Grid map representation for pillar-occupancy
			
 
				+   * @param[out] host_pillar_count
			
 
				+   *   The number of valid pillars for an input point cloud
			
 
				+   * @param[out] dev_pfe_gather_feature
			
 
				+   *   11 dimensions feature for pfe input channel
			
 
				+   * @details Convert point cloud to pillar representation
			
 
				+   */
			
 
				+  void DoPreprocessPointsCuda(const float* dev_points,
			
 
				+                              const int in_num_points,
			
 
				+                              int* dev_x_coors,
			
 
				+                              int* dev_y_coors,
			
 
				+                              float* dev_num_points_per_pillar,
			
 
				+                              float* dev_pillar_point_feature,
			
 
				+                              float* dev_pillar_coors,
			
 
				+                              int* dev_sparse_pillar_map,
			
 
				+                              int* host_pillar_count,
			
 
				+                              float* dev_pfe_gather_feature);
			
 
				+};
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/scatter.cu
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/scatter.cu
@@ -0,0 +1,73 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+//headers in local files
			
 
				+#include "scatter.h"
			
 
				+
			
 
				+__global__ void scatter_kernel(int *x_coors, int *y_coors, float *pfe_output,
			
 
				+  float *scattered_feature, const int grid_x_size,
			
 
				+  const int grid_y_size) {
			
 
				+int i_pillar = blockIdx.x;
			
 
				+int i_feature = threadIdx.x;
			
 
				+int x_ind = x_coors[i_pillar];
			
 
				+int y_ind = y_coors[i_pillar];
			
 
				+float feature = pfe_output[i_pillar * 64 + i_feature];
			
 
				+scattered_feature[i_feature * grid_y_size * grid_x_size +
			
 
				+y_ind * grid_x_size + x_ind] = feature;
			
 
				+}
			
 
				+
			
 
				+ScatterCuda::ScatterCuda(const int num_threads, const int grid_x_size,
			
 
				+const int grid_y_size)
			
 
				+: num_threads_(num_threads),
			
 
				+grid_x_size_(grid_x_size),
			
 
				+grid_y_size_(grid_y_size) {}
			
 
				+
			
 
				+void ScatterCuda::DoScatterCuda(const int pillar_count, int *x_coors,
			
 
				+   int *y_coors, float *pfe_output,
			
 
				+   float *scattered_feature) {
			
 
				+scatter_kernel<<<pillar_count, num_threads_>>>(x_coors, y_coors, pfe_output,
			
 
				+                    scattered_feature,
			
 
				+                    grid_x_size_, grid_y_size_);
			
 
				+}
			
--- a/src/detection/detection_lidar_PointPillars_MultiHead/scatter.h
+++ b/src/detection/detection_lidar_PointPillars_MultiHead/scatter.h
@@ -0,0 +1,77 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+class ScatterCuda {
			
 
				+ private:
			
 
				+  const int num_threads_;
			
 
				+  const int grid_x_size_;
			
 
				+  const int grid_y_size_;
			
 
				+
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @brief Constructor
			
 
				+   * @param[in] num_threads The number of threads to launch cuda kernel
			
 
				+   * @param[in] grid_x_size Number of pillars in x-coordinate
			
 
				+   * @param[in] grid_y_size Number of pillars in y-coordinate
			
 
				+   * @details Captital variables never change after the compile
			
 
				+   */
			
 
				+  ScatterCuda(const int num_threads, const int grid_x_size,
			
 
				+              const int grid_y_size);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Call scatter cuda kernel
			
 
				+   * @param[in] pillar_count The valid number of pillars
			
 
				+   * @param[in] x_coors X-coordinate indexes for corresponding pillars
			
 
				+   * @param[in] y_coors Y-coordinate indexes for corresponding pillars
			
 
				+   * @param[in] pfe_output Output from Pillar Feature Extractor
			
 
				+   * @param[out] scattered_feature Gridmap representation for pillars' feature
			
 
				+   * @details Allocate pillars in gridmap based on index(coordinates)
			
 
				+   * information
			
 
				+   */
			
 
				+  void DoScatterCuda(const int pillar_count, int* x_coors, int* y_coors,
			
 
				+                     float* pfe_output, float* scattered_feature);
			
 
				+};
			
 
				+
			
--- a/src/detection/detection_lidar_point_pillars_onnx/detection_lidar_point_pillars_onnx.pro
+++ b/src/detection/detection_lidar_point_pillars_onnx/detection_lidar_point_pillars_onnx.pro
@@ -132,10 +132,9 @@ else {
 
				 
			
 
				 }
			
 
				 
			
 
				-
			
 
				 LIBS += -lcudart -lcufft
			
 
				 
			
 
				-LIBS += -L/home/adc/soft/cudnn-10.2-linux-x64-v7.6.5.32/cuda/lib64 -lcudnn
			
 
				+#LIBS += -L/home/adc/soft/cudnn-10.2-linux-x64-v7.6.5.32/cuda/lib64 -lcudnn
			
 
				 
			
 
				 INCLUDEPATH += /home/adc/soft/TensorRT-7.0.0.11.Ubuntu-18.04.x86_64-gnu.cuda-10.2.cudnn7.6/TensorRT-7.0.0.11/include