2 years ago · 7b2612868c
--- a/src/detection/detection_lidar_pointpillars_cuda/.gitignore
+++ b/src/detection/detection_lidar_pointpillars_cuda/.gitignore
@@ -0,0 +1,73 @@
 
				+# This file is used to ignore files which are generated
			
 
				+# ----------------------------------------------------------------------------
			
 
				+
			
 
				+*~
			
 
				+*.autosave
			
 
				+*.a
			
 
				+*.core
			
 
				+*.moc
			
 
				+*.o
			
 
				+*.obj
			
 
				+*.orig
			
 
				+*.rej
			
 
				+*.so
			
 
				+*.so.*
			
 
				+*_pch.h.cpp
			
 
				+*_resource.rc
			
 
				+*.qm
			
 
				+.#*
			
 
				+*.*#
			
 
				+core
			
 
				+!core/
			
 
				+tags
			
 
				+.DS_Store
			
 
				+.directory
			
 
				+*.debug
			
 
				+Makefile*
			
 
				+*.prl
			
 
				+*.app
			
 
				+moc_*.cpp
			
 
				+ui_*.h
			
 
				+qrc_*.cpp
			
 
				+Thumbs.db
			
 
				+*.res
			
 
				+*.rc
			
 
				+/.qmake.cache
			
 
				+/.qmake.stash
			
 
				+
			
 
				+# qtcreator generated files
			
 
				+*.pro.user*
			
 
				+
			
 
				+# xemacs temporary files
			
 
				+*.flc
			
 
				+
			
 
				+# Vim temporary files
			
 
				+.*.swp
			
 
				+
			
 
				+# Visual Studio generated files
			
 
				+*.ib_pdb_index
			
 
				+*.idb
			
 
				+*.ilk
			
 
				+*.pdb
			
 
				+*.sln
			
 
				+*.suo
			
 
				+*.vcproj
			
 
				+*vcproj.*.*.user
			
 
				+*.ncb
			
 
				+*.sdf
			
 
				+*.opensdf
			
 
				+*.vcxproj
			
 
				+*vcxproj.*
			
 
				+
			
 
				+# MinGW generated files
			
 
				+*.Debug
			
 
				+*.Release
			
 
				+
			
 
				+# Python byte code
			
 
				+*.pyc
			
 
				+
			
 
				+# Binaries
			
 
				+# --------
			
 
				+*.dll
			
 
				+*.exe
			
 
				+
			
--- a/src/detection/detection_lidar_pointpillars_cuda/detection_lidar_pointpillars_cuda.pro
+++ b/src/detection/detection_lidar_pointpillars_cuda/detection_lidar_pointpillars_cuda.pro
@@ -0,0 +1,152 @@
 
				+QT -= gui
			
 
				+
			
 
				+CONFIG += c++14 #console
			
 
				+CONFIG -= app_bundle
			
 
				+
			
 
				+# The following define makes your compiler emit warnings if you use
			
 
				+# any Qt feature that has been marked deprecated (the exact warnings
			
 
				+# depend on your compiler). Please consult the documentation of the
			
 
				+# deprecated API in order to know how to port your code away from it.
			
 
				+DEFINES += QT_DEPRECATED_WARNINGS
			
 
				+
			
 
				+# You can also make your code fail to compile if it uses deprecated APIs.
			
 
				+# In order to do so, uncomment the following line.
			
 
				+# You can also select to disable deprecated APIs only up to a certain version of Qt.
			
 
				+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
			
 
				+
			
 
				+SOURCES += \
			
 
				+        main.cpp \
			
 
				+        src/pillarScatter.cpp \
			
 
				+        src/pointpillar.cpp \
			
 
				+        src/postprocess.cpp \
			
 
				+        src/preprocess.cpp
			
 
				+
			
 
				+# Default rules for deployment.
			
 
				+qnx: target.path = /tmp/$${TARGET}/bin
			
 
				+else: unix:!android: target.path = /opt/$${TARGET}/bin
			
 
				+!isEmpty(target.path): INSTALLS += target
			
 
				+
			
 
				+DISTFILES += \
			
 
				+    src/pillarScatterKernels.cu \
			
 
				+    src/postprocess_kernels.cu \
			
 
				+    src/preprocess_kernels.cu
			
 
				+
			
 
				+HEADERS += \
			
 
				+    include/kernel.h \
			
 
				+    include/params.h \
			
 
				+    include/pillarScatter.h \
			
 
				+    include/pointpillar.h \
			
 
				+    include/postprocess.h \
			
 
				+    include/preprocess.h
			
 
				+
			
 
				+
			
 
				+INCLUDEPATH += $$PWD/include
			
 
				+
			
 
				+INCLUDEPATH += /usr/include/pcl-1.7
			
 
				+INCLUDEPATH += /usr/include/pcl-1.8
			
 
				+INCLUDEPATH += /usr/include/pcl-1.10
			
 
				+INCLUDEPATH += /usr/include/eigen3
			
 
				+
			
 
				+
			
 
				+INCLUDEPATH += /usr/local/cuda/targets/aarch64-linux/include
			
 
				+
			
 
				+LIBS += -L/usr/local/cuda/targets/aarch64-linux/lib  # -lcublas
			
 
				+
			
 
				+
			
 
				+
			
 
				+CUDA_SOURCES += \
			
 
				+    src/pillarScatterKernels.cu \
			
 
				+    src/postprocess_kernels.cu \
			
 
				+    src/preprocess_kernels.cu
			
 
				+
			
 
				+LIBS += -L"/usr/local/lib" \
			
 
				+        -L"/usr/local/cuda/lib64" \
			
 
				+        -lcudart \
			
 
				+        -lcufft
			
 
				+
			
 
				+CUDA_SDK = "/usr/local/cuda/"   # cudaSDK路径
			
 
				+
			
 
				+CUDA_DIR = "/usr/local/cuda/"            # CUDA tookit路径
			
 
				+
			
 
				+SYSTEM_NAME = linux         # 自己系统环境 'Win32', 'x64', or 'Win64'
			
 
				+
			
 
				+SYSTEM_TYPE = 64           #操作系统位数 '32' or '64',
			
 
				+
			
 
				+CUDA_ARCH = sm_72 #sm_87 sm_62 # xavier sm_72          # cuda架构, for example 'compute_10', 'compute_11', 'sm_10'
			
 
				+
			
 
				+NVCC_OPTIONS = --use_fast_math --compiler-options "-fPIC"
			
 
				+
			
 
				+# include paths
			
 
				+
			
 
				+INCLUDEPATH += $$CUDA_DIR/include
			
 
				+#INCLUDEPATH += /usr/local/cuda-10.0/targets/aarch64-linux/include/crt
			
 
				+
			
 
				+# library directories
			
 
				+
			
 
				+QMAKE_LIBDIR += $$CUDA_DIR/lib/
			
 
				+
			
 
				+CUDA_OBJECTS_DIR = ./
			
 
				+
			
 
				+# The following library conflicts with something in Cuda
			
 
				+
			
 
				+#QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
			
 
				+
			
 
				+#QMAKE_LFLAGS_DEBUG   = /NODEFAULTLIB:msvcrtd.lib
			
 
				+
			
 
				+# Add the necessary libraries
			
 
				+
			
 
				+CUDA_LIBS =  cudart cufft
			
 
				+
			
 
				+# The following makes sure all path names (which often include spaces) are put between quotation marks
			
 
				+
			
 
				+CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
			
 
				+
			
 
				+NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
			
 
				+
			
 
				+#LIBS += $$join(CUDA_LIBS,'.so ', '', '.so')
			
 
				+
			
 
				+# Configuration of the Cuda compiler
			
 
				+
			
 
				+CONFIG(debug, debug|release) {
			
 
				+
			
 
				+    # Debug mode
			
 
				+
			
 
				+    cuda_d.input = CUDA_SOURCES
			
 
				+
			
 
				+    cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
			
 
				+
			
 
				+    cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
			
 
				+
			
 
				+    cuda_d.dependency_type = TYPE_C
			
 
				+
			
 
				+    QMAKE_EXTRA_COMPILERS += cuda_d
			
 
				+
			
 
				+}
			
 
				+
			
 
				+else {
			
 
				+
			
 
				+    # Release mode
			
 
				+
			
 
				+    cuda.input = CUDA_SOURCES
			
 
				+
			
 
				+    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
			
 
				+
			
 
				+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -O3 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
			
 
				+
			
 
				+    cuda.dependency_type = TYPE_C
			
 
				+
			
 
				+    QMAKE_EXTRA_COMPILERS += cuda
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+LIBS += -lnvinfer -lnvonnxparser
			
 
				+
			
 
				+!include(../../../include/common.pri ) {
			
 
				+    error( "Couldn't find the common.pri file!" )
			
 
				+}
			
 
				+
			
 
				+!include(../../../include/ivprotobuf.pri ) {
			
 
				+    error( "Couldn't find the ivprotobuf.pri file!" )
			
 
				+}
			
 
				+
			
--- a/src/detection/detection_lidar_pointpillars_cuda/include/kernel.h
+++ b/src/detection/detection_lidar_pointpillars_cuda/include/kernel.h
@@ -0,0 +1,122 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef _KERNEL_H_
			
 
				+#define _KERNEL_H_
			
 
				+
			
 
				+#include <iostream>
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_fp16.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include "params.h"
			
 
				+
			
 
				+const int THREADS_FOR_VOXEL = 256;    // threads number for a block
			
 
				+const int POINTS_PER_VOXEL = 32;      // depands on "params.h"
			
 
				+const int WARP_SIZE = 32;             // one warp(32 threads) for one pillar
			
 
				+const int WARPS_PER_BLOCK = 4;        // four warp for one block
			
 
				+const int FEATURES_SIZE = 10;         // features maps number depands on "params.h"
			
 
				+const int PILLARS_PER_BLOCK = 64;     // one thread deals with one pillar and a block has PILLARS_PER_BLOCK threads
			
 
				+const int PILLAR_FEATURE_SIZE = 64;   // feature count for one pillar depands on "params.h"
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+    STATUS_SUCCESS = 0,
			
 
				+    STATUS_FAILURE = 1,
			
 
				+    STATUS_BAD_PARAM = 2,
			
 
				+    STATUS_NOT_SUPPORTED = 3,
			
 
				+    STATUS_NOT_INITIALIZED = 4
			
 
				+} pluginStatus_t;
			
 
				+
			
 
				+#define checkCudaErrors(status)                                   \
			
 
				+{                                                                 \
			
 
				+  if (status != 0)                                                \
			
 
				+  {                                                               \
			
 
				+    std::cout << "Cuda failure: " << cudaGetErrorString(status)   \
			
 
				+              << " at line " << __LINE__                          \
			
 
				+              << " in file " << __FILE__                          \
			
 
				+              << " error status: " << status                      \
			
 
				+              << std::endl;                                       \
			
 
				+              abort();                                            \
			
 
				+    }                                                             \
			
 
				+}
			
 
				+
			
 
				+cudaError_t generateVoxels_random_launch(float *points, size_t points_size,
			
 
				+        float min_x_range, float max_x_range,
			
 
				+        float min_y_range, float max_y_range,
			
 
				+        float min_z_range, float max_z_range,
			
 
				+        float pillar_x_size, float pillar_y_size, float pillar_z_size,
			
 
				+        int grid_y_size, int grid_x_size,
			
 
				+        unsigned int *mask, float *voxels,
			
 
				+        cudaStream_t stream = 0);
			
 
				+
			
 
				+cudaError_t generateBaseFeatures_launch(unsigned int *mask, float *voxels,
			
 
				+        int grid_y_size, int grid_x_size,
			
 
				+        unsigned int *pillar_num,
			
 
				+        float *voxel_features,
			
 
				+        unsigned int *voxel_num,
			
 
				+        unsigned int *voxel_idxs,
			
 
				+        cudaStream_t stream = 0);
			
 
				+
			
 
				+cudaError_t generateFeatures_launch(float* voxel_features,
			
 
				+    unsigned int *voxel_num,
			
 
				+    unsigned int *voxel_idxs,
			
 
				+    unsigned int *params,
			
 
				+    float voxel_x, float voxel_y, float voxel_z,
			
 
				+    float range_min_x, float range_min_y, float range_min_z,
			
 
				+    float* features,
			
 
				+    cudaStream_t stream = 0);
			
 
				+
			
 
				+int pillarScatterHalfKernelLaunch(
			
 
				+  int max_pillar_num,
			
 
				+  int num_features,
			
 
				+  const half *pillar_features_data,
			
 
				+  const unsigned int *coords_data,
			
 
				+  const unsigned int *params_data,
			
 
				+  unsigned int featureX, unsigned int featureY,
			
 
				+  half *spatial_feature_data,
			
 
				+  cudaStream_t stream);
			
 
				+
			
 
				+int pillarScatterFloatKernelLaunch(
			
 
				+  int max_pillar_num,
			
 
				+  int num_features,
			
 
				+  const float *pillar_features_data,
			
 
				+  const unsigned int *coords_data,
			
 
				+  const unsigned int *params_data,
			
 
				+  unsigned int featureX, unsigned int featureY,
			
 
				+  float *spatial_feature_data,
			
 
				+  cudaStream_t stream);
			
 
				+
			
 
				+cudaError_t postprocess_launch(const float *cls_input,
			
 
				+                      float *box_input,
			
 
				+                      const float *dir_cls_input,
			
 
				+                      float *anchors,
			
 
				+                      float *anchor_bottom_heights,
			
 
				+                      float *bndbox_output,
			
 
				+                      int *object_counter,
			
 
				+                      const float min_x_range,
			
 
				+                      const float max_x_range,
			
 
				+                      const float min_y_range,
			
 
				+                      const float max_y_range,
			
 
				+                      const int feature_x_size,
			
 
				+                      const int feature_y_size,
			
 
				+                      const int num_anchors,
			
 
				+                      const int num_classes,
			
 
				+                      const int num_box_values,
			
 
				+                      const float score_thresh,
			
 
				+                      const float dir_offset,
			
 
				+                      cudaStream_t stream = 0);
			
 
				+#endif
			
--- a/src/detection/detection_lidar_pointpillars_cuda/include/params.h
+++ b/src/detection/detection_lidar_pointpillars_cuda/include/params.h
@@ -0,0 +1,71 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+#ifndef PARAMS_H_
			
 
				+#define PARAMS_H_
			
 
				+const int MAX_VOXELS = 40000;
			
 
				+class Params
			
 
				+{
			
 
				+  public:
			
 
				+    static const int num_classes = 3;
			
 
				+    const char *class_name [num_classes] = { "Car","Pedestrian","Cyclist",};
			
 
				+    const float min_x_range = -80;
			
 
				+    const float max_x_range = 80;
			
 
				+    const float min_y_range = -80;
			
 
				+    const float max_y_range = 80;
			
 
				+    const float min_z_range = -3.0;
			
 
				+    const float max_z_range = 1.0;
			
 
				+    // the size of a pillar
			
 
				+    const float pillar_x_size = 0.4;
			
 
				+    const float pillar_y_size = 0.4;
			
 
				+    const float pillar_z_size = 4.0;
			
 
				+    const int max_num_points_per_pillar = 16;
			
 
				+    const int num_point_values = 4;
			
 
				+    // the number of feature maps for pillar scatter
			
 
				+    const int num_feature_scatter = 64;
			
 
				+    const float dir_offset = 0.78539;
			
 
				+    const float dir_limit_offset = 0.0;
			
 
				+    // the num of direction classes(bins)
			
 
				+    const int num_dir_bins = 2;
			
 
				+    // anchors decode by (x, y, z, dir)
			
 
				+    static const int num_anchors = num_classes * 2;
			
 
				+    static const int len_per_anchor = 4;
			
 
				+    const float anchors[num_anchors * len_per_anchor] = {
			
 
				+      3.9,1.6,1.56,0.0,
			
 
				+      3.9,1.6,1.56,1.57,
			
 
				+      0.8,0.6,1.73,0.0,
			
 
				+      0.8,0.6,1.73,1.57,
			
 
				+      1.76,0.6,1.73,0.0,
			
 
				+      1.76,0.6,1.73,1.57,
			
 
				+      };
			
 
				+    const float anchor_bottom_heights[num_classes] = {-1.78,-0.6,-0.6,};
			
 
				+    // the score threshold for classification
			
 
				+    const float score_thresh = 0.1;
			
 
				+    const float nms_thresh = 0.01;
			
 
				+    const int max_num_pillars = MAX_VOXELS;
			
 
				+    const int pillarPoints_bev = max_num_points_per_pillar * max_num_pillars;
			
 
				+    // the detected boxes result decode by (x, y, z, w, l, h, yaw)
			
 
				+    const int num_box_values = 7;
			
 
				+    // the input size of the 2D backbone network
			
 
				+    const int grid_x_size = (max_x_range - min_x_range) / pillar_x_size;
			
 
				+    const int grid_y_size = (max_y_range - min_y_range) / pillar_y_size;
			
 
				+    const int grid_z_size = (max_z_range - min_z_range) / pillar_z_size;
			
 
				+    // the output size of the 2D backbone network
			
 
				+    const int feature_x_size = grid_x_size / 2;
			
 
				+    const int feature_y_size = grid_y_size / 2;
			
 
				+    Params() {};
			
 
				+};
			
 
				+#endif
			
--- a/src/detection/detection_lidar_pointpillars_cuda/include/pillarScatter.h
+++ b/src/detection/detection_lidar_pointpillars_cuda/include/pillarScatter.h
@@ -0,0 +1,98 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef _PILLAR_SCATTER_H_
			
 
				+#define _PILLAR_SCATTER_H_
			
 
				+
			
 
				+#include <vector>
			
 
				+#include <string>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include "NvInferPlugin.h"
			
 
				+#include "kernel.h"
			
 
				+
			
 
				+namespace nvinfer1
			
 
				+{
			
 
				+namespace plugin
			
 
				+{
			
 
				+
			
 
				+class PPScatterPlugin : public nvinfer1::IPluginV2DynamicExt
			
 
				+{
			
 
				+public:
			
 
				+    PPScatterPlugin() = delete;
			
 
				+    PPScatterPlugin(const void* data, size_t length);
			
 
				+    PPScatterPlugin(size_t h, size_t w);
			
 
				+    // IPluginV2DynamicExt Methods
			
 
				+    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
			
 
				+    nvinfer1::DimsExprs getOutputDimensions(int outputIndex, 
			
 
				+        const nvinfer1::DimsExprs* inputs, int nbInputs,
			
 
				+        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
			
 
				+    bool supportsFormatCombination(
			
 
				+        int pos, const nvinfer1::PluginTensorDesc* inOut, 
			
 
				+        int nbInputs, int nbOutputs) noexcept override;
			
 
				+    void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
			
 
				+        const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept override;
			
 
				+    size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
			
 
				+        const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept override;
			
 
				+    int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, 
			
 
				+        const nvinfer1::PluginTensorDesc* outputDesc,
			
 
				+        const void* const* inputs, void* const* outputs, 
			
 
				+        void* workspace, cudaStream_t stream) noexcept override;
			
 
				+    // IPluginV2Ext Methods
			
 
				+    nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, 
			
 
				+        int nbInputs) const noexcept override;
			
 
				+    // IPluginV2 Methods
			
 
				+    const char* getPluginType() const noexcept override;
			
 
				+    const char* getPluginVersion() const noexcept override;
			
 
				+    int getNbOutputs() const noexcept override;
			
 
				+    int initialize() noexcept override;
			
 
				+    void terminate() noexcept override;
			
 
				+    size_t getSerializationSize() const noexcept override;
			
 
				+    void serialize(void* buffer) const noexcept override;
			
 
				+    void destroy() noexcept override;
			
 
				+    void setPluginNamespace(const char* pluginNamespace) noexcept override;
			
 
				+    const char* getPluginNamespace() const noexcept override;
			
 
				+
			
 
				+private:
			
 
				+    std::string mNamespace;
			
 
				+    // the y -- output size of the 2D backbone network
			
 
				+    size_t feature_y_size_;
			
 
				+    // the x -- output size of the 2D backbone network
			
 
				+    size_t feature_x_size_;
			
 
				+};
			
 
				+
			
 
				+class PPScatterPluginCreator : public nvinfer1::IPluginCreator
			
 
				+{
			
 
				+public:
			
 
				+    PPScatterPluginCreator();
			
 
				+    const char* getPluginName() const noexcept override;
			
 
				+    const char* getPluginVersion() const noexcept override;
			
 
				+    const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override;
			
 
				+    nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept override;
			
 
				+    nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
			
 
				+    void setPluginNamespace(const char* pluginNamespace) noexcept override;
			
 
				+    const char* getPluginNamespace() const noexcept override;
			
 
				+
			
 
				+private:
			
 
				+    static nvinfer1::PluginFieldCollection mFC;
			
 
				+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
			
 
				+    std::string mNamespace;
			
 
				+};
			
 
				+
			
 
				+} // namespace plugin
			
 
				+} // namespace nvinfer1
			
 
				+
			
 
				+#endif
			
--- a/src/detection/detection_lidar_pointpillars_cuda/include/pointpillar.h
+++ b/src/detection/detection_lidar_pointpillars_cuda/include/pointpillar.h
@@ -0,0 +1,105 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+#include <memory>
			
 
				+
			
 
				+#include "cuda_runtime.h"
			
 
				+#include "NvInfer.h"
			
 
				+#include "NvOnnxConfig.h"
			
 
				+#include "NvOnnxParser.h"
			
 
				+#include "NvInferRuntime.h"
			
 
				+
			
 
				+#include "postprocess.h"
			
 
				+#include "preprocess.h"
			
 
				+
			
 
				+#define PERFORMANCE_LOG 1
			
 
				+
			
 
				+// Logger for TensorRT
			
 
				+class Logger : public nvinfer1::ILogger {
			
 
				+  public:
			
 
				+    void log(Severity severity, const char* msg) noexcept override {
			
 
				+        // suppress info-level message
			
 
				+        //if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR || severity == Severity::kINFO ) {
			
 
				+        if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR) {
			
 
				+            std::cerr << "trt_infer: " << msg << std::endl;
			
 
				+        }
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+class TRT {
			
 
				+  private:
			
 
				+    Params params_;
			
 
				+
			
 
				+    cudaEvent_t start_, stop_;
			
 
				+
			
 
				+    Logger gLogger_;
			
 
				+    nvinfer1::IExecutionContext *context_ = nullptr;
			
 
				+    nvinfer1::ICudaEngine *engine_ = nullptr;
			
 
				+
			
 
				+    cudaStream_t stream_ = 0;
			
 
				+  public:
			
 
				+    TRT(std::string modelFile, cudaStream_t stream = 0);
			
 
				+    ~TRT(void);
			
 
				+
			
 
				+    int doinfer(void**buffers);
			
 
				+};
			
 
				+
			
 
				+class PointPillar {
			
 
				+  private:
			
 
				+    Params params_;
			
 
				+
			
 
				+    cudaEvent_t start_, stop_;
			
 
				+    cudaStream_t stream_;
			
 
				+
			
 
				+    std::shared_ptr<PreProcessCuda> pre_;
			
 
				+    std::shared_ptr<TRT> trt_;
			
 
				+    std::shared_ptr<PostProcessCuda> post_;
			
 
				+
			
 
				+    //input of pre-process
			
 
				+    float *voxel_features_ = nullptr;
			
 
				+    unsigned int *voxel_num_ = nullptr;
			
 
				+    unsigned int *voxel_idxs_ = nullptr;
			
 
				+    unsigned int *pillar_num_ = nullptr;
			
 
				+
			
 
				+    unsigned int voxel_features_size_ = 0;
			
 
				+    unsigned int voxel_num_size_ = 0;
			
 
				+    unsigned int voxel_idxs_size_ = 0;
			
 
				+
			
 
				+    //TRT-input
			
 
				+    float *features_input_ = nullptr;
			
 
				+    unsigned int *params_input_ = nullptr;
			
 
				+    unsigned int features_input_size_ = 0;
			
 
				+
			
 
				+    //output of TRT -- input of post-process
			
 
				+    float *cls_output_ = nullptr;
			
 
				+    float *box_output_ = nullptr;
			
 
				+    float *dir_cls_output_ = nullptr;
			
 
				+    unsigned int cls_size_;
			
 
				+    unsigned int box_size_;
			
 
				+    unsigned int dir_cls_size_;
			
 
				+
			
 
				+    //output of post-process
			
 
				+    float *bndbox_output_ = nullptr;
			
 
				+    unsigned int bndbox_size_ = 0;
			
 
				+
			
 
				+    std::vector<Bndbox> res_;
			
 
				+
			
 
				+  public:
			
 
				+    PointPillar(std::string modelFile, cudaStream_t stream = 0);
			
 
				+    ~PointPillar(void);
			
 
				+    int doinfer(void*points, unsigned int point_size, std::vector<Bndbox> &res);
			
 
				+};
			
 
				+
			
--- a/src/detection/detection_lidar_pointpillars_cuda/include/postprocess.h
+++ b/src/detection/detection_lidar_pointpillars_cuda/include/postprocess.h
@@ -0,0 +1,58 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+#ifndef POSTPROCESS_H_
			
 
				+#define POSTPROCESS_H_
			
 
				+
			
 
				+#include <vector>
			
 
				+#include "kernel.h"
			
 
				+
			
 
				+/*
			
 
				+box_encodings: (B, N, 7 + C) or (N, 7 + C) [x, y, z, dx, dy, dz, heading or *[cos, sin], ...]
			
 
				+anchors: (B, N, 7 + C) or (N, 7 + C) [x, y, z, dx, dy, dz, heading, ...]
			
 
				+*/
			
 
				+struct Bndbox {
			
 
				+    float x;
			
 
				+    float y;
			
 
				+    float z;
			
 
				+    float w;
			
 
				+    float l;
			
 
				+    float h;
			
 
				+    float rt;
			
 
				+    int id;
			
 
				+    float score;
			
 
				+    Bndbox(){};
			
 
				+    Bndbox(float x_, float y_, float z_, float w_, float l_, float h_, float rt_, int id_, float score_)
			
 
				+        : x(x_), y(y_), z(z_), w(w_), l(l_), h(h_), rt(rt_), id(id_), score(score_) {}
			
 
				+};
			
 
				+
			
 
				+class PostProcessCuda {
			
 
				+  private:
			
 
				+    Params params_;
			
 
				+    float *anchors_;
			
 
				+    float *anchor_bottom_heights_;
			
 
				+    int *object_counter_;
			
 
				+    cudaStream_t stream_ = 0;
			
 
				+  public:
			
 
				+    PostProcessCuda(cudaStream_t stream = 0);
			
 
				+    ~PostProcessCuda();
			
 
				+
			
 
				+    int doPostprocessCuda(const float *cls_input, float *box_input, const float *dir_cls_input, float *bndbox_output);
			
 
				+};
			
 
				+
			
 
				+int nms_cpu(std::vector<Bndbox> bndboxes, const float nms_thresh, std::vector<Bndbox> &nms_pred);
			
 
				+
			
 
				+#endif
			
--- a/src/detection/detection_lidar_pointpillars_cuda/include/preprocess.h
+++ b/src/detection/detection_lidar_pointpillars_cuda/include/preprocess.h
@@ -0,0 +1,47 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kernel.h"
			
 
				+
			
 
				+class PreProcessCuda {
			
 
				+  private:
			
 
				+    Params params_;
			
 
				+    unsigned int *mask_;
			
 
				+    float *voxels_;
			
 
				+    int *voxelsList_;
			
 
				+    float *params_cuda_;
			
 
				+    cudaStream_t stream_ = 0;
			
 
				+
			
 
				+  public:
			
 
				+    PreProcessCuda(cudaStream_t stream_ = 0);
			
 
				+    ~PreProcessCuda();
			
 
				+
			
 
				+    //points cloud -> voxels (BEV) -> feature*4 
			
 
				+    int generateVoxels(float *points, size_t points_size,
			
 
				+        unsigned int *pillar_num,
			
 
				+        float *voxel_features,
			
 
				+        unsigned int *voxel_num,
			
 
				+        unsigned int *voxel_idxs);
			
 
				+
			
 
				+    //feature*4 -> feature * 10 
			
 
				+    int generateFeatures(float* voxel_features,
			
 
				+          unsigned int *voxel_num,
			
 
				+          unsigned int* voxel_idxs,
			
 
				+          unsigned int *params,
			
 
				+          float* features);
			
 
				+};
			
 
				+
			
--- a/src/detection/detection_lidar_pointpillars_cuda/main.cpp
+++ b/src/detection/detection_lidar_pointpillars_cuda/main.cpp
@@ -0,0 +1,189 @@
 
				+#include <QCoreApplication>
			
 
				+
			
 
				+
			
 
				+#include <iostream>
			
 
				+#include <sstream>
			
 
				+#include <fstream>
			
 
				+#include <pcl/point_types.h>
			
 
				+#include <pcl/point_cloud.h>
			
 
				+
			
 
				+#include "cuda_runtime.h"
			
 
				+
			
 
				+#include "./params.h"
			
 
				+#include "./pointpillar.h"
			
 
				+
			
 
				+#include "modulecomm.h"
			
 
				+
			
 
				+std::string Model_File = "/home/nvidia/models/lidar/pointpillar.onnx";
			
 
				+
			
 
				+PointPillar * gpPointPillar;
			
 
				+
			
 
				+cudaEvent_t start, stop;
			
 
				+float elapsedTime = 0.0f;
			
 
				+cudaStream_t stream = NULL;
			
 
				+std::vector<Bndbox> nms_pred;
			
 
				+
			
 
				+#define checkCudaErrors(status)                                   \
			
 
				+{                                                                 \
			
 
				+  if (status != 0)                                                \
			
 
				+  {                                                               \
			
 
				+    std::cout << "Cuda failure: " << cudaGetErrorString(status)   \
			
 
				+              << " at line " << __LINE__                          \
			
 
				+              << " in file " << __FILE__                          \
			
 
				+              << " error status: " << status                      \
			
 
				+              << std::endl;                                       \
			
 
				+              abort();                                            \
			
 
				+    }                                                             \
			
 
				+}
			
 
				+
			
 
				+void Getinfo(void)
			
 
				+{
			
 
				+  cudaDeviceProp prop;
			
 
				+
			
 
				+  int count = 0;
			
 
				+  cudaGetDeviceCount(&count);
			
 
				+  printf("\nGPU has cuda devices: %d\n", count);
			
 
				+  for (int i = 0; i < count; ++i) {
			
 
				+    cudaGetDeviceProperties(&prop, i);
			
 
				+    printf("----device id: %d info----\n", i);
			
 
				+    printf("  GPU : %s \n", prop.name);
			
 
				+    printf("  Capbility: %d.%d\n", prop.major, prop.minor);
			
 
				+    printf("  Global memory: %luMB\n", prop.totalGlobalMem >> 20);
			
 
				+    printf("  Const memory: %luKB\n", prop.totalConstMem  >> 10);
			
 
				+    printf("  SM in a block: %luKB\n", prop.sharedMemPerBlock >> 10);
			
 
				+    printf("  warp size: %d\n", prop.warpSize);
			
 
				+    printf("  threads in a block: %d\n", prop.maxThreadsPerBlock);
			
 
				+    printf("  block dim: (%d,%d,%d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
			
 
				+    printf("  grid dim: (%d,%d,%d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
			
 
				+  }
			
 
				+  printf("\n");
			
 
				+}
			
 
				+
			
 
				+
			
 
				+//void DectectOneBin(std::shared_ptr<float> & pbin_ptr,const int points_size)
			
 
				+void DectectOneBin(float * pdata,const int points_size)
			
 
				+{
			
 
				+    float *points_data = nullptr;
			
 
				+    unsigned int points_data_size = points_size * 4 * sizeof(float);
			
 
				+    checkCudaErrors(cudaMallocManaged((void **)&points_data, points_data_size));
			
 
				+ //   checkCudaErrors(cudaMemcpy(points_data, pbin_ptr.get(), points_data_size, cudaMemcpyDefault));
			
 
				+    checkCudaErrors(cudaMemcpy(points_data, pdata, points_data_size, cudaMemcpyDefault));
			
 
				+    checkCudaErrors(cudaDeviceSynchronize());
			
 
				+
			
 
				+    cudaEventRecord(start, stream);
			
 
				+
			
 
				+    gpPointPillar->doinfer(points_data, points_size, nms_pred);
			
 
				+    cudaEventRecord(stop, stream);
			
 
				+    cudaEventSynchronize(stop);
			
 
				+    cudaEventElapsedTime(&elapsedTime, start, stop);
			
 
				+    std::cout<<"TIME: pointpillar: "<< elapsedTime <<" ms." <<std::endl;
			
 
				+
			
 
				+    checkCudaErrors(cudaFree(points_data));
			
 
				+
			
 
				+    std::cout<<"Bndbox objs: "<< nms_pred.size()<<std::endl;
			
 
				+//    std::string save_file_name = Save_Dir + index_str + ".txt";
			
 
				+//    SaveBoxPred(nms_pred, save_file_name);
			
 
				+
			
 
				+    for (const auto box : nms_pred) {
			
 
				+
			
 
				+        if(box.id != 0)continue;
			
 
				+      std::cout << box.x << " ";
			
 
				+      std::cout << box.y << " ";
			
 
				+      std::cout << box.z << " ";
			
 
				+      std::cout << box.w << " ";
			
 
				+      std::cout << box.l << " ";
			
 
				+      std::cout << box.h << " ";
			
 
				+      std::cout << box.rt << " ";
			
 
				+      std::cout << box.id << " ";
			
 
				+      std::cout << box.score << " ";
			
 
				+      std::cout << std::endl;
			
 
				+    }
			
 
				+
			
 
				+    nms_pred.clear();
			
 
				+
			
 
				+    delete pdata;
			
 
				+
			
 
				+    std::cout << ">>>>>>>>>>>" <<std::endl;
			
 
				+}
			
 
				+
			
 
				+void ListenPointCloud(const char * strdata,const unsigned int nSize,const unsigned int index,const QDateTime * dt,const char * strmemname)
			
 
				+{
			
 
				+    if(nSize <=16)return;
			
 
				+    unsigned int * pHeadSize = (unsigned int *)strdata;
			
 
				+    if(*pHeadSize > nSize)
			
 
				+    {
			
 
				+        std::cout<<"ListenPointCloud data is small headsize ="<<*pHeadSize<<"  data size is"<<nSize<<std::endl;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    int nNameSize;
			
 
				+    nNameSize = *pHeadSize - 4-4-8;
			
 
				+    char * strName = new char[nNameSize+1];strName[nNameSize] = 0;
			
 
				+    memcpy(strName,(char *)((char *)strdata +4),nNameSize);
			
 
				+    int nPCount = (nSize - *pHeadSize)/sizeof(pcl::PointXYZI);
			
 
				+    int i;
			
 
				+    pcl::PointXYZI * p;
			
 
				+    p = (pcl::PointXYZI *)((char *)strdata + *pHeadSize);
			
 
				+
			
 
				+    std::shared_ptr<float> pbin_ptr = std::shared_ptr<float>(new float[nPCount*4]);
			
 
				+
			
 
				+    float * pdata = new float[nPCount * 4];// pbin_ptr.get();
			
 
				+
			
 
				+    for(i=0;i<nPCount;i++)
			
 
				+    {
			
 
				+        pcl::PointXYZI xp;
			
 
				+
			
 
				+        memcpy(&xp,p,sizeof(pcl::PointXYZI));
			
 
				+        xp.z = xp.z;
			
 
				+
			
 
				+        pdata[4*i + 0]= static_cast<float>(xp.x);
			
 
				+        pdata[4*i + 1]= static_cast<float>(xp.y);
			
 
				+        pdata[4*i + 2]= static_cast<float>(xp.z);
			
 
				+        pdata[4*i + 3]= static_cast<float>(xp.intensity);
			
 
				+
			
 
				+//        pdata[4*i + 0]= 0;
			
 
				+//        pdata[4*i + 1]= 0;
			
 
				+//        pdata[4*i + 2]= 0;
			
 
				+//        pdata[4*i + 3]= 0;
			
 
				+
			
 
				+        p++;
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    DectectOneBin(pdata,nPCount);
			
 
				+ //   DectectOneBin(pbin_ptr,nPCount);
			
 
				+
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+    QCoreApplication a(argc, argv);
			
 
				+
			
 
				+    Getinfo();
			
 
				+
			
 
				+
			
 
				+
			
 
				+    checkCudaErrors(cudaEventCreate(&start));
			
 
				+    checkCudaErrors(cudaEventCreate(&stop));
			
 
				+    checkCudaErrors(cudaStreamCreate(&stream));
			
 
				+
			
 
				+    Params params_;
			
 
				+
			
 
				+
			
 
				+    nms_pred.reserve(100);
			
 
				+
			
 
				+    PointPillar pointpillar(Model_File, stream);
			
 
				+
			
 
				+    gpPointPillar = &pointpillar;
			
 
				+
			
 
				+    void * pa = iv::modulecomm::RegisterRecv("lidarpc_center",ListenPointCloud);
			
 
				+
			
 
				+    return a.exec();
			
 
				+
			
 
				+    checkCudaErrors(cudaEventDestroy(start));
			
 
				+    checkCudaErrors(cudaEventDestroy(stop));
			
 
				+    checkCudaErrors(cudaStreamDestroy(stream));
			
 
				+}
			
--- a/src/detection/detection_lidar_pointpillars_cuda/src/pillarScatter.cpp
+++ b/src/detection/detection_lidar_pointpillars_cuda/src/pillarScatter.cpp
@@ -0,0 +1,306 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+#include <cassert>
			
 
				+#include <cstring>
			
 
				+#include "pillarScatter.h"
			
 
				+
			
 
				+using namespace nvinfer1;
			
 
				+using nvinfer1::plugin::PPScatterPlugin;
			
 
				+using nvinfer1::plugin::PPScatterPluginCreator;
			
 
				+
			
 
				+static const char* PLUGIN_VERSION{"1"};
			
 
				+static const char* PLUGIN_NAME{"PPScatterPlugin"};
			
 
				+
			
 
				+// Static class fields initialization
			
 
				+PluginFieldCollection PPScatterPluginCreator::mFC{};
			
 
				+std::vector<PluginField> PPScatterPluginCreator::mPluginAttributes;
			
 
				+
			
 
				+// Helper function for serializing plugin
			
 
				+template <typename T>
			
 
				+void writeToBuffer(char*& buffer, const T& val)
			
 
				+{
			
 
				+    *reinterpret_cast<T*>(buffer) = val;
			
 
				+    buffer += sizeof(T);
			
 
				+}
			
 
				+
			
 
				+// Helper function for deserializing plugin
			
 
				+template <typename T>
			
 
				+T readFromBuffer(const char*& buffer)
			
 
				+{
			
 
				+    T val = *reinterpret_cast<const T*>(buffer);
			
 
				+    buffer += sizeof(T);
			
 
				+    return val;
			
 
				+}
			
 
				+
			
 
				+PPScatterPlugin::PPScatterPlugin(size_t h, size_t w)
			
 
				+  : feature_y_size_(h), feature_x_size_(w)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+PPScatterPlugin::PPScatterPlugin(const void* data, size_t length)
			
 
				+{
			
 
				+    const char* d = reinterpret_cast<const char*>(data);
			
 
				+    feature_y_size_ = readFromBuffer<size_t>(d);
			
 
				+    feature_x_size_ = readFromBuffer<size_t>(d);
			
 
				+}
			
 
				+
			
 
				+nvinfer1::IPluginV2DynamicExt* PPScatterPlugin::clone() const noexcept
			
 
				+{
			
 
				+    auto* plugin = new PPScatterPlugin(feature_y_size_, feature_x_size_);
			
 
				+    plugin->setPluginNamespace(mNamespace.c_str());
			
 
				+    return plugin;
			
 
				+}
			
 
				+
			
 
				+nvinfer1::DimsExprs PPScatterPlugin::getOutputDimensions(
			
 
				+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept
			
 
				+{
			
 
				+    assert(outputIndex == 0);
			
 
				+    nvinfer1::DimsExprs output;
			
 
				+    output.nbDims = 4;
			
 
				+    output.d[0] = exprBuilder.constant(1);
			
 
				+    output.d[1] = inputs[0].d[1];
			
 
				+    output.d[2] = exprBuilder.constant(feature_y_size_);
			
 
				+    output.d[3] = exprBuilder.constant(feature_x_size_);
			
 
				+    return output;
			
 
				+}
			
 
				+
			
 
				+bool PPScatterPlugin::supportsFormatCombination(
			
 
				+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
			
 
				+{
			
 
				+    assert(nbInputs == 3);
			
 
				+    assert(nbOutputs == 1);
			
 
				+    const PluginTensorDesc& in = inOut[pos];
			
 
				+    if (pos == 0)
			
 
				+    {
			
 
				+        return (in.type == nvinfer1::DataType::kFLOAT || in.type == nvinfer1::DataType::kHALF) && (in.format == TensorFormat::kLINEAR);
			
 
				+    }
			
 
				+    if (pos == 1)
			
 
				+    {
			
 
				+        return (in.type == nvinfer1::DataType::kINT32) && (in.format == TensorFormat::kLINEAR);
			
 
				+    }
			
 
				+    if (pos == 2)
			
 
				+    {
			
 
				+        return (in.type == nvinfer1::DataType::kINT32) && (in.format == TensorFormat::kLINEAR);
			
 
				+    }
			
 
				+    if (pos == 3)
			
 
				+    {
			
 
				+        return (inOut[0].type == nvinfer1::DataType::kFLOAT && in.type == nvinfer1::DataType::kFLOAT && in.format == TensorFormat::kLINEAR)
			
 
				+            || (inOut[0].type == nvinfer1::DataType::kHALF  && in.type == nvinfer1::DataType::kHALF  && in.format == TensorFormat::kHWC8);
			
 
				+    }
			
 
				+    return false;
			
 
				+}
			
 
				+
			
 
				+void PPScatterPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
			
 
				+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept
			
 
				+{
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+size_t PPScatterPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
			
 
				+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept
			
 
				+{
			
 
				+    return 0;
			
 
				+}
			
 
				+int PPScatterPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
			
 
				+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace,
			
 
				+    cudaStream_t stream) noexcept
			
 
				+{
			
 
				+    try
			
 
				+    {
			
 
				+        int maxPillarNum = inputDesc[0].dims.d[0];
			
 
				+        int numFeatures = inputDesc[0].dims.d[1];
			
 
				+        
			
 
				+        nvinfer1::DataType inputType = inputDesc[0].type;
			
 
				+
			
 
				+        auto coords_data = static_cast<const unsigned int *>(inputs[1]);
			
 
				+        auto params_data = static_cast<const unsigned int *>(inputs[2]);
			
 
				+
			
 
				+        unsigned int featureY = feature_y_size_;
			
 
				+        unsigned int featureX = feature_x_size_;
			
 
				+
			
 
				+        int status = -1;
			
 
				+
			
 
				+        if(inputType == nvinfer1::DataType::kHALF){
			
 
				+            auto pillar_features_data = static_cast<const half *>(inputs[0]);
			
 
				+            auto spatial_feature_data = static_cast<half *>(outputs[0]);
			
 
				+            cudaMemsetAsync(spatial_feature_data, 0, numFeatures*featureY*featureX * sizeof(half), stream);
			
 
				+            status = pillarScatterHalfKernelLaunch(
			
 
				+                maxPillarNum,
			
 
				+                numFeatures,
			
 
				+                pillar_features_data,
			
 
				+                coords_data,
			
 
				+                params_data,
			
 
				+                featureX,
			
 
				+                featureY,
			
 
				+                spatial_feature_data,
			
 
				+                stream
			
 
				+                );
			
 
				+            assert(status == STATUS_SUCCESS);
			
 
				+            return status;
			
 
				+        }
			
 
				+        else if(inputType == nvinfer1::DataType::kFLOAT){
			
 
				+            auto pillar_features_data = static_cast<const float *>(inputs[0]);
			
 
				+            auto spatial_feature_data = static_cast<float *>(outputs[0]);
			
 
				+            cudaMemsetAsync(spatial_feature_data, 0, numFeatures*featureY*featureX * sizeof(float), stream);
			
 
				+            status = pillarScatterFloatKernelLaunch(
			
 
				+                maxPillarNum,
			
 
				+                numFeatures,
			
 
				+                pillar_features_data,
			
 
				+                coords_data,
			
 
				+                params_data,
			
 
				+                featureX,
			
 
				+                featureY,
			
 
				+                spatial_feature_data,
			
 
				+                stream
			
 
				+                );
			
 
				+            assert(status == STATUS_SUCCESS);
			
 
				+            return status;
			
 
				+        }
			
 
				+        else{
			
 
				+            assert(status == STATUS_SUCCESS);
			
 
				+            return status;
			
 
				+        }
			
 
				+    }
			
 
				+    catch (const std::exception& e)
			
 
				+    {
			
 
				+        std::cerr << e.what() << std::endl;
			
 
				+    }
			
 
				+    return -1;
			
 
				+}
			
 
				+
			
 
				+nvinfer1::DataType PPScatterPlugin::getOutputDataType(
			
 
				+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
			
 
				+{
			
 
				+    return inputTypes[0];
			
 
				+}
			
 
				+
			
 
				+const char* PPScatterPlugin::getPluginType() const noexcept
			
 
				+{
			
 
				+    return PLUGIN_NAME;
			
 
				+}
			
 
				+
			
 
				+const char* PPScatterPlugin::getPluginVersion() const noexcept
			
 
				+{
			
 
				+    return PLUGIN_VERSION;
			
 
				+}
			
 
				+
			
 
				+int PPScatterPlugin::getNbOutputs() const noexcept
			
 
				+{
			
 
				+    return 1;
			
 
				+}
			
 
				+
			
 
				+int PPScatterPlugin::initialize() noexcept
			
 
				+{
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+void PPScatterPlugin::terminate() noexcept
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+size_t PPScatterPlugin::getSerializationSize() const noexcept
			
 
				+{
			
 
				+    return 3 * sizeof(size_t);
			
 
				+}
			
 
				+
			
 
				+void PPScatterPlugin::serialize(void* buffer) const noexcept
			
 
				+{
			
 
				+    char* d = reinterpret_cast<char*>(buffer);
			
 
				+    writeToBuffer<size_t>(d, feature_y_size_);
			
 
				+    writeToBuffer<size_t>(d, feature_x_size_);
			
 
				+}
			
 
				+
			
 
				+void PPScatterPlugin::destroy() noexcept
			
 
				+{
			
 
				+    delete this;
			
 
				+}
			
 
				+
			
 
				+void PPScatterPlugin::setPluginNamespace(const char* libNamespace) noexcept
			
 
				+{
			
 
				+    mNamespace = libNamespace;
			
 
				+}
			
 
				+
			
 
				+const char* PPScatterPlugin::getPluginNamespace() const noexcept
			
 
				+{
			
 
				+    return mNamespace.c_str();
			
 
				+}
			
 
				+
			
 
				+PPScatterPluginCreator::PPScatterPluginCreator()
			
 
				+{
			
 
				+    mPluginAttributes.clear();
			
 
				+    mPluginAttributes.emplace_back(PluginField("dense_shape", nullptr, PluginFieldType::kINT32, 1));
			
 
				+    mFC.nbFields = mPluginAttributes.size();
			
 
				+    mFC.fields = mPluginAttributes.data();
			
 
				+}
			
 
				+
			
 
				+const char* PPScatterPluginCreator::getPluginName() const noexcept
			
 
				+{
			
 
				+    return PLUGIN_NAME;
			
 
				+}
			
 
				+
			
 
				+const char* PPScatterPluginCreator::getPluginVersion() const noexcept
			
 
				+{
			
 
				+    return PLUGIN_VERSION;
			
 
				+}
			
 
				+
			
 
				+const PluginFieldCollection* PPScatterPluginCreator::getFieldNames() noexcept
			
 
				+{
			
 
				+    return &mFC;
			
 
				+}
			
 
				+
			
 
				+IPluginV2* PPScatterPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
			
 
				+{
			
 
				+    const PluginField* fields = fc->fields;
			
 
				+    int nbFields = fc->nbFields;
			
 
				+    int target_h = 0;
			
 
				+    int target_w = 0;
			
 
				+    for (int i = 0; i < nbFields; ++i)
			
 
				+    {
			
 
				+        const char* attr_name = fields[i].name;
			
 
				+        if (!strcmp(attr_name, "dense_shape"))
			
 
				+        {
			
 
				+            const int* ts = static_cast<const int*>(fields[i].data);
			
 
				+            target_h = ts[0];
			
 
				+            target_w = ts[1];
			
 
				+        }
			
 
				+    }
			
 
				+    auto* plugin = new PPScatterPlugin(
			
 
				+        target_h,
			
 
				+        target_w
			
 
				+    );
			
 
				+    return plugin;
			
 
				+}
			
 
				+
			
 
				+IPluginV2* PPScatterPluginCreator::deserializePlugin(
			
 
				+    const char* name, const void* serialData, size_t serialLength) noexcept
			
 
				+{
			
 
				+    // This object will be deleted when the network is destroyed,
			
 
				+    auto* plugin = new PPScatterPlugin(serialData, serialLength);
			
 
				+    return plugin;
			
 
				+}
			
 
				+
			
 
				+void PPScatterPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
			
 
				+{
			
 
				+    mNamespace = libNamespace;
			
 
				+}
			
 
				+
			
 
				+const char* PPScatterPluginCreator::getPluginNamespace() const noexcept
			
 
				+{
			
 
				+    return mNamespace.c_str();
			
 
				+}
			
 
				+
			
 
				+REGISTER_TENSORRT_PLUGIN(PPScatterPluginCreator);
			
--- a/src/detection/detection_lidar_pointpillars_cuda/src/pillarScatterKernels.cu
+++ b/src/detection/detection_lidar_pointpillars_cuda/src/pillarScatterKernels.cu
@@ -0,0 +1,126 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kernel.h"
			
 
				+
			
 
				+__global__ void pillarScatterHalfkernel(const half *pillar_features_data,
			
 
				+                                        const unsigned int *coords_data, const unsigned int *params_data,
			
 
				+                                        unsigned int featureX, unsigned int featureY,
			
 
				+                                        half *spatial_feature_data)
			
 
				+{
			
 
				+    int pillar_idx = blockIdx.x * PILLARS_PER_BLOCK + threadIdx.x;
			
 
				+    int valid_pillars_inBlock = PILLARS_PER_BLOCK;
			
 
				+    const int num_pillars = params_data[0];
			
 
				+    int valid_blocks = (num_pillars+PILLARS_PER_BLOCK-1)/PILLARS_PER_BLOCK;
			
 
				+    if(blockIdx.x >= valid_blocks) return;
			
 
				+    if(blockIdx.x == (valid_blocks-1)) {
			
 
				+        valid_pillars_inBlock = num_pillars % PILLARS_PER_BLOCK;
			
 
				+    }
			
 
				+    valid_pillars_inBlock = (valid_pillars_inBlock==0) ? PILLARS_PER_BLOCK : valid_pillars_inBlock;
			
 
				+    __shared__ half pillarSM[PILLARS_PER_BLOCK][PILLAR_FEATURE_SIZE]; //pillar*64
			
 
				+    for (int i = 0; i < valid_pillars_inBlock; i++)
			
 
				+    {
			
 
				+        pillarSM[i][threadIdx.x] = pillar_features_data[ (blockIdx.x * PILLARS_PER_BLOCK +i)*PILLAR_FEATURE_SIZE + threadIdx.x];
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+    if(pillar_idx >= num_pillars) return;
			
 
				+    uint4 coord = ((const uint4 *)coords_data)[pillar_idx];
			
 
				+    unsigned int x = coord.w;
			
 
				+    unsigned int y = coord.z;
			
 
				+
			
 
				+    // Output tensor format : kHWC8, [N][H][W][(C+7)/8*8]
			
 
				+    int C_stride = (PILLAR_FEATURE_SIZE+7)/8*8;
			
 
				+    for (int i = 0; i < PILLAR_FEATURE_SIZE; i++)
			
 
				+    {
			
 
				+        spatial_feature_data[y*featureX*C_stride + x*C_stride + i] = pillarSM[threadIdx.x][i];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+__global__ void pillarScatterFloatkernel(const float *pillar_features_data,
			
 
				+                                         const unsigned int *coords_data, const unsigned int *params_data,
			
 
				+                                         unsigned int featureX, unsigned int featureY,
			
 
				+                                         float *spatial_feature_data)
			
 
				+{
			
 
				+    int pillar_idx = blockIdx.x * PILLARS_PER_BLOCK + threadIdx.x;
			
 
				+    int valid_pillars_inBlock = PILLARS_PER_BLOCK;
			
 
				+    const int num_pillars = params_data[0];
			
 
				+    int valid_blocks = (num_pillars+PILLARS_PER_BLOCK-1)/PILLARS_PER_BLOCK;
			
 
				+    if(blockIdx.x >= valid_blocks) return;
			
 
				+    if(blockIdx.x == (valid_blocks-1)) {
			
 
				+        valid_pillars_inBlock = num_pillars % PILLARS_PER_BLOCK;
			
 
				+    }
			
 
				+    valid_pillars_inBlock = (valid_pillars_inBlock==0) ? PILLARS_PER_BLOCK : valid_pillars_inBlock;
			
 
				+    __shared__ float pillarSM[PILLARS_PER_BLOCK][PILLAR_FEATURE_SIZE]; //pillar*64
			
 
				+    for (int i = 0; i < valid_pillars_inBlock; i++)
			
 
				+    {
			
 
				+        pillarSM[i][threadIdx.x] = pillar_features_data[ (blockIdx.x * PILLARS_PER_BLOCK +i)*PILLAR_FEATURE_SIZE + threadIdx.x];
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+    if(pillar_idx >= num_pillars) return;
			
 
				+    uint4 coord = ((const uint4 *)coords_data)[pillar_idx];
			
 
				+    unsigned int x = coord.w;
			
 
				+    unsigned int y = coord.z;
			
 
				+
			
 
				+    for (int i = 0; i < PILLAR_FEATURE_SIZE; i++)
			
 
				+    {
			
 
				+        spatial_feature_data[i*featureY*featureX + y*featureX + x] = pillarSM[threadIdx.x][i];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+int pillarScatterHalfKernelLaunch(int max_pillar_num,
			
 
				+                                  int num_features,
			
 
				+                                  const half *pillar_features_data,
			
 
				+                                  const unsigned int *coords_data,
			
 
				+                                  const unsigned int *params_data,
			
 
				+                                  unsigned int featureX, unsigned int featureY,
			
 
				+                                  half *spatial_feature_data,
			
 
				+                                  cudaStream_t stream)
			
 
				+{
			
 
				+    dim3 blocks((featureX*featureY+PILLARS_PER_BLOCK-1)/PILLARS_PER_BLOCK);
			
 
				+    dim3 threads(PILLARS_PER_BLOCK);
			
 
				+
			
 
				+    pillarScatterHalfkernel<<<blocks, threads, 0, stream>>>(pillar_features_data, coords_data, params_data, featureX, featureY, spatial_feature_data);
			
 
				+    auto err = cudaGetLastError();
			
 
				+    if (cudaSuccess != err) {
			
 
				+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
			
 
				+        return -1;
			
 
				+    }
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+int pillarScatterFloatKernelLaunch(int max_pillar_num,
			
 
				+                                   int num_features,
			
 
				+                                   const float *pillar_features_data,
			
 
				+                                   const unsigned int *coords_data,
			
 
				+                                   const unsigned int *params_data,
			
 
				+                                   unsigned int featureX, unsigned int featureY,
			
 
				+                                   float *spatial_feature_data,
			
 
				+                                   cudaStream_t stream)
			
 
				+{
			
 
				+    dim3 blocks((featureX*featureY+PILLARS_PER_BLOCK-1)/PILLARS_PER_BLOCK);
			
 
				+    dim3 threads(PILLARS_PER_BLOCK);
			
 
				+
			
 
				+    pillarScatterFloatkernel<<<blocks, threads, 0, stream>>>(pillar_features_data, coords_data, params_data, featureX, featureY, spatial_feature_data);
			
 
				+    auto err = cudaGetLastError();
			
 
				+    if (cudaSuccess != err) {
			
 
				+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
			
 
				+        return -1;
			
 
				+    }
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/src/detection/detection_lidar_pointpillars_cuda/src/pointpillar.cpp
+++ b/src/detection/detection_lidar_pointpillars_cuda/src/pointpillar.cpp
@@ -0,0 +1,312 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "pointpillar.h"
			
 
				+#include <iostream>
			
 
				+#include <fstream>
			
 
				+#include <vector>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include "NvInfer.h"
			
 
				+#include "NvOnnxConfig.h"
			
 
				+#include "NvOnnxParser.h"
			
 
				+#include "NvInferRuntime.h"
			
 
				+
			
 
				+TRT::~TRT(void)
			
 
				+{
			
 
				+  delete(context_);
			
 
				+  delete(engine_);
			
 
				+  checkCudaErrors(cudaEventDestroy(start_));
			
 
				+  checkCudaErrors(cudaEventDestroy(stop_));
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+TRT::TRT(std::string modelFile, cudaStream_t stream):stream_(stream)
			
 
				+{
			
 
				+  std::string modelCache = modelFile + ".cache";
			
 
				+  std::fstream trtCache(modelCache, std::ifstream::in);
			
 
				+  checkCudaErrors(cudaEventCreate(&start_));
			
 
				+  checkCudaErrors(cudaEventCreate(&stop_));
			
 
				+  if (!trtCache.is_open())
			
 
				+  {
			
 
				+	  std::cout << "Building TRT engine."<<std::endl;
			
 
				+    // define builder
			
 
				+    auto builder = (nvinfer1::createInferBuilder(gLogger_));
			
 
				+
			
 
				+    // define network
			
 
				+    const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
			
 
				+    auto network = (builder->createNetworkV2(explicitBatch));
			
 
				+
			
 
				+    // define onnxparser
			
 
				+    auto parser = (nvonnxparser::createParser(*network, gLogger_));
			
 
				+    if (!parser->parseFromFile(modelFile.data(), static_cast<int>(nvinfer1::ILogger::Severity::kWARNING)))
			
 
				+    {
			
 
				+        std::cerr << ": failed to parse onnx model file, please check the onnx version and trt support op!"
			
 
				+                  << std::endl;
			
 
				+        exit(-1);
			
 
				+    }
			
 
				+
			
 
				+    // define config
			
 
				+    auto networkConfig = builder->createBuilderConfig();
			
 
				+#if defined (__arm64__) || defined (__aarch64__) 
			
 
				+//    networkConfig->setFlag(nvinfer1::BuilderFlag::kFP16);
			
 
				+    std::cout << "Enable fp16!" << std::endl;
			
 
				+#endif
			
 
				+    // set max batch size
			
 
				+    builder->setMaxBatchSize(1);
			
 
				+    // set max workspace
			
 
				+    networkConfig->setMaxWorkspaceSize(size_t(1) << 30);
			
 
				+
			
 
				+    engine_ = (builder->buildEngineWithConfig(*network, *networkConfig));
			
 
				+
			
 
				+    if (engine_ == nullptr)
			
 
				+    {
			
 
				+      std::cerr << ": engine init null!" << std::endl;
			
 
				+      exit(-1);
			
 
				+    }
			
 
				+
			
 
				+    // serialize the engine, then close everything down
			
 
				+    auto trtModelStream = (engine_->serialize());
			
 
				+    std::fstream trtOut(modelCache, std::ifstream::out);
			
 
				+    if (!trtOut.is_open())
			
 
				+    {
			
 
				+       std::cout << "Can't store trt cache.\n";
			
 
				+       exit(-1);
			
 
				+    }
			
 
				+
			
 
				+    trtOut.write((char*)trtModelStream->data(), trtModelStream->size());
			
 
				+    trtOut.close();
			
 
				+    trtModelStream->destroy();
			
 
				+
			
 
				+    networkConfig->destroy();
			
 
				+    parser->destroy();
			
 
				+    network->destroy();
			
 
				+    builder->destroy();
			
 
				+
			
 
				+  } else {
			
 
				+	  std::cout << "load TRT cache."<<std::endl;
			
 
				+    char *data;
			
 
				+    unsigned int length;
			
 
				+
			
 
				+    // get length of file:
			
 
				+    trtCache.seekg(0, trtCache.end);
			
 
				+    length = trtCache.tellg();
			
 
				+    trtCache.seekg(0, trtCache.beg);
			
 
				+
			
 
				+    data = (char *)malloc(length);
			
 
				+    if (data == NULL ) {
			
 
				+       std::cout << "Can't malloc data.\n";
			
 
				+       exit(-1);
			
 
				+    }
			
 
				+
			
 
				+    trtCache.read(data, length);
			
 
				+    // create context
			
 
				+    auto runtime = nvinfer1::createInferRuntime(gLogger_);
			
 
				+
			
 
				+    if (runtime == nullptr) {	  std::cout << "load TRT cache0."<<std::endl;
			
 
				+        std::cerr << ": runtime null!" << std::endl;
			
 
				+        exit(-1);
			
 
				+    }
			
 
				+    //plugin_ = nvonnxparser::createPluginFactory(gLogger_);
			
 
				+    engine_ = (runtime->deserializeCudaEngine(data, length, 0));
			
 
				+    if (engine_ == nullptr) {
			
 
				+        std::cerr << ": engine null!" << std::endl;
			
 
				+        exit(-1);
			
 
				+    }
			
 
				+    free(data);
			
 
				+    trtCache.close();
			
 
				+  }
			
 
				+
			
 
				+  context_ = engine_->createExecutionContext();
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+int TRT::doinfer(void**buffers)
			
 
				+{
			
 
				+  int status;
			
 
				+
			
 
				+  status = context_->enqueueV2(buffers, stream_, &start_);
			
 
				+
			
 
				+  if (!status)
			
 
				+  {
			
 
				+      return -1;
			
 
				+  }
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+PointPillar::PointPillar(std::string modelFile, cudaStream_t stream):stream_(stream)
			
 
				+{
			
 
				+  checkCudaErrors(cudaEventCreate(&start_));
			
 
				+  checkCudaErrors(cudaEventCreate(&stop_));
			
 
				+
			
 
				+  pre_.reset(new PreProcessCuda(stream_));
			
 
				+  trt_.reset(new TRT(modelFile, stream_));
			
 
				+  post_.reset(new PostProcessCuda(stream_));
			
 
				+
			
 
				+  //point cloud to voxels
			
 
				+  voxel_features_size_ = MAX_VOXELS * params_.max_num_points_per_pillar * 4 * sizeof(float);
			
 
				+  voxel_num_size_ = MAX_VOXELS * sizeof(unsigned int);
			
 
				+  voxel_idxs_size_ = MAX_VOXELS* 4 * sizeof(unsigned int);
			
 
				+
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&voxel_features_, voxel_features_size_));
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&voxel_num_, voxel_num_size_));
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&voxel_idxs_, voxel_idxs_size_));
			
 
				+
			
 
				+  checkCudaErrors(cudaMemsetAsync(voxel_features_, 0, voxel_features_size_, stream_));
			
 
				+  checkCudaErrors(cudaMemsetAsync(voxel_num_, 0, voxel_num_size_, stream_));
			
 
				+  checkCudaErrors(cudaMemsetAsync(voxel_idxs_, 0, voxel_idxs_size_, stream_));
			
 
				+
			
 
				+  //TRT-input
			
 
				+  features_input_size_ = MAX_VOXELS * params_.max_num_points_per_pillar * 10 * sizeof(float);
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&features_input_, features_input_size_));
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&params_input_, sizeof(unsigned int)));
			
 
				+
			
 
				+  checkCudaErrors(cudaMemsetAsync(features_input_, 0, features_input_size_, stream_));
			
 
				+  checkCudaErrors(cudaMemsetAsync(params_input_, 0, sizeof(unsigned int), stream_));
			
 
				+
			
 
				+  //output of TRT -- input of post-process
			
 
				+  cls_size_ = params_.feature_x_size * params_.feature_y_size * params_.num_classes * params_.num_anchors * sizeof(float);
			
 
				+  box_size_ = params_.feature_x_size * params_.feature_y_size * params_.num_box_values * params_.num_anchors * sizeof(float);
			
 
				+  dir_cls_size_ = params_.feature_x_size * params_.feature_y_size * params_.num_dir_bins * params_.num_anchors * sizeof(float);
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&cls_output_, cls_size_));
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&box_output_, box_size_));
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&dir_cls_output_, dir_cls_size_));
			
 
				+
			
 
				+  //output of post-process
			
 
				+  bndbox_size_ = (params_.feature_x_size * params_.feature_y_size * params_.num_anchors * 9 + 1) * sizeof(float);
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&bndbox_output_, bndbox_size_));
			
 
				+
			
 
				+  res_.reserve(100);
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+PointPillar::~PointPillar(void)
			
 
				+{
			
 
				+  pre_.reset();
			
 
				+  trt_.reset();
			
 
				+  post_.reset();
			
 
				+
			
 
				+  checkCudaErrors(cudaFree(voxel_features_));
			
 
				+  checkCudaErrors(cudaFree(voxel_num_));
			
 
				+  checkCudaErrors(cudaFree(voxel_idxs_));
			
 
				+
			
 
				+  checkCudaErrors(cudaFree(features_input_));
			
 
				+  checkCudaErrors(cudaFree(params_input_));
			
 
				+
			
 
				+  checkCudaErrors(cudaFree(cls_output_));
			
 
				+  checkCudaErrors(cudaFree(box_output_));
			
 
				+  checkCudaErrors(cudaFree(dir_cls_output_));
			
 
				+
			
 
				+  checkCudaErrors(cudaFree(bndbox_output_));
			
 
				+
			
 
				+  checkCudaErrors(cudaEventDestroy(start_));
			
 
				+  checkCudaErrors(cudaEventDestroy(stop_));
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+int PointPillar::doinfer(void*points_data, unsigned int points_size, std::vector<Bndbox> &nms_pred)
			
 
				+{
			
 
				+#if PERFORMANCE_LOG
			
 
				+  float generateVoxelsTime = 0.0f;
			
 
				+  checkCudaErrors(cudaEventRecord(start_, stream_));
			
 
				+#endif
			
 
				+
			
 
				+  pre_->generateVoxels((float*)points_data, points_size,
			
 
				+        params_input_,
			
 
				+        voxel_features_, 
			
 
				+        voxel_num_,
			
 
				+        voxel_idxs_);
			
 
				+
			
 
				+#if PERFORMANCE_LOG
			
 
				+  checkCudaErrors(cudaEventRecord(stop_, stream_));
			
 
				+  checkCudaErrors(cudaDeviceSynchronize());
			
 
				+  checkCudaErrors(cudaEventElapsedTime(&generateVoxelsTime, start_, stop_));
			
 
				+  unsigned int params_input_cpu;
			
 
				+  checkCudaErrors(cudaMemcpy(&params_input_cpu, params_input_, sizeof(unsigned int), cudaMemcpyDefault));
			
 
				+  std::cout<<"find pillar_num: "<< params_input_cpu <<std::endl;
			
 
				+#endif
			
 
				+
			
 
				+#if PERFORMANCE_LOG
			
 
				+  float generateFeaturesTime = 0.0f;
			
 
				+  checkCudaErrors(cudaEventRecord(start_, stream_));
			
 
				+#endif
			
 
				+
			
 
				+  pre_->generateFeatures(voxel_features_,
			
 
				+      voxel_num_,
			
 
				+      voxel_idxs_,
			
 
				+      params_input_,
			
 
				+      features_input_);
			
 
				+
			
 
				+#if PERFORMANCE_LOG
			
 
				+  checkCudaErrors(cudaEventRecord(stop_, stream_));
			
 
				+  checkCudaErrors(cudaEventSynchronize(stop_));
			
 
				+  checkCudaErrors(cudaEventElapsedTime(&generateFeaturesTime, start_, stop_));
			
 
				+#endif
			
 
				+
			
 
				+#if PERFORMANCE_LOG
			
 
				+  float doinferTime = 0.0f;
			
 
				+  checkCudaErrors(cudaEventRecord(start_, stream_));
			
 
				+#endif
			
 
				+
			
 
				+  void *buffers[] = {features_input_, voxel_idxs_, params_input_, cls_output_, box_output_, dir_cls_output_};
			
 
				+  trt_->doinfer(buffers);
			
 
				+  checkCudaErrors(cudaMemsetAsync(params_input_, 0, sizeof(unsigned int), stream_));
			
 
				+
			
 
				+#if PERFORMANCE_LOG
			
 
				+  checkCudaErrors(cudaEventRecord(stop_, stream_));
			
 
				+  checkCudaErrors(cudaEventSynchronize(stop_));
			
 
				+  checkCudaErrors(cudaEventElapsedTime(&doinferTime, start_, stop_));
			
 
				+#endif
			
 
				+
			
 
				+#if PERFORMANCE_LOG
			
 
				+  float doPostprocessCudaTime = 0.0f;
			
 
				+  checkCudaErrors(cudaEventRecord(start_, stream_));
			
 
				+#endif
			
 
				+
			
 
				+  post_->doPostprocessCuda(cls_output_, box_output_, dir_cls_output_,
			
 
				+                          bndbox_output_);
			
 
				+  checkCudaErrors(cudaDeviceSynchronize());
			
 
				+  float obj_count = bndbox_output_[0];
			
 
				+
			
 
				+  int num_obj = static_cast<int>(obj_count);
			
 
				+  auto output = bndbox_output_ + 1;
			
 
				+
			
 
				+  for (int i = 0; i < num_obj; i++) {
			
 
				+    auto Bb = Bndbox(output[i * 9],
			
 
				+                    output[i * 9 + 1], output[i * 9 + 2], output[i * 9 + 3],
			
 
				+                    output[i * 9 + 4], output[i * 9 + 5], output[i * 9 + 6],
			
 
				+                    static_cast<int>(output[i * 9 + 7]),
			
 
				+                    output[i * 9 + 8]);
			
 
				+    res_.push_back(Bb);
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+  nms_cpu(res_, params_.nms_thresh, nms_pred);
			
 
				+  res_.clear();
			
 
				+
			
 
				+#if PERFORMANCE_LOG
			
 
				+  checkCudaErrors(cudaDeviceSynchronize());
			
 
				+  checkCudaErrors(cudaEventRecord(stop_, stream_));
			
 
				+  checkCudaErrors(cudaEventSynchronize(stop_));
			
 
				+  checkCudaErrors(cudaEventElapsedTime(&doPostprocessCudaTime, start_, stop_));
			
 
				+  std::cout<<"TIME: generateVoxels: "<< generateVoxelsTime <<" ms." <<std::endl;
			
 
				+  std::cout<<"TIME: generateFeatures: "<< generateFeaturesTime <<" ms." <<std::endl;
			
 
				+  std::cout<<"TIME: doinfer: "<< doinferTime <<" ms." <<std::endl;
			
 
				+  std::cout<<"TIME: doPostprocessCuda: "<< doPostprocessCudaTime <<" ms." <<std::endl;
			
 
				+#endif
			
 
				+  return 0;
			
 
				+}
			
--- a/src/detection/detection_lidar_pointpillars_cuda/src/postprocess.cpp
+++ b/src/detection/detection_lidar_pointpillars_cuda/src/postprocess.cpp
@@ -0,0 +1,251 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "postprocess.h"
			
 
				+#include <iostream>
			
 
				+#include <vector>
			
 
				+#include <algorithm>
			
 
				+#include <math.h>
			
 
				+
			
 
				+const float ThresHold = 1e-8;
			
 
				+
			
 
				+inline float cross(const float2 p1, const float2 p2, const float2 p0) {
			
 
				+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
			
 
				+}
			
 
				+
			
 
				+inline int check_box2d(const Bndbox box, const float2 p) {
			
 
				+    const float MARGIN = 1e-2;
			
 
				+    float center_x = box.x;
			
 
				+    float center_y = box.y;
			
 
				+    float angle_cos = cos(-box.rt);
			
 
				+    float angle_sin = sin(-box.rt);
			
 
				+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
			
 
				+    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
			
 
				+
			
 
				+    return (fabs(rot_x) < box.w / 2 + MARGIN && fabs(rot_y) < box.l / 2 + MARGIN);
			
 
				+}
			
 
				+
			
 
				+bool intersection(const float2 p1, const float2 p0, const float2 q1, const float2 q0, float2 &ans) {
			
 
				+
			
 
				+    if (( std::min(p0.x, p1.x) <= std::max(q0.x, q1.x) &&
			
 
				+          std::min(q0.x, q1.x) <= std::max(p0.x, p1.x) &&
			
 
				+          std::min(p0.y, p1.y) <= std::max(q0.y, q1.y) &&
			
 
				+          std::min(q0.y, q1.y) <= std::max(p0.y, p1.y) ) == 0)
			
 
				+        return false;
			
 
				+
			
 
				+
			
 
				+    float s1 = cross(q0, p1, p0);
			
 
				+    float s2 = cross(p1, q1, p0);
			
 
				+    float s3 = cross(p0, q1, q0);
			
 
				+    float s4 = cross(q1, p1, q0);
			
 
				+
			
 
				+    if (!(s1 * s2 > 0 && s3 * s4 > 0))
			
 
				+        return false;
			
 
				+
			
 
				+    float s5 = cross(q1, p1, p0);
			
 
				+    if (fabs(s5 - s1) > ThresHold) {
			
 
				+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
			
 
				+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
			
 
				+
			
 
				+    } else {
			
 
				+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
			
 
				+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
			
 
				+        float D = a0 * b1 - a1 * b0;
			
 
				+
			
 
				+        ans.x = (b0 * c1 - b1 * c0) / D;
			
 
				+        ans.y = (a1 * c0 - a0 * c1) / D;
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+inline void rotate_around_center(const float2 &center, const float angle_cos, const float angle_sin, float2 &p) {
			
 
				+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
			
 
				+    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
			
 
				+    p = float2 {new_x, new_y};
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+inline float box_overlap(const Bndbox &box_a, const Bndbox &box_b) {
			
 
				+    float a_angle = box_a.rt, b_angle = box_b.rt;
			
 
				+    float a_dx_half = box_a.w / 2, b_dx_half = box_b.w / 2, a_dy_half = box_a.l / 2, b_dy_half = box_b.l / 2;
			
 
				+    float a_x1 = box_a.x - a_dx_half, a_y1 = box_a.y - a_dy_half;
			
 
				+    float a_x2 = box_a.x + a_dx_half, a_y2 = box_a.y + a_dy_half;
			
 
				+    float b_x1 = box_b.x - b_dx_half, b_y1 = box_b.y - b_dy_half;
			
 
				+    float b_x2 = box_b.x + b_dx_half, b_y2 = box_b.y + b_dy_half;
			
 
				+    float2 box_a_corners[5];
			
 
				+    float2 box_b_corners[5];
			
 
				+
			
 
				+    float2 center_a = float2 {box_a.x, box_a.y};
			
 
				+    float2 center_b = float2 {box_b.x, box_b.y};
			
 
				+
			
 
				+    float2 cross_points[16];
			
 
				+    float2 poly_center =  {0, 0};
			
 
				+    int cnt = 0;
			
 
				+    bool flag = false;
			
 
				+
			
 
				+    box_a_corners[0] = float2 {a_x1, a_y1};
			
 
				+    box_a_corners[1] = float2 {a_x2, a_y1};
			
 
				+    box_a_corners[2] = float2 {a_x2, a_y2};
			
 
				+    box_a_corners[3] = float2 {a_x1, a_y2};
			
 
				+
			
 
				+    box_b_corners[0] = float2 {b_x1, b_y1};
			
 
				+    box_b_corners[1] = float2 {b_x2, b_y1};
			
 
				+    box_b_corners[2] = float2 {b_x2, b_y2};
			
 
				+    box_b_corners[3] = float2 {b_x1, b_y2};
			
 
				+
			
 
				+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
			
 
				+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
			
 
				+
			
 
				+    for (int k = 0; k < 4; k++) {
			
 
				+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
			
 
				+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
			
 
				+    }
			
 
				+
			
 
				+    box_a_corners[4] = box_a_corners[0];
			
 
				+    box_b_corners[4] = box_b_corners[0];
			
 
				+
			
 
				+    for (int i = 0; i < 4; i++) {
			
 
				+        for (int j = 0; j < 4; j++) {
			
 
				+            flag = intersection(box_a_corners[i + 1], box_a_corners[i],
			
 
				+                                box_b_corners[j + 1], box_b_corners[j],
			
 
				+                                cross_points[cnt]);
			
 
				+            if (flag) {
			
 
				+                poly_center = {poly_center.x + cross_points[cnt].x, poly_center.y + cross_points[cnt].y};
			
 
				+                cnt++;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for (int k = 0; k < 4; k++) {
			
 
				+        if (check_box2d(box_a, box_b_corners[k])) {
			
 
				+            poly_center = {poly_center.x + box_b_corners[k].x, poly_center.y + box_b_corners[k].y};
			
 
				+            cross_points[cnt] = box_b_corners[k];
			
 
				+            cnt++;
			
 
				+        }
			
 
				+        if (check_box2d(box_b, box_a_corners[k])) {
			
 
				+            poly_center = {poly_center.x + box_a_corners[k].x, poly_center.y + box_a_corners[k].y};
			
 
				+            cross_points[cnt] = box_a_corners[k];
			
 
				+            cnt++;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    poly_center.x /= cnt;
			
 
				+    poly_center.y /= cnt;
			
 
				+
			
 
				+    float2 temp;
			
 
				+    for (int j = 0; j < cnt - 1; j++) {
			
 
				+        for (int i = 0; i < cnt - j - 1; i++) {
			
 
				+            if (atan2(cross_points[i].y - poly_center.y, cross_points[i].x - poly_center.x) >
			
 
				+                atan2(cross_points[i+1].y - poly_center.y, cross_points[i+1].x - poly_center.x)
			
 
				+                ) {
			
 
				+                temp = cross_points[i];
			
 
				+                cross_points[i] = cross_points[i + 1];
			
 
				+                cross_points[i + 1] = temp;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    float area = 0;
			
 
				+    for (int k = 0; k < cnt - 1; k++) {
			
 
				+        float2 a = {cross_points[k].x - cross_points[0].x,
			
 
				+                    cross_points[k].y - cross_points[0].y};
			
 
				+        float2 b = {cross_points[k + 1].x - cross_points[0].x,
			
 
				+                    cross_points[k + 1].y - cross_points[0].y};
			
 
				+        area += (a.x * b.y - a.y * b.x);
			
 
				+    }
			
 
				+    return fabs(area) / 2.0;
			
 
				+}
			
 
				+
			
 
				+int nms_cpu(std::vector<Bndbox> bndboxes, const float nms_thresh, std::vector<Bndbox> &nms_pred)
			
 
				+{
			
 
				+    std::sort(bndboxes.begin(), bndboxes.end(),
			
 
				+              [](Bndbox boxes1, Bndbox boxes2) { return boxes1.score > boxes2.score; });
			
 
				+    std::vector<int> suppressed(bndboxes.size(), 0);
			
 
				+    for (size_t i = 0; i < bndboxes.size(); i++) {
			
 
				+        if (suppressed[i] == 1) {
			
 
				+            continue;
			
 
				+        }
			
 
				+        nms_pred.emplace_back(bndboxes[i]);
			
 
				+        for (size_t j = i + 1; j < bndboxes.size(); j++) {
			
 
				+            if (suppressed[j] == 1) {
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            float sa = bndboxes[i].w * bndboxes[i].l;
			
 
				+            float sb = bndboxes[j].w * bndboxes[j].l;
			
 
				+            float s_overlap = box_overlap(bndboxes[i], bndboxes[j]);
			
 
				+            float iou = s_overlap / fmaxf(sa + sb - s_overlap, ThresHold);
			
 
				+
			
 
				+            if (iou >= nms_thresh) {
			
 
				+                suppressed[j] = 1;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+PostProcessCuda::PostProcessCuda(cudaStream_t stream)
			
 
				+{
			
 
				+  stream_ = stream;
			
 
				+
			
 
				+  checkCudaErrors(cudaMalloc((void **)&anchors_, params_.num_anchors * params_.len_per_anchor * sizeof(float)));
			
 
				+  checkCudaErrors(cudaMalloc((void **)&anchor_bottom_heights_, params_.num_classes * sizeof(float)));
			
 
				+  checkCudaErrors(cudaMalloc((void **)&object_counter_, sizeof(int)));
			
 
				+
			
 
				+  checkCudaErrors(cudaMemcpyAsync(anchors_, params_.anchors,
			
 
				+        params_.num_anchors * params_.len_per_anchor * sizeof(float), cudaMemcpyDefault, stream_));
			
 
				+  checkCudaErrors(cudaMemcpyAsync(anchor_bottom_heights_, params_.anchor_bottom_heights,
			
 
				+                     params_.num_classes * sizeof(float), cudaMemcpyDefault, stream_));
			
 
				+  checkCudaErrors(cudaMemsetAsync(object_counter_, 0, sizeof(int), stream_));
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+PostProcessCuda::~PostProcessCuda()
			
 
				+{
			
 
				+  checkCudaErrors(cudaFree(anchors_));
			
 
				+  checkCudaErrors(cudaFree(anchor_bottom_heights_));
			
 
				+  checkCudaErrors(cudaFree(object_counter_));
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+int PostProcessCuda::doPostprocessCuda(const float *cls_input, float *box_input, const float *dir_cls_input,
			
 
				+                                        float *bndbox_output)
			
 
				+{
			
 
				+  checkCudaErrors(cudaMemsetAsync(object_counter_, 0, sizeof(int)));
			
 
				+  checkCudaErrors(postprocess_launch(cls_input,
			
 
				+                     box_input,
			
 
				+                     dir_cls_input,
			
 
				+                     anchors_,
			
 
				+                     anchor_bottom_heights_,
			
 
				+                     bndbox_output,
			
 
				+                     object_counter_,
			
 
				+                     params_.min_x_range,
			
 
				+                     params_.max_x_range,
			
 
				+                     params_.min_y_range,
			
 
				+                     params_.max_y_range,
			
 
				+                     params_.feature_x_size,
			
 
				+                     params_.feature_y_size,
			
 
				+                     params_.num_anchors,
			
 
				+                     params_.num_classes,
			
 
				+                     params_.num_box_values,
			
 
				+                     params_.score_thresh,
			
 
				+                     params_.dir_offset,
			
 
				+                     stream_
			
 
				+                     ));
			
 
				+  return 0;
			
 
				+}
			
--- a/src/detection/detection_lidar_pointpillars_cuda/src/postprocess_kernels.cu
+++ b/src/detection/detection_lidar_pointpillars_cuda/src/postprocess_kernels.cu
@@ -0,0 +1,159 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kernel.h"
			
 
				+
			
 
				+__device__ float sigmoid(const float x) { return 1.0f / (1.0f + expf(-x)); }
			
 
				+
			
 
				+__global__ void postprocess_kernal(const float *cls_input,
			
 
				+                                        float *box_input,
			
 
				+                                        const float *dir_cls_input,
			
 
				+                                        float *anchors,
			
 
				+                                        float *anchor_bottom_heights,
			
 
				+                                        float *bndbox_output,
			
 
				+                                        int *object_counter,
			
 
				+                                        const float min_x_range,
			
 
				+                                        const float max_x_range,
			
 
				+                                        const float min_y_range,
			
 
				+                                        const float max_y_range,
			
 
				+                                        const int feature_x_size,
			
 
				+                                        const int feature_y_size,
			
 
				+                                        const int num_anchors,
			
 
				+                                        const int num_classes,
			
 
				+                                        const int num_box_values,
			
 
				+                                        const float score_thresh,
			
 
				+                                        const float dir_offset)
			
 
				+{
			
 
				+  int loc_index = blockIdx.x;
			
 
				+  int ith_anchor = threadIdx.x;
			
 
				+  if (ith_anchor >= num_anchors)
			
 
				+  {
			
 
				+      return;
			
 
				+  }
			
 
				+  int col = loc_index % feature_x_size;
			
 
				+  int row = loc_index / feature_x_size;
			
 
				+  float x_offset = min_x_range + col * (max_x_range - min_x_range) / (feature_x_size - 1);
			
 
				+  float y_offset = min_y_range + row * (max_y_range - min_y_range) / (feature_y_size - 1);
			
 
				+  int cls_offset = loc_index * num_anchors * num_classes + ith_anchor * num_classes;
			
 
				+  float dev_cls[2] = {-1, 0};
			
 
				+
			
 
				+  const float *scores = cls_input + cls_offset;
			
 
				+  float max_score = sigmoid(scores[0]);
			
 
				+  int cls_id = 0;
			
 
				+  for (int i = 1; i < num_classes; i++) {
			
 
				+    float cls_score = sigmoid(scores[i]);
			
 
				+    if (cls_score > max_score) {
			
 
				+      max_score = cls_score;
			
 
				+      cls_id = i;
			
 
				+    }
			
 
				+  }
			
 
				+  dev_cls[0] = static_cast<float>(cls_id);
			
 
				+  dev_cls[1] = max_score;
			
 
				+
			
 
				+  if (dev_cls[1] >= score_thresh)
			
 
				+  {
			
 
				+    int box_offset = loc_index * num_anchors * num_box_values + ith_anchor * num_box_values;
			
 
				+    int dir_cls_offset = loc_index * num_anchors * 2 + ith_anchor * 2;
			
 
				+    float *anchor_ptr = anchors + ith_anchor * 4;
			
 
				+    float z_offset = anchor_ptr[2] / 2 + anchor_bottom_heights[ith_anchor / 2];
			
 
				+    float anchor[7] = {x_offset, y_offset, z_offset, anchor_ptr[0], anchor_ptr[1], anchor_ptr[2], anchor_ptr[3]};
			
 
				+    float *box_encodings = box_input + box_offset;
			
 
				+
			
 
				+    float xa = anchor[0];
			
 
				+    float ya = anchor[1];
			
 
				+    float za = anchor[2];
			
 
				+    float dxa = anchor[3];
			
 
				+    float dya = anchor[4];
			
 
				+    float dza = anchor[5];
			
 
				+    float ra = anchor[6];
			
 
				+    float diagonal = sqrtf(dxa * dxa + dya * dya);
			
 
				+    box_encodings[0] = box_encodings[0] * diagonal + xa;
			
 
				+    box_encodings[1] = box_encodings[1] * diagonal + ya;
			
 
				+    box_encodings[2] = box_encodings[2] * dza + za;
			
 
				+    box_encodings[3] = expf(box_encodings[3]) * dxa;
			
 
				+    box_encodings[4] = expf(box_encodings[4]) * dya;
			
 
				+    box_encodings[5] = expf(box_encodings[5]) * dza;
			
 
				+    box_encodings[6] = box_encodings[6] + ra;
			
 
				+
			
 
				+    float yaw;
			
 
				+    int dir_label = dir_cls_input[dir_cls_offset] > dir_cls_input[dir_cls_offset + 1] ? 0 : 1;
			
 
				+    float period = 2 * M_PI / 2;
			
 
				+    float val = box_input[box_offset + 6] - dir_offset;
			
 
				+    float dir_rot = val - floor(val / (period + 1e-8) + 0.f) * period;
			
 
				+    yaw = dir_rot + dir_offset + period * dir_label;
			
 
				+
			
 
				+    int resCount = (int)atomicAdd(object_counter, 1);
			
 
				+    bndbox_output[0] = resCount+1;
			
 
				+    float *data = bndbox_output + 1 + resCount * 9;
			
 
				+    data[0] = box_input[box_offset];
			
 
				+    data[1] = box_input[box_offset + 1];
			
 
				+    data[2] = box_input[box_offset + 2];
			
 
				+    data[3] = box_input[box_offset + 3];
			
 
				+    data[4] = box_input[box_offset + 4];
			
 
				+    data[5] = box_input[box_offset + 5];
			
 
				+    data[6] = yaw;
			
 
				+    data[7] = dev_cls[0];
			
 
				+    data[8] = dev_cls[1];
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+cudaError_t postprocess_launch(const float *cls_input,
			
 
				+                      float *box_input,
			
 
				+                      const float *dir_cls_input,
			
 
				+                      float *anchors,
			
 
				+                      float *anchor_bottom_heights,
			
 
				+                      float *bndbox_output,
			
 
				+                      int *object_counter,
			
 
				+                      const float min_x_range,
			
 
				+                      const float max_x_range,
			
 
				+                      const float min_y_range,
			
 
				+                      const float max_y_range,
			
 
				+                      const int feature_x_size,
			
 
				+                      const int feature_y_size,
			
 
				+                      const int num_anchors,
			
 
				+                      const int num_classes,
			
 
				+                      const int num_box_values,
			
 
				+                      const float score_thresh,
			
 
				+                      const float dir_offset,
			
 
				+                      cudaStream_t stream)
			
 
				+{
			
 
				+  int bev_size = feature_x_size * feature_y_size;
			
 
				+  dim3 threads (num_anchors);
			
 
				+  dim3 blocks (bev_size);
			
 
				+
			
 
				+  postprocess_kernal<<<blocks, threads, 0, stream>>>
			
 
				+                (cls_input,
			
 
				+                 box_input,
			
 
				+                 dir_cls_input,
			
 
				+                 anchors,
			
 
				+                 anchor_bottom_heights,
			
 
				+                 bndbox_output,
			
 
				+                 object_counter,
			
 
				+                 min_x_range,
			
 
				+                 max_x_range,
			
 
				+                 min_y_range,
			
 
				+                 max_y_range,
			
 
				+                 feature_x_size,
			
 
				+                 feature_y_size,
			
 
				+                 num_anchors,
			
 
				+                 num_classes,
			
 
				+                 num_box_values,
			
 
				+                 score_thresh,
			
 
				+                 dir_offset);
			
 
				+  return cudaGetLastError();
			
 
				+}
			
 
				+
			
--- a/src/detection/detection_lidar_pointpillars_cuda/src/preprocess.cpp
+++ b/src/detection/detection_lidar_pointpillars_cuda/src/preprocess.cpp
@@ -0,0 +1,122 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "preprocess.h"
			
 
				+#include <assert.h>
			
 
				+#include <iostream>
			
 
				+#include <math.h>
			
 
				+
			
 
				+PreProcessCuda::PreProcessCuda(cudaStream_t stream)
			
 
				+{
			
 
				+  stream_ = stream;
			
 
				+
			
 
				+  unsigned int mask_size = params_.grid_z_size
			
 
				+              * params_.grid_y_size
			
 
				+              * params_.grid_x_size
			
 
				+              * sizeof(unsigned int);
			
 
				+
			
 
				+  unsigned int voxels_size = params_.grid_z_size
			
 
				+              * params_.grid_y_size
			
 
				+              * params_.grid_x_size
			
 
				+              * params_.max_num_points_per_pillar
			
 
				+              * params_.num_point_values
			
 
				+              * sizeof(float);
			
 
				+
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&mask_, mask_size));
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&voxels_, voxels_size));
			
 
				+  checkCudaErrors(cudaMallocManaged((void **)&voxelsList_, 300000l*sizeof(int)));
			
 
				+
			
 
				+  checkCudaErrors(cudaMemsetAsync(mask_, 0, mask_size, stream_));
			
 
				+  checkCudaErrors(cudaMemsetAsync(voxels_, 0, voxels_size, stream_));
			
 
				+  checkCudaErrors(cudaMemsetAsync(voxelsList_, 0, 300000l*sizeof(int), stream_));
			
 
				+
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+PreProcessCuda::~PreProcessCuda()
			
 
				+{
			
 
				+  checkCudaErrors(cudaFree(mask_));
			
 
				+  checkCudaErrors(cudaFree(voxels_));
			
 
				+  checkCudaErrors(cudaFree(voxelsList_));
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+int PreProcessCuda::generateVoxels(float *points, size_t points_size,
			
 
				+        unsigned int *pillar_num,
			
 
				+        float *voxel_features,
			
 
				+        unsigned int *voxel_num,
			
 
				+        unsigned int *voxel_idxs)
			
 
				+{
			
 
				+  int grid_x_size = params_.grid_x_size;
			
 
				+  int grid_y_size = params_.grid_y_size;
			
 
				+  int grid_z_size = params_.grid_z_size;
			
 
				+  assert(grid_z_size ==1);
			
 
				+  float min_x_range = params_.min_x_range;
			
 
				+  float max_x_range = params_.max_x_range;
			
 
				+  float min_y_range = params_.min_y_range;
			
 
				+  float max_y_range = params_.max_y_range;
			
 
				+  float min_z_range = params_.min_z_range;
			
 
				+  float max_z_range = params_.max_z_range;
			
 
				+  float pillar_x_size = params_.pillar_x_size;
			
 
				+  float pillar_y_size = params_.pillar_y_size;
			
 
				+  float pillar_z_size = params_.pillar_z_size;
			
 
				+
			
 
				+  checkCudaErrors(generateVoxels_random_launch(points, points_size,
			
 
				+        min_x_range, max_x_range,
			
 
				+        min_y_range, max_y_range,
			
 
				+        min_z_range, max_z_range,
			
 
				+        pillar_x_size, pillar_y_size, pillar_z_size,
			
 
				+        grid_y_size, grid_x_size,
			
 
				+        mask_, voxels_, stream_));
			
 
				+
			
 
				+  checkCudaErrors(generateBaseFeatures_launch(mask_, voxels_,
			
 
				+      grid_y_size, grid_x_size,
			
 
				+      pillar_num,
			
 
				+      voxel_features,
			
 
				+      voxel_num,
			
 
				+      voxel_idxs, stream_));
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+int PreProcessCuda::generateFeatures(float* voxel_features,
			
 
				+          unsigned int* voxel_num,
			
 
				+          unsigned int* voxel_idxs,
			
 
				+          unsigned int *params,
			
 
				+          float* features)
			
 
				+{
			
 
				+  int grid_z_size = params_.grid_z_size;
			
 
				+  assert(grid_z_size ==1);
			
 
				+  float range_min_x = params_.min_x_range;
			
 
				+  float range_min_y = params_.min_y_range;
			
 
				+  float range_min_z = params_.min_z_range;
			
 
				+
			
 
				+  float voxel_x = params_.pillar_x_size;
			
 
				+  float voxel_y = params_.pillar_y_size;
			
 
				+  float voxel_z = params_.pillar_z_size;
			
 
				+
			
 
				+  checkCudaErrors(generateFeatures_launch(voxel_features,
			
 
				+      voxel_num,
			
 
				+      voxel_idxs,
			
 
				+      params,
			
 
				+      voxel_x, voxel_y, voxel_z,
			
 
				+      range_min_x, range_min_y, range_min_z,
			
 
				+      features, stream_));
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
--- a/src/detection/detection_lidar_pointpillars_cuda/src/preprocess_kernels.cu
+++ b/src/detection/detection_lidar_pointpillars_cuda/src/preprocess_kernels.cu
@@ -0,0 +1,311 @@
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "kernel.h"
			
 
				+
			
 
				+__global__ void generateVoxels_random_kernel(float *points, size_t points_size,
			
 
				+        float min_x_range, float max_x_range,
			
 
				+        float min_y_range, float max_y_range,
			
 
				+        float min_z_range, float max_z_range,
			
 
				+        float pillar_x_size, float pillar_y_size, float pillar_z_size,
			
 
				+        int grid_y_size, int grid_x_size,
			
 
				+        unsigned int *mask, float *voxels)
			
 
				+{
			
 
				+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if(point_idx >= points_size) return;
			
 
				+
			
 
				+  float4 point = ((float4*)points)[point_idx];
			
 
				+
			
 
				+  if(point.x<min_x_range||point.x>=max_x_range
			
 
				+    || point.y<min_y_range||point.y>=max_y_range
			
 
				+    || point.z<min_z_range||point.z>=max_z_range) return;
			
 
				+
			
 
				+  int voxel_idx = floorf((point.x - min_x_range)/pillar_x_size);
			
 
				+  int voxel_idy = floorf((point.y - min_y_range)/pillar_y_size);
			
 
				+  unsigned int voxel_index = voxel_idy * grid_x_size
			
 
				+                            + voxel_idx;
			
 
				+
			
 
				+  unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
			
 
				+
			
 
				+  if(point_id >= POINTS_PER_VOXEL) return;
			
 
				+  float *address = voxels + (voxel_index*POINTS_PER_VOXEL + point_id)*4;
			
 
				+  atomicExch(address+0, point.x);
			
 
				+  atomicExch(address+1, point.y);
			
 
				+  atomicExch(address+2, point.z);
			
 
				+  atomicExch(address+3, point.w);
			
 
				+}
			
 
				+
			
 
				+cudaError_t generateVoxels_random_launch(float *points, size_t points_size,
			
 
				+        float min_x_range, float max_x_range,
			
 
				+        float min_y_range, float max_y_range,
			
 
				+        float min_z_range, float max_z_range,
			
 
				+        float pillar_x_size, float pillar_y_size, float pillar_z_size,
			
 
				+        int grid_y_size, int grid_x_size,
			
 
				+        unsigned int *mask, float *voxels,
			
 
				+        cudaStream_t stream)
			
 
				+{
			
 
				+  int threadNum = THREADS_FOR_VOXEL;
			
 
				+  dim3 blocks((points_size+threadNum-1)/threadNum);
			
 
				+  dim3 threads(threadNum);
			
 
				+  generateVoxels_random_kernel<<<blocks, threads, 0, stream>>>
			
 
				+    (points, points_size,
			
 
				+        min_x_range, max_x_range,
			
 
				+        min_y_range, max_y_range,
			
 
				+        min_z_range, max_z_range,
			
 
				+        pillar_x_size, pillar_y_size, pillar_z_size,
			
 
				+        grid_y_size, grid_x_size,
			
 
				+        mask, voxels);
			
 
				+  cudaError_t err = cudaGetLastError();
			
 
				+  return err;
			
 
				+}
			
 
				+
			
 
				+__global__ void generateVoxelsList_kernel(float *points, size_t points_size,
			
 
				+        float min_x_range, float max_x_range,
			
 
				+        float min_y_range, float max_y_range,
			
 
				+        float min_z_range, float max_z_range,
			
 
				+        float pillar_x_size, float pillar_y_size, float pillar_z_size,
			
 
				+        int grid_y_size, int grid_x_size,
			
 
				+        unsigned int *mask, int *voxelsList)
			
 
				+{
			
 
				+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if(point_idx >= points_size) return;
			
 
				+
			
 
				+  float4 point = ((float4*)points)[point_idx];
			
 
				+
			
 
				+  if(point.x<min_x_range||point.x>=max_x_range
			
 
				+    || point.y<min_y_range||point.y>=max_y_range
			
 
				+    || point.z<min_z_range||point.z>=max_z_range)
			
 
				+  {
			
 
				+    voxelsList[point_idx] = -1;
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  int voxel_idx = floorf((point.x - min_x_range)/pillar_x_size);
			
 
				+  int voxel_idy = floorf((point.y - min_y_range)/pillar_y_size);
			
 
				+  unsigned int voxel_index = voxel_idy * grid_x_size
			
 
				+                            + voxel_idx;
			
 
				+
			
 
				+  atomicAdd(&(mask[voxel_index]), 1);
			
 
				+  voxelsList[point_idx] = voxel_index;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+__global__ void generateVoxels_kernel(float *points, size_t points_size,
			
 
				+        int *voxelsList,
			
 
				+        unsigned int *mask, float *voxels)
			
 
				+{
			
 
				+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if(point_idx >= points_size) return;
			
 
				+
			
 
				+  int voxel_index = voxelsList[point_idx];
			
 
				+
			
 
				+  if (voxel_index == -1) return;
			
 
				+  int point_id = atomicAdd(&(mask[voxel_index]), 1);
			
 
				+
			
 
				+  if(point_id >= POINTS_PER_VOXEL) return;
			
 
				+  float *address = voxels + (voxel_index*POINTS_PER_VOXEL + point_id)*4;
			
 
				+  float4 point = ((float4*)points)[point_idx];
			
 
				+  atomicExch(address+0, point.x);
			
 
				+  atomicExch(address+1, point.y);
			
 
				+  atomicExch(address+2, point.z);
			
 
				+  atomicExch(address+3, point.w);
			
 
				+}
			
 
				+
			
 
				+__global__ void generateBaseFeatures_kernel(unsigned int *mask, float *voxels,
			
 
				+        int grid_y_size, int grid_x_size,
			
 
				+        unsigned int *pillar_num,
			
 
				+        float *voxel_features,
			
 
				+        unsigned int *voxel_num,
			
 
				+        unsigned int *voxel_idxs)
			
 
				+{
			
 
				+  unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				+
			
 
				+  if(voxel_idx >= grid_x_size ||voxel_idy >= grid_y_size) return;
			
 
				+
			
 
				+  unsigned int voxel_index = voxel_idy * grid_x_size
			
 
				+                           + voxel_idx;
			
 
				+  unsigned int count = mask[voxel_index];
			
 
				+  if( !(count>0) ) return;
			
 
				+  count = count<POINTS_PER_VOXEL?count:POINTS_PER_VOXEL;
			
 
				+
			
 
				+  unsigned int current_pillarId = 0;
			
 
				+  current_pillarId = atomicAdd(pillar_num, 1);
			
 
				+
			
 
				+  voxel_num[current_pillarId] = count;
			
 
				+
			
 
				+  uint4 idx = {0, 0, voxel_idy, voxel_idx};
			
 
				+  ((uint4*)voxel_idxs)[current_pillarId] = idx;
			
 
				+
			
 
				+  for (int i=0; i<count; i++){
			
 
				+    int inIndex = voxel_index*POINTS_PER_VOXEL + i;
			
 
				+    int outIndex = current_pillarId*POINTS_PER_VOXEL + i;
			
 
				+    ((float4*)voxel_features)[outIndex] = ((float4*)voxels)[inIndex];
			
 
				+  }
			
 
				+
			
 
				+  // clear buffer for next infer
			
 
				+  atomicExch(mask + voxel_index, 0);
			
 
				+}
			
 
				+
			
 
				+// create 4 channels
			
 
				+cudaError_t generateBaseFeatures_launch(unsigned int *mask, float *voxels,
			
 
				+        int grid_y_size, int grid_x_size,
			
 
				+        unsigned int *pillar_num,
			
 
				+        float *voxel_features,
			
 
				+        unsigned int *voxel_num,
			
 
				+        unsigned int *voxel_idxs,
			
 
				+        cudaStream_t stream)
			
 
				+{
			
 
				+  dim3 threads = {32,32};
			
 
				+  dim3 blocks = {(grid_x_size + threads.x -1)/threads.x,
			
 
				+                 (grid_y_size + threads.y -1)/threads.y};
			
 
				+
			
 
				+  generateBaseFeatures_kernel<<<blocks, threads, 0, stream>>>
			
 
				+      (mask, voxels, grid_y_size, grid_x_size,
			
 
				+       pillar_num,
			
 
				+       voxel_features,
			
 
				+       voxel_num,
			
 
				+       voxel_idxs);
			
 
				+  cudaError_t err = cudaGetLastError();
			
 
				+  return err;
			
 
				+}
			
 
				+
			
 
				+// 4 channels -> 10 channels
			
 
				+__global__ void generateFeatures_kernel(float* voxel_features,
			
 
				+    unsigned int* voxel_num, unsigned int* voxel_idxs, unsigned int *params,
			
 
				+    float voxel_x, float voxel_y, float voxel_z,
			
 
				+    float range_min_x, float range_min_y, float range_min_z,
			
 
				+    float* features)
			
 
				+{
			
 
				+    int pillar_idx = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x/WARP_SIZE;
			
 
				+    int point_idx = threadIdx.x % WARP_SIZE;
			
 
				+
			
 
				+    int pillar_idx_inBlock = threadIdx.x/32;
			
 
				+    unsigned int num_pillars = params[0];
			
 
				+
			
 
				+    if (pillar_idx >= num_pillars) return;
			
 
				+
			
 
				+    __shared__ float4 pillarSM[WARPS_PER_BLOCK][WARP_SIZE];
			
 
				+    __shared__ float4 pillarSumSM[WARPS_PER_BLOCK];
			
 
				+    __shared__ uint4 idxsSM[WARPS_PER_BLOCK];
			
 
				+    __shared__ int pointsNumSM[WARPS_PER_BLOCK];
			
 
				+    __shared__ float pillarOutSM[WARPS_PER_BLOCK][WARP_SIZE][FEATURES_SIZE];
			
 
				+
			
 
				+    if (threadIdx.x < WARPS_PER_BLOCK) {
			
 
				+      pointsNumSM[threadIdx.x] = voxel_num[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
			
 
				+      idxsSM[threadIdx.x] = ((uint4*)voxel_idxs)[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
			
 
				+      pillarSumSM[threadIdx.x] = {0,0,0,0};
			
 
				+    }
			
 
				+
			
 
				+    pillarSM[pillar_idx_inBlock][point_idx] = ((float4*)voxel_features)[pillar_idx*WARP_SIZE + point_idx];
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    //calculate sm in a pillar
			
 
				+    if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
			
 
				+      atomicAdd(&(pillarSumSM[pillar_idx_inBlock].x),  pillarSM[pillar_idx_inBlock][point_idx].x);
			
 
				+      atomicAdd(&(pillarSumSM[pillar_idx_inBlock].y),  pillarSM[pillar_idx_inBlock][point_idx].y);
			
 
				+      atomicAdd(&(pillarSumSM[pillar_idx_inBlock].z),  pillarSM[pillar_idx_inBlock][point_idx].z);
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    //feature-mean
			
 
				+    float4 mean;
			
 
				+    float validPoints = pointsNumSM[pillar_idx_inBlock];
			
 
				+    mean.x = pillarSumSM[pillar_idx_inBlock].x / validPoints;
			
 
				+    mean.y = pillarSumSM[pillar_idx_inBlock].y / validPoints;
			
 
				+    mean.z = pillarSumSM[pillar_idx_inBlock].z / validPoints;
			
 
				+
			
 
				+    mean.x  = pillarSM[pillar_idx_inBlock][point_idx].x - mean.x;
			
 
				+    mean.y  = pillarSM[pillar_idx_inBlock][point_idx].y - mean.y;
			
 
				+    mean.z  = pillarSM[pillar_idx_inBlock][point_idx].z - mean.z;
			
 
				+
			
 
				+
			
 
				+    //calculate offset
			
 
				+    float x_offset = voxel_x / 2 + idxsSM[pillar_idx_inBlock].w * voxel_x + range_min_x;
			
 
				+    float y_offset = voxel_y / 2 + idxsSM[pillar_idx_inBlock].z * voxel_y + range_min_y;
			
 
				+    float z_offset = voxel_z / 2 + idxsSM[pillar_idx_inBlock].y * voxel_z + range_min_z;
			
 
				+
			
 
				+    //feature-offset
			
 
				+    float4 center;
			
 
				+    center.x  = pillarSM[pillar_idx_inBlock][point_idx].x - x_offset;
			
 
				+    center.y  = pillarSM[pillar_idx_inBlock][point_idx].y - y_offset;
			
 
				+    center.z  = pillarSM[pillar_idx_inBlock][point_idx].z - z_offset;
			
 
				+
			
 
				+    //store output
			
 
				+    if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][0] = pillarSM[pillar_idx_inBlock][point_idx].x;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][1] = pillarSM[pillar_idx_inBlock][point_idx].y;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][2] = pillarSM[pillar_idx_inBlock][point_idx].z;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][3] = pillarSM[pillar_idx_inBlock][point_idx].w;
			
 
				+
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][4] = mean.x;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][5] = mean.y;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][6] = mean.z;
			
 
				+
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][7] = center.x;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][8] = center.y;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][9] = center.z;
			
 
				+
			
 
				+    } else {
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][0] = 0;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][1] = 0;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][2] = 0;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][3] = 0;
			
 
				+
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][4] = 0;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][5] = 0;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][6] = 0;
			
 
				+
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][7] = 0;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][8] = 0;
			
 
				+      pillarOutSM[pillar_idx_inBlock][point_idx][9] = 0;
			
 
				+    }
			
 
				+
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    for(int i = 0; i < FEATURES_SIZE; i ++) {
			
 
				+      int outputSMId = pillar_idx_inBlock*WARP_SIZE*FEATURES_SIZE + i* WARP_SIZE + point_idx;
			
 
				+      int outputId = pillar_idx*WARP_SIZE*FEATURES_SIZE + i* WARP_SIZE + point_idx;
			
 
				+      features[outputId] = ((float*)pillarOutSM)[outputSMId] ;
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+cudaError_t generateFeatures_launch(float* voxel_features,
			
 
				+    unsigned int * voxel_num,
			
 
				+    unsigned int* voxel_idxs,
			
 
				+    unsigned int *params,
			
 
				+    float voxel_x, float voxel_y, float voxel_z,
			
 
				+    float range_min_x, float range_min_y, float range_min_z,
			
 
				+    float* features,
			
 
				+    cudaStream_t stream)
			
 
				+{
			
 
				+    dim3 blocks( (MAX_VOXELS+WARPS_PER_BLOCK-1)/WARPS_PER_BLOCK);
			
 
				+    dim3 threads(WARPS_PER_BLOCK*WARP_SIZE);
			
 
				+
			
 
				+    generateFeatures_kernel<<<blocks, threads, 0, stream>>>
			
 
				+      (voxel_features,
			
 
				+      voxel_num,
			
 
				+      voxel_idxs,
			
 
				+      params,
			
 
				+      voxel_x, voxel_y, voxel_z,
			
 
				+      range_min_x, range_min_y, range_min_z,
			
 
				+      features);
			
 
				+
			
 
				+    cudaError_t err = cudaGetLastError();
			
 
				+    return err;
			
 
				+}
			
--- a/src/tool/pointcloudviewer/main.cpp
+++ b/src/tool/pointcloudviewer/main.cpp
@@ -189,9 +189,9 @@ int main(int argc, char *argv[])
 
				     showversion("pointcloudviewer");
			
 
				     QCoreApplication a(argc, argv);
			
 
				 
			
 
				-    snprintf(gstr_memname,255,"lidar_pc");
			
 
				+//    snprintf(gstr_memname,255,"lidar_pc");
			
 
				 //    snprintf(gstr_memname,255,"lidar_pc_filter");
			
 
				-//    snprintf(gstr_memname,255,"lidarpc_center");
			
 
				+    snprintf(gstr_memname,255,"lidarpc_center");
			
 
				 
			
 
				 
			
 
				     int nRtn = GetOptLong(argc,argv);