Browse Source

add detection_lidar_pointpillars_cuda. not test ok,need some work.

yuchuli 2 years ago
parent
commit
7b2612868c

+ 73 - 0
src/detection/detection_lidar_pointpillars_cuda/.gitignore

@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+

+ 152 - 0
src/detection/detection_lidar_pointpillars_cuda/detection_lidar_pointpillars_cuda.pro

@@ -0,0 +1,152 @@
+QT -= gui
+
+CONFIG += c++14 #console
+CONFIG -= app_bundle
+
+# The following define makes your compiler emit warnings if you use
+# any Qt feature that has been marked deprecated (the exact warnings
+# depend on your compiler). Please consult the documentation of the
+# deprecated API in order to know how to port your code away from it.
+DEFINES += QT_DEPRECATED_WARNINGS
+
+# You can also make your code fail to compile if it uses deprecated APIs.
+# In order to do so, uncomment the following line.
+# You can also select to disable deprecated APIs only up to a certain version of Qt.
+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
+
+SOURCES += \
+        main.cpp \
+        src/pillarScatter.cpp \
+        src/pointpillar.cpp \
+        src/postprocess.cpp \
+        src/preprocess.cpp
+
+# Default rules for deployment.
+qnx: target.path = /tmp/$${TARGET}/bin
+else: unix:!android: target.path = /opt/$${TARGET}/bin
+!isEmpty(target.path): INSTALLS += target
+
+DISTFILES += \
+    src/pillarScatterKernels.cu \
+    src/postprocess_kernels.cu \
+    src/preprocess_kernels.cu
+
+HEADERS += \
+    include/kernel.h \
+    include/params.h \
+    include/pillarScatter.h \
+    include/pointpillar.h \
+    include/postprocess.h \
+    include/preprocess.h
+
+
+INCLUDEPATH += $$PWD/include
+
+INCLUDEPATH += /usr/include/pcl-1.7
+INCLUDEPATH += /usr/include/pcl-1.8
+INCLUDEPATH += /usr/include/pcl-1.10
+INCLUDEPATH += /usr/include/eigen3
+
+
+INCLUDEPATH += /usr/local/cuda/targets/aarch64-linux/include
+
+LIBS += -L/usr/local/cuda/targets/aarch64-linux/lib  # -lcublas
+
+
+
+CUDA_SOURCES += \
+    src/pillarScatterKernels.cu \
+    src/postprocess_kernels.cu \
+    src/preprocess_kernels.cu
+
+LIBS += -L"/usr/local/lib" \
+        -L"/usr/local/cuda/lib64" \
+        -lcudart \
+        -lcufft
+
+CUDA_SDK = "/usr/local/cuda/"   # cudaSDK路径
+
+CUDA_DIR = "/usr/local/cuda/"            # CUDA tookit路径
+
+SYSTEM_NAME = linux         # 自己系统环境 'Win32', 'x64', or 'Win64'
+
+SYSTEM_TYPE = 64           #操作系统位数 '32' or '64',
+
+CUDA_ARCH = sm_72 #sm_87 sm_62 # xavier sm_72          # cuda架构, for example 'compute_10', 'compute_11', 'sm_10'
+
+NVCC_OPTIONS = --use_fast_math --compiler-options "-fPIC"
+
+# include paths
+
+INCLUDEPATH += $$CUDA_DIR/include
+#INCLUDEPATH += /usr/local/cuda-10.0/targets/aarch64-linux/include/crt
+
+# library directories
+
+QMAKE_LIBDIR += $$CUDA_DIR/lib/
+
+CUDA_OBJECTS_DIR = ./
+
+# The following library conflicts with something in Cuda
+
+#QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
+
+#QMAKE_LFLAGS_DEBUG   = /NODEFAULTLIB:msvcrtd.lib
+
+# Add the necessary libraries
+
+CUDA_LIBS =  cudart cufft
+
+# The following makes sure all path names (which often include spaces) are put between quotation marks
+
+CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
+
+NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
+
+#LIBS += $$join(CUDA_LIBS,'.so ', '', '.so')
+
+# Configuration of the Cuda compiler
+
+CONFIG(debug, debug|release) {
+
+    # Debug mode
+
+    cuda_d.input = CUDA_SOURCES
+
+    cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
+
+    cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+
+    cuda_d.dependency_type = TYPE_C
+
+    QMAKE_EXTRA_COMPILERS += cuda_d
+
+}
+
+else {
+
+    # Release mode
+
+    cuda.input = CUDA_SOURCES
+
+    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
+
+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -O3 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+
+    cuda.dependency_type = TYPE_C
+
+    QMAKE_EXTRA_COMPILERS += cuda
+
+}
+
+
+LIBS += -lnvinfer -lnvonnxparser
+
+!include(../../../include/common.pri ) {
+    error( "Couldn't find the common.pri file!" )
+}
+
+!include(../../../include/ivprotobuf.pri ) {
+    error( "Couldn't find the ivprotobuf.pri file!" )
+}
+

+ 122 - 0
src/detection/detection_lidar_pointpillars_cuda/include/kernel.h

@@ -0,0 +1,122 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _KERNEL_H_
+#define _KERNEL_H_
+
+#include <iostream>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include "params.h"
+
+const int THREADS_FOR_VOXEL = 256;    // threads number for a block
+const int POINTS_PER_VOXEL = 32;      // depands on "params.h"
+const int WARP_SIZE = 32;             // one warp(32 threads) for one pillar
+const int WARPS_PER_BLOCK = 4;        // four warp for one block
+const int FEATURES_SIZE = 10;         // features maps number depands on "params.h"
+const int PILLARS_PER_BLOCK = 64;     // one thread deals with one pillar and a block has PILLARS_PER_BLOCK threads
+const int PILLAR_FEATURE_SIZE = 64;   // feature count for one pillar depands on "params.h"
+
+typedef enum
+{
+    STATUS_SUCCESS = 0,
+    STATUS_FAILURE = 1,
+    STATUS_BAD_PARAM = 2,
+    STATUS_NOT_SUPPORTED = 3,
+    STATUS_NOT_INITIALIZED = 4
+} pluginStatus_t;
+
+#define checkCudaErrors(status)                                   \
+{                                                                 \
+  if (status != 0)                                                \
+  {                                                               \
+    std::cout << "Cuda failure: " << cudaGetErrorString(status)   \
+              << " at line " << __LINE__                          \
+              << " in file " << __FILE__                          \
+              << " error status: " << status                      \
+              << std::endl;                                       \
+              abort();                                            \
+    }                                                             \
+}
+
+cudaError_t generateVoxels_random_launch(float *points, size_t points_size,
+        float min_x_range, float max_x_range,
+        float min_y_range, float max_y_range,
+        float min_z_range, float max_z_range,
+        float pillar_x_size, float pillar_y_size, float pillar_z_size,
+        int grid_y_size, int grid_x_size,
+        unsigned int *mask, float *voxels,
+        cudaStream_t stream = 0);
+
+cudaError_t generateBaseFeatures_launch(unsigned int *mask, float *voxels,
+        int grid_y_size, int grid_x_size,
+        unsigned int *pillar_num,
+        float *voxel_features,
+        unsigned int *voxel_num,
+        unsigned int *voxel_idxs,
+        cudaStream_t stream = 0);
+
+cudaError_t generateFeatures_launch(float* voxel_features,
+    unsigned int *voxel_num,
+    unsigned int *voxel_idxs,
+    unsigned int *params,
+    float voxel_x, float voxel_y, float voxel_z,
+    float range_min_x, float range_min_y, float range_min_z,
+    float* features,
+    cudaStream_t stream = 0);
+
+int pillarScatterHalfKernelLaunch(
+  int max_pillar_num,
+  int num_features,
+  const half *pillar_features_data,
+  const unsigned int *coords_data,
+  const unsigned int *params_data,
+  unsigned int featureX, unsigned int featureY,
+  half *spatial_feature_data,
+  cudaStream_t stream);
+
+int pillarScatterFloatKernelLaunch(
+  int max_pillar_num,
+  int num_features,
+  const float *pillar_features_data,
+  const unsigned int *coords_data,
+  const unsigned int *params_data,
+  unsigned int featureX, unsigned int featureY,
+  float *spatial_feature_data,
+  cudaStream_t stream);
+
+cudaError_t postprocess_launch(const float *cls_input,
+                      float *box_input,
+                      const float *dir_cls_input,
+                      float *anchors,
+                      float *anchor_bottom_heights,
+                      float *bndbox_output,
+                      int *object_counter,
+                      const float min_x_range,
+                      const float max_x_range,
+                      const float min_y_range,
+                      const float max_y_range,
+                      const int feature_x_size,
+                      const int feature_y_size,
+                      const int num_anchors,
+                      const int num_classes,
+                      const int num_box_values,
+                      const float score_thresh,
+                      const float dir_offset,
+                      cudaStream_t stream = 0);
+#endif

+ 71 - 0
src/detection/detection_lidar_pointpillars_cuda/include/params.h

@@ -0,0 +1,71 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef PARAMS_H_
+#define PARAMS_H_
+const int MAX_VOXELS = 40000;
+class Params
+{
+  public:
+    static const int num_classes = 3;
+    const char *class_name [num_classes] = { "Car","Pedestrian","Cyclist",};
+    const float min_x_range = -80;
+    const float max_x_range = 80;
+    const float min_y_range = -80;
+    const float max_y_range = 80;
+    const float min_z_range = -3.0;
+    const float max_z_range = 1.0;
+    // the size of a pillar
+    const float pillar_x_size = 0.4;
+    const float pillar_y_size = 0.4;
+    const float pillar_z_size = 4.0;
+    const int max_num_points_per_pillar = 16;
+    const int num_point_values = 4;
+    // the number of feature maps for pillar scatter
+    const int num_feature_scatter = 64;
+    const float dir_offset = 0.78539;
+    const float dir_limit_offset = 0.0;
+    // the num of direction classes(bins)
+    const int num_dir_bins = 2;
+    // anchors decode by (x, y, z, dir)
+    static const int num_anchors = num_classes * 2;
+    static const int len_per_anchor = 4;
+    const float anchors[num_anchors * len_per_anchor] = {
+      3.9,1.6,1.56,0.0,
+      3.9,1.6,1.56,1.57,
+      0.8,0.6,1.73,0.0,
+      0.8,0.6,1.73,1.57,
+      1.76,0.6,1.73,0.0,
+      1.76,0.6,1.73,1.57,
+      };
+    const float anchor_bottom_heights[num_classes] = {-1.78,-0.6,-0.6,};
+    // the score threshold for classification
+    const float score_thresh = 0.1;
+    const float nms_thresh = 0.01;
+    const int max_num_pillars = MAX_VOXELS;
+    const int pillarPoints_bev = max_num_points_per_pillar * max_num_pillars;
+    // the detected boxes result decode by (x, y, z, w, l, h, yaw)
+    const int num_box_values = 7;
+    // the input size of the 2D backbone network
+    const int grid_x_size = (max_x_range - min_x_range) / pillar_x_size;
+    const int grid_y_size = (max_y_range - min_y_range) / pillar_y_size;
+    const int grid_z_size = (max_z_range - min_z_range) / pillar_z_size;
+    // the output size of the 2D backbone network
+    const int feature_x_size = grid_x_size / 2;
+    const int feature_y_size = grid_y_size / 2;
+    Params() {};
+};
+#endif

+ 98 - 0
src/detection/detection_lidar_pointpillars_cuda/include/pillarScatter.h

@@ -0,0 +1,98 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _PILLAR_SCATTER_H_
+#define _PILLAR_SCATTER_H_
+
+#include <vector>
+#include <string>
+#include <cuda_runtime_api.h>
+#include "NvInferPlugin.h"
+#include "kernel.h"
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+class PPScatterPlugin : public nvinfer1::IPluginV2DynamicExt
+{
+public:
+    PPScatterPlugin() = delete;
+    PPScatterPlugin(const void* data, size_t length);
+    PPScatterPlugin(size_t h, size_t w);
+    // IPluginV2DynamicExt Methods
+    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
+    nvinfer1::DimsExprs getOutputDimensions(int outputIndex, 
+        const nvinfer1::DimsExprs* inputs, int nbInputs,
+        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
+    bool supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, 
+        int nbInputs, int nbOutputs) noexcept override;
+    void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept override;
+    size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept override;
+    int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, 
+        const nvinfer1::PluginTensorDesc* outputDesc,
+        const void* const* inputs, void* const* outputs, 
+        void* workspace, cudaStream_t stream) noexcept override;
+    // IPluginV2Ext Methods
+    nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, 
+        int nbInputs) const noexcept override;
+    // IPluginV2 Methods
+    const char* getPluginType() const noexcept override;
+    const char* getPluginVersion() const noexcept override;
+    int getNbOutputs() const noexcept override;
+    int initialize() noexcept override;
+    void terminate() noexcept override;
+    size_t getSerializationSize() const noexcept override;
+    void serialize(void* buffer) const noexcept override;
+    void destroy() noexcept override;
+    void setPluginNamespace(const char* pluginNamespace) noexcept override;
+    const char* getPluginNamespace() const noexcept override;
+
+private:
+    std::string mNamespace;
+    // the y -- output size of the 2D backbone network
+    size_t feature_y_size_;
+    // the x -- output size of the 2D backbone network
+    size_t feature_x_size_;
+};
+
+class PPScatterPluginCreator : public nvinfer1::IPluginCreator
+{
+public:
+    PPScatterPluginCreator();
+    const char* getPluginName() const noexcept override;
+    const char* getPluginVersion() const noexcept override;
+    const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override;
+    nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept override;
+    nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
+    void setPluginNamespace(const char* pluginNamespace) noexcept override;
+    const char* getPluginNamespace() const noexcept override;
+
+private:
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
+    std::string mNamespace;
+};
+
+} // namespace plugin
+} // namespace nvinfer1
+
+#endif

+ 105 - 0
src/detection/detection_lidar_pointpillars_cuda/include/pointpillar.h

@@ -0,0 +1,105 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <memory>
+
+#include "cuda_runtime.h"
+#include "NvInfer.h"
+#include "NvOnnxConfig.h"
+#include "NvOnnxParser.h"
+#include "NvInferRuntime.h"
+
+#include "postprocess.h"
+#include "preprocess.h"
+
+#define PERFORMANCE_LOG 1
+
+// Logger for TensorRT
+class Logger : public nvinfer1::ILogger {
+  public:
+    void log(Severity severity, const char* msg) noexcept override {
+        // suppress info-level message
+        //if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR || severity == Severity::kINFO ) {
+        if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR) {
+            std::cerr << "trt_infer: " << msg << std::endl;
+        }
+    }
+};
+
+class TRT {
+  private:
+    Params params_;
+
+    cudaEvent_t start_, stop_;
+
+    Logger gLogger_;
+    nvinfer1::IExecutionContext *context_ = nullptr;
+    nvinfer1::ICudaEngine *engine_ = nullptr;
+
+    cudaStream_t stream_ = 0;
+  public:
+    TRT(std::string modelFile, cudaStream_t stream = 0);
+    ~TRT(void);
+
+    int doinfer(void**buffers);
+};
+
+class PointPillar {
+  private:
+    Params params_;
+
+    cudaEvent_t start_, stop_;
+    cudaStream_t stream_;
+
+    std::shared_ptr<PreProcessCuda> pre_;
+    std::shared_ptr<TRT> trt_;
+    std::shared_ptr<PostProcessCuda> post_;
+
+    //input of pre-process
+    float *voxel_features_ = nullptr;
+    unsigned int *voxel_num_ = nullptr;
+    unsigned int *voxel_idxs_ = nullptr;
+    unsigned int *pillar_num_ = nullptr;
+
+    unsigned int voxel_features_size_ = 0;
+    unsigned int voxel_num_size_ = 0;
+    unsigned int voxel_idxs_size_ = 0;
+
+    //TRT-input
+    float *features_input_ = nullptr;
+    unsigned int *params_input_ = nullptr;
+    unsigned int features_input_size_ = 0;
+
+    //output of TRT -- input of post-process
+    float *cls_output_ = nullptr;
+    float *box_output_ = nullptr;
+    float *dir_cls_output_ = nullptr;
+    unsigned int cls_size_;
+    unsigned int box_size_;
+    unsigned int dir_cls_size_;
+
+    //output of post-process
+    float *bndbox_output_ = nullptr;
+    unsigned int bndbox_size_ = 0;
+
+    std::vector<Bndbox> res_;
+
+  public:
+    PointPillar(std::string modelFile, cudaStream_t stream = 0);
+    ~PointPillar(void);
+    int doinfer(void*points, unsigned int point_size, std::vector<Bndbox> &res);
+};
+

+ 58 - 0
src/detection/detection_lidar_pointpillars_cuda/include/postprocess.h

@@ -0,0 +1,58 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef POSTPROCESS_H_
+#define POSTPROCESS_H_
+
+#include <vector>
+#include "kernel.h"
+
+/*
+box_encodings: (B, N, 7 + C) or (N, 7 + C) [x, y, z, dx, dy, dz, heading or *[cos, sin], ...]
+anchors: (B, N, 7 + C) or (N, 7 + C) [x, y, z, dx, dy, dz, heading, ...]
+*/
+struct Bndbox {
+    float x;
+    float y;
+    float z;
+    float w;
+    float l;
+    float h;
+    float rt;
+    int id;
+    float score;
+    Bndbox(){};
+    Bndbox(float x_, float y_, float z_, float w_, float l_, float h_, float rt_, int id_, float score_)
+        : x(x_), y(y_), z(z_), w(w_), l(l_), h(h_), rt(rt_), id(id_), score(score_) {}
+};
+
+class PostProcessCuda {
+  private:
+    Params params_;
+    float *anchors_;
+    float *anchor_bottom_heights_;
+    int *object_counter_;
+    cudaStream_t stream_ = 0;
+  public:
+    PostProcessCuda(cudaStream_t stream = 0);
+    ~PostProcessCuda();
+
+    int doPostprocessCuda(const float *cls_input, float *box_input, const float *dir_cls_input, float *bndbox_output);
+};
+
+int nms_cpu(std::vector<Bndbox> bndboxes, const float nms_thresh, std::vector<Bndbox> &nms_pred);
+
+#endif

+ 47 - 0
src/detection/detection_lidar_pointpillars_cuda/include/preprocess.h

@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel.h"
+
+class PreProcessCuda {
+  private:
+    Params params_;
+    unsigned int *mask_;
+    float *voxels_;
+    int *voxelsList_;
+    float *params_cuda_;
+    cudaStream_t stream_ = 0;
+
+  public:
+    PreProcessCuda(cudaStream_t stream_ = 0);
+    ~PreProcessCuda();
+
+    //points cloud -> voxels (BEV) -> feature*4 
+    int generateVoxels(float *points, size_t points_size,
+        unsigned int *pillar_num,
+        float *voxel_features,
+        unsigned int *voxel_num,
+        unsigned int *voxel_idxs);
+
+    //feature*4 -> feature * 10 
+    int generateFeatures(float* voxel_features,
+          unsigned int *voxel_num,
+          unsigned int* voxel_idxs,
+          unsigned int *params,
+          float* features);
+};
+

+ 189 - 0
src/detection/detection_lidar_pointpillars_cuda/main.cpp

@@ -0,0 +1,189 @@
+#include <QCoreApplication>
+
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <pcl/point_types.h>
+#include <pcl/point_cloud.h>
+
+#include "cuda_runtime.h"
+
+#include "./params.h"
+#include "./pointpillar.h"
+
+#include "modulecomm.h"
+
+std::string Model_File = "/home/nvidia/models/lidar/pointpillar.onnx";
+
+PointPillar * gpPointPillar;
+
+cudaEvent_t start, stop;
+float elapsedTime = 0.0f;
+cudaStream_t stream = NULL;
+std::vector<Bndbox> nms_pred;
+
+#define checkCudaErrors(status)                                   \
+{                                                                 \
+  if (status != 0)                                                \
+  {                                                               \
+    std::cout << "Cuda failure: " << cudaGetErrorString(status)   \
+              << " at line " << __LINE__                          \
+              << " in file " << __FILE__                          \
+              << " error status: " << status                      \
+              << std::endl;                                       \
+              abort();                                            \
+    }                                                             \
+}
+
+void Getinfo(void)
+{
+  cudaDeviceProp prop;
+
+  int count = 0;
+  cudaGetDeviceCount(&count);
+  printf("\nGPU has cuda devices: %d\n", count);
+  for (int i = 0; i < count; ++i) {
+    cudaGetDeviceProperties(&prop, i);
+    printf("----device id: %d info----\n", i);
+    printf("  GPU : %s \n", prop.name);
+    printf("  Capbility: %d.%d\n", prop.major, prop.minor);
+    printf("  Global memory: %luMB\n", prop.totalGlobalMem >> 20);
+    printf("  Const memory: %luKB\n", prop.totalConstMem  >> 10);
+    printf("  SM in a block: %luKB\n", prop.sharedMemPerBlock >> 10);
+    printf("  warp size: %d\n", prop.warpSize);
+    printf("  threads in a block: %d\n", prop.maxThreadsPerBlock);
+    printf("  block dim: (%d,%d,%d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
+    printf("  grid dim: (%d,%d,%d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
+  }
+  printf("\n");
+}
+
+
+//void DectectOneBin(std::shared_ptr<float> & pbin_ptr,const int points_size)
+void DectectOneBin(float * pdata,const int points_size)
+{
+    float *points_data = nullptr;
+    unsigned int points_data_size = points_size * 4 * sizeof(float);
+    checkCudaErrors(cudaMallocManaged((void **)&points_data, points_data_size));
+ //   checkCudaErrors(cudaMemcpy(points_data, pbin_ptr.get(), points_data_size, cudaMemcpyDefault));
+    checkCudaErrors(cudaMemcpy(points_data, pdata, points_data_size, cudaMemcpyDefault));
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    cudaEventRecord(start, stream);
+
+    gpPointPillar->doinfer(points_data, points_size, nms_pred);
+    cudaEventRecord(stop, stream);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    std::cout<<"TIME: pointpillar: "<< elapsedTime <<" ms." <<std::endl;
+
+    checkCudaErrors(cudaFree(points_data));
+
+    std::cout<<"Bndbox objs: "<< nms_pred.size()<<std::endl;
+//    std::string save_file_name = Save_Dir + index_str + ".txt";
+//    SaveBoxPred(nms_pred, save_file_name);
+
+    for (const auto box : nms_pred) {
+
+        if(box.id != 0)continue;
+      std::cout << box.x << " ";
+      std::cout << box.y << " ";
+      std::cout << box.z << " ";
+      std::cout << box.w << " ";
+      std::cout << box.l << " ";
+      std::cout << box.h << " ";
+      std::cout << box.rt << " ";
+      std::cout << box.id << " ";
+      std::cout << box.score << " ";
+      std::cout << std::endl;
+    }
+
+    nms_pred.clear();
+
+    delete pdata;
+
+    std::cout << ">>>>>>>>>>>" <<std::endl;
+}
+
+void ListenPointCloud(const char * strdata,const unsigned int nSize,const unsigned int index,const QDateTime * dt,const char * strmemname)
+{
+    if(nSize <=16)return;
+    unsigned int * pHeadSize = (unsigned int *)strdata;
+    if(*pHeadSize > nSize)
+    {
+        std::cout<<"ListenPointCloud data is small headsize ="<<*pHeadSize<<"  data size is"<<nSize<<std::endl;
+    }
+
+
+    int nNameSize;
+    nNameSize = *pHeadSize - 4-4-8;
+    char * strName = new char[nNameSize+1];strName[nNameSize] = 0;
+    memcpy(strName,(char *)((char *)strdata +4),nNameSize);
+    int nPCount = (nSize - *pHeadSize)/sizeof(pcl::PointXYZI);
+    int i;
+    pcl::PointXYZI * p;
+    p = (pcl::PointXYZI *)((char *)strdata + *pHeadSize);
+
+    std::shared_ptr<float> pbin_ptr = std::shared_ptr<float>(new float[nPCount*4]);
+
+    float * pdata = new float[nPCount * 4];// pbin_ptr.get();
+
+    for(i=0;i<nPCount;i++)
+    {
+        pcl::PointXYZI xp;
+
+        memcpy(&xp,p,sizeof(pcl::PointXYZI));
+        xp.z = xp.z;
+
+        pdata[4*i + 0]= static_cast<float>(xp.x);
+        pdata[4*i + 1]= static_cast<float>(xp.y);
+        pdata[4*i + 2]= static_cast<float>(xp.z);
+        pdata[4*i + 3]= static_cast<float>(xp.intensity);
+
+//        pdata[4*i + 0]= 0;
+//        pdata[4*i + 1]= 0;
+//        pdata[4*i + 2]= 0;
+//        pdata[4*i + 3]= 0;
+
+        p++;
+
+    }
+
+    DectectOneBin(pdata,nPCount);
+ //   DectectOneBin(pbin_ptr,nPCount);
+
+
+
+}
+
+
+int main(int argc, char *argv[])
+{
+    QCoreApplication a(argc, argv);
+
+    Getinfo();
+
+
+
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaStreamCreate(&stream));
+
+    Params params_;
+
+
+    nms_pred.reserve(100);
+
+    PointPillar pointpillar(Model_File, stream);
+
+    gpPointPillar = &pointpillar;
+
+    void * pa = iv::modulecomm::RegisterRecv("lidarpc_center",ListenPointCloud);
+
+    return a.exec();
+
+    checkCudaErrors(cudaEventDestroy(start));
+    checkCudaErrors(cudaEventDestroy(stop));
+    checkCudaErrors(cudaStreamDestroy(stream));
+}

+ 306 - 0
src/detection/detection_lidar_pointpillars_cuda/src/pillarScatter.cpp

@@ -0,0 +1,306 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cassert>
+#include <cstring>
+#include "pillarScatter.h"
+
+using namespace nvinfer1;
+using nvinfer1::plugin::PPScatterPlugin;
+using nvinfer1::plugin::PPScatterPluginCreator;
+
+static const char* PLUGIN_VERSION{"1"};
+static const char* PLUGIN_NAME{"PPScatterPlugin"};
+
+// Static class fields initialization
+PluginFieldCollection PPScatterPluginCreator::mFC{};
+std::vector<PluginField> PPScatterPluginCreator::mPluginAttributes;
+
+// Helper function for serializing plugin
+template <typename T>
+void writeToBuffer(char*& buffer, const T& val)
+{
+    *reinterpret_cast<T*>(buffer) = val;
+    buffer += sizeof(T);
+}
+
+// Helper function for deserializing plugin
+template <typename T>
+T readFromBuffer(const char*& buffer)
+{
+    T val = *reinterpret_cast<const T*>(buffer);
+    buffer += sizeof(T);
+    return val;
+}
+
+PPScatterPlugin::PPScatterPlugin(size_t h, size_t w)
+  : feature_y_size_(h), feature_x_size_(w)
+{
+}
+
+PPScatterPlugin::PPScatterPlugin(const void* data, size_t length)
+{
+    const char* d = reinterpret_cast<const char*>(data);
+    feature_y_size_ = readFromBuffer<size_t>(d);
+    feature_x_size_ = readFromBuffer<size_t>(d);
+}
+
+nvinfer1::IPluginV2DynamicExt* PPScatterPlugin::clone() const noexcept
+{
+    auto* plugin = new PPScatterPlugin(feature_y_size_, feature_x_size_);
+    plugin->setPluginNamespace(mNamespace.c_str());
+    return plugin;
+}
+
+nvinfer1::DimsExprs PPScatterPlugin::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept
+{
+    assert(outputIndex == 0);
+    nvinfer1::DimsExprs output;
+    output.nbDims = 4;
+    output.d[0] = exprBuilder.constant(1);
+    output.d[1] = inputs[0].d[1];
+    output.d[2] = exprBuilder.constant(feature_y_size_);
+    output.d[3] = exprBuilder.constant(feature_x_size_);
+    return output;
+}
+
+bool PPScatterPlugin::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
+{
+    assert(nbInputs == 3);
+    assert(nbOutputs == 1);
+    const PluginTensorDesc& in = inOut[pos];
+    if (pos == 0)
+    {
+        return (in.type == nvinfer1::DataType::kFLOAT || in.type == nvinfer1::DataType::kHALF) && (in.format == TensorFormat::kLINEAR);
+    }
+    if (pos == 1)
+    {
+        return (in.type == nvinfer1::DataType::kINT32) && (in.format == TensorFormat::kLINEAR);
+    }
+    if (pos == 2)
+    {
+        return (in.type == nvinfer1::DataType::kINT32) && (in.format == TensorFormat::kLINEAR);
+    }
+    if (pos == 3)
+    {
+        return (inOut[0].type == nvinfer1::DataType::kFLOAT && in.type == nvinfer1::DataType::kFLOAT && in.format == TensorFormat::kLINEAR)
+            || (inOut[0].type == nvinfer1::DataType::kHALF  && in.type == nvinfer1::DataType::kHALF  && in.format == TensorFormat::kHWC8);
+    }
+    return false;
+}
+
+void PPScatterPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept
+{
+    return;
+}
+
+size_t PPScatterPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept
+{
+    return 0;
+}
+int PPScatterPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace,
+    cudaStream_t stream) noexcept
+{
+    try
+    {
+        int maxPillarNum = inputDesc[0].dims.d[0];
+        int numFeatures = inputDesc[0].dims.d[1];
+        
+        nvinfer1::DataType inputType = inputDesc[0].type;
+
+        auto coords_data = static_cast<const unsigned int *>(inputs[1]);
+        auto params_data = static_cast<const unsigned int *>(inputs[2]);
+
+        unsigned int featureY = feature_y_size_;
+        unsigned int featureX = feature_x_size_;
+
+        int status = -1;
+
+        if(inputType == nvinfer1::DataType::kHALF){
+            auto pillar_features_data = static_cast<const half *>(inputs[0]);
+            auto spatial_feature_data = static_cast<half *>(outputs[0]);
+            cudaMemsetAsync(spatial_feature_data, 0, numFeatures*featureY*featureX * sizeof(half), stream);
+            status = pillarScatterHalfKernelLaunch(
+                maxPillarNum,
+                numFeatures,
+                pillar_features_data,
+                coords_data,
+                params_data,
+                featureX,
+                featureY,
+                spatial_feature_data,
+                stream
+                );
+            assert(status == STATUS_SUCCESS);
+            return status;
+        }
+        else if(inputType == nvinfer1::DataType::kFLOAT){
+            auto pillar_features_data = static_cast<const float *>(inputs[0]);
+            auto spatial_feature_data = static_cast<float *>(outputs[0]);
+            cudaMemsetAsync(spatial_feature_data, 0, numFeatures*featureY*featureX * sizeof(float), stream);
+            status = pillarScatterFloatKernelLaunch(
+                maxPillarNum,
+                numFeatures,
+                pillar_features_data,
+                coords_data,
+                params_data,
+                featureX,
+                featureY,
+                spatial_feature_data,
+                stream
+                );
+            assert(status == STATUS_SUCCESS);
+            return status;
+        }
+        else{
+            assert(status == STATUS_SUCCESS);
+            return status;
+        }
+    }
+    catch (const std::exception& e)
+    {
+        std::cerr << e.what() << std::endl;
+    }
+    return -1;
+}
+
+nvinfer1::DataType PPScatterPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
+{
+    return inputTypes[0];
+}
+
+const char* PPScatterPlugin::getPluginType() const noexcept
+{
+    return PLUGIN_NAME;
+}
+
+const char* PPScatterPlugin::getPluginVersion() const noexcept
+{
+    return PLUGIN_VERSION;
+}
+
+int PPScatterPlugin::getNbOutputs() const noexcept
+{
+    return 1;
+}
+
+int PPScatterPlugin::initialize() noexcept
+{
+    return 0;
+}
+
+void PPScatterPlugin::terminate() noexcept
+{
+}
+
+size_t PPScatterPlugin::getSerializationSize() const noexcept
+{
+    return 3 * sizeof(size_t);
+}
+
+void PPScatterPlugin::serialize(void* buffer) const noexcept
+{
+    char* d = reinterpret_cast<char*>(buffer);
+    writeToBuffer<size_t>(d, feature_y_size_);
+    writeToBuffer<size_t>(d, feature_x_size_);
+}
+
+void PPScatterPlugin::destroy() noexcept
+{
+    delete this;
+}
+
+void PPScatterPlugin::setPluginNamespace(const char* libNamespace) noexcept
+{
+    mNamespace = libNamespace;
+}
+
+const char* PPScatterPlugin::getPluginNamespace() const noexcept
+{
+    return mNamespace.c_str();
+}
+
+PPScatterPluginCreator::PPScatterPluginCreator()
+{
+    mPluginAttributes.clear();
+    mPluginAttributes.emplace_back(PluginField("dense_shape", nullptr, PluginFieldType::kINT32, 1));
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* PPScatterPluginCreator::getPluginName() const noexcept
+{
+    return PLUGIN_NAME;
+}
+
+const char* PPScatterPluginCreator::getPluginVersion() const noexcept
+{
+    return PLUGIN_VERSION;
+}
+
+const PluginFieldCollection* PPScatterPluginCreator::getFieldNames() noexcept
+{
+    return &mFC;
+}
+
+IPluginV2* PPScatterPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
+{
+    const PluginField* fields = fc->fields;
+    int nbFields = fc->nbFields;
+    int target_h = 0;
+    int target_w = 0;
+    for (int i = 0; i < nbFields; ++i)
+    {
+        const char* attr_name = fields[i].name;
+        if (!strcmp(attr_name, "dense_shape"))
+        {
+            const int* ts = static_cast<const int*>(fields[i].data);
+            target_h = ts[0];
+            target_w = ts[1];
+        }
+    }
+    auto* plugin = new PPScatterPlugin(
+        target_h,
+        target_w
+    );
+    return plugin;
+}
+
+IPluginV2* PPScatterPluginCreator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength) noexcept
+{
+    // This object will be deleted when the network is destroyed,
+    auto* plugin = new PPScatterPlugin(serialData, serialLength);
+    return plugin;
+}
+
+void PPScatterPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
+{
+    mNamespace = libNamespace;
+}
+
+const char* PPScatterPluginCreator::getPluginNamespace() const noexcept
+{
+    return mNamespace.c_str();
+}
+
+REGISTER_TENSORRT_PLUGIN(PPScatterPluginCreator);

+ 126 - 0
src/detection/detection_lidar_pointpillars_cuda/src/pillarScatterKernels.cu

@@ -0,0 +1,126 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel.h"
+
+__global__ void pillarScatterHalfkernel(const half *pillar_features_data,
+                                        const unsigned int *coords_data, const unsigned int *params_data,
+                                        unsigned int featureX, unsigned int featureY,
+                                        half *spatial_feature_data)
+{
+    int pillar_idx = blockIdx.x * PILLARS_PER_BLOCK + threadIdx.x;
+    int valid_pillars_inBlock = PILLARS_PER_BLOCK;
+    const int num_pillars = params_data[0];
+    int valid_blocks = (num_pillars+PILLARS_PER_BLOCK-1)/PILLARS_PER_BLOCK;
+    if(blockIdx.x >= valid_blocks) return;
+    if(blockIdx.x == (valid_blocks-1)) {
+        valid_pillars_inBlock = num_pillars % PILLARS_PER_BLOCK;
+    }
+    valid_pillars_inBlock = (valid_pillars_inBlock==0) ? PILLARS_PER_BLOCK : valid_pillars_inBlock;
+    __shared__ half pillarSM[PILLARS_PER_BLOCK][PILLAR_FEATURE_SIZE]; //pillar*64
+    for (int i = 0; i < valid_pillars_inBlock; i++)
+    {
+        pillarSM[i][threadIdx.x] = pillar_features_data[ (blockIdx.x * PILLARS_PER_BLOCK +i)*PILLAR_FEATURE_SIZE + threadIdx.x];
+    }
+    __syncthreads();
+    if(pillar_idx >= num_pillars) return;
+    uint4 coord = ((const uint4 *)coords_data)[pillar_idx];
+    unsigned int x = coord.w;
+    unsigned int y = coord.z;
+
+    // Output tensor format : kHWC8, [N][H][W][(C+7)/8*8]
+    int C_stride = (PILLAR_FEATURE_SIZE+7)/8*8;
+    for (int i = 0; i < PILLAR_FEATURE_SIZE; i++)
+    {
+        spatial_feature_data[y*featureX*C_stride + x*C_stride + i] = pillarSM[threadIdx.x][i];
+    }
+}
+
+__global__ void pillarScatterFloatkernel(const float *pillar_features_data,
+                                         const unsigned int *coords_data, const unsigned int *params_data,
+                                         unsigned int featureX, unsigned int featureY,
+                                         float *spatial_feature_data)
+{
+    int pillar_idx = blockIdx.x * PILLARS_PER_BLOCK + threadIdx.x;
+    int valid_pillars_inBlock = PILLARS_PER_BLOCK;
+    const int num_pillars = params_data[0];
+    int valid_blocks = (num_pillars+PILLARS_PER_BLOCK-1)/PILLARS_PER_BLOCK;
+    if(blockIdx.x >= valid_blocks) return;
+    if(blockIdx.x == (valid_blocks-1)) {
+        valid_pillars_inBlock = num_pillars % PILLARS_PER_BLOCK;
+    }
+    valid_pillars_inBlock = (valid_pillars_inBlock==0) ? PILLARS_PER_BLOCK : valid_pillars_inBlock;
+    __shared__ float pillarSM[PILLARS_PER_BLOCK][PILLAR_FEATURE_SIZE]; //pillar*64
+    for (int i = 0; i < valid_pillars_inBlock; i++)
+    {
+        pillarSM[i][threadIdx.x] = pillar_features_data[ (blockIdx.x * PILLARS_PER_BLOCK +i)*PILLAR_FEATURE_SIZE + threadIdx.x];
+    }
+    __syncthreads();
+    if(pillar_idx >= num_pillars) return;
+    uint4 coord = ((const uint4 *)coords_data)[pillar_idx];
+    unsigned int x = coord.w;
+    unsigned int y = coord.z;
+
+    for (int i = 0; i < PILLAR_FEATURE_SIZE; i++)
+    {
+        spatial_feature_data[i*featureY*featureX + y*featureX + x] = pillarSM[threadIdx.x][i];
+    }
+}
+
+int pillarScatterHalfKernelLaunch(int max_pillar_num,
+                                  int num_features,
+                                  const half *pillar_features_data,
+                                  const unsigned int *coords_data,
+                                  const unsigned int *params_data,
+                                  unsigned int featureX, unsigned int featureY,
+                                  half *spatial_feature_data,
+                                  cudaStream_t stream)
+{
+    dim3 blocks((featureX*featureY+PILLARS_PER_BLOCK-1)/PILLARS_PER_BLOCK);
+    dim3 threads(PILLARS_PER_BLOCK);
+
+    pillarScatterHalfkernel<<<blocks, threads, 0, stream>>>(pillar_features_data, coords_data, params_data, featureX, featureY, spatial_feature_data);
+    auto err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        return -1;
+    }
+
+    return 0;
+}
+
+int pillarScatterFloatKernelLaunch(int max_pillar_num,
+                                   int num_features,
+                                   const float *pillar_features_data,
+                                   const unsigned int *coords_data,
+                                   const unsigned int *params_data,
+                                   unsigned int featureX, unsigned int featureY,
+                                   float *spatial_feature_data,
+                                   cudaStream_t stream)
+{
+    dim3 blocks((featureX*featureY+PILLARS_PER_BLOCK-1)/PILLARS_PER_BLOCK);
+    dim3 threads(PILLARS_PER_BLOCK);
+
+    pillarScatterFloatkernel<<<blocks, threads, 0, stream>>>(pillar_features_data, coords_data, params_data, featureX, featureY, spatial_feature_data);
+    auto err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        return -1;
+    }
+
+    return 0;
+}

+ 312 - 0
src/detection/detection_lidar_pointpillars_cuda/src/pointpillar.cpp

@@ -0,0 +1,312 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pointpillar.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cuda_runtime.h>
+#include "NvInfer.h"
+#include "NvOnnxConfig.h"
+#include "NvOnnxParser.h"
+#include "NvInferRuntime.h"
+
+TRT::~TRT(void)
+{
+  delete(context_);
+  delete(engine_);
+  checkCudaErrors(cudaEventDestroy(start_));
+  checkCudaErrors(cudaEventDestroy(stop_));
+  return;
+}
+
+TRT::TRT(std::string modelFile, cudaStream_t stream):stream_(stream)
+{
+  std::string modelCache = modelFile + ".cache";
+  std::fstream trtCache(modelCache, std::ifstream::in);
+  checkCudaErrors(cudaEventCreate(&start_));
+  checkCudaErrors(cudaEventCreate(&stop_));
+  if (!trtCache.is_open())
+  {
+	  std::cout << "Building TRT engine."<<std::endl;
+    // define builder
+    auto builder = (nvinfer1::createInferBuilder(gLogger_));
+
+    // define network
+    const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    auto network = (builder->createNetworkV2(explicitBatch));
+
+    // define onnxparser
+    auto parser = (nvonnxparser::createParser(*network, gLogger_));
+    if (!parser->parseFromFile(modelFile.data(), static_cast<int>(nvinfer1::ILogger::Severity::kWARNING)))
+    {
+        std::cerr << ": failed to parse onnx model file, please check the onnx version and trt support op!"
+                  << std::endl;
+        exit(-1);
+    }
+
+    // define config
+    auto networkConfig = builder->createBuilderConfig();
+#if defined (__arm64__) || defined (__aarch64__) 
+//    networkConfig->setFlag(nvinfer1::BuilderFlag::kFP16);
+    std::cout << "Enable fp16!" << std::endl;
+#endif
+    // set max batch size
+    builder->setMaxBatchSize(1);
+    // set max workspace
+    networkConfig->setMaxWorkspaceSize(size_t(1) << 30);
+
+    engine_ = (builder->buildEngineWithConfig(*network, *networkConfig));
+
+    if (engine_ == nullptr)
+    {
+      std::cerr << ": engine init null!" << std::endl;
+      exit(-1);
+    }
+
+    // serialize the engine, then close everything down
+    auto trtModelStream = (engine_->serialize());
+    std::fstream trtOut(modelCache, std::ifstream::out);
+    if (!trtOut.is_open())
+    {
+       std::cout << "Can't store trt cache.\n";
+       exit(-1);
+    }
+
+    trtOut.write((char*)trtModelStream->data(), trtModelStream->size());
+    trtOut.close();
+    trtModelStream->destroy();
+
+    networkConfig->destroy();
+    parser->destroy();
+    network->destroy();
+    builder->destroy();
+
+  } else {
+	  std::cout << "load TRT cache."<<std::endl;
+    char *data;
+    unsigned int length;
+
+    // get length of file:
+    trtCache.seekg(0, trtCache.end);
+    length = trtCache.tellg();
+    trtCache.seekg(0, trtCache.beg);
+
+    data = (char *)malloc(length);
+    if (data == NULL ) {
+       std::cout << "Can't malloc data.\n";
+       exit(-1);
+    }
+
+    trtCache.read(data, length);
+    // create context
+    auto runtime = nvinfer1::createInferRuntime(gLogger_);
+
+    if (runtime == nullptr) {	  std::cout << "load TRT cache0."<<std::endl;
+        std::cerr << ": runtime null!" << std::endl;
+        exit(-1);
+    }
+    //plugin_ = nvonnxparser::createPluginFactory(gLogger_);
+    engine_ = (runtime->deserializeCudaEngine(data, length, 0));
+    if (engine_ == nullptr) {
+        std::cerr << ": engine null!" << std::endl;
+        exit(-1);
+    }
+    free(data);
+    trtCache.close();
+  }
+
+  context_ = engine_->createExecutionContext();
+  return;
+}
+
+int TRT::doinfer(void**buffers)
+{
+  int status;
+
+  status = context_->enqueueV2(buffers, stream_, &start_);
+
+  if (!status)
+  {
+      return -1;
+  }
+
+  return 0;
+}
+
+PointPillar::PointPillar(std::string modelFile, cudaStream_t stream):stream_(stream)
+{
+  checkCudaErrors(cudaEventCreate(&start_));
+  checkCudaErrors(cudaEventCreate(&stop_));
+
+  pre_.reset(new PreProcessCuda(stream_));
+  trt_.reset(new TRT(modelFile, stream_));
+  post_.reset(new PostProcessCuda(stream_));
+
+  //point cloud to voxels
+  voxel_features_size_ = MAX_VOXELS * params_.max_num_points_per_pillar * 4 * sizeof(float);
+  voxel_num_size_ = MAX_VOXELS * sizeof(unsigned int);
+  voxel_idxs_size_ = MAX_VOXELS* 4 * sizeof(unsigned int);
+
+  checkCudaErrors(cudaMallocManaged((void **)&voxel_features_, voxel_features_size_));
+  checkCudaErrors(cudaMallocManaged((void **)&voxel_num_, voxel_num_size_));
+  checkCudaErrors(cudaMallocManaged((void **)&voxel_idxs_, voxel_idxs_size_));
+
+  checkCudaErrors(cudaMemsetAsync(voxel_features_, 0, voxel_features_size_, stream_));
+  checkCudaErrors(cudaMemsetAsync(voxel_num_, 0, voxel_num_size_, stream_));
+  checkCudaErrors(cudaMemsetAsync(voxel_idxs_, 0, voxel_idxs_size_, stream_));
+
+  //TRT-input
+  features_input_size_ = MAX_VOXELS * params_.max_num_points_per_pillar * 10 * sizeof(float);
+  checkCudaErrors(cudaMallocManaged((void **)&features_input_, features_input_size_));
+  checkCudaErrors(cudaMallocManaged((void **)&params_input_, sizeof(unsigned int)));
+
+  checkCudaErrors(cudaMemsetAsync(features_input_, 0, features_input_size_, stream_));
+  checkCudaErrors(cudaMemsetAsync(params_input_, 0, sizeof(unsigned int), stream_));
+
+  //output of TRT -- input of post-process
+  cls_size_ = params_.feature_x_size * params_.feature_y_size * params_.num_classes * params_.num_anchors * sizeof(float);
+  box_size_ = params_.feature_x_size * params_.feature_y_size * params_.num_box_values * params_.num_anchors * sizeof(float);
+  dir_cls_size_ = params_.feature_x_size * params_.feature_y_size * params_.num_dir_bins * params_.num_anchors * sizeof(float);
+  checkCudaErrors(cudaMallocManaged((void **)&cls_output_, cls_size_));
+  checkCudaErrors(cudaMallocManaged((void **)&box_output_, box_size_));
+  checkCudaErrors(cudaMallocManaged((void **)&dir_cls_output_, dir_cls_size_));
+
+  //output of post-process
+  bndbox_size_ = (params_.feature_x_size * params_.feature_y_size * params_.num_anchors * 9 + 1) * sizeof(float);
+  checkCudaErrors(cudaMallocManaged((void **)&bndbox_output_, bndbox_size_));
+
+  res_.reserve(100);
+  return;
+}
+
+PointPillar::~PointPillar(void)
+{
+  pre_.reset();
+  trt_.reset();
+  post_.reset();
+
+  checkCudaErrors(cudaFree(voxel_features_));
+  checkCudaErrors(cudaFree(voxel_num_));
+  checkCudaErrors(cudaFree(voxel_idxs_));
+
+  checkCudaErrors(cudaFree(features_input_));
+  checkCudaErrors(cudaFree(params_input_));
+
+  checkCudaErrors(cudaFree(cls_output_));
+  checkCudaErrors(cudaFree(box_output_));
+  checkCudaErrors(cudaFree(dir_cls_output_));
+
+  checkCudaErrors(cudaFree(bndbox_output_));
+
+  checkCudaErrors(cudaEventDestroy(start_));
+  checkCudaErrors(cudaEventDestroy(stop_));
+  return;
+}
+
+int PointPillar::doinfer(void*points_data, unsigned int points_size, std::vector<Bndbox> &nms_pred)
+{
+#if PERFORMANCE_LOG
+  float generateVoxelsTime = 0.0f;
+  checkCudaErrors(cudaEventRecord(start_, stream_));
+#endif
+
+  pre_->generateVoxels((float*)points_data, points_size,
+        params_input_,
+        voxel_features_, 
+        voxel_num_,
+        voxel_idxs_);
+
+#if PERFORMANCE_LOG
+  checkCudaErrors(cudaEventRecord(stop_, stream_));
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaEventElapsedTime(&generateVoxelsTime, start_, stop_));
+  unsigned int params_input_cpu;
+  checkCudaErrors(cudaMemcpy(&params_input_cpu, params_input_, sizeof(unsigned int), cudaMemcpyDefault));
+  std::cout<<"find pillar_num: "<< params_input_cpu <<std::endl;
+#endif
+
+#if PERFORMANCE_LOG
+  float generateFeaturesTime = 0.0f;
+  checkCudaErrors(cudaEventRecord(start_, stream_));
+#endif
+
+  pre_->generateFeatures(voxel_features_,
+      voxel_num_,
+      voxel_idxs_,
+      params_input_,
+      features_input_);
+
+#if PERFORMANCE_LOG
+  checkCudaErrors(cudaEventRecord(stop_, stream_));
+  checkCudaErrors(cudaEventSynchronize(stop_));
+  checkCudaErrors(cudaEventElapsedTime(&generateFeaturesTime, start_, stop_));
+#endif
+
+#if PERFORMANCE_LOG
+  float doinferTime = 0.0f;
+  checkCudaErrors(cudaEventRecord(start_, stream_));
+#endif
+
+  void *buffers[] = {features_input_, voxel_idxs_, params_input_, cls_output_, box_output_, dir_cls_output_};
+  trt_->doinfer(buffers);
+  checkCudaErrors(cudaMemsetAsync(params_input_, 0, sizeof(unsigned int), stream_));
+
+#if PERFORMANCE_LOG
+  checkCudaErrors(cudaEventRecord(stop_, stream_));
+  checkCudaErrors(cudaEventSynchronize(stop_));
+  checkCudaErrors(cudaEventElapsedTime(&doinferTime, start_, stop_));
+#endif
+
+#if PERFORMANCE_LOG
+  float doPostprocessCudaTime = 0.0f;
+  checkCudaErrors(cudaEventRecord(start_, stream_));
+#endif
+
+  post_->doPostprocessCuda(cls_output_, box_output_, dir_cls_output_,
+                          bndbox_output_);
+  checkCudaErrors(cudaDeviceSynchronize());
+  float obj_count = bndbox_output_[0];
+
+  int num_obj = static_cast<int>(obj_count);
+  auto output = bndbox_output_ + 1;
+
+  for (int i = 0; i < num_obj; i++) {
+    auto Bb = Bndbox(output[i * 9],
+                    output[i * 9 + 1], output[i * 9 + 2], output[i * 9 + 3],
+                    output[i * 9 + 4], output[i * 9 + 5], output[i * 9 + 6],
+                    static_cast<int>(output[i * 9 + 7]),
+                    output[i * 9 + 8]);
+    res_.push_back(Bb);
+  }
+
+
+  nms_cpu(res_, params_.nms_thresh, nms_pred);
+  res_.clear();
+
+#if PERFORMANCE_LOG
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaEventRecord(stop_, stream_));
+  checkCudaErrors(cudaEventSynchronize(stop_));
+  checkCudaErrors(cudaEventElapsedTime(&doPostprocessCudaTime, start_, stop_));
+  std::cout<<"TIME: generateVoxels: "<< generateVoxelsTime <<" ms." <<std::endl;
+  std::cout<<"TIME: generateFeatures: "<< generateFeaturesTime <<" ms." <<std::endl;
+  std::cout<<"TIME: doinfer: "<< doinferTime <<" ms." <<std::endl;
+  std::cout<<"TIME: doPostprocessCuda: "<< doPostprocessCudaTime <<" ms." <<std::endl;
+#endif
+  return 0;
+}

+ 251 - 0
src/detection/detection_lidar_pointpillars_cuda/src/postprocess.cpp

@@ -0,0 +1,251 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "postprocess.h"
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <math.h>
+
+const float ThresHold = 1e-8;
+
+inline float cross(const float2 p1, const float2 p2, const float2 p0) {
+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+inline int check_box2d(const Bndbox box, const float2 p) {
+    const float MARGIN = 1e-2;
+    float center_x = box.x;
+    float center_y = box.y;
+    float angle_cos = cos(-box.rt);
+    float angle_sin = sin(-box.rt);
+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+    return (fabs(rot_x) < box.w / 2 + MARGIN && fabs(rot_y) < box.l / 2 + MARGIN);
+}
+
+bool intersection(const float2 p1, const float2 p0, const float2 q1, const float2 q0, float2 &ans) {
+
+    if (( std::min(p0.x, p1.x) <= std::max(q0.x, q1.x) &&
+          std::min(q0.x, q1.x) <= std::max(p0.x, p1.x) &&
+          std::min(p0.y, p1.y) <= std::max(q0.y, q1.y) &&
+          std::min(q0.y, q1.y) <= std::max(p0.y, p1.y) ) == 0)
+        return false;
+
+
+    float s1 = cross(q0, p1, p0);
+    float s2 = cross(p1, q1, p0);
+    float s3 = cross(p0, q1, q0);
+    float s4 = cross(q1, p1, q0);
+
+    if (!(s1 * s2 > 0 && s3 * s4 > 0))
+        return false;
+
+    float s5 = cross(q1, p1, p0);
+    if (fabs(s5 - s1) > ThresHold) {
+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+    } else {
+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+        float D = a0 * b1 - a1 * b0;
+
+        ans.x = (b0 * c1 - b1 * c0) / D;
+        ans.y = (a1 * c0 - a0 * c1) / D;
+    }
+
+    return true;
+}
+
+inline void rotate_around_center(const float2 &center, const float angle_cos, const float angle_sin, float2 &p) {
+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
+    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+    p = float2 {new_x, new_y};
+    return;
+}
+
+inline float box_overlap(const Bndbox &box_a, const Bndbox &box_b) {
+    float a_angle = box_a.rt, b_angle = box_b.rt;
+    float a_dx_half = box_a.w / 2, b_dx_half = box_b.w / 2, a_dy_half = box_a.l / 2, b_dy_half = box_b.l / 2;
+    float a_x1 = box_a.x - a_dx_half, a_y1 = box_a.y - a_dy_half;
+    float a_x2 = box_a.x + a_dx_half, a_y2 = box_a.y + a_dy_half;
+    float b_x1 = box_b.x - b_dx_half, b_y1 = box_b.y - b_dy_half;
+    float b_x2 = box_b.x + b_dx_half, b_y2 = box_b.y + b_dy_half;
+    float2 box_a_corners[5];
+    float2 box_b_corners[5];
+
+    float2 center_a = float2 {box_a.x, box_a.y};
+    float2 center_b = float2 {box_b.x, box_b.y};
+
+    float2 cross_points[16];
+    float2 poly_center =  {0, 0};
+    int cnt = 0;
+    bool flag = false;
+
+    box_a_corners[0] = float2 {a_x1, a_y1};
+    box_a_corners[1] = float2 {a_x2, a_y1};
+    box_a_corners[2] = float2 {a_x2, a_y2};
+    box_a_corners[3] = float2 {a_x1, a_y2};
+
+    box_b_corners[0] = float2 {b_x1, b_y1};
+    box_b_corners[1] = float2 {b_x2, b_y1};
+    box_b_corners[2] = float2 {b_x2, b_y2};
+    box_b_corners[3] = float2 {b_x1, b_y2};
+
+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+    for (int k = 0; k < 4; k++) {
+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+    }
+
+    box_a_corners[4] = box_a_corners[0];
+    box_b_corners[4] = box_b_corners[0];
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            flag = intersection(box_a_corners[i + 1], box_a_corners[i],
+                                box_b_corners[j + 1], box_b_corners[j],
+                                cross_points[cnt]);
+            if (flag) {
+                poly_center = {poly_center.x + cross_points[cnt].x, poly_center.y + cross_points[cnt].y};
+                cnt++;
+            }
+        }
+    }
+
+    for (int k = 0; k < 4; k++) {
+        if (check_box2d(box_a, box_b_corners[k])) {
+            poly_center = {poly_center.x + box_b_corners[k].x, poly_center.y + box_b_corners[k].y};
+            cross_points[cnt] = box_b_corners[k];
+            cnt++;
+        }
+        if (check_box2d(box_b, box_a_corners[k])) {
+            poly_center = {poly_center.x + box_a_corners[k].x, poly_center.y + box_a_corners[k].y};
+            cross_points[cnt] = box_a_corners[k];
+            cnt++;
+        }
+    }
+
+    poly_center.x /= cnt;
+    poly_center.y /= cnt;
+
+    float2 temp;
+    for (int j = 0; j < cnt - 1; j++) {
+        for (int i = 0; i < cnt - j - 1; i++) {
+            if (atan2(cross_points[i].y - poly_center.y, cross_points[i].x - poly_center.x) >
+                atan2(cross_points[i+1].y - poly_center.y, cross_points[i+1].x - poly_center.x)
+                ) {
+                temp = cross_points[i];
+                cross_points[i] = cross_points[i + 1];
+                cross_points[i + 1] = temp;
+            }
+        }
+    }
+
+    float area = 0;
+    for (int k = 0; k < cnt - 1; k++) {
+        float2 a = {cross_points[k].x - cross_points[0].x,
+                    cross_points[k].y - cross_points[0].y};
+        float2 b = {cross_points[k + 1].x - cross_points[0].x,
+                    cross_points[k + 1].y - cross_points[0].y};
+        area += (a.x * b.y - a.y * b.x);
+    }
+    return fabs(area) / 2.0;
+}
+
+int nms_cpu(std::vector<Bndbox> bndboxes, const float nms_thresh, std::vector<Bndbox> &nms_pred)
+{
+    std::sort(bndboxes.begin(), bndboxes.end(),
+              [](Bndbox boxes1, Bndbox boxes2) { return boxes1.score > boxes2.score; });
+    std::vector<int> suppressed(bndboxes.size(), 0);
+    for (size_t i = 0; i < bndboxes.size(); i++) {
+        if (suppressed[i] == 1) {
+            continue;
+        }
+        nms_pred.emplace_back(bndboxes[i]);
+        for (size_t j = i + 1; j < bndboxes.size(); j++) {
+            if (suppressed[j] == 1) {
+                continue;
+            }
+
+            float sa = bndboxes[i].w * bndboxes[i].l;
+            float sb = bndboxes[j].w * bndboxes[j].l;
+            float s_overlap = box_overlap(bndboxes[i], bndboxes[j]);
+            float iou = s_overlap / fmaxf(sa + sb - s_overlap, ThresHold);
+
+            if (iou >= nms_thresh) {
+                suppressed[j] = 1;
+            }
+        }
+    }
+    return 0;
+}
+
+PostProcessCuda::PostProcessCuda(cudaStream_t stream)
+{
+  stream_ = stream;
+
+  checkCudaErrors(cudaMalloc((void **)&anchors_, params_.num_anchors * params_.len_per_anchor * sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&anchor_bottom_heights_, params_.num_classes * sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&object_counter_, sizeof(int)));
+
+  checkCudaErrors(cudaMemcpyAsync(anchors_, params_.anchors,
+        params_.num_anchors * params_.len_per_anchor * sizeof(float), cudaMemcpyDefault, stream_));
+  checkCudaErrors(cudaMemcpyAsync(anchor_bottom_heights_, params_.anchor_bottom_heights,
+                     params_.num_classes * sizeof(float), cudaMemcpyDefault, stream_));
+  checkCudaErrors(cudaMemsetAsync(object_counter_, 0, sizeof(int), stream_));
+  return;
+}
+
+PostProcessCuda::~PostProcessCuda()
+{
+  checkCudaErrors(cudaFree(anchors_));
+  checkCudaErrors(cudaFree(anchor_bottom_heights_));
+  checkCudaErrors(cudaFree(object_counter_));
+  return;
+}
+
+int PostProcessCuda::doPostprocessCuda(const float *cls_input, float *box_input, const float *dir_cls_input,
+                                        float *bndbox_output)
+{
+  checkCudaErrors(cudaMemsetAsync(object_counter_, 0, sizeof(int)));
+  checkCudaErrors(postprocess_launch(cls_input,
+                     box_input,
+                     dir_cls_input,
+                     anchors_,
+                     anchor_bottom_heights_,
+                     bndbox_output,
+                     object_counter_,
+                     params_.min_x_range,
+                     params_.max_x_range,
+                     params_.min_y_range,
+                     params_.max_y_range,
+                     params_.feature_x_size,
+                     params_.feature_y_size,
+                     params_.num_anchors,
+                     params_.num_classes,
+                     params_.num_box_values,
+                     params_.score_thresh,
+                     params_.dir_offset,
+                     stream_
+                     ));
+  return 0;
+}

+ 159 - 0
src/detection/detection_lidar_pointpillars_cuda/src/postprocess_kernels.cu

@@ -0,0 +1,159 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel.h"
+
+__device__ float sigmoid(const float x) { return 1.0f / (1.0f + expf(-x)); }
+
+__global__ void postprocess_kernal(const float *cls_input,
+                                        float *box_input,
+                                        const float *dir_cls_input,
+                                        float *anchors,
+                                        float *anchor_bottom_heights,
+                                        float *bndbox_output,
+                                        int *object_counter,
+                                        const float min_x_range,
+                                        const float max_x_range,
+                                        const float min_y_range,
+                                        const float max_y_range,
+                                        const int feature_x_size,
+                                        const int feature_y_size,
+                                        const int num_anchors,
+                                        const int num_classes,
+                                        const int num_box_values,
+                                        const float score_thresh,
+                                        const float dir_offset)
+{
+  int loc_index = blockIdx.x;
+  int ith_anchor = threadIdx.x;
+  if (ith_anchor >= num_anchors)
+  {
+      return;
+  }
+  int col = loc_index % feature_x_size;
+  int row = loc_index / feature_x_size;
+  float x_offset = min_x_range + col * (max_x_range - min_x_range) / (feature_x_size - 1);
+  float y_offset = min_y_range + row * (max_y_range - min_y_range) / (feature_y_size - 1);
+  int cls_offset = loc_index * num_anchors * num_classes + ith_anchor * num_classes;
+  float dev_cls[2] = {-1, 0};
+
+  const float *scores = cls_input + cls_offset;
+  float max_score = sigmoid(scores[0]);
+  int cls_id = 0;
+  for (int i = 1; i < num_classes; i++) {
+    float cls_score = sigmoid(scores[i]);
+    if (cls_score > max_score) {
+      max_score = cls_score;
+      cls_id = i;
+    }
+  }
+  dev_cls[0] = static_cast<float>(cls_id);
+  dev_cls[1] = max_score;
+
+  if (dev_cls[1] >= score_thresh)
+  {
+    int box_offset = loc_index * num_anchors * num_box_values + ith_anchor * num_box_values;
+    int dir_cls_offset = loc_index * num_anchors * 2 + ith_anchor * 2;
+    float *anchor_ptr = anchors + ith_anchor * 4;
+    float z_offset = anchor_ptr[2] / 2 + anchor_bottom_heights[ith_anchor / 2];
+    float anchor[7] = {x_offset, y_offset, z_offset, anchor_ptr[0], anchor_ptr[1], anchor_ptr[2], anchor_ptr[3]};
+    float *box_encodings = box_input + box_offset;
+
+    float xa = anchor[0];
+    float ya = anchor[1];
+    float za = anchor[2];
+    float dxa = anchor[3];
+    float dya = anchor[4];
+    float dza = anchor[5];
+    float ra = anchor[6];
+    float diagonal = sqrtf(dxa * dxa + dya * dya);
+    box_encodings[0] = box_encodings[0] * diagonal + xa;
+    box_encodings[1] = box_encodings[1] * diagonal + ya;
+    box_encodings[2] = box_encodings[2] * dza + za;
+    box_encodings[3] = expf(box_encodings[3]) * dxa;
+    box_encodings[4] = expf(box_encodings[4]) * dya;
+    box_encodings[5] = expf(box_encodings[5]) * dza;
+    box_encodings[6] = box_encodings[6] + ra;
+
+    float yaw;
+    int dir_label = dir_cls_input[dir_cls_offset] > dir_cls_input[dir_cls_offset + 1] ? 0 : 1;
+    float period = 2 * M_PI / 2;
+    float val = box_input[box_offset + 6] - dir_offset;
+    float dir_rot = val - floor(val / (period + 1e-8) + 0.f) * period;
+    yaw = dir_rot + dir_offset + period * dir_label;
+
+    int resCount = (int)atomicAdd(object_counter, 1);
+    bndbox_output[0] = resCount+1;
+    float *data = bndbox_output + 1 + resCount * 9;
+    data[0] = box_input[box_offset];
+    data[1] = box_input[box_offset + 1];
+    data[2] = box_input[box_offset + 2];
+    data[3] = box_input[box_offset + 3];
+    data[4] = box_input[box_offset + 4];
+    data[5] = box_input[box_offset + 5];
+    data[6] = yaw;
+    data[7] = dev_cls[0];
+    data[8] = dev_cls[1];
+  }
+}
+
+cudaError_t postprocess_launch(const float *cls_input,
+                      float *box_input,
+                      const float *dir_cls_input,
+                      float *anchors,
+                      float *anchor_bottom_heights,
+                      float *bndbox_output,
+                      int *object_counter,
+                      const float min_x_range,
+                      const float max_x_range,
+                      const float min_y_range,
+                      const float max_y_range,
+                      const int feature_x_size,
+                      const int feature_y_size,
+                      const int num_anchors,
+                      const int num_classes,
+                      const int num_box_values,
+                      const float score_thresh,
+                      const float dir_offset,
+                      cudaStream_t stream)
+{
+  int bev_size = feature_x_size * feature_y_size;
+  dim3 threads (num_anchors);
+  dim3 blocks (bev_size);
+
+  postprocess_kernal<<<blocks, threads, 0, stream>>>
+                (cls_input,
+                 box_input,
+                 dir_cls_input,
+                 anchors,
+                 anchor_bottom_heights,
+                 bndbox_output,
+                 object_counter,
+                 min_x_range,
+                 max_x_range,
+                 min_y_range,
+                 max_y_range,
+                 feature_x_size,
+                 feature_y_size,
+                 num_anchors,
+                 num_classes,
+                 num_box_values,
+                 score_thresh,
+                 dir_offset);
+  return cudaGetLastError();
+}
+

+ 122 - 0
src/detection/detection_lidar_pointpillars_cuda/src/preprocess.cpp

@@ -0,0 +1,122 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "preprocess.h"
+#include <assert.h>
+#include <iostream>
+#include <math.h>
+
+PreProcessCuda::PreProcessCuda(cudaStream_t stream)
+{
+  stream_ = stream;
+
+  unsigned int mask_size = params_.grid_z_size
+              * params_.grid_y_size
+              * params_.grid_x_size
+              * sizeof(unsigned int);
+
+  unsigned int voxels_size = params_.grid_z_size
+              * params_.grid_y_size
+              * params_.grid_x_size
+              * params_.max_num_points_per_pillar
+              * params_.num_point_values
+              * sizeof(float);
+
+  checkCudaErrors(cudaMallocManaged((void **)&mask_, mask_size));
+  checkCudaErrors(cudaMallocManaged((void **)&voxels_, voxels_size));
+  checkCudaErrors(cudaMallocManaged((void **)&voxelsList_, 300000l*sizeof(int)));
+
+  checkCudaErrors(cudaMemsetAsync(mask_, 0, mask_size, stream_));
+  checkCudaErrors(cudaMemsetAsync(voxels_, 0, voxels_size, stream_));
+  checkCudaErrors(cudaMemsetAsync(voxelsList_, 0, 300000l*sizeof(int), stream_));
+
+  return;
+}
+
+PreProcessCuda::~PreProcessCuda()
+{
+  checkCudaErrors(cudaFree(mask_));
+  checkCudaErrors(cudaFree(voxels_));
+  checkCudaErrors(cudaFree(voxelsList_));
+  return;
+}
+
+int PreProcessCuda::generateVoxels(float *points, size_t points_size,
+        unsigned int *pillar_num,
+        float *voxel_features,
+        unsigned int *voxel_num,
+        unsigned int *voxel_idxs)
+{
+  int grid_x_size = params_.grid_x_size;
+  int grid_y_size = params_.grid_y_size;
+  int grid_z_size = params_.grid_z_size;
+  assert(grid_z_size ==1);
+  float min_x_range = params_.min_x_range;
+  float max_x_range = params_.max_x_range;
+  float min_y_range = params_.min_y_range;
+  float max_y_range = params_.max_y_range;
+  float min_z_range = params_.min_z_range;
+  float max_z_range = params_.max_z_range;
+  float pillar_x_size = params_.pillar_x_size;
+  float pillar_y_size = params_.pillar_y_size;
+  float pillar_z_size = params_.pillar_z_size;
+
+  checkCudaErrors(generateVoxels_random_launch(points, points_size,
+        min_x_range, max_x_range,
+        min_y_range, max_y_range,
+        min_z_range, max_z_range,
+        pillar_x_size, pillar_y_size, pillar_z_size,
+        grid_y_size, grid_x_size,
+        mask_, voxels_, stream_));
+
+  checkCudaErrors(generateBaseFeatures_launch(mask_, voxels_,
+      grid_y_size, grid_x_size,
+      pillar_num,
+      voxel_features,
+      voxel_num,
+      voxel_idxs, stream_));
+
+  return 0;
+}
+
+int PreProcessCuda::generateFeatures(float* voxel_features,
+          unsigned int* voxel_num,
+          unsigned int* voxel_idxs,
+          unsigned int *params,
+          float* features)
+{
+  int grid_z_size = params_.grid_z_size;
+  assert(grid_z_size ==1);
+  float range_min_x = params_.min_x_range;
+  float range_min_y = params_.min_y_range;
+  float range_min_z = params_.min_z_range;
+
+  float voxel_x = params_.pillar_x_size;
+  float voxel_y = params_.pillar_y_size;
+  float voxel_z = params_.pillar_z_size;
+
+  checkCudaErrors(generateFeatures_launch(voxel_features,
+      voxel_num,
+      voxel_idxs,
+      params,
+      voxel_x, voxel_y, voxel_z,
+      range_min_x, range_min_y, range_min_z,
+      features, stream_));
+
+  return 0;
+}
+

+ 311 - 0
src/detection/detection_lidar_pointpillars_cuda/src/preprocess_kernels.cu

@@ -0,0 +1,311 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel.h"
+
+__global__ void generateVoxels_random_kernel(float *points, size_t points_size,
+        float min_x_range, float max_x_range,
+        float min_y_range, float max_y_range,
+        float min_z_range, float max_z_range,
+        float pillar_x_size, float pillar_y_size, float pillar_z_size,
+        int grid_y_size, int grid_x_size,
+        unsigned int *mask, float *voxels)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(point_idx >= points_size) return;
+
+  float4 point = ((float4*)points)[point_idx];
+
+  if(point.x<min_x_range||point.x>=max_x_range
+    || point.y<min_y_range||point.y>=max_y_range
+    || point.z<min_z_range||point.z>=max_z_range) return;
+
+  int voxel_idx = floorf((point.x - min_x_range)/pillar_x_size);
+  int voxel_idy = floorf((point.y - min_y_range)/pillar_y_size);
+  unsigned int voxel_index = voxel_idy * grid_x_size
+                            + voxel_idx;
+
+  unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
+
+  if(point_id >= POINTS_PER_VOXEL) return;
+  float *address = voxels + (voxel_index*POINTS_PER_VOXEL + point_id)*4;
+  atomicExch(address+0, point.x);
+  atomicExch(address+1, point.y);
+  atomicExch(address+2, point.z);
+  atomicExch(address+3, point.w);
+}
+
+cudaError_t generateVoxels_random_launch(float *points, size_t points_size,
+        float min_x_range, float max_x_range,
+        float min_y_range, float max_y_range,
+        float min_z_range, float max_z_range,
+        float pillar_x_size, float pillar_y_size, float pillar_z_size,
+        int grid_y_size, int grid_x_size,
+        unsigned int *mask, float *voxels,
+        cudaStream_t stream)
+{
+  int threadNum = THREADS_FOR_VOXEL;
+  dim3 blocks((points_size+threadNum-1)/threadNum);
+  dim3 threads(threadNum);
+  generateVoxels_random_kernel<<<blocks, threads, 0, stream>>>
+    (points, points_size,
+        min_x_range, max_x_range,
+        min_y_range, max_y_range,
+        min_z_range, max_z_range,
+        pillar_x_size, pillar_y_size, pillar_z_size,
+        grid_y_size, grid_x_size,
+        mask, voxels);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void generateVoxelsList_kernel(float *points, size_t points_size,
+        float min_x_range, float max_x_range,
+        float min_y_range, float max_y_range,
+        float min_z_range, float max_z_range,
+        float pillar_x_size, float pillar_y_size, float pillar_z_size,
+        int grid_y_size, int grid_x_size,
+        unsigned int *mask, int *voxelsList)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(point_idx >= points_size) return;
+
+  float4 point = ((float4*)points)[point_idx];
+
+  if(point.x<min_x_range||point.x>=max_x_range
+    || point.y<min_y_range||point.y>=max_y_range
+    || point.z<min_z_range||point.z>=max_z_range)
+  {
+    voxelsList[point_idx] = -1;
+    return;
+  }
+
+  int voxel_idx = floorf((point.x - min_x_range)/pillar_x_size);
+  int voxel_idy = floorf((point.y - min_y_range)/pillar_y_size);
+  unsigned int voxel_index = voxel_idy * grid_x_size
+                            + voxel_idx;
+
+  atomicAdd(&(mask[voxel_index]), 1);
+  voxelsList[point_idx] = voxel_index;
+
+}
+
+__global__ void generateVoxels_kernel(float *points, size_t points_size,
+        int *voxelsList,
+        unsigned int *mask, float *voxels)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(point_idx >= points_size) return;
+
+  int voxel_index = voxelsList[point_idx];
+
+  if (voxel_index == -1) return;
+  int point_id = atomicAdd(&(mask[voxel_index]), 1);
+
+  if(point_id >= POINTS_PER_VOXEL) return;
+  float *address = voxels + (voxel_index*POINTS_PER_VOXEL + point_id)*4;
+  float4 point = ((float4*)points)[point_idx];
+  atomicExch(address+0, point.x);
+  atomicExch(address+1, point.y);
+  atomicExch(address+2, point.z);
+  atomicExch(address+3, point.w);
+}
+
+__global__ void generateBaseFeatures_kernel(unsigned int *mask, float *voxels,
+        int grid_y_size, int grid_x_size,
+        unsigned int *pillar_num,
+        float *voxel_features,
+        unsigned int *voxel_num,
+        unsigned int *voxel_idxs)
+{
+  unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(voxel_idx >= grid_x_size ||voxel_idy >= grid_y_size) return;
+
+  unsigned int voxel_index = voxel_idy * grid_x_size
+                           + voxel_idx;
+  unsigned int count = mask[voxel_index];
+  if( !(count>0) ) return;
+  count = count<POINTS_PER_VOXEL?count:POINTS_PER_VOXEL;
+
+  unsigned int current_pillarId = 0;
+  current_pillarId = atomicAdd(pillar_num, 1);
+
+  voxel_num[current_pillarId] = count;
+
+  uint4 idx = {0, 0, voxel_idy, voxel_idx};
+  ((uint4*)voxel_idxs)[current_pillarId] = idx;
+
+  for (int i=0; i<count; i++){
+    int inIndex = voxel_index*POINTS_PER_VOXEL + i;
+    int outIndex = current_pillarId*POINTS_PER_VOXEL + i;
+    ((float4*)voxel_features)[outIndex] = ((float4*)voxels)[inIndex];
+  }
+
+  // clear buffer for next infer
+  atomicExch(mask + voxel_index, 0);
+}
+
+// create 4 channels
+cudaError_t generateBaseFeatures_launch(unsigned int *mask, float *voxels,
+        int grid_y_size, int grid_x_size,
+        unsigned int *pillar_num,
+        float *voxel_features,
+        unsigned int *voxel_num,
+        unsigned int *voxel_idxs,
+        cudaStream_t stream)
+{
+  dim3 threads = {32,32};
+  dim3 blocks = {(grid_x_size + threads.x -1)/threads.x,
+                 (grid_y_size + threads.y -1)/threads.y};
+
+  generateBaseFeatures_kernel<<<blocks, threads, 0, stream>>>
+      (mask, voxels, grid_y_size, grid_x_size,
+       pillar_num,
+       voxel_features,
+       voxel_num,
+       voxel_idxs);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+// 4 channels -> 10 channels
+__global__ void generateFeatures_kernel(float* voxel_features,
+    unsigned int* voxel_num, unsigned int* voxel_idxs, unsigned int *params,
+    float voxel_x, float voxel_y, float voxel_z,
+    float range_min_x, float range_min_y, float range_min_z,
+    float* features)
+{
+    int pillar_idx = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x/WARP_SIZE;
+    int point_idx = threadIdx.x % WARP_SIZE;
+
+    int pillar_idx_inBlock = threadIdx.x/32;
+    unsigned int num_pillars = params[0];
+
+    if (pillar_idx >= num_pillars) return;
+
+    __shared__ float4 pillarSM[WARPS_PER_BLOCK][WARP_SIZE];
+    __shared__ float4 pillarSumSM[WARPS_PER_BLOCK];
+    __shared__ uint4 idxsSM[WARPS_PER_BLOCK];
+    __shared__ int pointsNumSM[WARPS_PER_BLOCK];
+    __shared__ float pillarOutSM[WARPS_PER_BLOCK][WARP_SIZE][FEATURES_SIZE];
+
+    if (threadIdx.x < WARPS_PER_BLOCK) {
+      pointsNumSM[threadIdx.x] = voxel_num[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
+      idxsSM[threadIdx.x] = ((uint4*)voxel_idxs)[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
+      pillarSumSM[threadIdx.x] = {0,0,0,0};
+    }
+
+    pillarSM[pillar_idx_inBlock][point_idx] = ((float4*)voxel_features)[pillar_idx*WARP_SIZE + point_idx];
+    __syncthreads();
+
+    //calculate sm in a pillar
+    if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
+      atomicAdd(&(pillarSumSM[pillar_idx_inBlock].x),  pillarSM[pillar_idx_inBlock][point_idx].x);
+      atomicAdd(&(pillarSumSM[pillar_idx_inBlock].y),  pillarSM[pillar_idx_inBlock][point_idx].y);
+      atomicAdd(&(pillarSumSM[pillar_idx_inBlock].z),  pillarSM[pillar_idx_inBlock][point_idx].z);
+    }
+    __syncthreads();
+
+    //feature-mean
+    float4 mean;
+    float validPoints = pointsNumSM[pillar_idx_inBlock];
+    mean.x = pillarSumSM[pillar_idx_inBlock].x / validPoints;
+    mean.y = pillarSumSM[pillar_idx_inBlock].y / validPoints;
+    mean.z = pillarSumSM[pillar_idx_inBlock].z / validPoints;
+
+    mean.x  = pillarSM[pillar_idx_inBlock][point_idx].x - mean.x;
+    mean.y  = pillarSM[pillar_idx_inBlock][point_idx].y - mean.y;
+    mean.z  = pillarSM[pillar_idx_inBlock][point_idx].z - mean.z;
+
+
+    //calculate offset
+    float x_offset = voxel_x / 2 + idxsSM[pillar_idx_inBlock].w * voxel_x + range_min_x;
+    float y_offset = voxel_y / 2 + idxsSM[pillar_idx_inBlock].z * voxel_y + range_min_y;
+    float z_offset = voxel_z / 2 + idxsSM[pillar_idx_inBlock].y * voxel_z + range_min_z;
+
+    //feature-offset
+    float4 center;
+    center.x  = pillarSM[pillar_idx_inBlock][point_idx].x - x_offset;
+    center.y  = pillarSM[pillar_idx_inBlock][point_idx].y - y_offset;
+    center.z  = pillarSM[pillar_idx_inBlock][point_idx].z - z_offset;
+
+    //store output
+    if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
+      pillarOutSM[pillar_idx_inBlock][point_idx][0] = pillarSM[pillar_idx_inBlock][point_idx].x;
+      pillarOutSM[pillar_idx_inBlock][point_idx][1] = pillarSM[pillar_idx_inBlock][point_idx].y;
+      pillarOutSM[pillar_idx_inBlock][point_idx][2] = pillarSM[pillar_idx_inBlock][point_idx].z;
+      pillarOutSM[pillar_idx_inBlock][point_idx][3] = pillarSM[pillar_idx_inBlock][point_idx].w;
+
+      pillarOutSM[pillar_idx_inBlock][point_idx][4] = mean.x;
+      pillarOutSM[pillar_idx_inBlock][point_idx][5] = mean.y;
+      pillarOutSM[pillar_idx_inBlock][point_idx][6] = mean.z;
+
+      pillarOutSM[pillar_idx_inBlock][point_idx][7] = center.x;
+      pillarOutSM[pillar_idx_inBlock][point_idx][8] = center.y;
+      pillarOutSM[pillar_idx_inBlock][point_idx][9] = center.z;
+
+    } else {
+      pillarOutSM[pillar_idx_inBlock][point_idx][0] = 0;
+      pillarOutSM[pillar_idx_inBlock][point_idx][1] = 0;
+      pillarOutSM[pillar_idx_inBlock][point_idx][2] = 0;
+      pillarOutSM[pillar_idx_inBlock][point_idx][3] = 0;
+
+      pillarOutSM[pillar_idx_inBlock][point_idx][4] = 0;
+      pillarOutSM[pillar_idx_inBlock][point_idx][5] = 0;
+      pillarOutSM[pillar_idx_inBlock][point_idx][6] = 0;
+
+      pillarOutSM[pillar_idx_inBlock][point_idx][7] = 0;
+      pillarOutSM[pillar_idx_inBlock][point_idx][8] = 0;
+      pillarOutSM[pillar_idx_inBlock][point_idx][9] = 0;
+    }
+
+    __syncthreads();
+
+    for(int i = 0; i < FEATURES_SIZE; i ++) {
+      int outputSMId = pillar_idx_inBlock*WARP_SIZE*FEATURES_SIZE + i* WARP_SIZE + point_idx;
+      int outputId = pillar_idx*WARP_SIZE*FEATURES_SIZE + i* WARP_SIZE + point_idx;
+      features[outputId] = ((float*)pillarOutSM)[outputSMId] ;
+    }
+
+}
+
+cudaError_t generateFeatures_launch(float* voxel_features,
+    unsigned int * voxel_num,
+    unsigned int* voxel_idxs,
+    unsigned int *params,
+    float voxel_x, float voxel_y, float voxel_z,
+    float range_min_x, float range_min_y, float range_min_z,
+    float* features,
+    cudaStream_t stream)
+{
+    dim3 blocks( (MAX_VOXELS+WARPS_PER_BLOCK-1)/WARPS_PER_BLOCK);
+    dim3 threads(WARPS_PER_BLOCK*WARP_SIZE);
+
+    generateFeatures_kernel<<<blocks, threads, 0, stream>>>
+      (voxel_features,
+      voxel_num,
+      voxel_idxs,
+      params,
+      voxel_x, voxel_y, voxel_z,
+      range_min_x, range_min_y, range_min_z,
+      features);
+
+    cudaError_t err = cudaGetLastError();
+    return err;
+}

+ 2 - 2
src/tool/pointcloudviewer/main.cpp

@@ -189,9 +189,9 @@ int main(int argc, char *argv[])
     showversion("pointcloudviewer");
     QCoreApplication a(argc, argv);
 
-    snprintf(gstr_memname,255,"lidar_pc");
+//    snprintf(gstr_memname,255,"lidar_pc");
 //    snprintf(gstr_memname,255,"lidar_pc_filter");
-//    snprintf(gstr_memname,255,"lidarpc_center");
+    snprintf(gstr_memname,255,"lidarpc_center");
 
 
     int nRtn = GetOptLong(argc,argv);