Jelajahi Sumber

add detection_lidar_transfusion. not complete.

yuchuli 1 bulan lalu
induk
melakukan
47014b71f9
25 mengubah file dengan 3885 tambahan dan 0 penghapusan
  1. 73 0
      src/detection/detection_lidar_transfusion/.gitignore
  2. 126 0
      src/detection/detection_lidar_transfusion/cuda_utils.hpp
  3. 137 0
      src/detection/detection_lidar_transfusion/detection_lidar_transfusion.pro
  4. 87 0
      src/detection/detection_lidar_transfusion/include/network/network_trt.hpp
  5. 32 0
      src/detection/detection_lidar_transfusion/include/postprocess/circle_nms_kernel.hpp
  6. 82 0
      src/detection/detection_lidar_transfusion/include/postprocess/non_maximum_suppression.hpp
  7. 47 0
      src/detection/detection_lidar_transfusion/include/postprocess/postprocess_kernel.hpp
  8. 101 0
      src/detection/detection_lidar_transfusion/include/preprocess/pointcloud_densification.hpp
  9. 73 0
      src/detection/detection_lidar_transfusion/include/preprocess/preprocess_kernel.hpp
  10. 72 0
      src/detection/detection_lidar_transfusion/include/preprocess/voxel_generator.hpp
  11. 552 0
      src/detection/detection_lidar_transfusion/include/tensorrt_common/logger.hpp
  12. 70 0
      src/detection/detection_lidar_transfusion/include/tensorrt_common/simple_profiler.hpp
  13. 232 0
      src/detection/detection_lidar_transfusion/include/tensorrt_common/tensorrt_common.hpp
  14. 8 0
      src/detection/detection_lidar_transfusion/main.cpp
  15. 333 0
      src/detection/detection_lidar_transfusion/network/network_trt.cpp
  16. 144 0
      src/detection/detection_lidar_transfusion/postprocess/circle_nms_kernel.cu
  17. 147 0
      src/detection/detection_lidar_transfusion/postprocess/non_maximum_suppression.cpp
  18. 145 0
      src/detection/detection_lidar_transfusion/postprocess/postprocess_kernel.cu
  19. 116 0
      src/detection/detection_lidar_transfusion/preprocess/pointcloud_densification.cpp
  20. 221 0
      src/detection/detection_lidar_transfusion/preprocess/preprocess_kernel.cu
  21. 123 0
      src/detection/detection_lidar_transfusion/preprocess/voxel_generator.cpp
  22. 132 0
      src/detection/detection_lidar_transfusion/simple_profiler.cpp
  23. 605 0
      src/detection/detection_lidar_transfusion/tensorrt_common.cpp
  24. 170 0
      src/detection/detection_lidar_transfusion/transfusion_config.hpp
  25. 57 0
      src/detection/detection_lidar_transfusion/utils.hpp

+ 73 - 0
src/detection/detection_lidar_transfusion/.gitignore

@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+

+ 126 - 0
src/detection/detection_lidar_transfusion/cuda_utils.hpp

@@ -0,0 +1,126 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This code is licensed under CC0 1.0 Universal (Public Domain).
+ * You can use this without any limitation.
+ * https://creativecommons.org/publicdomain/zero/1.0/deed.en
+ */
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__CUDA_UTILS_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__CUDA_UTILS_HPP_
+
+#include <cuda_runtime_api.h>
+
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+#define CHECK_CUDA_ERROR(e) (cuda::check_error(e, __FILE__, __LINE__))
+
+namespace cuda
+{
+inline void check_error(const ::cudaError_t e, const char * f, int n)
+{
+  if (e != ::cudaSuccess) {
+    ::std::stringstream s;
+    s << ::cudaGetErrorName(e) << " (" << e << ")@" << f << "#L" << n << ": "
+      << ::cudaGetErrorString(e);
+    throw ::std::runtime_error{s.str()};
+  }
+}
+
+struct deleter
+{
+  void operator()(void * p) const { CHECK_CUDA_ERROR(::cudaFree(p)); }
+};
+
+template <typename T>
+using unique_ptr = ::std::unique_ptr<T, deleter>;
+
+template <typename T>
+typename ::std::enable_if<::std::is_array<T>::value, cuda::unique_ptr<T>>::type make_unique(
+  const ::std::size_t n)
+{
+  using U = typename ::std::remove_extent<T>::type;
+  U * p;
+  CHECK_CUDA_ERROR(::cudaMalloc(reinterpret_cast<void **>(&p), sizeof(U) * n));
+  return cuda::unique_ptr<T>{p};
+}
+
+template <typename T>
+cuda::unique_ptr<T> make_unique()
+{
+  T * p;
+  CHECK_CUDA_ERROR(::cudaMalloc(reinterpret_cast<void **>(&p), sizeof(T)));
+  return cuda::unique_ptr<T>{p};
+}
+
+constexpr size_t CUDA_ALIGN = 256;
+
+template <typename T>
+inline size_t get_size_aligned(size_t num_elem)
+{
+  size_t size = num_elem * sizeof(T);
+  size_t extra_align = 0;
+  if (size % CUDA_ALIGN != 0) {
+    extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
+  }
+  return size + extra_align;
+}
+
+template <typename T>
+inline T * get_next_ptr(size_t num_elem, void *& workspace, size_t & workspace_size)
+{
+  size_t size = get_size_aligned<T>(num_elem);
+  if (size > workspace_size) {
+    throw ::std::runtime_error("Workspace is too small!");
+  }
+  workspace_size -= size;
+  T * ptr = reinterpret_cast<T *>(workspace);
+  workspace = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(workspace) + size);
+  return ptr;
+}
+
+template <typename T>
+void clear_async(T * ptr, size_t num_elem, cudaStream_t stream)
+{
+  CHECK_CUDA_ERROR(::cudaMemsetAsync(ptr, 0, sizeof(T) * num_elem, stream));
+}
+
+}  // namespace cuda
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__CUDA_UTILS_HPP_

+ 137 - 0
src/detection/detection_lidar_transfusion/detection_lidar_transfusion.pro

@@ -0,0 +1,137 @@
+QT -= gui
+
+CONFIG += c++1z console
+CONFIG -= app_bundle
+
+# You can make your code fail to compile if it uses deprecated APIs.
+# In order to do so, uncomment the following line.
+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
+
+SOURCES += \
+        main.cpp \
+        network/network_trt.cpp \
+        postprocess/non_maximum_suppression.cpp \
+        simple_profiler.cpp \
+        tensorrt_common.cpp
+
+# Default rules for deployment.
+qnx: target.path = /tmp/$${TARGET}/bin
+else: unix:!android: target.path = /opt/$${TARGET}/bin
+!isEmpty(target.path): INSTALLS += target
+
+DISTFILES += \
+    postprocess/circle_nms_kernel.cu \
+    postprocess/postprocess_kernel.cu
+
+HEADERS += \
+    cuda_utils.hpp \
+    include/network/network_trt.hpp \
+    include/postprocess/circle_nms_kernel.hpp \
+    include/postprocess/non_maximum_suppression.hpp \
+    include/postprocess/postprocess_kernel.hpp \
+    include/preprocess/pointcloud_densification.hpp \
+    include/preprocess/preprocess_kernel.hpp \
+    include/preprocess/voxel_generator.hpp \
+    include/tensorrt_common/logger.hpp \
+    include/tensorrt_common/simple_profiler.hpp \
+    include/tensorrt_common/tensorrt_common.hpp \
+    utils.hpp
+
+INCLUDEPATH += /usr/include/eigen3
+
+INCLUDEPATH += $$PWD/include
+
+INCLUDEPATH += $$CUDA_DIR/include
+
+LIBS += -L"/usr/local/lib" \
+        -L"/usr/local/cuda/lib64" \
+        -lcudart \
+        -lcufft
+
+CUDA_SOURCES += \
+    postprocess/circle_nms_kernel.cu \
+    postprocess/postprocess_kernel.cu
+
+CUDA_SDK = "/usr/local/cuda/"   # cudaSDK路径
+
+CUDA_DIR = "/usr/local/cuda/"            # CUDA tookit路径
+
+SYSTEM_NAME = linux         # 自己系统环境 'Win32', 'x64', or 'Win64'
+
+SYSTEM_TYPE = 64           #操作系统位数 '32' or '64',
+
+CUDA_ARCH = sm_72 # xavier sm_72          # cuda架构, for example 'compute_10', 'compute_11', 'sm_10'
+
+NVCC_OPTIONS = --use_fast_math --compiler-options "-fPIC"
+
+
+# include paths
+
+INCLUDEPATH += $$CUDA_DIR/include
+INCLUDEPATH += $$PWD
+#INCLUDEPATH += /usr/local/cuda-10.0/targets/aarch64-linux/include/crt
+
+# library directories
+
+QMAKE_LIBDIR += $$CUDA_DIR/lib/
+
+CUDA_OBJECTS_DIR = ./
+
+# The following library conflicts with something in Cuda
+
+#QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
+
+#QMAKE_LFLAGS_DEBUG   = /NODEFAULTLIB:msvcrtd.lib
+
+# Add the necessary libraries
+
+CUDA_LIBS =  cudart cufft
+
+# The following makes sure all path names (which often include spaces) are put between quotation marks
+
+CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
+
+NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
+
+#LIBS += $$join(CUDA_LIBS,'.so ', '', '.so')
+
+# Configuration of the Cuda compiler
+
+CONFIG(debug, debug|release) {
+
+    # Debug mode
+
+    cuda_d.input = CUDA_SOURCES
+
+    cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
+
+    cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+
+    cuda_d.dependency_type = TYPE_C
+
+    QMAKE_EXTRA_COMPILERS += cuda_d
+
+}
+
+else {
+
+    # Release mode
+
+    cuda.input = CUDA_SOURCES
+
+    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
+
+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -O3 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+
+    cuda.dependency_type = TYPE_C
+
+    QMAKE_EXTRA_COMPILERS += cuda
+
+}
+
+
+# include paths
+
+
+
+LIBS += -lrt -ldl -lnvinfer -lcudnn  -lcudart -lnvparsers -lnvonnxparser -lnvinfer_plugin -lstdc++fs

+ 87 - 0
src/detection/detection_lidar_transfusion/include/network/network_trt.hpp

@@ -0,0 +1,87 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_
+
+#include "transfusion_config.hpp"
+#include "utils.hpp"
+
+#include <tensorrt_common/tensorrt_common.hpp>
+
+#include <NvInfer.h>
+
+#include <array>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace autoware::lidar_transfusion
+{
+
+struct ProfileDimension
+{
+  nvinfer1::Dims min;
+  nvinfer1::Dims opt;
+  nvinfer1::Dims max;
+
+  bool operator!=(const ProfileDimension & rhs) const
+  {
+    return min.nbDims != rhs.min.nbDims || opt.nbDims != rhs.opt.nbDims ||
+           max.nbDims != rhs.max.nbDims || !std::equal(min.d, min.d + min.nbDims, rhs.min.d) ||
+           !std::equal(opt.d, opt.d + opt.nbDims, rhs.opt.d) ||
+           !std::equal(max.d, max.d + max.nbDims, rhs.max.d);
+  }
+};
+
+class NetworkTRT
+{
+public:
+  explicit NetworkTRT(const TransfusionConfig & config);
+  ~NetworkTRT();
+
+  bool init(
+    const std::string & onnx_path, const std::string & engine_path, const std::string & precision);
+  const char * getTensorName(NetworkIO name);
+
+  tensorrt_common::TrtUniquePtr<nvinfer1::ICudaEngine> engine{nullptr};
+  tensorrt_common::TrtUniquePtr<nvinfer1::IExecutionContext> context{nullptr};
+
+private:
+  bool parseONNX(
+    const std::string & onnx_path, const std::string & engine_path, const std::string & precision,
+    size_t workspace_size = (1ULL << 30));
+  bool saveEngine(const std::string & engine_path);
+  bool loadEngine(const std::string & engine_path);
+  bool createContext();
+  bool setProfile(
+    nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network,
+    nvinfer1::IBuilderConfig & config);
+  bool validateNetworkIO();
+  nvinfer1::Dims validateTensorShape(NetworkIO name, const std::vector<int> shape);
+
+  tensorrt_common::TrtUniquePtr<nvinfer1::IRuntime> runtime_{nullptr};
+  tensorrt_common::TrtUniquePtr<nvinfer1::IHostMemory> plan_{nullptr};
+  tensorrt_common::Logger logger_;
+  TransfusionConfig config_;
+  std::vector<const char *> tensors_names_;
+
+  std::array<ProfileDimension, 3> in_profile_dims_;
+};
+
+}  // namespace autoware::lidar_transfusion
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_

+ 32 - 0
src/detection/detection_lidar_transfusion/include/postprocess/circle_nms_kernel.hpp

@@ -0,0 +1,32 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
+
+#include "utils.hpp"
+
+#include <thrust/device_vector.h>
+
+namespace autoware::lidar_transfusion
+{
+// Non-maximum suppression (NMS) uses the distance on the xy plane instead of
+// intersection over union (IoU) to suppress overlapped objects.
+std::size_t circleNMS(
+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream);
+
+}  // namespace autoware::lidar_transfusion
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_

+ 82 - 0
src/detection/detection_lidar_transfusion/include/postprocess/non_maximum_suppression.hpp

@@ -0,0 +1,82 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
+
+//#include "autoware/lidar_transfusion/ros_utils.hpp"
+
+#include <Eigen/Eigen>
+
+//#include <autoware_perception_msgs/msg/detected_object.hpp>
+
+#include <string>
+#include <vector>
+
+uint8_t getSemanticType(const std::string & class_name);
+
+namespace autoware::lidar_transfusion
+{
+//using autoware_perception_msgs::msg::DetectedObject;
+
+enum class NMS_TYPE {
+  IoU_BEV
+  // IoU_3D
+  // Distance_2D
+  // Distance_3D
+};
+
+struct NMSParams
+{
+  NMS_TYPE nms_type_{};
+  std::vector<std::string> target_class_names_{};
+  double search_distance_2d_{};
+  double iou_threshold_{};
+  // double distance_threshold_{};
+};
+
+std::vector<bool> classNamesToBooleanMask(const std::vector<std::string> & class_names)
+{
+  std::vector<bool> mask;
+  constexpr std::size_t num_object_classification = 8;
+  mask.resize(num_object_classification);
+  for (const auto & class_name : class_names) {
+    const auto semantic_type = getSemanticType(class_name);
+    mask.at(semantic_type) = true;
+  }
+
+  return mask;
+}
+
+class NonMaximumSuppression
+{
+public:
+  void setParameters(const NMSParams &);
+
+//  std::vector<DetectedObject> apply(const std::vector<DetectedObject> &);
+
+private:
+  bool isTargetLabel(const std::uint8_t);
+
+//  bool isTargetPairObject(const DetectedObject &, const DetectedObject &);
+
+//  Eigen::MatrixXd generateIoUMatrix(const std::vector<DetectedObject> &);
+
+  NMSParams params_{};
+  std::vector<bool> target_class_mask_{};
+};
+
+}  // namespace autoware::lidar_transfusion
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_

+ 47 - 0
src/detection/detection_lidar_transfusion/include/postprocess/postprocess_kernel.hpp

@@ -0,0 +1,47 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
+
+#include "transfusion_config.hpp"
+#include "utils.hpp"
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <vector>
+
+namespace autoware::lidar_transfusion
+{
+
+class PostprocessCuda
+{
+public:
+  explicit PostprocessCuda(const TransfusionConfig & config, cudaStream_t & stream);
+
+  cudaError_t generateDetectedBoxes3D_launch(
+    const float * cls_output, const float * box_output, const float * dir_cls_output,
+    std::vector<Box3D> & det_boxes3d, cudaStream_t stream);
+
+private:
+  TransfusionConfig config_;
+  cudaStream_t stream_;
+  cudaStream_t stream_event_;
+  cudaEvent_t start_, stop_;
+};
+
+}  // namespace autoware::lidar_transfusion
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_

+ 101 - 0
src/detection/detection_lidar_transfusion/include/preprocess/pointcloud_densification.hpp

@@ -0,0 +1,101 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
+
+#include "autoware/lidar_transfusion/cuda_utils.hpp"
+
+#include <tf2_ros/buffer.h>
+#include <tf2_ros/transform_listener.h>
+#ifdef ROS_DISTRO_GALACTIC
+#include <tf2_sensor_msgs/tf2_sensor_msgs.h>
+#else
+#include <tf2_sensor_msgs/tf2_sensor_msgs.hpp>
+#endif
+
+#include <list>
+#include <string>
+#include <utility>
+
+namespace autoware::lidar_transfusion
+{
+
+class DensificationParam
+{
+public:
+  DensificationParam(const std::string & world_frame_id, const unsigned int num_past_frames)
+  : world_frame_id_(std::move(world_frame_id)),
+    pointcloud_cache_size_(num_past_frames + /*current frame*/ 1)
+  {
+  }
+
+  std::string world_frame_id() const { return world_frame_id_; }
+  unsigned int pointcloud_cache_size() const { return pointcloud_cache_size_; }
+
+private:
+  std::string world_frame_id_;
+  unsigned int pointcloud_cache_size_{1};
+};
+
+struct PointCloudWithTransform
+{
+  cuda::unique_ptr<uint8_t[]> data_d{nullptr};
+  std_msgs::msg::Header header;
+  size_t num_points{0};
+  Eigen::Affine3f affine_past2world;
+};
+
+class PointCloudDensification
+{
+public:
+  explicit PointCloudDensification(const DensificationParam & param, cudaStream_t & stream);
+
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer);
+
+  double getCurrentTimestamp() const { return current_timestamp_; }
+  Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
+  std::list<PointCloudWithTransform>::iterator getPointCloudCacheIter()
+  {
+    return pointcloud_cache_.begin();
+  }
+  bool isCacheEnd(std::list<PointCloudWithTransform>::iterator iter)
+  {
+    return iter == pointcloud_cache_.end();
+  }
+  size_t getIdx(std::list<PointCloudWithTransform>::iterator iter)
+  {
+    return std::distance(pointcloud_cache_.begin(), iter);
+  }
+  size_t getCacheSize()
+  {
+    return std::distance(pointcloud_cache_.begin(), pointcloud_cache_.end());
+  }
+  unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
+
+private:
+  void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
+  void dequeue();
+
+  DensificationParam param_;
+  double current_timestamp_{0.0};
+  Eigen::Affine3f affine_world2current_;
+  std::list<PointCloudWithTransform> pointcloud_cache_;
+  cudaStream_t stream_;
+};
+
+}  // namespace autoware::lidar_transfusion
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_

+ 73 - 0
src/detection/detection_lidar_transfusion/include/preprocess/preprocess_kernel.hpp

@@ -0,0 +1,73 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
+
+#include "autoware/lidar_transfusion/cuda_utils.hpp"
+#include "autoware/lidar_transfusion/transfusion_config.hpp"
+#include "autoware/lidar_transfusion/utils.hpp"
+
+#include <cuda_runtime_api.h>
+
+namespace autoware::lidar_transfusion
+{
+
+class PreprocessCuda
+{
+public:
+  PreprocessCuda(const TransfusionConfig & config, cudaStream_t & stream);
+
+  void generateVoxels(
+    float * points, unsigned int points_size, unsigned int * pillar_num, float * voxel_features,
+    unsigned int * voxel_num, unsigned int * voxel_idxs);
+
+  cudaError_t generateVoxels_random_launch(
+    float * points, unsigned int points_size, unsigned int * mask, float * voxels);
+
+  cudaError_t generateBaseFeatures_launch(
+    unsigned int * mask, float * voxels, unsigned int * pillar_num, float * voxel_features,
+    unsigned int * voxel_num, unsigned int * voxel_idxs);
+
+  cudaError_t generateSweepPoints_launch(
+    const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag,
+    const float * transform, float * output_points);
+
+private:
+  TransfusionConfig config_;
+  cudaStream_t stream_;
+  cuda::unique_ptr<unsigned int[]> mask_{nullptr};
+  cuda::unique_ptr<float[]> voxels_{nullptr};
+  unsigned int mask_size_;
+  unsigned int voxels_size_;
+};
+}  // namespace autoware::lidar_transfusion
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_

+ 72 - 0
src/detection/detection_lidar_transfusion/include/preprocess/voxel_generator.hpp

@@ -0,0 +1,72 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
+
+#include "autoware/lidar_transfusion/cuda_utils.hpp"
+#include "autoware/lidar_transfusion/preprocess/pointcloud_densification.hpp"
+#include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp"
+#include "autoware/lidar_transfusion/ros_utils.hpp"
+#include "autoware/lidar_transfusion/transfusion_config.hpp"
+
+#ifdef ROS_DISTRO_GALACTIC
+#include <tf2_eigen/tf2_eigen.h>
+#else
+#include <tf2_eigen/tf2_eigen.hpp>
+#endif
+
+#include <autoware_point_types/types.hpp>
+
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace autoware::lidar_transfusion
+{
+constexpr size_t AFF_MAT_SIZE = 16;  // 4x4 matrix
+constexpr size_t MAX_CLOUD_STEP_SIZE = sizeof(autoware_point_types::PointXYZIRCAEDT);
+
+class VoxelGenerator
+{
+public:
+  explicit VoxelGenerator(
+    const DensificationParam & densification_param, const TransfusionConfig & config,
+    cudaStream_t & stream);
+  std::size_t generateSweepPoints(
+    const sensor_msgs::msg::PointCloud2 & msg, cuda::unique_ptr<float[]> & points_d);
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer);
+  void initCloudInfo(const sensor_msgs::msg::PointCloud2 & msg);
+  std::tuple<const uint32_t, const uint8_t, const uint8_t> getFieldInfo(
+    const sensor_msgs::msg::PointCloud2 & msg, const std::string & field_name);
+
+private:
+  std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
+  std::unique_ptr<PreprocessCuda> pre_ptr_{nullptr};
+  TransfusionConfig config_;
+  cuda::unique_ptr<unsigned char[]> cloud_data_d_{nullptr};
+  cuda::unique_ptr<float[]> affine_past2current_d_{nullptr};
+  std::vector<float> points_;
+  cudaStream_t stream_;
+  CloudInfo cloud_info_;
+  bool is_initialized_{false};
+};
+
+}  // namespace autoware::lidar_transfusion
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_

+ 552 - 0
src/detection/detection_lidar_transfusion/include/tensorrt_common/logger.hpp

@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_COMMON__LOGGER_HPP_
+#define TENSORRT_COMMON__LOGGER_HPP_
+
+#include "NvInferRuntimeCommon.h"
+
+#include <atomic>
+#include <cassert>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <thread>
+
+namespace tensorrt_common
+{
+using Severity = nvinfer1::ILogger::Severity;
+
+class LogStreamConsumerBuffer : public std::stringbuf
+{
+public:
+  LogStreamConsumerBuffer(std::ostream & stream, const std::string & prefix, bool shouldLog)
+  : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog)
+  {
+  }
+
+  LogStreamConsumerBuffer(LogStreamConsumerBuffer && other) : mOutput(other.mOutput) {}
+
+  ~LogStreamConsumerBuffer()
+  {
+    // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output
+    // sequence std::streambuf::pptr() gives a pointer to the current position of the output
+    // sequence if the pointer to the beginning is not equal to the pointer to the current position,
+    // call putOutput() to log the output to the stream
+    if (pbase() != pptr()) {
+      putOutput();
+    }
+  }
+
+  // synchronizes the stream buffer and returns 0 on success
+  // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
+  // resetting the buffer and flushing the stream
+  virtual int sync()
+  {
+    putOutput();
+    return 0;
+  }
+
+  void putOutput()
+  {
+    if (mShouldLog) {
+      // prepend timestamp
+      // std::time_t timestamp = std::time(nullptr);
+      // tm* tm_local = std::localtime(&timestamp);
+      mOutput << mPrefix << str();
+      // set the buffer to empty
+      str("");
+      // flush the stream
+      mOutput.flush();
+    }
+  }
+
+  void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }
+
+private:
+  std::ostream & mOutput;
+  std::string mPrefix;
+  bool mShouldLog;
+};
+
+//!
+//! \class LogStreamConsumerBase
+//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in
+//! LogStreamConsumer
+//!
+class LogStreamConsumerBase
+{
+public:
+  LogStreamConsumerBase(std::ostream & stream, const std::string & prefix, bool shouldLog)
+  : mBuffer(stream, prefix, shouldLog)
+  {
+  }
+
+protected:
+  LogStreamConsumerBuffer mBuffer;
+};
+
+//!
+//! \class LogStreamConsumer
+//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
+//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
+//!  This is because the LogStreamConsumerBase class is used to initialize the
+//!  LogStreamConsumerBuffer member field in LogStreamConsumer and then the address of the buffer is
+//!  passed to std::ostream. This is necessary to prevent the address of an uninitialized buffer
+//!  from being passed to std::ostream. Please do not change the order of the parent classes.
+//!
+class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
+{
+public:
+  //! \brief Creates a LogStreamConsumer which logs messages with level severity.
+  //!  Reportable severity determines if the messages are severe enough to be logged.
+  LogStreamConsumer(Severity reportableSeverity, Severity severity)
+  : LogStreamConsumerBase(
+      severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
+    std::ostream(&mBuffer)  // links the stream buffer with the stream
+    ,
+    mShouldLog(severity <= reportableSeverity),
+    mSeverity(severity)
+  {
+  }
+
+  LogStreamConsumer(LogStreamConsumer && other)
+  : LogStreamConsumerBase(
+      severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
+    std::ostream(&mBuffer)  // links the stream buffer with the stream
+    ,
+    mShouldLog(other.mShouldLog),
+    mSeverity(other.mSeverity)
+  {
+  }
+
+  void setReportableSeverity(Severity reportableSeverity)
+  {
+    mShouldLog = mSeverity <= reportableSeverity;
+    mBuffer.setShouldLog(mShouldLog);
+  }
+
+private:
+  static std::ostream & severityOstream(Severity severity)
+  {
+    return severity >= Severity::kINFO ? std::cout : std::cerr;
+  }
+
+  static std::string severityPrefix(Severity severity)
+  {
+    switch (severity) {
+      case Severity::kINTERNAL_ERROR:
+        return "[F] ";
+      case Severity::kERROR:
+        return "[E] ";
+      case Severity::kWARNING:
+        return "[W] ";
+      case Severity::kINFO:
+        return "[I] ";
+      case Severity::kVERBOSE:
+        return "[V] ";
+      default:
+        assert(0);
+        return "";
+    }
+  }
+
+  bool mShouldLog;
+  Severity mSeverity;
+};
+
+//! \class Logger
+//!
+//! \brief Class which manages logging of TensorRT tools and samples
+//!
+//! \details This class provides a common interface for TensorRT tools and samples to log
+//! information to the console, and supports logging two types of messages:
+//!
+//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
+//! - Test pass/fail messages
+//!
+//! The advantage of having all samples use this class for logging as opposed to emitting directly
+//! to stdout/stderr is that the logic for controlling the verbosity and formatting of sample output
+//! is centralized in one location.
+//!
+//! In the future, this class could be extended to support dumping test results to a file in some
+//! standard format (for example, JUnit XML), and providing additional metadata (e.g. timing the
+//! duration of a test run).
+//!
+//! TODO: For backwards compatibility with existing samples, this class inherits directly from the
+//! nvinfer1::ILogger interface, which is problematic since there isn't a clean separation between
+//! messages coming from the TensorRT library and messages coming from the sample.
+//!
+//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger)
+//! we can refactor the class to eliminate the inheritance and instead make the nvinfer1::ILogger
+//! implementation a member of the Logger object.
+
+class Logger : public nvinfer1::ILogger  // NOLINT
+{
+public:
+  //  Logger(Severity severity = Severity::kWARNING)
+  //  Logger(Severity severity = Severity::kVERBOSE)
+  explicit Logger(Severity severity = Severity::kINFO)
+  : mReportableSeverity(severity), mVerbose(true), mThrottleStopFlag(false)
+  {
+  }
+
+  explicit Logger(const bool verbose, Severity severity = Severity::kINFO)
+  : mReportableSeverity(severity), mVerbose(verbose), mThrottleStopFlag(false)
+  {
+  }
+
+  //!
+  //! \enum TestResult
+  //! \brief Represents the state of a given test
+  //!
+  enum class TestResult {
+    kRUNNING,  //!< The test is running
+    kPASSED,   //!< The test passed
+    kFAILED,   //!< The test failed
+    kWAIVED    //!< The test was waived
+  };
+
+  //!
+  //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this
+  //! Logger \return The nvinfer1::ILogger associated with this Logger
+  //!
+  //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
+  //! we can eliminate the inheritance of Logger from ILogger
+  //!
+  nvinfer1::ILogger & getTRTLogger()
+  {
+    printf("verbose\n");
+    return *this;
+  }
+
+  //!
+  //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
+  //!
+  //! Note samples should not be calling this function directly; it will eventually go away once we
+  //! eliminate the inheritance from nvinfer1::ILogger
+  //!
+  void log(Severity severity, const char * msg) noexcept override
+  {
+    if (mVerbose) {
+      LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
+    }
+  }
+
+  /**
+   * @brief Logging with throttle.
+   *
+   * @example
+   * Logger logger();
+   * auto log_thread = logger.log_throttle(nvinfer1::ILogger::Severity::kINFO, "SOME MSG", 1);
+   * // some operation
+   * logger.stop_throttle(log_thread);
+   *
+   * @param severity
+   * @param msg
+   * @param duration
+   * @return std::thread
+   *
+   */
+  std::thread log_throttle(Severity severity, const char * msg, const int duration) noexcept
+  {
+    mThrottleStopFlag.store(false);
+    auto log_func = [this](Severity s, const char * m, const int d) {
+      while (!mThrottleStopFlag.load()) {
+        this->log(s, m);
+        std::this_thread::sleep_for(std::chrono::seconds(d));
+      }
+    };
+
+    std::thread log_thread(log_func, severity, msg, duration);
+    return log_thread;
+  }
+
+  void stop_throttle(std::thread & log_thread) noexcept
+  {
+    mThrottleStopFlag.store(true);
+    log_thread.join();
+  }
+
+  //!
+  //! \brief Method for controlling the verbosity of logging output
+  //!
+  //! \param severity The logger will only emit messages that have severity of this level or higher.
+  //!
+  void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }
+
+  //!
+  //! \brief Opaque handle that holds logging information for a particular test
+  //!
+  //! This object is an opaque handle to information used by the Logger to print test results.
+  //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
+  //! with Logger::reportTest{Start,End}().
+  //!
+  class TestAtom
+  {
+  public:
+    TestAtom(TestAtom &&) = default;
+
+  private:
+    friend class Logger;
+
+    TestAtom(bool started, const std::string & name, const std::string & cmdline)
+    : mStarted(started), mName(name), mCmdline(cmdline)
+    {
+    }
+
+    bool mStarted;
+    std::string mName;
+    std::string mCmdline;
+  };
+
+  //!
+  //! \brief Define a test for logging
+  //!
+  //! \param[in] name The name of the test.  This should be a string starting with
+  //!                  "TensorRT" and containing dot-separated strings containing
+  //!                  the characters [A-Za-z0-9_].
+  //!                  For example, "TensorRT.sample_googlenet"
+  //! \param[in] cmdline The command line used to reproduce the test
+  //
+  //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+  //!
+  static TestAtom defineTest(const std::string & name, const std::string & cmdline)
+  {
+    return TestAtom(false, name, cmdline);
+  }
+
+  //!
+  //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line
+  //! arguments
+  //!        as input
+  //!
+  //! \param[in] name The name of the test
+  //! \param[in] argc The number of command-line arguments
+  //! \param[in] argv The array of command-line arguments (given as C strings)
+  //!
+  //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+  static TestAtom defineTest(const std::string & name, int argc, char const * const * argv)
+  {
+    auto cmdline = genCmdlineString(argc, argv);
+    return defineTest(name, cmdline);
+  }
+
+  //!
+  //! \brief Report that a test has started.
+  //!
+  //! \pre reportTestStart() has not been called yet for the given testAtom
+  //!
+  //! \param[in] testAtom The handle to the test that has started
+  //!
+  static void reportTestStart(TestAtom & testAtom)
+  {
+    reportTestResult(testAtom, TestResult::kRUNNING);
+    assert(!testAtom.mStarted);
+    testAtom.mStarted = true;
+  }
+
+  //!
+  //! \brief Report that a test has ended.
+  //!
+  //! \pre reportTestStart() has been called for the given testAtom
+  //!
+  //! \param[in] testAtom The handle to the test that has ended
+  //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
+  //!                   TestResult::kFAILED, TestResult::kWAIVED
+  //!
+  static void reportTestEnd(const TestAtom & testAtom, TestResult result)
+  {
+    assert(result != TestResult::kRUNNING);
+    assert(testAtom.mStarted);
+    reportTestResult(testAtom, result);
+  }
+
+  static int reportPass(const TestAtom & testAtom)
+  {
+    reportTestEnd(testAtom, TestResult::kPASSED);
+    return EXIT_SUCCESS;
+  }
+
+  static int reportFail(const TestAtom & testAtom)
+  {
+    reportTestEnd(testAtom, TestResult::kFAILED);
+    return EXIT_FAILURE;
+  }
+
+  static int reportWaive(const TestAtom & testAtom)
+  {
+    reportTestEnd(testAtom, TestResult::kWAIVED);
+    return EXIT_SUCCESS;
+  }
+
+  static int reportTest(const TestAtom & testAtom, bool pass)
+  {
+    return pass ? reportPass(testAtom) : reportFail(testAtom);
+  }
+
+  Severity getReportableSeverity() const { return mReportableSeverity; }
+
+private:
+  //!
+  //! \brief returns an appropriate string for prefixing a log message with the given severity
+  //!
+  static const char * severityPrefix(Severity severity)
+  {
+    switch (severity) {
+      case Severity::kINTERNAL_ERROR:
+        return "[F] ";
+      case Severity::kERROR:
+        return "[E] ";
+      case Severity::kWARNING:
+        return "[W] ";
+      case Severity::kINFO:
+        return "[I] ";
+      case Severity::kVERBOSE:
+        return "[V] ";
+      default:
+        assert(0);
+        return "";
+    }
+  }
+
+  //!
+  //! \brief returns an appropriate string for prefixing a test result message with the given result
+  //!
+  static const char * testResultString(TestResult result)
+  {
+    switch (result) {
+      case TestResult::kRUNNING:
+        return "RUNNING";
+      case TestResult::kPASSED:
+        return "PASSED";
+      case TestResult::kFAILED:
+        return "FAILED";
+      case TestResult::kWAIVED:
+        return "WAIVED";
+      default:
+        assert(0);
+        return "";
+    }
+  }
+
+  //!
+  //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
+  //!
+  static std::ostream & severityOstream(Severity severity)
+  {
+    return severity >= Severity::kINFO ? std::cout : std::cerr;
+  }
+
+  //!
+  //! \brief method that implements logging test results
+  //!
+  static void reportTestResult(const TestAtom & testAtom, TestResult result)
+  {
+    severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName
+                                     << " # " << testAtom.mCmdline << std::endl;
+  }
+
+  //!
+  //! \brief generate a command line string from the given (argc, argv) values
+  //!
+  static std::string genCmdlineString(int argc, char const * const * argv)
+  {
+    std::stringstream ss;
+    for (int i = 0; i < argc; i++) {
+      if (i > 0) ss << " ";
+      ss << argv[i];
+    }
+    return ss.str();
+  }
+
+  Severity mReportableSeverity;
+  bool mVerbose;
+  std::atomic<bool> mThrottleStopFlag;
+};
+
+namespace
+{
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
+//!
+//! Example usage:
+//!
+//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_VERBOSE(const Logger & logger)
+{
+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
+//!
+//! Example usage:
+//!
+//!     LOG_INFO(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_INFO(const Logger & logger)
+{
+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
+//!
+//! Example usage:
+//!
+//!     LOG_WARN(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_WARN(const Logger & logger)
+{
+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
+//!
+//! Example usage:
+//!
+//!     LOG_ERROR(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_ERROR(const Logger & logger)
+{
+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity
+//! kINTERNAL_ERROR
+//         ("fatal" severity)
+//!
+//! Example usage:
+//!
+//!     LOG_FATAL(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_FATAL(const Logger & logger)
+{
+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
+}
+
+}  // anonymous namespace
+}  // namespace tensorrt_common
+
+#endif  // TENSORRT_COMMON__LOGGER_HPP_

+ 70 - 0
src/detection/detection_lidar_transfusion/include/tensorrt_common/simple_profiler.hpp

@@ -0,0 +1,70 @@
+// Copyright 2023 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORRT_COMMON__SIMPLE_PROFILER_HPP_
+#define TENSORRT_COMMON__SIMPLE_PROFILER_HPP_
+
+#include <NvInfer.h>
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace tensorrt_common
+{
+struct LayerInfo
+{
+  int in_c;
+  int out_c;
+  int w;
+  int h;
+  int k;
+  int stride;
+  int groups;
+  nvinfer1::LayerType type;
+};
+
+/**
+ * @class Profiler
+ * @brief Collect per-layer profile information, assuming times are reported in the same order
+ */
+class SimpleProfiler : public nvinfer1::IProfiler
+{
+public:
+  struct Record
+  {
+    float time{0};
+    int count{0};
+    float min_time{-1.0};
+    int index;
+  };
+  SimpleProfiler(
+    std::string name,
+    const std::vector<SimpleProfiler> & src_profilers = std::vector<SimpleProfiler>());
+
+  void reportLayerTime(const char * layerName, float ms) noexcept override;
+
+  void setProfDict(nvinfer1::ILayer * layer) noexcept;
+
+  friend std::ostream & operator<<(std::ostream & out, const SimpleProfiler & value);
+
+private:
+  std::string m_name;
+  std::map<std::string, Record> m_profile;
+  int m_index;
+  std::map<std::string, LayerInfo> m_layer_dict;
+};
+}  // namespace tensorrt_common
+#endif  // TENSORRT_COMMON__SIMPLE_PROFILER_HPP_

+ 232 - 0
src/detection/detection_lidar_transfusion/include/tensorrt_common/tensorrt_common.hpp

@@ -0,0 +1,232 @@
+// Copyright 2023 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORRT_COMMON__TENSORRT_COMMON_HPP_
+#define TENSORRT_COMMON__TENSORRT_COMMON_HPP_
+
+//#ifndef YOLOX_STANDALONE
+//#include <rclcpp/rclcpp.hpp>
+//#endif
+
+#include <NvInfer.h>
+#include <NvOnnxParser.h>
+
+#if (defined(_MSC_VER) or (defined(__GNUC__) and (7 <= __GNUC_MAJOR__)))
+#include <filesystem>
+namespace fs = ::std::filesystem;
+#else
+#include <experimental/filesystem>
+namespace fs = ::std::experimental::filesystem;
+#endif
+
+#include <tensorrt_common/logger.hpp>
+#include <tensorrt_common/simple_profiler.hpp>
+
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+namespace tensorrt_common
+{
+/**
+ * @struct BuildConfig
+ * @brief Configuration to provide fine control regarding TensorRT builder
+ */
+struct BuildConfig
+{
+  // type for calibration
+  std::string calib_type_str;
+
+  // DLA core ID that the process uses
+  int dla_core_id;
+
+  // flag for partial quantization in first layer
+  bool quantize_first_layer;  // For partial quantization
+
+  // flag for partial quantization in last layer
+  bool quantize_last_layer;  // For partial quantization
+
+  // flag for per-layer profiler using IProfiler
+  bool profile_per_layer;
+
+  // clip value for implicit quantization
+  double clip_value;  // For implicit quantization
+
+  // Supported calibration type
+  const std::array<std::string, 4> valid_calib_type = {"Entropy", "Legacy", "Percentile", "MinMax"};
+
+  BuildConfig()
+  : calib_type_str("MinMax"),
+    dla_core_id(-1),
+    quantize_first_layer(false),
+    quantize_last_layer(false),
+    profile_per_layer(false),
+    clip_value(0.0)
+  {
+  }
+
+  explicit BuildConfig(
+    const std::string & calib_type_str, const int dla_core_id = -1,
+    const bool quantize_first_layer = false, const bool quantize_last_layer = false,
+    const bool profile_per_layer = false, const double clip_value = 0.0)
+  : calib_type_str(calib_type_str),
+    dla_core_id(dla_core_id),
+    quantize_first_layer(quantize_first_layer),
+    quantize_last_layer(quantize_last_layer),
+    profile_per_layer(profile_per_layer),
+    clip_value(clip_value)
+  {
+#ifndef YOLOX_STANDALONE
+    if (
+      std::find(valid_calib_type.begin(), valid_calib_type.end(), calib_type_str) ==
+      valid_calib_type.end()) {
+      std::stringstream message;
+      message << "Invalid calibration type was specified: " << calib_type_str << std::endl
+              << "Valid value is one of: [Entropy, (Legacy | Percentile), MinMax]" << std::endl
+              << "Default calibration type will be used: MinMax" << std::endl;
+      std::cerr << message.str();
+    }
+#endif
+  }
+};
+
+nvinfer1::Dims get_input_dims(const std::string & onnx_file_path);
+
+const std::array<std::string, 3> valid_precisions = {"fp32", "fp16", "int8"};
+bool is_valid_precision_string(const std::string & precision);
+
+template <typename T>
+struct InferDeleter  // NOLINT
+{
+  void operator()(T * obj) const
+  {
+    if (obj) {
+#if TENSORRT_VERSION_MAJOR >= 8
+      delete obj;
+#else
+      obj->destroy();
+#endif
+    }
+  }
+};
+
+template <typename T>
+using TrtUniquePtr = std::unique_ptr<T, InferDeleter<T>>;
+
+using BatchConfig = std::array<int32_t, 3>;
+
+/**
+ * @class TrtCommon
+ * @brief TensorRT common library
+ */
+class TrtCommon  // NOLINT
+{
+public:
+  /**
+   * @brief Construct TrtCommon.
+   * @param[in] mode_path ONNX model_path
+   * @param[in] precision precision for inference
+   * @param[in] calibrator pointer for any type of INT8 calibrator
+   * @param[in] batch_config configuration for batched execution
+   * @param[in] max_workspace_size maximum workspace for building TensorRT engine
+   * @param[in] buildConfig configuration including precision, calibration method, dla, remaining
+   * fp16 for first layer,  remaining fp16 for last layer and profiler for builder
+   * @param[in] plugin_paths path for custom plugin
+   */
+  TrtCommon(
+    const std::string & model_path, const std::string & precision,
+    std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator = nullptr,
+    const BatchConfig & batch_config = {1, 1, 1}, const size_t max_workspace_size = (16 << 20),
+    const BuildConfig & buildConfig = BuildConfig(),
+    const std::vector<std::string> & plugin_paths = {});
+
+  /**
+   * @brief Deconstruct TrtCommon
+   */
+  ~TrtCommon();
+
+  /**
+   * @brief Load TensorRT engine
+   * @param[in] engine_file_path path for a engine file
+   * @return flag for whether loading are succeeded or failed
+   */
+  bool loadEngine(const std::string & engine_file_path);
+
+  /**
+   * @brief Output layer information including GFLOPs and parameters
+   * @param[in] onnx_file_path path for a onnx file
+   * @warning This function is based on darknet log.
+   */
+  void printNetworkInfo(const std::string & onnx_file_path);
+
+  /**
+   * @brief build TensorRT engine from ONNX
+   * @param[in] onnx_file_path path for a onnx file
+   * @param[in] output_engine_file_path path for a engine file
+   */
+  bool buildEngineFromOnnx(
+    const std::string & onnx_file_path, const std::string & output_engine_file_path);
+
+  /**
+   * @brief setup for TensorRT execution including building and loading engine
+   */
+  void setup();
+
+  bool isInitialized();
+
+  nvinfer1::Dims getBindingDimensions(const int32_t index) const;
+  int32_t getNbBindings();
+  bool setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const;
+  bool enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed);
+
+  /**
+   * @brief output per-layer information
+   */
+  void printProfiling(void);
+
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
+  /**
+   * @brief get per-layer information for trt-engine-profiler
+   */
+  std::string getLayerInformation(nvinfer1::LayerInformationFormat format);
+#endif
+
+private:
+  Logger logger_;
+  fs::path model_file_path_;
+  TrtUniquePtr<nvinfer1::IRuntime> runtime_;
+  TrtUniquePtr<nvinfer1::ICudaEngine> engine_;
+  TrtUniquePtr<nvinfer1::IExecutionContext> context_;
+  std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator_;
+
+  nvinfer1::Dims input_dims_;
+  nvinfer1::Dims output_dims_;
+  std::string precision_;
+  BatchConfig batch_config_;
+  size_t max_workspace_size_;
+  bool is_initialized_{false};
+
+  // profiler for per-layer
+  SimpleProfiler model_profiler_;
+  // profiler for whole model
+  SimpleProfiler host_profiler_;
+
+  std::unique_ptr<const BuildConfig> build_config_;
+};
+
+}  // namespace tensorrt_common
+
+#endif  // TENSORRT_COMMON__TENSORRT_COMMON_HPP_

+ 8 - 0
src/detection/detection_lidar_transfusion/main.cpp

@@ -0,0 +1,8 @@
+#include <QCoreApplication>
+
+int main(int argc, char *argv[])
+{
+    QCoreApplication a(argc, argv);
+
+    return a.exec();
+}

+ 333 - 0
src/detection/detection_lidar_transfusion/network/network_trt.cpp

@@ -0,0 +1,333 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/network/network_trt.hpp"
+
+#include <NvOnnxParser.h>
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <bits/stdc++.h>
+
+namespace autoware::lidar_transfusion
+{
+
+std::ostream & operator<<(std::ostream & os, const ProfileDimension & profile)
+{
+  std::string delim = "";
+  os << "min->[";
+  for (int i = 0; i < profile.min.nbDims; ++i) {
+    os << delim << profile.min.d[i];
+    delim = ", ";
+  }
+  os << "], opt->[";
+  delim = "";
+  for (int i = 0; i < profile.opt.nbDims; ++i) {
+    os << delim << profile.opt.d[i];
+    delim = ", ";
+  }
+  os << "], max->[";
+  delim = "";
+  for (int i = 0; i < profile.max.nbDims; ++i) {
+    os << delim << profile.max.d[i];
+    delim = ", ";
+  }
+  os << "]";
+  return os;
+}
+
+NetworkTRT::NetworkTRT(const TransfusionConfig & config) : config_(config)
+{
+  ProfileDimension voxels_dims = {
+    nvinfer1::Dims3(
+      config_.min_voxel_size_, config_.min_point_in_voxel_size_, config_.min_network_feature_size_),
+    nvinfer1::Dims3(
+      config_.opt_voxel_size_, config_.opt_point_in_voxel_size_, config_.opt_network_feature_size_),
+    nvinfer1::Dims3(
+      config_.max_voxel_size_, config_.max_point_in_voxel_size_,
+      config_.max_network_feature_size_)};
+  ProfileDimension num_points_dims = {
+    nvinfer1::Dims{1, {static_cast<int32_t>(config_.min_points_size_)}},
+    nvinfer1::Dims{1, {static_cast<int32_t>(config_.opt_points_size_)}},
+    nvinfer1::Dims{1, {static_cast<int32_t>(config_.max_points_size_)}}};
+  ProfileDimension coors_dims = {
+    nvinfer1::Dims2(config_.min_coors_size_, config_.min_coors_dim_size_),
+    nvinfer1::Dims2(config_.opt_coors_size_, config_.opt_coors_dim_size_),
+    nvinfer1::Dims2(config_.max_coors_size_, config_.max_coors_dim_size_)};
+  in_profile_dims_ = {voxels_dims, num_points_dims, coors_dims};
+}
+
+NetworkTRT::~NetworkTRT()
+{
+  context.reset();
+  runtime_.reset();
+  plan_.reset();
+  engine.reset();
+}
+
+bool NetworkTRT::init(
+  const std::string & onnx_path, const std::string & engine_path, const std::string & precision)
+{
+  runtime_ =
+    tensorrt_common::TrtUniquePtr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(logger_));
+  if (!runtime_) {
+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create runtime" << std::endl;
+    return false;
+  }
+
+  bool success;
+  std::ifstream engine_file(engine_path);
+  if (engine_file.is_open()) {
+    success = loadEngine(engine_path);
+  } else {
+    auto log_thread = logger_.log_throttle(
+      nvinfer1::ILogger::Severity::kINFO,
+      "Applying optimizations and building TRT CUDA engine. Please wait a minutes...", 5);
+    success = parseONNX(onnx_path, engine_path, precision);
+    logger_.stop_throttle(log_thread);
+  }
+  success &= createContext();
+
+  return success;
+}
+
+bool NetworkTRT::setProfile(
+  nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network,
+  nvinfer1::IBuilderConfig & config)
+{
+  auto profile = builder.createOptimizationProfile();
+
+  auto voxels_name = network.getInput(NetworkIO::voxels)->getName();
+  auto num_points_name = network.getInput(NetworkIO::num_points)->getName();
+  auto coors_name = network.getInput(NetworkIO::coors)->getName();
+
+  profile->setDimensions(
+    voxels_name, nvinfer1::OptProfileSelector::kMIN, in_profile_dims_[NetworkIO::voxels].min);
+  profile->setDimensions(
+    voxels_name, nvinfer1::OptProfileSelector::kOPT, in_profile_dims_[NetworkIO::voxels].opt);
+  profile->setDimensions(
+    voxels_name, nvinfer1::OptProfileSelector::kMAX, in_profile_dims_[NetworkIO::voxels].max);
+
+  profile->setDimensions(
+    num_points_name, nvinfer1::OptProfileSelector::kMIN,
+    in_profile_dims_[NetworkIO::num_points].min);
+  profile->setDimensions(
+    num_points_name, nvinfer1::OptProfileSelector::kOPT,
+    in_profile_dims_[NetworkIO::num_points].opt);
+  profile->setDimensions(
+    num_points_name, nvinfer1::OptProfileSelector::kMAX,
+    in_profile_dims_[NetworkIO::num_points].max);
+
+  profile->setDimensions(
+    coors_name, nvinfer1::OptProfileSelector::kMIN, in_profile_dims_[NetworkIO::coors].min);
+  profile->setDimensions(
+    coors_name, nvinfer1::OptProfileSelector::kOPT, in_profile_dims_[NetworkIO::coors].opt);
+  profile->setDimensions(
+    coors_name, nvinfer1::OptProfileSelector::kMAX, in_profile_dims_[NetworkIO::coors].max);
+
+  config.addOptimizationProfile(profile);
+  return true;
+}
+
+bool NetworkTRT::createContext()
+{
+  if (!engine) {
+    tensorrt_common::LOG_ERROR(logger_)
+      << "Failed to create context: Engine was not created" << std::endl;
+    return false;
+  }
+
+  context =
+    tensorrt_common::TrtUniquePtr<nvinfer1::IExecutionContext>(engine->createExecutionContext());
+  if (!context) {
+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create context" << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+bool NetworkTRT::parseONNX(
+  const std::string & onnx_path, const std::string & engine_path, const std::string & precision,
+  const size_t workspace_size)
+{
+  auto builder =
+    tensorrt_common::TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
+  if (!builder) {
+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create builder" << std::endl;
+    return false;
+  }
+
+  auto config =
+    tensorrt_common::TrtUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+  if (!config) {
+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create config" << std::endl;
+    return false;
+  }
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400
+  config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size);
+#else
+  config->setMaxWorkspaceSize(workspace_size);
+#endif
+  if (precision == "fp16") {
+    if (builder->platformHasFastFp16()) {
+      tensorrt_common::LOG_INFO(logger_) << "Using TensorRT FP16 Inference" << std::endl;
+      config->setFlag(nvinfer1::BuilderFlag::kFP16);
+    } else {
+      tensorrt_common::LOG_INFO(logger_)
+        << "TensorRT FP16 Inference isn't supported in this environment" << std::endl;
+    }
+  }
+
+  const auto flag =
+    1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+  auto network =
+    tensorrt_common::TrtUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(flag));
+  if (!network) {
+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create network" << std::endl;
+    return false;
+  }
+
+  auto parser = tensorrt_common::TrtUniquePtr<nvonnxparser::IParser>(
+    nvonnxparser::createParser(*network, logger_));
+  parser->parseFromFile(onnx_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kERROR));
+
+  if (!setProfile(*builder, *network, *config)) {
+    tensorrt_common::LOG_ERROR(logger_) << "Failed to set profile" << std::endl;
+    return false;
+  }
+
+  plan_ = tensorrt_common::TrtUniquePtr<nvinfer1::IHostMemory>(
+    builder->buildSerializedNetwork(*network, *config));
+  if (!plan_) {
+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create serialized network" << std::endl;
+    return false;
+  }
+  engine = tensorrt_common::TrtUniquePtr<nvinfer1::ICudaEngine>(
+    runtime_->deserializeCudaEngine(plan_->data(), plan_->size()));
+  if (!engine) {
+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create engine" << std::endl;
+    return false;
+  }
+
+  return saveEngine(engine_path);
+}
+
+bool NetworkTRT::saveEngine(const std::string & engine_path)
+{
+  tensorrt_common::LOG_INFO(logger_) << "Writing to " << engine_path << std::endl;
+  std::ofstream file(engine_path, std::ios::out | std::ios::binary);
+  file.write(reinterpret_cast<const char *>(plan_->data()), plan_->size());
+  return validateNetworkIO();
+}
+
+bool NetworkTRT::loadEngine(const std::string & engine_path)
+{
+  std::ifstream engine_file(engine_path);
+  std::stringstream engine_buffer;
+  engine_buffer << engine_file.rdbuf();
+  std::string engine_str = engine_buffer.str();
+  engine = tensorrt_common::TrtUniquePtr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(
+    reinterpret_cast<const void *>(engine_str.data()), engine_str.size()));
+  tensorrt_common::LOG_INFO(logger_) << "Loaded engine from " << engine_path << std::endl;
+  return validateNetworkIO();
+}
+
+bool NetworkTRT::validateNetworkIO()
+{
+  // Whether the number of IO match the expected size
+  if (engine->getNbIOTensors() != NetworkIO::ENUM_SIZE) {
+    tensorrt_common::LOG_ERROR(logger_)
+      << "Invalid network IO. Expected size: " << NetworkIO::ENUM_SIZE
+      << ". Actual size: " << engine->getNbIOTensors() << "." << std::endl;
+    throw std::runtime_error("Failed to initialize TRT network.");
+  }
+  for (int i = 0; i < NetworkIO::ENUM_SIZE; ++i) {
+    tensors_names_.push_back(engine->getIOTensorName(i));
+  }
+
+  // Log the network IO
+  std::string tensors = std::accumulate(
+    tensors_names_.begin(), tensors_names_.end(), std::string(),
+    [](const std::string & a, const std::string & b) -> std::string { return a + b + " "; });
+  tensorrt_common::LOG_INFO(logger_) << "Network IO: " << tensors << std::endl;
+
+  // Whether the current engine input profile match the config input profile
+  for (int i = 0; i <= NetworkIO::coors; ++i) {
+    ProfileDimension engine_dims{
+      engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kMIN),
+      engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kOPT),
+      engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kMAX)};
+
+    tensorrt_common::LOG_INFO(logger_)
+      << "Profile for " << tensors_names_[i] << ": " << engine_dims << std::endl;
+
+    if (engine_dims != in_profile_dims_[i]) {
+      tensorrt_common::LOG_ERROR(logger_)
+        << "Invalid network input dimension. Config: " << in_profile_dims_[i]
+        << ". Please change the input profile or delete the engine file and build engine again."
+        << std::endl;
+      throw std::runtime_error("Failed to initialize TRT network.");
+    }
+  }
+
+  // Whether the IO tensor shapes match the network config, -1 for dynamic size
+  validateTensorShape(
+    NetworkIO::voxels, {-1, static_cast<int>(config_.points_per_voxel_),
+                        static_cast<int>(config_.num_point_feature_size_)});
+  validateTensorShape(NetworkIO::num_points, {-1});
+  validateTensorShape(NetworkIO::coors, {-1, static_cast<int>(config_.num_point_values_)});
+  auto cls_score = validateTensorShape(
+    NetworkIO::cls_score,
+    {static_cast<int>(config_.batch_size_), static_cast<int>(config_.num_classes_),
+     static_cast<int>(config_.num_proposals_)});
+  tensorrt_common::LOG_INFO(logger_) << "Network num classes: " << cls_score.d[1] << std::endl;
+  validateTensorShape(
+    NetworkIO::dir_pred,
+    {static_cast<int>(config_.batch_size_), 2, static_cast<int>(config_.num_proposals_)});  // x, y
+  validateTensorShape(
+    NetworkIO::bbox_pred,
+    {static_cast<int>(config_.batch_size_), static_cast<int>(config_.num_box_values_),
+     static_cast<int>(config_.num_proposals_)});
+
+  return true;
+}
+
+const char * NetworkTRT::getTensorName(NetworkIO name)
+{
+  return tensors_names_.at(name);
+}
+
+nvinfer1::Dims NetworkTRT::validateTensorShape(NetworkIO name, const std::vector<int> shape)
+{
+  auto tensor_shape = engine->getTensorShape(tensors_names_[name]);
+  if (tensor_shape.nbDims != static_cast<int>(shape.size())) {
+    tensorrt_common::LOG_ERROR(logger_)
+      << "Invalid tensor shape for " << tensors_names_[name] << ". Expected size: " << shape.size()
+      << ". Actual size: " << tensor_shape.nbDims << "." << std::endl;
+    throw std::runtime_error("Failed to initialize TRT network.");
+  }
+  for (int i = 0; i < tensor_shape.nbDims; ++i) {
+    if (tensor_shape.d[i] != static_cast<int>(shape[i])) {
+      tensorrt_common::LOG_ERROR(logger_)
+        << "Invalid tensor shape for " << tensors_names_[name] << ". Expected: " << shape[i]
+        << ". Actual: " << tensor_shape.d[i] << "." << std::endl;
+      throw std::runtime_error("Failed to initialize TRT network.");
+    }
+  }
+  return tensor_shape;
+}
+
+}  // namespace autoware::lidar_transfusion

+ 144 - 0
src/detection/detection_lidar_transfusion/postprocess/circle_nms_kernel.cu

@@ -0,0 +1,144 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "cuda_utils.hpp"
+#include "postprocess/circle_nms_kernel.hpp"
+#include "utils.hpp"
+
+#include <thrust/host_vector.h>
+
+namespace
+{
+const std::size_t THREADS_PER_BLOCK_NMS = 16;
+}  // namespace
+
+namespace autoware::lidar_transfusion
+{
+
+__device__ inline float dist2dPow(const Box3D * a, const Box3D * b)
+{
+  return powf(a->x - b->x, 2) + powf(a->y - b->y, 2);
+}
+
+// cspell: ignore divup
+__global__ void circleNMS_Kernel(
+  const Box3D * boxes, const std::size_t num_boxes3d, const std::size_t col_blocks,
+  const float dist2d_pow_threshold, std::uint64_t * mask)
+{
+  // params: boxes (N,)
+  // params: mask (N, divup(N/THREADS_PER_BLOCK_NMS))
+
+  const auto row_start = blockIdx.y;
+  const auto col_start = blockIdx.x;
+
+  if (row_start > col_start) return;
+
+  const std::size_t row_size =
+    fminf(num_boxes3d - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+  const std::size_t col_size =
+    fminf(num_boxes3d - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+  __shared__ Box3D block_boxes[THREADS_PER_BLOCK_NMS];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x] = boxes[THREADS_PER_BLOCK_NMS * col_start + threadIdx.x];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const std::size_t cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const Box3D * cur_box = boxes + cur_box_idx;
+
+    std::uint64_t t = 0;
+    std::size_t start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (std::size_t i = start; i < col_size; i++) {
+      if (dist2dPow(cur_box, block_boxes + i) < dist2d_pow_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+cudaError_t circleNMS_launch(
+  const thrust::device_vector<Box3D> & boxes3d, const std::size_t num_boxes3d,
+  std::size_t col_blocks, const float distance_threshold,
+  thrust::device_vector<std::uint64_t> & mask, cudaStream_t stream)
+{
+  const float dist2d_pow_thres = powf(distance_threshold, 2);
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  circleNMS_Kernel<<<blocks, threads, 0, stream>>>(
+    thrust::raw_pointer_cast(boxes3d.data()), num_boxes3d, col_blocks, dist2d_pow_thres,
+    thrust::raw_pointer_cast(mask.data()));
+
+  return cudaGetLastError();
+}
+
+std::size_t circleNMS(
+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream)
+{
+  const auto num_boxes3d = boxes3d.size();
+  const auto col_blocks = divup(num_boxes3d, THREADS_PER_BLOCK_NMS);
+  thrust::device_vector<std::uint64_t> mask_d(num_boxes3d * col_blocks);
+
+  CHECK_CUDA_ERROR(
+    circleNMS_launch(boxes3d, num_boxes3d, col_blocks, distance_threshold, mask_d, stream));
+
+  // memcpy device to host
+  thrust::host_vector<std::uint64_t> mask_h(mask_d.size());
+  thrust::copy(mask_d.begin(), mask_d.end(), mask_h.begin());
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+
+  // generate keep_mask
+  std::vector<std::uint64_t> remv_h(col_blocks);
+  thrust::host_vector<bool> keep_mask_h(keep_mask.size());
+  std::size_t num_to_keep = 0;
+  for (std::size_t i = 0; i < num_boxes3d; i++) {
+    auto nblock = i / THREADS_PER_BLOCK_NMS;
+    auto inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_h[nblock] & (1ULL << inblock))) {
+      keep_mask_h[i] = true;
+      num_to_keep++;
+      std::uint64_t * p = &mask_h[0] + i * col_blocks;
+      for (std::size_t j = nblock; j < col_blocks; j++) {
+        remv_h[j] |= p[j];
+      }
+    } else {
+      keep_mask_h[i] = false;
+    }
+  }
+
+  // memcpy host to device
+  keep_mask = keep_mask_h;
+
+  return num_to_keep;
+}
+
+}  // namespace autoware::lidar_transfusion

+ 147 - 0
src/detection/detection_lidar_transfusion/postprocess/non_maximum_suppression.cpp

@@ -0,0 +1,147 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "postprocess/non_maximum_suppression.hpp"
+
+//#include <autoware/universe_utils/geometry/geometry.hpp>
+//#include <object_recognition_utils/geometry.hpp>
+//#include <object_recognition_utils/object_recognition_utils.hpp>
+
+// constant declarations
+namespace Label {
+static constexpr uint8_t UNKNOWN =
+  0u;
+static constexpr uint8_t CAR =
+  1u;
+static constexpr uint8_t TRUCK =
+  2u;
+static constexpr uint8_t BUS =
+  3u;
+static constexpr uint8_t TRAILER =
+  4u;
+static constexpr uint8_t MOTORCYCLE =
+  5u;
+static constexpr uint8_t BICYCLE =
+  6u;
+static constexpr uint8_t PEDESTRIAN =
+  7u;
+}
+
+
+uint8_t getSemanticType(const std::string & class_name)
+{
+  if (class_name == "CAR") {
+    return Label::CAR;
+  } else if (class_name == "TRUCK") {
+    return Label::TRUCK;
+  } else if (class_name == "BUS") {
+    return Label::BUS;
+  } else if (class_name == "TRAILER") {
+    return Label::TRAILER;
+  } else if (class_name == "MOTORCYCLE") {
+    return Label::MOTORCYCLE;
+  } else if (class_name == "BICYCLE") {
+    return Label::BICYCLE;
+  } else if (class_name == "PEDESTRIAN") {
+    return Label::PEDESTRIAN;
+  } else {  // CONSTRUCTION_VEHICLE, BARRIER, TRAFFIC_CONE
+    return Label::UNKNOWN;
+  }
+}
+
+
+namespace autoware::lidar_transfusion
+{
+
+void NonMaximumSuppression::setParameters(const NMSParams & params)
+{
+  assert(params.search_distance_2d_ >= 0.0);
+  assert(params.iou_threshold_ >= 0.0 && params.iou_threshold_ <= 1.0);
+
+  params_ = params;
+  target_class_mask_ = classNamesToBooleanMask(params.target_class_names_);
+}
+
+bool NonMaximumSuppression::isTargetLabel(const uint8_t label)
+{
+  if (label >= target_class_mask_.size()) {
+    return false;
+  }
+  return target_class_mask_.at(label);
+}
+
+//bool NonMaximumSuppression::isTargetPairObject(
+//  const DetectedObject & object1, const DetectedObject & object2)
+//{
+//  const auto label1 = object_recognition_utils::getHighestProbLabel(object1.classification);
+//  const auto label2 = object_recognition_utils::getHighestProbLabel(object2.classification);
+
+//  if (isTargetLabel(label1) && isTargetLabel(label2)) {
+//    return true;
+//  }
+
+//  const auto search_sqr_dist_2d = params_.search_distance_2d_ * params_.search_distance_2d_;
+//  const auto sqr_dist_2d = autoware::universe_utils::calcSquaredDistance2d(
+//    object_recognition_utils::getPose(object1), object_recognition_utils::getPose(object2));
+//  return sqr_dist_2d <= search_sqr_dist_2d;
+//}
+
+//Eigen::MatrixXd NonMaximumSuppression::generateIoUMatrix(
+//  const std::vector<DetectedObject> & input_objects)
+//{
+//  // NOTE: row = target objects to be suppressed, col = source objects to be compared
+//  Eigen::MatrixXd triangular_matrix =
+//    Eigen::MatrixXd::Zero(input_objects.size(), input_objects.size());
+//  for (std::size_t target_i = 0; target_i < input_objects.size(); ++target_i) {
+//    for (std::size_t source_i = 0; source_i < target_i; ++source_i) {
+//      const auto & target_obj = input_objects.at(target_i);
+//      const auto & source_obj = input_objects.at(source_i);
+//      if (!isTargetPairObject(target_obj, source_obj)) {
+//        continue;
+//      }
+
+//      if (params_.nms_type_ == NMS_TYPE::IoU_BEV) {
+//        const double iou = object_recognition_utils::get2dIoU(target_obj, source_obj);
+//        triangular_matrix(target_i, source_i) = iou;
+//        // NOTE: If the target object has any objects with iou > iou_threshold, it
+//        // will be suppressed regardless of later results.
+//        if (iou > params_.iou_threshold_) {
+//          break;
+//        }
+//      }
+//    }
+//  }
+
+//  return triangular_matrix;
+//}
+
+//std::vector<DetectedObject> NonMaximumSuppression::apply(
+//  const std::vector<DetectedObject> & input_objects)
+//{
+//  Eigen::MatrixXd iou_matrix = generateIoUMatrix(input_objects);
+
+//  std::vector<DetectedObject> output_objects;
+//  output_objects.reserve(input_objects.size());
+//  for (std::size_t i = 0; i < input_objects.size(); ++i) {
+//    const auto value = iou_matrix.row(i).maxCoeff();
+//    if (params_.nms_type_ == NMS_TYPE::IoU_BEV) {
+//      if (value <= params_.iou_threshold_) {
+//        output_objects.emplace_back(input_objects.at(i));
+//      }
+//    }
+//  }
+
+//  return output_objects;
+//}
+}  // namespace autoware::lidar_transfusion

+ 145 - 0
src/detection/detection_lidar_transfusion/postprocess/postprocess_kernel.cu

@@ -0,0 +1,145 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "postprocess/circle_nms_kernel.hpp"
+#include "postprocess/postprocess_kernel.hpp"
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+
+namespace autoware::lidar_transfusion
+{
+const size_t THREADS_PER_BLOCK = 256;
+
+struct is_score_greater
+{
+  is_score_greater(float t) : t_(t) {}
+
+  __device__ bool operator()(const Box3D & b) { return b.score > t_; }
+
+private:
+  float t_{0.0};
+};
+
+struct is_kept
+{
+  __device__ bool operator()(const bool keep) { return keep; }
+};
+
+struct score_greater
+{
+  __device__ bool operator()(const Box3D & lb, const Box3D & rb) { return lb.score > rb.score; }
+};
+
+__device__ inline float sigmoid(float x)
+{
+  return 1.0f / (1.0f + expf(-x));
+}
+
+__global__ void generateBoxes3D_kernel(
+  const float * __restrict__ cls_output, const float * __restrict__ box_output,
+  const float * __restrict__ dir_cls_output, const float voxel_size_x, const float voxel_size_y,
+  const float min_x_range, const float min_y_range, const int num_proposals, const int num_classes,
+  const int num_point_values, const float * __restrict__ yaw_norm_thresholds,
+  Box3D * __restrict__ det_boxes3d)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= num_proposals) {
+    return;
+  }
+
+  int class_id = 0;
+  float max_score = cls_output[point_idx];
+
+#pragma unroll
+  for (int i = 1; i < num_classes; i++) {
+    float score = cls_output[i * num_proposals + point_idx];
+    if (score > max_score) {
+      max_score = score;
+      class_id = i;
+    }
+  }
+
+  // yaw validation
+  const float yaw_sin = dir_cls_output[point_idx];
+  const float yaw_cos = dir_cls_output[point_idx + num_proposals];
+  const float yaw_norm = sqrtf(yaw_sin * yaw_sin + yaw_cos * yaw_cos);
+
+  det_boxes3d[point_idx].label = class_id;
+  det_boxes3d[point_idx].score = yaw_norm >= yaw_norm_thresholds[class_id] ? max_score : 0.f;
+  det_boxes3d[point_idx].x = box_output[point_idx] * num_point_values * voxel_size_x + min_x_range;
+  det_boxes3d[point_idx].y =
+    box_output[point_idx + num_proposals] * num_point_values * voxel_size_y + min_y_range;
+  det_boxes3d[point_idx].z = box_output[point_idx + 2 * num_proposals];
+  det_boxes3d[point_idx].width = expf(box_output[point_idx + 3 * num_proposals]);
+  det_boxes3d[point_idx].length = expf(box_output[point_idx + 4 * num_proposals]);
+  det_boxes3d[point_idx].height = expf(box_output[point_idx + 5 * num_proposals]);
+  det_boxes3d[point_idx].yaw =
+    atan2f(dir_cls_output[point_idx], dir_cls_output[point_idx + num_proposals]);
+}
+
+PostprocessCuda::PostprocessCuda(const TransfusionConfig & config, cudaStream_t & stream)
+: config_(config), stream_(stream)
+{
+}
+
+// cspell: ignore divup
+cudaError_t PostprocessCuda::generateDetectedBoxes3D_launch(
+  const float * cls_output, const float * box_output, const float * dir_cls_output,
+  std::vector<Box3D> & det_boxes3d, cudaStream_t stream)
+{
+  dim3 threads = {THREADS_PER_BLOCK};
+  dim3 blocks = {divup(config_.num_proposals_, threads.x)};
+
+  auto boxes3d_d = thrust::device_vector<Box3D>(config_.num_proposals_);
+  auto yaw_norm_thresholds_d = thrust::device_vector<float>(
+    config_.yaw_norm_thresholds_.begin(), config_.yaw_norm_thresholds_.end());
+
+  generateBoxes3D_kernel<<<blocks, threads, 0, stream>>>(
+    cls_output, box_output, dir_cls_output, config_.voxel_x_size_, config_.voxel_y_size_,
+    config_.min_x_range_, config_.min_y_range_, config_.num_proposals_, config_.num_classes_,
+    config_.num_point_values_, thrust::raw_pointer_cast(yaw_norm_thresholds_d.data()),
+    thrust::raw_pointer_cast(boxes3d_d.data()));
+
+  // suppress by score
+  const auto num_det_boxes3d = thrust::count_if(
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), is_score_greater(config_.score_threshold_));
+  if (num_det_boxes3d == 0) {
+    return cudaGetLastError();
+  }
+  thrust::device_vector<Box3D> det_boxes3d_d(num_det_boxes3d);
+  thrust::copy_if(
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), det_boxes3d_d.begin(),
+    is_score_greater(config_.score_threshold_));
+
+  // sort by score
+  thrust::sort(det_boxes3d_d.begin(), det_boxes3d_d.end(), score_greater());
+
+  // supress by NMS
+  thrust::device_vector<bool> final_keep_mask_d(num_det_boxes3d);
+  const auto num_final_det_boxes3d =
+    circleNMS(det_boxes3d_d, config_.circle_nms_dist_threshold_, final_keep_mask_d, stream);
+  thrust::device_vector<Box3D> final_det_boxes3d_d(num_final_det_boxes3d);
+  thrust::copy_if(
+    thrust::device, det_boxes3d_d.begin(), det_boxes3d_d.end(), final_keep_mask_d.begin(),
+    final_det_boxes3d_d.begin(), is_kept());
+
+  // memcpy device to host
+  det_boxes3d.resize(num_final_det_boxes3d);
+  thrust::copy(final_det_boxes3d_d.begin(), final_det_boxes3d_d.end(), det_boxes3d.begin());
+  return cudaGetLastError();
+}
+
+}  // namespace autoware::lidar_transfusion

+ 116 - 0
src/detection/detection_lidar_transfusion/preprocess/pointcloud_densification.cpp

@@ -0,0 +1,116 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_transfusion/preprocess/pointcloud_densification.hpp"
+
+#include <pcl_ros/transforms.hpp>
+
+#include <pcl_conversions/pcl_conversions.h>
+#ifdef ROS_DISTRO_GALACTIC
+#include <tf2_eigen/tf2_eigen.h>
+#else
+#include <tf2_eigen/tf2_eigen.hpp>
+#endif
+
+#include <boost/optional.hpp>
+
+#include <string>
+#include <utility>
+
+namespace
+{
+
+boost::optional<geometry_msgs::msg::Transform> getTransform(
+  const tf2_ros::Buffer & tf_buffer, const std::string & target_frame_id,
+  const std::string & source_frame_id, const rclcpp::Time & time)
+{
+  try {
+    geometry_msgs::msg::TransformStamped transform_stamped;
+    transform_stamped = tf_buffer.lookupTransform(
+      target_frame_id, source_frame_id, time, rclcpp::Duration::from_seconds(0.5));
+    return transform_stamped.transform;
+  } catch (tf2::TransformException & ex) {
+    RCLCPP_WARN_STREAM(rclcpp::get_logger("lidar_transfusion"), ex.what());
+    return boost::none;
+  }
+}
+
+Eigen::Affine3f transformToEigen(const geometry_msgs::msg::Transform & t)
+{
+  Eigen::Affine3f a;
+  a.matrix() = tf2::transformToEigen(t).matrix().cast<float>();
+  return a;
+}
+
+}  // namespace
+
+namespace autoware::lidar_transfusion
+{
+
+PointCloudDensification::PointCloudDensification(
+  const DensificationParam & param, cudaStream_t & stream)
+: param_(param), stream_(stream)
+{
+}
+
+bool PointCloudDensification::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+{
+  const auto header = pointcloud_msg.header;
+
+  if (param_.pointcloud_cache_size() > 1) {
+    auto transform_world2current =
+      getTransform(tf_buffer, header.frame_id, param_.world_frame_id(), header.stamp);
+    if (!transform_world2current) {
+      return false;
+    }
+    auto affine_world2current = transformToEigen(transform_world2current.get());
+
+    enqueue(pointcloud_msg, affine_world2current);
+  } else {
+    enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
+  }
+
+  dequeue();
+
+  return true;
+}
+
+void PointCloudDensification::enqueue(
+  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
+{
+  affine_world2current_ = affine_world2current;
+  current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
+
+  auto data_d = cuda::make_unique<uint8_t[]>(
+    sizeof(uint8_t) * msg.width * msg.height * msg.point_step / sizeof(uint8_t));
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    data_d.get(), msg.data.data(), sizeof(uint8_t) * msg.width * msg.height * msg.point_step,
+    cudaMemcpyHostToDevice, stream_));
+
+  PointCloudWithTransform pointcloud = {
+    std::move(data_d), msg.header, msg.width * msg.height, affine_world2current.inverse()};
+
+  pointcloud_cache_.push_front(std::move(pointcloud));
+}
+
+void PointCloudDensification::dequeue()
+{
+  if (pointcloud_cache_.size() > param_.pointcloud_cache_size()) {
+    pointcloud_cache_.pop_back();
+  }
+}
+
+}  // namespace autoware::lidar_transfusion

+ 221 - 0
src/detection/detection_lidar_transfusion/preprocess/preprocess_kernel.cu

@@ -0,0 +1,221 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "autoware/lidar_transfusion/cuda_utils.hpp"
+#include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp"
+
+#include <cstdint>
+
+namespace autoware::lidar_transfusion
+{
+
+PreprocessCuda::PreprocessCuda(const TransfusionConfig & config, cudaStream_t & stream)
+: stream_(stream), config_(config)
+{
+  mask_size_ = config_.grid_z_size_ * config_.grid_y_size_ * config_.grid_x_size_;
+  voxels_size_ = config_.grid_z_size_ * config_.grid_y_size_ * config_.grid_x_size_ *
+                   config_.max_num_points_per_pillar_ * config_.num_point_feature_size_ +
+                 1;
+  mask_ = cuda::make_unique<unsigned int[]>(mask_size_);
+  voxels_ = cuda::make_unique<float[]>(voxels_size_);
+}
+
+void PreprocessCuda::generateVoxels(
+  float * points, unsigned int points_size, unsigned int * pillar_num, float * voxel_features,
+  unsigned int * voxel_num, unsigned int * voxel_idxs)
+{
+  cuda::clear_async(mask_.get(), mask_size_, stream_);
+  cuda::clear_async(voxels_.get(), voxels_size_, stream_);
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  CHECK_CUDA_ERROR(generateVoxels_random_launch(points, points_size, mask_.get(), voxels_.get()));
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  CHECK_CUDA_ERROR(generateBaseFeatures_launch(
+    mask_.get(), voxels_.get(), pillar_num, voxel_features, voxel_num, voxel_idxs));
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+}
+
+__global__ void generateVoxels_random_kernel(
+  float * points, unsigned int points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, int points_per_voxel, unsigned int * mask,
+  float * voxels)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  float x = points[point_idx * 5];
+  float y = points[point_idx * 5 + 1];
+  float z = points[point_idx * 5 + 2];
+  float i = points[point_idx * 5 + 3];
+  float t = points[point_idx * 5 + 4];
+
+  if (
+    x <= min_x_range || x >= max_x_range || y <= min_y_range || y >= max_y_range ||
+    z <= min_z_range || z >= max_z_range)
+    return;
+
+  int voxel_idx = floorf((x - min_x_range) / pillar_x_size);
+  int voxel_idy = floorf((y - min_y_range) / pillar_y_size);
+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
+
+  unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
+
+  if (point_id >= points_per_voxel) return;
+  float * address = voxels + (voxel_index * points_per_voxel + point_id) * 5;
+  atomicExch(address + 0, x);
+  atomicExch(address + 1, y);
+  atomicExch(address + 2, z);
+  atomicExch(address + 3, i);
+  atomicExch(address + 4, t);
+}
+
+cudaError_t PreprocessCuda::generateVoxels_random_launch(
+  float * points, unsigned int points_size, unsigned int * mask, float * voxels)
+{
+  if (points_size == 0) {
+    return cudaGetLastError();
+  }
+  dim3 blocks(divup(points_size, config_.threads_for_voxel_));
+  dim3 threads(config_.threads_for_voxel_);
+
+  generateVoxels_random_kernel<<<blocks, threads, 0, stream_>>>(
+    points, points_size, config_.min_x_range_, config_.max_x_range_, config_.min_y_range_,
+    config_.max_y_range_, config_.min_z_range_, config_.max_z_range_, config_.voxel_x_size_,
+    config_.voxel_y_size_, config_.voxel_z_size_, config_.grid_y_size_, config_.grid_x_size_,
+    config_.points_per_voxel_, mask, voxels);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void generateBaseFeatures_kernel(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, float points_per_voxel,
+  float max_voxels, unsigned int * pillar_num, float * voxel_features, unsigned int * voxel_num,
+  unsigned int * voxel_idxs)
+{
+  unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return;
+
+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
+  unsigned int count = mask[voxel_index];
+  if (!(count > 0)) return;
+  count = count < points_per_voxel ? count : points_per_voxel;
+
+  unsigned int current_pillarId = 0;
+  current_pillarId = atomicAdd(pillar_num, 1);
+  if (current_pillarId >= max_voxels) return;
+
+  voxel_num[current_pillarId] = count;
+
+  uint4 idx = {0, 0, voxel_idy, voxel_idx};
+  ((uint4 *)voxel_idxs)[current_pillarId] = idx;
+
+  for (int i = 0; i < count; i++) {
+    int inIndex = voxel_index * points_per_voxel + i;
+    int outIndex = current_pillarId * points_per_voxel + i;
+    voxel_features[outIndex * 5] = voxels[inIndex * 5];
+    voxel_features[outIndex * 5 + 1] = voxels[inIndex * 5 + 1];
+    voxel_features[outIndex * 5 + 2] = voxels[inIndex * 5 + 2];
+    voxel_features[outIndex * 5 + 3] = voxels[inIndex * 5 + 3];
+    voxel_features[outIndex * 5 + 4] = voxels[inIndex * 5 + 4];
+  }
+
+  // clear buffer for next infer
+  atomicExch(mask + voxel_index, 0);
+}
+
+// create 4 channels
+cudaError_t PreprocessCuda::generateBaseFeatures_launch(
+  unsigned int * mask, float * voxels, unsigned int * pillar_num, float * voxel_features,
+  unsigned int * voxel_num, unsigned int * voxel_idxs)
+{
+  dim3 threads = {32, 32};
+  dim3 blocks = {divup(config_.grid_x_size_, threads.x), divup(config_.grid_y_size_, threads.y)};
+
+  generateBaseFeatures_kernel<<<blocks, threads, 0, stream_>>>(
+    mask, voxels, config_.grid_y_size_, config_.grid_x_size_, config_.points_per_voxel_,
+    config_.max_voxels_, pillar_num, voxel_features, voxel_num, voxel_idxs);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void generateSweepPoints_kernel(
+  const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag,
+  const float * transform_array, int num_features, float * output_points)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  union {
+    uint32_t raw{0};
+    float value;
+  } input_x, input_y, input_z;
+
+#pragma unroll
+  for (int i = 0; i < 4; i++) {  // 4 bytes for float32
+    input_x.raw |= input_data[point_idx * input_point_step + i] << i * 8;
+    input_y.raw |= input_data[point_idx * input_point_step + i + 4] << i * 8;
+    input_z.raw |= input_data[point_idx * input_point_step + i + 8] << i * 8;
+  }
+
+  float input_intensity = static_cast<float>(input_data[point_idx * input_point_step + 12]);
+
+  output_points[point_idx * num_features] =
+    transform_array[0] * input_x.value + transform_array[4] * input_y.value +
+    transform_array[8] * input_z.value + transform_array[12];
+  output_points[point_idx * num_features + 1] =
+    transform_array[1] * input_x.value + transform_array[5] * input_y.value +
+    transform_array[9] * input_z.value + transform_array[13];
+  output_points[point_idx * num_features + 2] =
+    transform_array[2] * input_x.value + transform_array[6] * input_y.value +
+    transform_array[10] * input_z.value + transform_array[14];
+  output_points[point_idx * num_features + 3] = input_intensity;
+  output_points[point_idx * num_features + 4] = time_lag;
+}
+
+cudaError_t PreprocessCuda::generateSweepPoints_launch(
+  const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag,
+  const float * transform_array, float * output_points)
+{
+  dim3 blocks(divup(points_size, config_.threads_for_voxel_));
+  dim3 threads(config_.threads_for_voxel_);
+
+  generateSweepPoints_kernel<<<blocks, threads, 0, stream_>>>(
+    input_data, points_size, input_point_step, time_lag, transform_array,
+    config_.num_point_feature_size_, output_points);
+
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+}  // namespace autoware::lidar_transfusion

+ 123 - 0
src/detection/detection_lidar_transfusion/preprocess/voxel_generator.cpp

@@ -0,0 +1,123 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_transfusion/preprocess/voxel_generator.hpp"
+
+#include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp"
+
+#include <sensor_msgs/point_cloud2_iterator.hpp>
+
+#include <type_traits>
+
+namespace autoware::lidar_transfusion
+{
+
+VoxelGenerator::VoxelGenerator(
+  const DensificationParam & densification_param, const TransfusionConfig & config,
+  cudaStream_t & stream)
+: config_(config), stream_(stream)
+{
+  pd_ptr_ = std::make_unique<PointCloudDensification>(densification_param, stream_);
+  pre_ptr_ = std::make_unique<PreprocessCuda>(config_, stream_);
+  cloud_data_d_ = cuda::make_unique<unsigned char[]>(config_.cloud_capacity_ * MAX_CLOUD_STEP_SIZE);
+  affine_past2current_d_ = cuda::make_unique<float[]>(AFF_MAT_SIZE);
+}
+
+bool VoxelGenerator::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer)
+{
+  return pd_ptr_->enqueuePointCloud(msg, tf_buffer);
+}
+
+std::size_t VoxelGenerator::generateSweepPoints(
+  const sensor_msgs::msg::PointCloud2 & msg, cuda::unique_ptr<float[]> & points_d)
+{
+  if (!is_initialized_) {
+    initCloudInfo(msg);
+    std::stringstream ss;
+    ss << "Input point cloud information: " << std::endl << cloud_info_;
+    RCLCPP_DEBUG_STREAM(rclcpp::get_logger("lidar_transfusion"), ss.str());
+
+    CloudInfo default_cloud_info;
+    if (cloud_info_ != default_cloud_info) {
+      ss << "Expected point cloud information: " << std::endl << default_cloud_info;
+      RCLCPP_ERROR_STREAM(rclcpp::get_logger("lidar_transfusion"), ss.str());
+      throw std::runtime_error("Input point cloud has unsupported format");
+    }
+    is_initialized_ = true;
+  }
+
+  size_t point_counter{0};
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
+       pc_cache_iter++) {
+    auto sweep_num_points = pc_cache_iter->num_points;
+    if (point_counter + sweep_num_points >= config_.cloud_capacity_) {
+      RCLCPP_WARN_STREAM(
+        rclcpp::get_logger("lidar_transfusion"), "Exceeding cloud capacity. Used "
+                                                   << pd_ptr_->getIdx(pc_cache_iter) << " out of "
+                                                   << pd_ptr_->getCacheSize() << " sweep(s)");
+      break;
+    }
+    auto shift = point_counter * config_.num_point_feature_size_;
+
+    auto affine_past2current =
+      pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
+    static_assert(std::is_same<decltype(affine_past2current.matrix()), Eigen::Matrix4f &>::value);
+    static_assert(!Eigen::Matrix4f::IsRowMajor, "matrices should be col-major.");
+
+    float time_lag = static_cast<float>(
+      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());
+
+    CHECK_CUDA_ERROR(cudaMemcpyAsync(
+      affine_past2current_d_.get(), affine_past2current.data(), AFF_MAT_SIZE * sizeof(float),
+      cudaMemcpyHostToDevice, stream_));
+    CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+    pre_ptr_->generateSweepPoints_launch(
+      pc_cache_iter->data_d.get(), sweep_num_points, cloud_info_.point_step, time_lag,
+      affine_past2current_d_.get(), points_d.get() + shift);
+    point_counter += sweep_num_points;
+  }
+
+  return point_counter;
+}
+
+void VoxelGenerator::initCloudInfo(const sensor_msgs::msg::PointCloud2 & msg)
+{
+  std::tie(cloud_info_.x_offset, cloud_info_.x_datatype, cloud_info_.x_count) =
+    getFieldInfo(msg, "x");
+  std::tie(cloud_info_.y_offset, cloud_info_.y_datatype, cloud_info_.y_count) =
+    getFieldInfo(msg, "y");
+  std::tie(cloud_info_.z_offset, cloud_info_.z_datatype, cloud_info_.z_count) =
+    getFieldInfo(msg, "z");
+  std::tie(
+    cloud_info_.intensity_offset, cloud_info_.intensity_datatype, cloud_info_.intensity_count) =
+    getFieldInfo(msg, "intensity");
+  cloud_info_.point_step = msg.point_step;
+  cloud_info_.is_bigendian = msg.is_bigendian;
+}
+
+std::tuple<const uint32_t, const uint8_t, const uint8_t> VoxelGenerator::getFieldInfo(
+  const sensor_msgs::msg::PointCloud2 & msg, const std::string & field_name)
+{
+  for (const auto & field : msg.fields) {
+    if (field.name == field_name) {
+      return std::make_tuple(field.offset, field.datatype, field.count);
+    }
+  }
+  throw std::runtime_error("Missing field: " + field_name);
+}
+}  // namespace autoware::lidar_transfusion

+ 132 - 0
src/detection/detection_lidar_transfusion/simple_profiler.cpp

@@ -0,0 +1,132 @@
+// Copyright 2023 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <tensorrt_common/simple_profiler.hpp>
+
+#include <iomanip>
+
+namespace tensorrt_common
+{
+
+SimpleProfiler::SimpleProfiler(std::string name, const std::vector<SimpleProfiler> & src_profilers)
+: m_name(name)
+{
+  m_index = 0;
+  for (const auto & src_profiler : src_profilers) {
+    for (const auto & rec : src_profiler.m_profile) {
+      auto it = m_profile.find(rec.first);
+      if (it == m_profile.end()) {
+        m_profile.insert(rec);
+      } else {
+        it->second.time += rec.second.time;
+        it->second.count += rec.second.count;
+      }
+    }
+  }
+}
+
+void SimpleProfiler::reportLayerTime(const char * layerName, float ms) noexcept
+{
+  m_profile[layerName].count++;
+  m_profile[layerName].time += ms;
+  if (m_profile[layerName].min_time == -1.0) {
+    m_profile[layerName].min_time = ms;
+    m_profile[layerName].index = m_index;
+    m_index++;
+  } else if (m_profile[layerName].min_time > ms) {
+    m_profile[layerName].min_time = ms;
+  }
+}
+
+void SimpleProfiler::setProfDict(nvinfer1::ILayer * layer) noexcept
+{
+  std::string name = layer->getName();
+  m_layer_dict[name];
+  m_layer_dict[name].type = layer->getType();
+  if (layer->getType() == nvinfer1::LayerType::kCONVOLUTION) {
+    nvinfer1::IConvolutionLayer * conv = (nvinfer1::IConvolutionLayer *)layer;
+    nvinfer1::ITensor * in = layer->getInput(0);
+    nvinfer1::Dims dim_in = in->getDimensions();
+    nvinfer1::ITensor * out = layer->getOutput(0);
+    nvinfer1::Dims dim_out = out->getDimensions();
+    nvinfer1::Dims k_dims = conv->getKernelSizeNd();
+    nvinfer1::Dims s_dims = conv->getStrideNd();
+    int groups = conv->getNbGroups();
+    int stride = s_dims.d[0];
+    int kernel = k_dims.d[0];
+    m_layer_dict[name].in_c = dim_in.d[1];
+    m_layer_dict[name].out_c = dim_out.d[1];
+    m_layer_dict[name].w = dim_in.d[3];
+    m_layer_dict[name].h = dim_in.d[2];
+    m_layer_dict[name].k = kernel;
+    ;
+    m_layer_dict[name].stride = stride;
+    m_layer_dict[name].groups = groups;
+  }
+}
+
+std::ostream & operator<<(std::ostream & out, const SimpleProfiler & value)
+{
+  out << "========== " << value.m_name << " profile ==========" << std::endl;
+  float totalTime = 0;
+  std::string layerNameStr = "Operation";
+
+  int maxLayerNameLength = static_cast<int>(layerNameStr.size());
+  for (const auto & elem : value.m_profile) {
+    totalTime += elem.second.time;
+    maxLayerNameLength = std::max(maxLayerNameLength, static_cast<int>(elem.first.size()));
+  }
+
+  auto old_settings = out.flags();
+  auto old_precision = out.precision();
+  // Output header
+  {
+    out << "index, " << std::setw(12);
+    out << std::setw(maxLayerNameLength) << layerNameStr << " ";
+    out << std::setw(12) << "Runtime"
+        << "%,"
+        << " ";
+    out << std::setw(12) << "Invocations"
+        << " , ";
+    out << std::setw(12) << "Runtime[ms]"
+        << " , ";
+    out << std::setw(12) << "Avg Runtime[ms]"
+        << " ,";
+    out << std::setw(12) << "Min Runtime[ms]" << std::endl;
+  }
+  int index = value.m_index;
+  for (int i = 0; i < index; i++) {
+    for (const auto & elem : value.m_profile) {
+      if (elem.second.index == i) {
+        out << i << ",   ";
+        out << std::setw(maxLayerNameLength) << elem.first << ",";
+        out << std::setw(12) << std::fixed << std::setprecision(1)
+            << (elem.second.time * 100.0F / totalTime) << "%"
+            << ",";
+        out << std::setw(12) << elem.second.count << ",";
+        out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << ", ";
+        out << std::setw(12) << std::fixed << std::setprecision(2)
+            << elem.second.time / elem.second.count << ", ";
+        out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.min_time
+            << std::endl;
+      }
+    }
+  }
+  out.flags(old_settings);
+  out.precision(old_precision);
+  out << "========== " << value.m_name << " total runtime = " << totalTime
+      << " ms ==========" << std::endl;
+  return out;
+}
+}  // namespace tensorrt_common

+ 605 - 0
src/detection/detection_lidar_transfusion/tensorrt_common.cpp

@@ -0,0 +1,605 @@
+// Copyright 2023 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <tensorrt_common/tensorrt_common.hpp>
+
+#include <NvInferPlugin.h>
+#include <dlfcn.h>
+
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace
+{
+template <class T>
+bool contain(const std::string & s, const T & v)
+{
+  return s.find(v) != std::string::npos;
+}
+}  // anonymous namespace
+
+namespace tensorrt_common
+{
+nvinfer1::Dims get_input_dims(const std::string & onnx_file_path)
+{
+  Logger logger_;
+  auto builder = TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
+  if (!builder) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder");
+  }
+
+  const auto explicitBatch =
+    1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+
+  auto network =
+    TrtUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
+  if (!network) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network");
+  }
+
+  auto config = TrtUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+  if (!config) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config");
+  }
+
+  auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, logger_));
+  if (!parser->parseFromFile(
+        onnx_file_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kERROR))) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Failed to parse onnx file");
+  }
+
+  const auto input = network->getInput(0);
+  return input->getDimensions();
+}
+
+bool is_valid_precision_string(const std::string & precision)
+{
+  if (
+    std::find(valid_precisions.begin(), valid_precisions.end(), precision) ==
+    valid_precisions.end()) {
+    std::stringstream message;
+    message << "Invalid precision was specified: " << precision << std::endl
+            << "Valid string is one of: [";
+    for (const auto & s : valid_precisions) {
+      message << s << ", ";
+    }
+    message << "] (case sensitive)" << std::endl;
+    std::cerr << message.str();
+    return false;
+  } else {
+    return true;
+  }
+}
+
+TrtCommon::TrtCommon(
+  const std::string & model_path, const std::string & precision,
+  std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator, const BatchConfig & batch_config,
+  const size_t max_workspace_size, const BuildConfig & build_config,
+  const std::vector<std::string> & plugin_paths)
+: model_file_path_(model_path),
+  calibrator_(std::move(calibrator)),
+  precision_(precision),
+  batch_config_(batch_config),
+  max_workspace_size_(max_workspace_size),
+  model_profiler_("Model"),
+  host_profiler_("Host")
+{
+  // Check given precision is valid one
+  if (!is_valid_precision_string(precision)) {
+    return;
+  }
+  build_config_ = std::make_unique<const BuildConfig>(build_config);
+
+  for (const auto & plugin_path : plugin_paths) {
+    int32_t flags{RTLD_LAZY};
+// cspell: ignore asan
+#if ENABLE_ASAN
+    // https://github.com/google/sanitizers/issues/89
+    // asan doesn't handle module unloading correctly and there are no plans on doing
+    // so. In order to get proper stack traces, don't delete the shared library on
+    // close so that asan can resolve the symbols correctly.
+    flags |= RTLD_NODELETE;
+#endif  // ENABLE_ASAN
+    void * handle = dlopen(plugin_path.c_str(), flags);
+    if (!handle) {
+      logger_.log(nvinfer1::ILogger::Severity::kERROR, "Could not load plugin library");
+    }
+  }
+  runtime_ = TrtUniquePtr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(logger_));
+  if (build_config_->dla_core_id != -1) {
+    runtime_->setDLACore(build_config_->dla_core_id);
+  }
+  initLibNvInferPlugins(&logger_, "");
+}
+
+TrtCommon::~TrtCommon()
+{
+}
+
+void TrtCommon::setup()
+{
+  if (!fs::exists(model_file_path_)) {
+    is_initialized_ = false;
+    return;
+  }
+  std::string engine_path = model_file_path_;
+  if (model_file_path_.extension() == ".engine") {
+    std::cout << "Load ... " << model_file_path_ << std::endl;
+    loadEngine(model_file_path_);
+  } else if (model_file_path_.extension() == ".onnx") {
+    fs::path cache_engine_path{model_file_path_};
+    std::string ext;
+    std::string calib_name = "";
+    if (precision_ == "int8") {
+      if (build_config_->calib_type_str == "Entropy") {
+        calib_name = "EntropyV2-";
+      } else if (
+        build_config_->calib_type_str == "Legacy" ||
+        build_config_->calib_type_str == "Percentile") {
+        calib_name = "Legacy-";
+      } else {
+        calib_name = "MinMax-";
+      }
+    }
+    if (build_config_->dla_core_id != -1) {
+      ext = "DLA" + std::to_string(build_config_->dla_core_id) + "-" + calib_name + precision_;
+      if (build_config_->quantize_first_layer) {
+        ext += "-firstFP16";
+      }
+      if (build_config_->quantize_last_layer) {
+        ext += "-lastFP16";
+      }
+      ext += "-batch" + std::to_string(batch_config_[0]) + ".engine";
+    } else {
+      ext = calib_name + precision_;
+      if (build_config_->quantize_first_layer) {
+        ext += "-firstFP16";
+      }
+      if (build_config_->quantize_last_layer) {
+        ext += "-lastFP16";
+      }
+      ext += "-batch" + std::to_string(batch_config_[0]) + ".engine";
+    }
+    cache_engine_path.replace_extension(ext);
+
+    // Output Network Information
+    printNetworkInfo(model_file_path_);
+
+    if (fs::exists(cache_engine_path)) {
+      std::cout << "Loading... " << cache_engine_path << std::endl;
+      loadEngine(cache_engine_path);
+    } else {
+      std::cout << "Building... " << cache_engine_path << std::endl;
+      logger_.log(nvinfer1::ILogger::Severity::kINFO, "Start build engine");
+      auto log_thread = logger_.log_throttle(
+        nvinfer1::ILogger::Severity::kINFO,
+        "Applying optimizations and building TRT CUDA engine. Please wait for a few minutes...", 5);
+      buildEngineFromOnnx(model_file_path_, cache_engine_path);
+      logger_.stop_throttle(log_thread);
+      logger_.log(nvinfer1::ILogger::Severity::kINFO, "End build engine");
+    }
+    engine_path = cache_engine_path;
+  } else {
+    is_initialized_ = false;
+    return;
+  }
+
+  context_ = TrtUniquePtr<nvinfer1::IExecutionContext>(engine_->createExecutionContext());
+  if (!context_) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context");
+    is_initialized_ = false;
+    return;
+  }
+
+  if (build_config_->profile_per_layer) {
+    context_->setProfiler(&model_profiler_);
+  }
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
+  // Write profiles for trt-engine-explorer
+  // See: https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer
+  std::string j_ext = ".json";
+  fs::path json_path{engine_path};
+  json_path.replace_extension(j_ext);
+  std::string ret = getLayerInformation(nvinfer1::LayerInformationFormat::kJSON);
+  std::ofstream os(json_path, std::ofstream::trunc);
+  os << ret << std::flush;
+#endif
+
+  is_initialized_ = true;
+}
+
+bool TrtCommon::loadEngine(const std::string & engine_file_path)
+{
+  std::ifstream engine_file(engine_file_path);
+  std::stringstream engine_buffer;
+  engine_buffer << engine_file.rdbuf();
+  std::string engine_str = engine_buffer.str();
+  engine_ = TrtUniquePtr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(
+    reinterpret_cast<const void *>(engine_str.data()), engine_str.size()));
+  return true;
+}
+
+void TrtCommon::printNetworkInfo(const std::string & onnx_file_path)
+{
+  auto builder = TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
+  if (!builder) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder");
+    return;
+  }
+
+  const auto explicitBatch =
+    1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+
+  auto network =
+    TrtUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
+  if (!network) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network");
+    return;
+  }
+
+  auto config = TrtUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+  if (!config) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config");
+    return;
+  }
+
+  if (precision_ == "fp16" || precision_ == "int8") {
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+  }
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400
+  config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+#else
+  config->setMaxWorkspaceSize(max_workspace_size_);
+#endif
+
+  auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, logger_));
+  if (!parser->parseFromFile(
+        onnx_file_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kERROR))) {
+    return;
+  }
+  int num = network->getNbLayers();
+  float total_gflops = 0.0;
+  int total_params = 0;
+  for (int i = 0; i < num; i++) {
+    nvinfer1::ILayer * layer = network->getLayer(i);
+    auto layer_type = layer->getType();
+    if (build_config_->profile_per_layer) {
+      model_profiler_.setProfDict(layer);
+    }
+    if (layer_type == nvinfer1::LayerType::kCONSTANT) {
+      continue;
+    }
+    nvinfer1::ITensor * in = layer->getInput(0);
+    nvinfer1::Dims dim_in = in->getDimensions();
+    nvinfer1::ITensor * out = layer->getOutput(0);
+    nvinfer1::Dims dim_out = out->getDimensions();
+
+    if (layer_type == nvinfer1::LayerType::kCONVOLUTION) {
+      nvinfer1::IConvolutionLayer * conv = (nvinfer1::IConvolutionLayer *)layer;
+      nvinfer1::Dims k_dims = conv->getKernelSizeNd();
+      nvinfer1::Dims s_dims = conv->getStrideNd();
+      int groups = conv->getNbGroups();
+      int stride = s_dims.d[0];
+      int num_weights = (dim_in.d[1] / groups) * dim_out.d[1] * k_dims.d[0] * k_dims.d[1];
+      float gflops = (2 * num_weights) * (dim_in.d[3] / stride * dim_in.d[2] / stride / 1e9);
+      ;
+      total_gflops += gflops;
+      total_params += num_weights;
+      std::cout << "L" << i << " [conv " << k_dims.d[0] << "x" << k_dims.d[1] << " (" << groups
+                << ") "
+                << "/" << s_dims.d[0] << "] " << dim_in.d[3] << "x" << dim_in.d[2] << "x"
+                << dim_in.d[1] << " -> " << dim_out.d[3] << "x" << dim_out.d[2] << "x"
+                << dim_out.d[1];
+      std::cout << " weights:" << num_weights;
+      std::cout << " GFLOPs:" << gflops;
+      std::cout << std::endl;
+    } else if (layer_type == nvinfer1::LayerType::kPOOLING) {
+      nvinfer1::IPoolingLayer * pool = (nvinfer1::IPoolingLayer *)layer;
+      auto p_type = pool->getPoolingType();
+      nvinfer1::Dims dim_stride = pool->getStrideNd();
+      nvinfer1::Dims dim_window = pool->getWindowSizeNd();
+
+      std::cout << "L" << i << " [";
+      if (p_type == nvinfer1::PoolingType::kMAX) {
+        std::cout << "max ";
+      } else if (p_type == nvinfer1::PoolingType::kAVERAGE) {
+        std::cout << "avg ";
+      } else if (p_type == nvinfer1::PoolingType::kMAX_AVERAGE_BLEND) {
+        std::cout << "max avg blend ";
+      }
+      float gflops = dim_in.d[1] * dim_window.d[0] / dim_stride.d[0] * dim_window.d[1] /
+                     dim_stride.d[1] * dim_in.d[2] * dim_in.d[3] / 1e9;
+      total_gflops += gflops;
+      std::cout << "pool " << dim_window.d[0] << "x" << dim_window.d[1] << "]";
+      std::cout << " GFLOPs:" << gflops;
+      std::cout << std::endl;
+    } else if (layer_type == nvinfer1::LayerType::kRESIZE) {
+      std::cout << "L" << i << " [resize]" << std::endl;
+    }
+  }
+  std::cout << "Total " << total_gflops << " GFLOPs" << std::endl;
+  std::cout << "Total " << total_params / 1000.0 / 1000.0 << " M params" << std::endl;
+  return;
+}
+
+bool TrtCommon::buildEngineFromOnnx(
+  const std::string & onnx_file_path, const std::string & output_engine_file_path)
+{
+  auto builder = TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
+  if (!builder) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder");
+    return false;
+  }
+
+  const auto explicitBatch =
+    1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+
+  auto network =
+    TrtUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
+  if (!network) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network");
+    return false;
+  }
+
+  auto config = TrtUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+  if (!config) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config");
+    return false;
+  }
+
+  int num_available_dla = builder->getNbDLACores();
+  if (build_config_->dla_core_id != -1) {
+    if (num_available_dla > 0) {
+      std::cout << "###" << num_available_dla << " DLAs are supported! ###" << std::endl;
+    } else {
+      std::cout << "###Warning : "
+                << "No DLA is supported! ###" << std::endl;
+    }
+    config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+    config->setDLACore(build_config_->dla_core_id);
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
+    config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
+#else
+    config->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
+#endif
+    config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+  }
+  if (precision_ == "fp16" || precision_ == "int8") {
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+  }
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400
+  config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+#else
+  config->setMaxWorkspaceSize(max_workspace_size_);
+#endif
+
+  auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, logger_));
+  if (!parser->parseFromFile(
+        onnx_file_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kERROR))) {
+    std::cout << "Failed to parse onnx file" << std::endl;
+    return false;
+  }
+
+  const int num = network->getNbLayers();
+  bool first = build_config_->quantize_first_layer;
+  bool last = build_config_->quantize_last_layer;
+  // Partial Quantization
+  if (precision_ == "int8") {
+    network->getInput(0)->setDynamicRange(0, 255.0);
+    for (int i = 0; i < num; i++) {
+      nvinfer1::ILayer * layer = network->getLayer(i);
+      auto layer_type = layer->getType();
+      std::string name = layer->getName();
+      nvinfer1::ITensor * out = layer->getOutput(0);
+      if (build_config_->clip_value > 0.0) {
+        std::cout << "Set max value for outputs : " << build_config_->clip_value << "  " << name
+                  << std::endl;
+        out->setDynamicRange(0.0, build_config_->clip_value);
+      }
+
+      if (layer_type == nvinfer1::LayerType::kCONVOLUTION) {
+        if (first) {
+          layer->setPrecision(nvinfer1::DataType::kHALF);
+          std::cout << "Set kHALF in " << name << std::endl;
+          first = false;
+        }
+        if (last) {
+          // cspell: ignore preds
+          if (
+            contain(name, "reg_preds") || contain(name, "cls_preds") ||
+            contain(name, "obj_preds")) {
+            layer->setPrecision(nvinfer1::DataType::kHALF);
+            std::cout << "Set kHALF in " << name << std::endl;
+          }
+          for (int j = num - 1; j >= 0; j--) {
+            nvinfer1::ILayer * inner_layer = network->getLayer(j);
+            auto inner_layer_type = inner_layer->getType();
+            std::string inner_name = inner_layer->getName();
+            if (inner_layer_type == nvinfer1::LayerType::kCONVOLUTION) {
+              inner_layer->setPrecision(nvinfer1::DataType::kHALF);
+              std::cout << "Set kHALF in " << inner_name << std::endl;
+              break;
+            }
+            if (inner_layer_type == nvinfer1::LayerType::kMATRIX_MULTIPLY) {
+              inner_layer->setPrecision(nvinfer1::DataType::kHALF);
+              std::cout << "Set kHALF in " << inner_name << std::endl;
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const auto input = network->getInput(0);
+  const auto input_dims = input->getDimensions();
+  const auto input_channel = input_dims.d[1];
+  const auto input_height = input_dims.d[2];
+  const auto input_width = input_dims.d[3];
+  const auto input_batch = input_dims.d[0];
+
+  if (input_batch > 1) {
+    batch_config_[0] = input_batch;
+  }
+
+  if (batch_config_.at(0) > 1 && (batch_config_.at(0) == batch_config_.at(2))) {
+    // Attention : below API is deprecated in TRT8.4
+    builder->setMaxBatchSize(batch_config_.at(2));
+  } else {
+    if (build_config_->profile_per_layer) {
+      auto profile = builder->createOptimizationProfile();
+      profile->setDimensions(
+        network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMIN,
+        nvinfer1::Dims4{batch_config_.at(0), input_channel, input_height, input_width});
+      profile->setDimensions(
+        network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kOPT,
+        nvinfer1::Dims4{batch_config_.at(1), input_channel, input_height, input_width});
+      profile->setDimensions(
+        network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMAX,
+        nvinfer1::Dims4{batch_config_.at(2), input_channel, input_height, input_width});
+      config->addOptimizationProfile(profile);
+    }
+  }
+  if (precision_ == "int8" && calibrator_) {
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
+    config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
+#else
+    config->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
+#endif
+    // QAT requires no calibrator.
+    //    assert((calibrator != nullptr) && "Invalid calibrator for INT8 precision");
+    config->setInt8Calibrator(calibrator_.get());
+  }
+  if (build_config_->profile_per_layer) {
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
+    config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kDETAILED);
+#else
+    config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kVERBOSE);
+#endif
+  }
+
+#if TENSORRT_VERSION_MAJOR >= 8
+  auto plan =
+    TrtUniquePtr<nvinfer1::IHostMemory>(builder->buildSerializedNetwork(*network, *config));
+  if (!plan) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create host memory");
+    return false;
+  }
+  engine_ = TrtUniquePtr<nvinfer1::ICudaEngine>(
+    runtime_->deserializeCudaEngine(plan->data(), plan->size()));
+#else
+  engine_ = TrtUniquePtr<nvinfer1::ICudaEngine>(builder->buildEngineWithConfig(*network, *config));
+#endif
+
+  if (!engine_) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine");
+    return false;
+  }
+
+  // save engine
+#if TENSORRT_VERSION_MAJOR < 8
+  auto data = TrtUniquePtr<nvinfer1::IHostMemory>(engine_->serialize());
+#endif
+  std::ofstream file;
+  file.open(output_engine_file_path, std::ios::binary | std::ios::out);
+  if (!file.is_open()) {
+    return false;
+  }
+#if TENSORRT_VERSION_MAJOR < 8
+  file.write(reinterpret_cast<const char *>(data->data()), data->size());
+#else
+  file.write(reinterpret_cast<const char *>(plan->data()), plan->size());
+#endif
+
+  file.close();
+
+  return true;
+}
+
+bool TrtCommon::isInitialized()
+{
+  return is_initialized_;
+}
+
+nvinfer1::Dims TrtCommon::getBindingDimensions(const int32_t index) const
+{
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSOR_PATCH * 10) >= 8500
+  auto const & name = engine_->getIOTensorName(index);
+  auto dims = context_->getTensorShape(name);
+  bool const has_runtime_dim =
+    std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; });
+
+  if (has_runtime_dim) {
+    return dims;
+  } else {
+    return context_->getBindingDimensions(index);
+  }
+#else
+  return context_->getBindingDimensions(index);
+#endif
+}
+
+int32_t TrtCommon::getNbBindings()
+{
+  return engine_->getNbBindings();
+}
+
+bool TrtCommon::setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const
+{
+  return context_->setBindingDimensions(index, dimensions);
+}
+
+bool TrtCommon::enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed)
+{
+  if (build_config_->profile_per_layer) {
+    auto inference_start = std::chrono::high_resolution_clock::now();
+
+    bool ret = context_->enqueueV2(bindings, stream, input_consumed);
+
+    auto inference_end = std::chrono::high_resolution_clock::now();
+    host_profiler_.reportLayerTime(
+      "inference",
+      std::chrono::duration<float, std::milli>(inference_end - inference_start).count());
+    return ret;
+  } else {
+    return context_->enqueueV2(bindings, stream, input_consumed);
+  }
+}
+
+void TrtCommon::printProfiling()
+{
+  std::cout << host_profiler_;
+  std::cout << std::endl;
+  std::cout << model_profiler_;
+}
+
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
+std::string TrtCommon::getLayerInformation(nvinfer1::LayerInformationFormat format)
+{
+  auto runtime = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(logger_));
+  auto inspector = std::unique_ptr<nvinfer1::IEngineInspector>(engine_->createEngineInspector());
+  if (context_ != nullptr) {
+    inspector->setExecutionContext(&(*context_));
+  }
+  std::string result = inspector->getEngineInformation(format);
+  return result;
+}
+#endif
+
+}  // namespace tensorrt_common

+ 170 - 0
src/detection/detection_lidar_transfusion/transfusion_config.hpp

@@ -0,0 +1,170 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__TRANSFUSION_CONFIG_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__TRANSFUSION_CONFIG_HPP_
+
+#include <cstdint>
+#include <vector>
+
+namespace autoware::lidar_transfusion
+{
+
+class TransfusionConfig
+{
+public:
+  TransfusionConfig(
+    const std::size_t cloud_capacity, const std::vector<int64_t> & voxels_num,
+    const std::vector<double> & point_cloud_range, const std::vector<double> & voxel_size,
+    const std::size_t num_proposals, const float circle_nms_dist_threshold,
+    const std::vector<double> & yaw_norm_thresholds, const float score_threshold)
+  {
+    cloud_capacity_ = cloud_capacity;
+
+    if (voxels_num.size() == 3) {
+      max_voxels_ = voxels_num[2];
+
+      voxels_num_[0] = voxels_num[0];
+      voxels_num_[1] = voxels_num[1];
+      voxels_num_[2] = voxels_num[2];
+
+      min_voxel_size_ = voxels_num[0];
+      opt_voxel_size_ = voxels_num[1];
+      max_voxel_size_ = voxels_num[2];
+
+      min_points_size_ = voxels_num[0];
+      opt_points_size_ = voxels_num[1];
+      max_points_size_ = voxels_num[2];
+
+      min_coors_size_ = voxels_num[0];
+      opt_coors_size_ = voxels_num[1];
+      max_coors_size_ = voxels_num[2];
+    }
+    if (point_cloud_range.size() == 6) {
+      min_x_range_ = static_cast<float>(point_cloud_range[0]);
+      min_y_range_ = static_cast<float>(point_cloud_range[1]);
+      min_z_range_ = static_cast<float>(point_cloud_range[2]);
+      max_x_range_ = static_cast<float>(point_cloud_range[3]);
+      max_y_range_ = static_cast<float>(point_cloud_range[4]);
+      max_z_range_ = static_cast<float>(point_cloud_range[5]);
+    }
+    if (voxel_size.size() == 3) {
+      voxel_x_size_ = static_cast<float>(voxel_size[0]);
+      voxel_y_size_ = static_cast<float>(voxel_size[1]);
+      voxel_z_size_ = static_cast<float>(voxel_size[2]);
+    }
+    if (num_proposals > 0) {
+      num_proposals_ = num_proposals;
+    }
+    if (score_threshold > 0.0) {
+      score_threshold_ = score_threshold;
+    }
+    if (circle_nms_dist_threshold > 0.0) {
+      circle_nms_dist_threshold_ = circle_nms_dist_threshold;
+    }
+    yaw_norm_thresholds_ =
+      std::vector<float>(yaw_norm_thresholds.begin(), yaw_norm_thresholds.end());
+    for (auto & yaw_norm_threshold : yaw_norm_thresholds_) {
+      yaw_norm_threshold =
+        (yaw_norm_threshold >= 0.0 && yaw_norm_threshold < 1.0) ? yaw_norm_threshold : 0.0;
+    }
+    grid_x_size_ = static_cast<std::size_t>((max_x_range_ - min_x_range_) / voxel_x_size_);
+    grid_y_size_ = static_cast<std::size_t>((max_y_range_ - min_y_range_) / voxel_y_size_);
+    grid_z_size_ = static_cast<std::size_t>((max_z_range_ - min_z_range_) / voxel_z_size_);
+
+    feature_x_size_ = grid_x_size_ / out_size_factor_;
+    feature_y_size_ = grid_y_size_ / out_size_factor_;
+  }
+
+  ///// INPUT PARAMETERS /////
+  std::size_t cloud_capacity_{};
+  ///// KERNEL PARAMETERS /////
+  const std::size_t threads_for_voxel_{256};  // threads number for a block
+  const std::size_t points_per_voxel_{20};
+  const std::size_t warp_size_{32};          // one warp(32 threads) for one pillar
+  const std::size_t pillars_per_block_{64};  // one thread deals with one pillar
+                                             // and a block has pillars_per_block threads
+  const std::size_t pillar_feature_size_{64};
+  std::size_t max_voxels_{60000};
+
+  ///// NETWORK PARAMETERS /////
+  const std::size_t batch_size_{1};
+  const std::size_t num_classes_{5};
+  const std::size_t num_point_feature_size_{5};  // x, y, z, intensity, lag
+  // the dimension of the input cloud
+  float min_x_range_{-76.8};
+  float max_x_range_{76.8};
+  float min_y_range_{-76.8};
+  float max_y_range_{76.8};
+  float min_z_range_{-3.0};
+  float max_z_range_{5.0};
+  // the size of a pillar
+  float voxel_x_size_{0.3};
+  float voxel_y_size_{0.3};
+  float voxel_z_size_{8.0};
+  const std::size_t out_size_factor_{4};
+  const std::size_t max_num_points_per_pillar_{points_per_voxel_};
+  const std::size_t num_point_values_{4};
+  std::size_t num_proposals_{200};
+  // the number of feature maps for pillar scatter
+  const std::size_t num_feature_scatter_{pillar_feature_size_};
+  // the score threshold for classification
+  float score_threshold_{0.2};
+  float circle_nms_dist_threshold_{0.5};
+  std::vector<float> yaw_norm_thresholds_{0.3, 0.3, 0.3, 0.3, 0.0};
+  std::size_t max_num_pillars_{max_voxels_};
+  const std::size_t pillar_points_bev_{max_num_points_per_pillar_ * max_num_pillars_};
+  // the detected boxes result decode by (x, y, z, w, l, h, yaw)
+  const std::size_t num_box_values_{8};
+  // the input size of the 2D backbone network
+  std::size_t grid_x_size_{512};
+  std::size_t grid_y_size_{512};
+  std::size_t grid_z_size_{1};
+  // the output size of the 2D backbone network
+  std::size_t feature_x_size_{grid_x_size_ / out_size_factor_};
+  std::size_t feature_y_size_{grid_y_size_ / out_size_factor_};
+
+  ///// RUNTIME DIMENSIONS /////
+  std::vector<std::size_t> voxels_num_{5000, 30000, 60000};
+  // voxels
+  std::size_t min_voxel_size_{voxels_num_[0]};
+  std::size_t opt_voxel_size_{voxels_num_[1]};
+  std::size_t max_voxel_size_{voxels_num_[2]};
+
+  std::size_t min_point_in_voxel_size_{points_per_voxel_};
+  std::size_t opt_point_in_voxel_size_{points_per_voxel_};
+  std::size_t max_point_in_voxel_size_{points_per_voxel_};
+
+  std::size_t min_network_feature_size_{num_point_feature_size_};
+  std::size_t opt_network_feature_size_{num_point_feature_size_};
+  std::size_t max_network_feature_size_{num_point_feature_size_};
+
+  // num_points
+  std::size_t min_points_size_{voxels_num_[0]};
+  std::size_t opt_points_size_{voxels_num_[1]};
+  std::size_t max_points_size_{voxels_num_[2]};
+
+  // coors
+  std::size_t min_coors_size_{voxels_num_[0]};
+  std::size_t opt_coors_size_{voxels_num_[1]};
+  std::size_t max_coors_size_{voxels_num_[2]};
+
+  std::size_t min_coors_dim_size_{num_point_values_};
+  std::size_t opt_coors_dim_size_{num_point_values_};
+  std::size_t max_coors_dim_size_{num_point_values_};
+};
+
+}  // namespace autoware::lidar_transfusion
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__TRANSFUSION_CONFIG_HPP_

+ 57 - 0
src/detection/detection_lidar_transfusion/utils.hpp

@@ -0,0 +1,57 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_TRANSFUSION__UTILS_HPP_
+#define AUTOWARE__LIDAR_TRANSFUSION__UTILS_HPP_
+
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <unordered_map>
+
+namespace autoware::lidar_transfusion
+{
+
+struct Box3D
+{
+  int label;
+  float score;
+  float x;
+  float y;
+  float z;
+  float width;
+  float length;
+  float height;
+  float yaw;
+};
+
+enum NetworkIO { voxels = 0, num_points, coors, cls_score, dir_pred, bbox_pred, ENUM_SIZE };
+
+// cspell: ignore divup
+template <typename T1, typename T2>
+unsigned int divup(const T1 a, const T2 b)
+{
+  if (a == 0) {
+    throw std::runtime_error("A dividend of divup isn't positive.");
+  }
+  if (b == 0) {
+    throw std::runtime_error("A divisor of divup isn't positive.");
+  }
+
+  return (a + b - 1) / b;
+}
+
+}  // namespace autoware::lidar_transfusion
+
+#endif  // AUTOWARE__LIDAR_TRANSFUSION__UTILS_HPP_