7 月之前 · 47014b71f9
--- a/src/detection/detection_lidar_transfusion/.gitignore
+++ b/src/detection/detection_lidar_transfusion/.gitignore
@@ -0,0 +1,73 @@
 
				+# This file is used to ignore files which are generated
			
 
				+# ----------------------------------------------------------------------------
			
 
				+
			
 
				+*~
			
 
				+*.autosave
			
 
				+*.a
			
 
				+*.core
			
 
				+*.moc
			
 
				+*.o
			
 
				+*.obj
			
 
				+*.orig
			
 
				+*.rej
			
 
				+*.so
			
 
				+*.so.*
			
 
				+*_pch.h.cpp
			
 
				+*_resource.rc
			
 
				+*.qm
			
 
				+.#*
			
 
				+*.*#
			
 
				+core
			
 
				+!core/
			
 
				+tags
			
 
				+.DS_Store
			
 
				+.directory
			
 
				+*.debug
			
 
				+Makefile*
			
 
				+*.prl
			
 
				+*.app
			
 
				+moc_*.cpp
			
 
				+ui_*.h
			
 
				+qrc_*.cpp
			
 
				+Thumbs.db
			
 
				+*.res
			
 
				+*.rc
			
 
				+/.qmake.cache
			
 
				+/.qmake.stash
			
 
				+
			
 
				+# qtcreator generated files
			
 
				+*.pro.user*
			
 
				+
			
 
				+# xemacs temporary files
			
 
				+*.flc
			
 
				+
			
 
				+# Vim temporary files
			
 
				+.*.swp
			
 
				+
			
 
				+# Visual Studio generated files
			
 
				+*.ib_pdb_index
			
 
				+*.idb
			
 
				+*.ilk
			
 
				+*.pdb
			
 
				+*.sln
			
 
				+*.suo
			
 
				+*.vcproj
			
 
				+*vcproj.*.*.user
			
 
				+*.ncb
			
 
				+*.sdf
			
 
				+*.opensdf
			
 
				+*.vcxproj
			
 
				+*vcxproj.*
			
 
				+
			
 
				+# MinGW generated files
			
 
				+*.Debug
			
 
				+*.Release
			
 
				+
			
 
				+# Python byte code
			
 
				+*.pyc
			
 
				+
			
 
				+# Binaries
			
 
				+# --------
			
 
				+*.dll
			
 
				+*.exe
			
 
				+
			
--- a/src/detection/detection_lidar_transfusion/cuda_utils.hpp
+++ b/src/detection/detection_lidar_transfusion/cuda_utils.hpp
@@ -0,0 +1,126 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+/*
			
 
				+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This code is licensed under CC0 1.0 Universal (Public Domain).
			
 
				+ * You can use this without any limitation.
			
 
				+ * https://creativecommons.org/publicdomain/zero/1.0/deed.en
			
 
				+ */
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__CUDA_UTILS_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__CUDA_UTILS_HPP_
			
 
				+
			
 
				+#include <cuda_runtime_api.h>
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <sstream>
			
 
				+#include <stdexcept>
			
 
				+#include <type_traits>
			
 
				+
			
 
				+#define CHECK_CUDA_ERROR(e) (cuda::check_error(e, __FILE__, __LINE__))
			
 
				+
			
 
				+namespace cuda
			
 
				+{
			
 
				+inline void check_error(const ::cudaError_t e, const char * f, int n)
			
 
				+{
			
 
				+  if (e != ::cudaSuccess) {
			
 
				+    ::std::stringstream s;
			
 
				+    s << ::cudaGetErrorName(e) << " (" << e << ")@" << f << "#L" << n << ": "
			
 
				+      << ::cudaGetErrorString(e);
			
 
				+    throw ::std::runtime_error{s.str()};
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+struct deleter
			
 
				+{
			
 
				+  void operator()(void * p) const { CHECK_CUDA_ERROR(::cudaFree(p)); }
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+using unique_ptr = ::std::unique_ptr<T, deleter>;
			
 
				+
			
 
				+template <typename T>
			
 
				+typename ::std::enable_if<::std::is_array<T>::value, cuda::unique_ptr<T>>::type make_unique(
			
 
				+  const ::std::size_t n)
			
 
				+{
			
 
				+  using U = typename ::std::remove_extent<T>::type;
			
 
				+  U * p;
			
 
				+  CHECK_CUDA_ERROR(::cudaMalloc(reinterpret_cast<void **>(&p), sizeof(U) * n));
			
 
				+  return cuda::unique_ptr<T>{p};
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+cuda::unique_ptr<T> make_unique()
			
 
				+{
			
 
				+  T * p;
			
 
				+  CHECK_CUDA_ERROR(::cudaMalloc(reinterpret_cast<void **>(&p), sizeof(T)));
			
 
				+  return cuda::unique_ptr<T>{p};
			
 
				+}
			
 
				+
			
 
				+constexpr size_t CUDA_ALIGN = 256;
			
 
				+
			
 
				+template <typename T>
			
 
				+inline size_t get_size_aligned(size_t num_elem)
			
 
				+{
			
 
				+  size_t size = num_elem * sizeof(T);
			
 
				+  size_t extra_align = 0;
			
 
				+  if (size % CUDA_ALIGN != 0) {
			
 
				+    extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
			
 
				+  }
			
 
				+  return size + extra_align;
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+inline T * get_next_ptr(size_t num_elem, void *& workspace, size_t & workspace_size)
			
 
				+{
			
 
				+  size_t size = get_size_aligned<T>(num_elem);
			
 
				+  if (size > workspace_size) {
			
 
				+    throw ::std::runtime_error("Workspace is too small!");
			
 
				+  }
			
 
				+  workspace_size -= size;
			
 
				+  T * ptr = reinterpret_cast<T *>(workspace);
			
 
				+  workspace = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(workspace) + size);
			
 
				+  return ptr;
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+void clear_async(T * ptr, size_t num_elem, cudaStream_t stream)
			
 
				+{
			
 
				+  CHECK_CUDA_ERROR(::cudaMemsetAsync(ptr, 0, sizeof(T) * num_elem, stream));
			
 
				+}
			
 
				+
			
 
				+}  // namespace cuda
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__CUDA_UTILS_HPP_
			
--- a/src/detection/detection_lidar_transfusion/detection_lidar_transfusion.pro
+++ b/src/detection/detection_lidar_transfusion/detection_lidar_transfusion.pro
@@ -0,0 +1,137 @@
 
				+QT -= gui
			
 
				+
			
 
				+CONFIG += c++1z console
			
 
				+CONFIG -= app_bundle
			
 
				+
			
 
				+# You can make your code fail to compile if it uses deprecated APIs.
			
 
				+# In order to do so, uncomment the following line.
			
 
				+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
			
 
				+
			
 
				+SOURCES += \
			
 
				+        main.cpp \
			
 
				+        network/network_trt.cpp \
			
 
				+        postprocess/non_maximum_suppression.cpp \
			
 
				+        simple_profiler.cpp \
			
 
				+        tensorrt_common.cpp
			
 
				+
			
 
				+# Default rules for deployment.
			
 
				+qnx: target.path = /tmp/$${TARGET}/bin
			
 
				+else: unix:!android: target.path = /opt/$${TARGET}/bin
			
 
				+!isEmpty(target.path): INSTALLS += target
			
 
				+
			
 
				+DISTFILES += \
			
 
				+    postprocess/circle_nms_kernel.cu \
			
 
				+    postprocess/postprocess_kernel.cu
			
 
				+
			
 
				+HEADERS += \
			
 
				+    cuda_utils.hpp \
			
 
				+    include/network/network_trt.hpp \
			
 
				+    include/postprocess/circle_nms_kernel.hpp \
			
 
				+    include/postprocess/non_maximum_suppression.hpp \
			
 
				+    include/postprocess/postprocess_kernel.hpp \
			
 
				+    include/preprocess/pointcloud_densification.hpp \
			
 
				+    include/preprocess/preprocess_kernel.hpp \
			
 
				+    include/preprocess/voxel_generator.hpp \
			
 
				+    include/tensorrt_common/logger.hpp \
			
 
				+    include/tensorrt_common/simple_profiler.hpp \
			
 
				+    include/tensorrt_common/tensorrt_common.hpp \
			
 
				+    utils.hpp
			
 
				+
			
 
				+INCLUDEPATH += /usr/include/eigen3
			
 
				+
			
 
				+INCLUDEPATH += $$PWD/include
			
 
				+
			
 
				+INCLUDEPATH += $$CUDA_DIR/include
			
 
				+
			
 
				+LIBS += -L"/usr/local/lib" \
			
 
				+        -L"/usr/local/cuda/lib64" \
			
 
				+        -lcudart \
			
 
				+        -lcufft
			
 
				+
			
 
				+CUDA_SOURCES += \
			
 
				+    postprocess/circle_nms_kernel.cu \
			
 
				+    postprocess/postprocess_kernel.cu
			
 
				+
			
 
				+CUDA_SDK = "/usr/local/cuda/"   # cudaSDK路径
			
 
				+
			
 
				+CUDA_DIR = "/usr/local/cuda/"            # CUDA tookit路径
			
 
				+
			
 
				+SYSTEM_NAME = linux         # 自己系统环境 'Win32', 'x64', or 'Win64'
			
 
				+
			
 
				+SYSTEM_TYPE = 64           #操作系统位数 '32' or '64',
			
 
				+
			
 
				+CUDA_ARCH = sm_72 # xavier sm_72          # cuda架构, for example 'compute_10', 'compute_11', 'sm_10'
			
 
				+
			
 
				+NVCC_OPTIONS = --use_fast_math --compiler-options "-fPIC"
			
 
				+
			
 
				+
			
 
				+# include paths
			
 
				+
			
 
				+INCLUDEPATH += $$CUDA_DIR/include
			
 
				+INCLUDEPATH += $$PWD
			
 
				+#INCLUDEPATH += /usr/local/cuda-10.0/targets/aarch64-linux/include/crt
			
 
				+
			
 
				+# library directories
			
 
				+
			
 
				+QMAKE_LIBDIR += $$CUDA_DIR/lib/
			
 
				+
			
 
				+CUDA_OBJECTS_DIR = ./
			
 
				+
			
 
				+# The following library conflicts with something in Cuda
			
 
				+
			
 
				+#QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
			
 
				+
			
 
				+#QMAKE_LFLAGS_DEBUG   = /NODEFAULTLIB:msvcrtd.lib
			
 
				+
			
 
				+# Add the necessary libraries
			
 
				+
			
 
				+CUDA_LIBS =  cudart cufft
			
 
				+
			
 
				+# The following makes sure all path names (which often include spaces) are put between quotation marks
			
 
				+
			
 
				+CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
			
 
				+
			
 
				+NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
			
 
				+
			
 
				+#LIBS += $$join(CUDA_LIBS,'.so ', '', '.so')
			
 
				+
			
 
				+# Configuration of the Cuda compiler
			
 
				+
			
 
				+CONFIG(debug, debug|release) {
			
 
				+
			
 
				+    # Debug mode
			
 
				+
			
 
				+    cuda_d.input = CUDA_SOURCES
			
 
				+
			
 
				+    cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
			
 
				+
			
 
				+    cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
			
 
				+
			
 
				+    cuda_d.dependency_type = TYPE_C
			
 
				+
			
 
				+    QMAKE_EXTRA_COMPILERS += cuda_d
			
 
				+
			
 
				+}
			
 
				+
			
 
				+else {
			
 
				+
			
 
				+    # Release mode
			
 
				+
			
 
				+    cuda.input = CUDA_SOURCES
			
 
				+
			
 
				+    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
			
 
				+
			
 
				+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -O3 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
			
 
				+
			
 
				+    cuda.dependency_type = TYPE_C
			
 
				+
			
 
				+    QMAKE_EXTRA_COMPILERS += cuda
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# include paths
			
 
				+
			
 
				+
			
 
				+
			
 
				+LIBS += -lrt -ldl -lnvinfer -lcudnn  -lcudart -lnvparsers -lnvonnxparser -lnvinfer_plugin -lstdc++fs
			
--- a/src/detection/detection_lidar_transfusion/include/network/network_trt.hpp
+++ b/src/detection/detection_lidar_transfusion/include/network/network_trt.hpp
@@ -0,0 +1,87 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_
			
 
				+
			
 
				+#include "transfusion_config.hpp"
			
 
				+#include "utils.hpp"
			
 
				+
			
 
				+#include <tensorrt_common/tensorrt_common.hpp>
			
 
				+
			
 
				+#include <NvInfer.h>
			
 
				+
			
 
				+#include <array>
			
 
				+#include <iostream>
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+#include <unordered_map>
			
 
				+#include <vector>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+struct ProfileDimension
			
 
				+{
			
 
				+  nvinfer1::Dims min;
			
 
				+  nvinfer1::Dims opt;
			
 
				+  nvinfer1::Dims max;
			
 
				+
			
 
				+  bool operator!=(const ProfileDimension & rhs) const
			
 
				+  {
			
 
				+    return min.nbDims != rhs.min.nbDims || opt.nbDims != rhs.opt.nbDims ||
			
 
				+           max.nbDims != rhs.max.nbDims || !std::equal(min.d, min.d + min.nbDims, rhs.min.d) ||
			
 
				+           !std::equal(opt.d, opt.d + opt.nbDims, rhs.opt.d) ||
			
 
				+           !std::equal(max.d, max.d + max.nbDims, rhs.max.d);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+class NetworkTRT
			
 
				+{
			
 
				+public:
			
 
				+  explicit NetworkTRT(const TransfusionConfig & config);
			
 
				+  ~NetworkTRT();
			
 
				+
			
 
				+  bool init(
			
 
				+    const std::string & onnx_path, const std::string & engine_path, const std::string & precision);
			
 
				+  const char * getTensorName(NetworkIO name);
			
 
				+
			
 
				+  tensorrt_common::TrtUniquePtr<nvinfer1::ICudaEngine> engine{nullptr};
			
 
				+  tensorrt_common::TrtUniquePtr<nvinfer1::IExecutionContext> context{nullptr};
			
 
				+
			
 
				+private:
			
 
				+  bool parseONNX(
			
 
				+    const std::string & onnx_path, const std::string & engine_path, const std::string & precision,
			
 
				+    size_t workspace_size = (1ULL << 30));
			
 
				+  bool saveEngine(const std::string & engine_path);
			
 
				+  bool loadEngine(const std::string & engine_path);
			
 
				+  bool createContext();
			
 
				+  bool setProfile(
			
 
				+    nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network,
			
 
				+    nvinfer1::IBuilderConfig & config);
			
 
				+  bool validateNetworkIO();
			
 
				+  nvinfer1::Dims validateTensorShape(NetworkIO name, const std::vector<int> shape);
			
 
				+
			
 
				+  tensorrt_common::TrtUniquePtr<nvinfer1::IRuntime> runtime_{nullptr};
			
 
				+  tensorrt_common::TrtUniquePtr<nvinfer1::IHostMemory> plan_{nullptr};
			
 
				+  tensorrt_common::Logger logger_;
			
 
				+  TransfusionConfig config_;
			
 
				+  std::vector<const char *> tensors_names_;
			
 
				+
			
 
				+  std::array<ProfileDimension, 3> in_profile_dims_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_
			
--- a/src/detection/detection_lidar_transfusion/include/postprocess/circle_nms_kernel.hpp
+++ b/src/detection/detection_lidar_transfusion/include/postprocess/circle_nms_kernel.hpp
@@ -0,0 +1,32 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
			
 
				+
			
 
				+#include "utils.hpp"
			
 
				+
			
 
				+#include <thrust/device_vector.h>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+// Non-maximum suppression (NMS) uses the distance on the xy plane instead of
			
 
				+// intersection over union (IoU) to suppress overlapped objects.
			
 
				+std::size_t circleNMS(
			
 
				+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
			
 
				+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream);
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
			
--- a/src/detection/detection_lidar_transfusion/include/postprocess/non_maximum_suppression.hpp
+++ b/src/detection/detection_lidar_transfusion/include/postprocess/non_maximum_suppression.hpp
@@ -0,0 +1,82 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
			
 
				+
			
 
				+//#include "autoware/lidar_transfusion/ros_utils.hpp"
			
 
				+
			
 
				+#include <Eigen/Eigen>
			
 
				+
			
 
				+//#include <autoware_perception_msgs/msg/detected_object.hpp>
			
 
				+
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+uint8_t getSemanticType(const std::string & class_name);
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+//using autoware_perception_msgs::msg::DetectedObject;
			
 
				+
			
 
				+enum class NMS_TYPE {
			
 
				+  IoU_BEV
			
 
				+  // IoU_3D
			
 
				+  // Distance_2D
			
 
				+  // Distance_3D
			
 
				+};
			
 
				+
			
 
				+struct NMSParams
			
 
				+{
			
 
				+  NMS_TYPE nms_type_{};
			
 
				+  std::vector<std::string> target_class_names_{};
			
 
				+  double search_distance_2d_{};
			
 
				+  double iou_threshold_{};
			
 
				+  // double distance_threshold_{};
			
 
				+};
			
 
				+
			
 
				+std::vector<bool> classNamesToBooleanMask(const std::vector<std::string> & class_names)
			
 
				+{
			
 
				+  std::vector<bool> mask;
			
 
				+  constexpr std::size_t num_object_classification = 8;
			
 
				+  mask.resize(num_object_classification);
			
 
				+  for (const auto & class_name : class_names) {
			
 
				+    const auto semantic_type = getSemanticType(class_name);
			
 
				+    mask.at(semantic_type) = true;
			
 
				+  }
			
 
				+
			
 
				+  return mask;
			
 
				+}
			
 
				+
			
 
				+class NonMaximumSuppression
			
 
				+{
			
 
				+public:
			
 
				+  void setParameters(const NMSParams &);
			
 
				+
			
 
				+//  std::vector<DetectedObject> apply(const std::vector<DetectedObject> &);
			
 
				+
			
 
				+private:
			
 
				+  bool isTargetLabel(const std::uint8_t);
			
 
				+
			
 
				+//  bool isTargetPairObject(const DetectedObject &, const DetectedObject &);
			
 
				+
			
 
				+//  Eigen::MatrixXd generateIoUMatrix(const std::vector<DetectedObject> &);
			
 
				+
			
 
				+  NMSParams params_{};
			
 
				+  std::vector<bool> target_class_mask_{};
			
 
				+};
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
			
--- a/src/detection/detection_lidar_transfusion/include/postprocess/postprocess_kernel.hpp
+++ b/src/detection/detection_lidar_transfusion/include/postprocess/postprocess_kernel.hpp
@@ -0,0 +1,47 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
			
 
				+
			
 
				+#include "transfusion_config.hpp"
			
 
				+#include "utils.hpp"
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+class PostprocessCuda
			
 
				+{
			
 
				+public:
			
 
				+  explicit PostprocessCuda(const TransfusionConfig & config, cudaStream_t & stream);
			
 
				+
			
 
				+  cudaError_t generateDetectedBoxes3D_launch(
			
 
				+    const float * cls_output, const float * box_output, const float * dir_cls_output,
			
 
				+    std::vector<Box3D> & det_boxes3d, cudaStream_t stream);
			
 
				+
			
 
				+private:
			
 
				+  TransfusionConfig config_;
			
 
				+  cudaStream_t stream_;
			
 
				+  cudaStream_t stream_event_;
			
 
				+  cudaEvent_t start_, stop_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
			
--- a/src/detection/detection_lidar_transfusion/include/preprocess/pointcloud_densification.hpp
+++ b/src/detection/detection_lidar_transfusion/include/preprocess/pointcloud_densification.hpp
@@ -0,0 +1,101 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
			
 
				+
			
 
				+#include "autoware/lidar_transfusion/cuda_utils.hpp"
			
 
				+
			
 
				+#include <tf2_ros/buffer.h>
			
 
				+#include <tf2_ros/transform_listener.h>
			
 
				+#ifdef ROS_DISTRO_GALACTIC
			
 
				+#include <tf2_sensor_msgs/tf2_sensor_msgs.h>
			
 
				+#else
			
 
				+#include <tf2_sensor_msgs/tf2_sensor_msgs.hpp>
			
 
				+#endif
			
 
				+
			
 
				+#include <list>
			
 
				+#include <string>
			
 
				+#include <utility>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+class DensificationParam
			
 
				+{
			
 
				+public:
			
 
				+  DensificationParam(const std::string & world_frame_id, const unsigned int num_past_frames)
			
 
				+  : world_frame_id_(std::move(world_frame_id)),
			
 
				+    pointcloud_cache_size_(num_past_frames + /*current frame*/ 1)
			
 
				+  {
			
 
				+  }
			
 
				+
			
 
				+  std::string world_frame_id() const { return world_frame_id_; }
			
 
				+  unsigned int pointcloud_cache_size() const { return pointcloud_cache_size_; }
			
 
				+
			
 
				+private:
			
 
				+  std::string world_frame_id_;
			
 
				+  unsigned int pointcloud_cache_size_{1};
			
 
				+};
			
 
				+
			
 
				+struct PointCloudWithTransform
			
 
				+{
			
 
				+  cuda::unique_ptr<uint8_t[]> data_d{nullptr};
			
 
				+  std_msgs::msg::Header header;
			
 
				+  size_t num_points{0};
			
 
				+  Eigen::Affine3f affine_past2world;
			
 
				+};
			
 
				+
			
 
				+class PointCloudDensification
			
 
				+{
			
 
				+public:
			
 
				+  explicit PointCloudDensification(const DensificationParam & param, cudaStream_t & stream);
			
 
				+
			
 
				+  bool enqueuePointCloud(
			
 
				+    const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer);
			
 
				+
			
 
				+  double getCurrentTimestamp() const { return current_timestamp_; }
			
 
				+  Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
			
 
				+  std::list<PointCloudWithTransform>::iterator getPointCloudCacheIter()
			
 
				+  {
			
 
				+    return pointcloud_cache_.begin();
			
 
				+  }
			
 
				+  bool isCacheEnd(std::list<PointCloudWithTransform>::iterator iter)
			
 
				+  {
			
 
				+    return iter == pointcloud_cache_.end();
			
 
				+  }
			
 
				+  size_t getIdx(std::list<PointCloudWithTransform>::iterator iter)
			
 
				+  {
			
 
				+    return std::distance(pointcloud_cache_.begin(), iter);
			
 
				+  }
			
 
				+  size_t getCacheSize()
			
 
				+  {
			
 
				+    return std::distance(pointcloud_cache_.begin(), pointcloud_cache_.end());
			
 
				+  }
			
 
				+  unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
			
 
				+
			
 
				+private:
			
 
				+  void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
			
 
				+  void dequeue();
			
 
				+
			
 
				+  DensificationParam param_;
			
 
				+  double current_timestamp_{0.0};
			
 
				+  Eigen::Affine3f affine_world2current_;
			
 
				+  std::list<PointCloudWithTransform> pointcloud_cache_;
			
 
				+  cudaStream_t stream_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
			
--- a/src/detection/detection_lidar_transfusion/include/preprocess/preprocess_kernel.hpp
+++ b/src/detection/detection_lidar_transfusion/include/preprocess/preprocess_kernel.hpp
@@ -0,0 +1,73 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
			
 
				+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
			
 
				+
			
 
				+#include "autoware/lidar_transfusion/cuda_utils.hpp"
			
 
				+#include "autoware/lidar_transfusion/transfusion_config.hpp"
			
 
				+#include "autoware/lidar_transfusion/utils.hpp"
			
 
				+
			
 
				+#include <cuda_runtime_api.h>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+class PreprocessCuda
			
 
				+{
			
 
				+public:
			
 
				+  PreprocessCuda(const TransfusionConfig & config, cudaStream_t & stream);
			
 
				+
			
 
				+  void generateVoxels(
			
 
				+    float * points, unsigned int points_size, unsigned int * pillar_num, float * voxel_features,
			
 
				+    unsigned int * voxel_num, unsigned int * voxel_idxs);
			
 
				+
			
 
				+  cudaError_t generateVoxels_random_launch(
			
 
				+    float * points, unsigned int points_size, unsigned int * mask, float * voxels);
			
 
				+
			
 
				+  cudaError_t generateBaseFeatures_launch(
			
 
				+    unsigned int * mask, float * voxels, unsigned int * pillar_num, float * voxel_features,
			
 
				+    unsigned int * voxel_num, unsigned int * voxel_idxs);
			
 
				+
			
 
				+  cudaError_t generateSweepPoints_launch(
			
 
				+    const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag,
			
 
				+    const float * transform, float * output_points);
			
 
				+
			
 
				+private:
			
 
				+  TransfusionConfig config_;
			
 
				+  cudaStream_t stream_;
			
 
				+  cuda::unique_ptr<unsigned int[]> mask_{nullptr};
			
 
				+  cuda::unique_ptr<float[]> voxels_{nullptr};
			
 
				+  unsigned int mask_size_;
			
 
				+  unsigned int voxels_size_;
			
 
				+};
			
 
				+}  // namespace autoware::lidar_transfusion
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
			
--- a/src/detection/detection_lidar_transfusion/include/preprocess/voxel_generator.hpp
+++ b/src/detection/detection_lidar_transfusion/include/preprocess/voxel_generator.hpp
@@ -0,0 +1,72 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
			
 
				+
			
 
				+#include "autoware/lidar_transfusion/cuda_utils.hpp"
			
 
				+#include "autoware/lidar_transfusion/preprocess/pointcloud_densification.hpp"
			
 
				+#include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp"
			
 
				+#include "autoware/lidar_transfusion/ros_utils.hpp"
			
 
				+#include "autoware/lidar_transfusion/transfusion_config.hpp"
			
 
				+
			
 
				+#ifdef ROS_DISTRO_GALACTIC
			
 
				+#include <tf2_eigen/tf2_eigen.h>
			
 
				+#else
			
 
				+#include <tf2_eigen/tf2_eigen.hpp>
			
 
				+#endif
			
 
				+
			
 
				+#include <autoware_point_types/types.hpp>
			
 
				+
			
 
				+#include <sensor_msgs/msg/point_cloud2.hpp>
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+#include <tuple>
			
 
				+#include <vector>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+constexpr size_t AFF_MAT_SIZE = 16;  // 4x4 matrix
			
 
				+constexpr size_t MAX_CLOUD_STEP_SIZE = sizeof(autoware_point_types::PointXYZIRCAEDT);
			
 
				+
			
 
				+class VoxelGenerator
			
 
				+{
			
 
				+public:
			
 
				+  explicit VoxelGenerator(
			
 
				+    const DensificationParam & densification_param, const TransfusionConfig & config,
			
 
				+    cudaStream_t & stream);
			
 
				+  std::size_t generateSweepPoints(
			
 
				+    const sensor_msgs::msg::PointCloud2 & msg, cuda::unique_ptr<float[]> & points_d);
			
 
				+  bool enqueuePointCloud(
			
 
				+    const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer);
			
 
				+  void initCloudInfo(const sensor_msgs::msg::PointCloud2 & msg);
			
 
				+  std::tuple<const uint32_t, const uint8_t, const uint8_t> getFieldInfo(
			
 
				+    const sensor_msgs::msg::PointCloud2 & msg, const std::string & field_name);
			
 
				+
			
 
				+private:
			
 
				+  std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
			
 
				+  std::unique_ptr<PreprocessCuda> pre_ptr_{nullptr};
			
 
				+  TransfusionConfig config_;
			
 
				+  cuda::unique_ptr<unsigned char[]> cloud_data_d_{nullptr};
			
 
				+  cuda::unique_ptr<float[]> affine_past2current_d_{nullptr};
			
 
				+  std::vector<float> points_;
			
 
				+  cudaStream_t stream_;
			
 
				+  CloudInfo cloud_info_;
			
 
				+  bool is_initialized_{false};
			
 
				+};
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
			
--- a/src/detection/detection_lidar_transfusion/include/tensorrt_common/logger.hpp
+++ b/src/detection/detection_lidar_transfusion/include/tensorrt_common/logger.hpp
@@ -0,0 +1,552 @@
 
				+/*
			
 
				+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef TENSORRT_COMMON__LOGGER_HPP_
			
 
				+#define TENSORRT_COMMON__LOGGER_HPP_
			
 
				+
			
 
				+#include "NvInferRuntimeCommon.h"
			
 
				+
			
 
				+#include <atomic>
			
 
				+#include <cassert>
			
 
				+#include <ctime>
			
 
				+#include <iomanip>
			
 
				+#include <iostream>
			
 
				+#include <ostream>
			
 
				+#include <sstream>
			
 
				+#include <string>
			
 
				+#include <thread>
			
 
				+
			
 
				+namespace tensorrt_common
			
 
				+{
			
 
				+using Severity = nvinfer1::ILogger::Severity;
			
 
				+
			
 
				+class LogStreamConsumerBuffer : public std::stringbuf
			
 
				+{
			
 
				+public:
			
 
				+  LogStreamConsumerBuffer(std::ostream & stream, const std::string & prefix, bool shouldLog)
			
 
				+  : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog)
			
 
				+  {
			
 
				+  }
			
 
				+
			
 
				+  LogStreamConsumerBuffer(LogStreamConsumerBuffer && other) : mOutput(other.mOutput) {}
			
 
				+
			
 
				+  ~LogStreamConsumerBuffer()
			
 
				+  {
			
 
				+    // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output
			
 
				+    // sequence std::streambuf::pptr() gives a pointer to the current position of the output
			
 
				+    // sequence if the pointer to the beginning is not equal to the pointer to the current position,
			
 
				+    // call putOutput() to log the output to the stream
			
 
				+    if (pbase() != pptr()) {
			
 
				+      putOutput();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // synchronizes the stream buffer and returns 0 on success
			
 
				+  // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
			
 
				+  // resetting the buffer and flushing the stream
			
 
				+  virtual int sync()
			
 
				+  {
			
 
				+    putOutput();
			
 
				+    return 0;
			
 
				+  }
			
 
				+
			
 
				+  void putOutput()
			
 
				+  {
			
 
				+    if (mShouldLog) {
			
 
				+      // prepend timestamp
			
 
				+      // std::time_t timestamp = std::time(nullptr);
			
 
				+      // tm* tm_local = std::localtime(&timestamp);
			
 
				+      mOutput << mPrefix << str();
			
 
				+      // set the buffer to empty
			
 
				+      str("");
			
 
				+      // flush the stream
			
 
				+      mOutput.flush();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; }
			
 
				+
			
 
				+private:
			
 
				+  std::ostream & mOutput;
			
 
				+  std::string mPrefix;
			
 
				+  bool mShouldLog;
			
 
				+};
			
 
				+
			
 
				+//!
			
 
				+//! \class LogStreamConsumerBase
			
 
				+//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in
			
 
				+//! LogStreamConsumer
			
 
				+//!
			
 
				+class LogStreamConsumerBase
			
 
				+{
			
 
				+public:
			
 
				+  LogStreamConsumerBase(std::ostream & stream, const std::string & prefix, bool shouldLog)
			
 
				+  : mBuffer(stream, prefix, shouldLog)
			
 
				+  {
			
 
				+  }
			
 
				+
			
 
				+protected:
			
 
				+  LogStreamConsumerBuffer mBuffer;
			
 
				+};
			
 
				+
			
 
				+//!
			
 
				+//! \class LogStreamConsumer
			
 
				+//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
			
 
				+//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
			
 
				+//!  This is because the LogStreamConsumerBase class is used to initialize the
			
 
				+//!  LogStreamConsumerBuffer member field in LogStreamConsumer and then the address of the buffer is
			
 
				+//!  passed to std::ostream. This is necessary to prevent the address of an uninitialized buffer
			
 
				+//!  from being passed to std::ostream. Please do not change the order of the parent classes.
			
 
				+//!
			
 
				+class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
			
 
				+{
			
 
				+public:
			
 
				+  //! \brief Creates a LogStreamConsumer which logs messages with level severity.
			
 
				+  //!  Reportable severity determines if the messages are severe enough to be logged.
			
 
				+  LogStreamConsumer(Severity reportableSeverity, Severity severity)
			
 
				+  : LogStreamConsumerBase(
			
 
				+      severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity),
			
 
				+    std::ostream(&mBuffer)  // links the stream buffer with the stream
			
 
				+    ,
			
 
				+    mShouldLog(severity <= reportableSeverity),
			
 
				+    mSeverity(severity)
			
 
				+  {
			
 
				+  }
			
 
				+
			
 
				+  LogStreamConsumer(LogStreamConsumer && other)
			
 
				+  : LogStreamConsumerBase(
			
 
				+      severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog),
			
 
				+    std::ostream(&mBuffer)  // links the stream buffer with the stream
			
 
				+    ,
			
 
				+    mShouldLog(other.mShouldLog),
			
 
				+    mSeverity(other.mSeverity)
			
 
				+  {
			
 
				+  }
			
 
				+
			
 
				+  void setReportableSeverity(Severity reportableSeverity)
			
 
				+  {
			
 
				+    mShouldLog = mSeverity <= reportableSeverity;
			
 
				+    mBuffer.setShouldLog(mShouldLog);
			
 
				+  }
			
 
				+
			
 
				+private:
			
 
				+  static std::ostream & severityOstream(Severity severity)
			
 
				+  {
			
 
				+    return severity >= Severity::kINFO ? std::cout : std::cerr;
			
 
				+  }
			
 
				+
			
 
				+  static std::string severityPrefix(Severity severity)
			
 
				+  {
			
 
				+    switch (severity) {
			
 
				+      case Severity::kINTERNAL_ERROR:
			
 
				+        return "[F] ";
			
 
				+      case Severity::kERROR:
			
 
				+        return "[E] ";
			
 
				+      case Severity::kWARNING:
			
 
				+        return "[W] ";
			
 
				+      case Severity::kINFO:
			
 
				+        return "[I] ";
			
 
				+      case Severity::kVERBOSE:
			
 
				+        return "[V] ";
			
 
				+      default:
			
 
				+        assert(0);
			
 
				+        return "";
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  bool mShouldLog;
			
 
				+  Severity mSeverity;
			
 
				+};
			
 
				+
			
 
				+//! \class Logger
			
 
				+//!
			
 
				+//! \brief Class which manages logging of TensorRT tools and samples
			
 
				+//!
			
 
				+//! \details This class provides a common interface for TensorRT tools and samples to log
			
 
				+//! information to the console, and supports logging two types of messages:
			
 
				+//!
			
 
				+//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
			
 
				+//! - Test pass/fail messages
			
 
				+//!
			
 
				+//! The advantage of having all samples use this class for logging as opposed to emitting directly
			
 
				+//! to stdout/stderr is that the logic for controlling the verbosity and formatting of sample output
			
 
				+//! is centralized in one location.
			
 
				+//!
			
 
				+//! In the future, this class could be extended to support dumping test results to a file in some
			
 
				+//! standard format (for example, JUnit XML), and providing additional metadata (e.g. timing the
			
 
				+//! duration of a test run).
			
 
				+//!
			
 
				+//! TODO: For backwards compatibility with existing samples, this class inherits directly from the
			
 
				+//! nvinfer1::ILogger interface, which is problematic since there isn't a clean separation between
			
 
				+//! messages coming from the TensorRT library and messages coming from the sample.
			
 
				+//!
			
 
				+//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger)
			
 
				+//! we can refactor the class to eliminate the inheritance and instead make the nvinfer1::ILogger
			
 
				+//! implementation a member of the Logger object.
			
 
				+
			
 
				+class Logger : public nvinfer1::ILogger  // NOLINT
			
 
				+{
			
 
				+public:
			
 
				+  //  Logger(Severity severity = Severity::kWARNING)
			
 
				+  //  Logger(Severity severity = Severity::kVERBOSE)
			
 
				+  explicit Logger(Severity severity = Severity::kINFO)
			
 
				+  : mReportableSeverity(severity), mVerbose(true), mThrottleStopFlag(false)
			
 
				+  {
			
 
				+  }
			
 
				+
			
 
				+  explicit Logger(const bool verbose, Severity severity = Severity::kINFO)
			
 
				+  : mReportableSeverity(severity), mVerbose(verbose), mThrottleStopFlag(false)
			
 
				+  {
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \enum TestResult
			
 
				+  //! \brief Represents the state of a given test
			
 
				+  //!
			
 
				+  enum class TestResult {
			
 
				+    kRUNNING,  //!< The test is running
			
 
				+    kPASSED,   //!< The test passed
			
 
				+    kFAILED,   //!< The test failed
			
 
				+    kWAIVED    //!< The test was waived
			
 
				+  };
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this
			
 
				+  //! Logger \return The nvinfer1::ILogger associated with this Logger
			
 
				+  //!
			
 
				+  //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
			
 
				+  //! we can eliminate the inheritance of Logger from ILogger
			
 
				+  //!
			
 
				+  nvinfer1::ILogger & getTRTLogger()
			
 
				+  {
			
 
				+    printf("verbose\n");
			
 
				+    return *this;
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
			
 
				+  //!
			
 
				+  //! Note samples should not be calling this function directly; it will eventually go away once we
			
 
				+  //! eliminate the inheritance from nvinfer1::ILogger
			
 
				+  //!
			
 
				+  void log(Severity severity, const char * msg) noexcept override
			
 
				+  {
			
 
				+    if (mVerbose) {
			
 
				+      LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Logging with throttle.
			
 
				+   *
			
 
				+   * @example
			
 
				+   * Logger logger();
			
 
				+   * auto log_thread = logger.log_throttle(nvinfer1::ILogger::Severity::kINFO, "SOME MSG", 1);
			
 
				+   * // some operation
			
 
				+   * logger.stop_throttle(log_thread);
			
 
				+   *
			
 
				+   * @param severity
			
 
				+   * @param msg
			
 
				+   * @param duration
			
 
				+   * @return std::thread
			
 
				+   *
			
 
				+   */
			
 
				+  std::thread log_throttle(Severity severity, const char * msg, const int duration) noexcept
			
 
				+  {
			
 
				+    mThrottleStopFlag.store(false);
			
 
				+    auto log_func = [this](Severity s, const char * m, const int d) {
			
 
				+      while (!mThrottleStopFlag.load()) {
			
 
				+        this->log(s, m);
			
 
				+        std::this_thread::sleep_for(std::chrono::seconds(d));
			
 
				+      }
			
 
				+    };
			
 
				+
			
 
				+    std::thread log_thread(log_func, severity, msg, duration);
			
 
				+    return log_thread;
			
 
				+  }
			
 
				+
			
 
				+  void stop_throttle(std::thread & log_thread) noexcept
			
 
				+  {
			
 
				+    mThrottleStopFlag.store(true);
			
 
				+    log_thread.join();
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief Method for controlling the verbosity of logging output
			
 
				+  //!
			
 
				+  //! \param severity The logger will only emit messages that have severity of this level or higher.
			
 
				+  //!
			
 
				+  void setReportableSeverity(Severity severity) { mReportableSeverity = severity; }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief Opaque handle that holds logging information for a particular test
			
 
				+  //!
			
 
				+  //! This object is an opaque handle to information used by the Logger to print test results.
			
 
				+  //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
			
 
				+  //! with Logger::reportTest{Start,End}().
			
 
				+  //!
			
 
				+  class TestAtom
			
 
				+  {
			
 
				+  public:
			
 
				+    TestAtom(TestAtom &&) = default;
			
 
				+
			
 
				+  private:
			
 
				+    friend class Logger;
			
 
				+
			
 
				+    TestAtom(bool started, const std::string & name, const std::string & cmdline)
			
 
				+    : mStarted(started), mName(name), mCmdline(cmdline)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    bool mStarted;
			
 
				+    std::string mName;
			
 
				+    std::string mCmdline;
			
 
				+  };
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief Define a test for logging
			
 
				+  //!
			
 
				+  //! \param[in] name The name of the test.  This should be a string starting with
			
 
				+  //!                  "TensorRT" and containing dot-separated strings containing
			
 
				+  //!                  the characters [A-Za-z0-9_].
			
 
				+  //!                  For example, "TensorRT.sample_googlenet"
			
 
				+  //! \param[in] cmdline The command line used to reproduce the test
			
 
				+  //
			
 
				+  //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
			
 
				+  //!
			
 
				+  static TestAtom defineTest(const std::string & name, const std::string & cmdline)
			
 
				+  {
			
 
				+    return TestAtom(false, name, cmdline);
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line
			
 
				+  //! arguments
			
 
				+  //!        as input
			
 
				+  //!
			
 
				+  //! \param[in] name The name of the test
			
 
				+  //! \param[in] argc The number of command-line arguments
			
 
				+  //! \param[in] argv The array of command-line arguments (given as C strings)
			
 
				+  //!
			
 
				+  //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
			
 
				+  static TestAtom defineTest(const std::string & name, int argc, char const * const * argv)
			
 
				+  {
			
 
				+    auto cmdline = genCmdlineString(argc, argv);
			
 
				+    return defineTest(name, cmdline);
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief Report that a test has started.
			
 
				+  //!
			
 
				+  //! \pre reportTestStart() has not been called yet for the given testAtom
			
 
				+  //!
			
 
				+  //! \param[in] testAtom The handle to the test that has started
			
 
				+  //!
			
 
				+  static void reportTestStart(TestAtom & testAtom)
			
 
				+  {
			
 
				+    reportTestResult(testAtom, TestResult::kRUNNING);
			
 
				+    assert(!testAtom.mStarted);
			
 
				+    testAtom.mStarted = true;
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief Report that a test has ended.
			
 
				+  //!
			
 
				+  //! \pre reportTestStart() has been called for the given testAtom
			
 
				+  //!
			
 
				+  //! \param[in] testAtom The handle to the test that has ended
			
 
				+  //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
			
 
				+  //!                   TestResult::kFAILED, TestResult::kWAIVED
			
 
				+  //!
			
 
				+  static void reportTestEnd(const TestAtom & testAtom, TestResult result)
			
 
				+  {
			
 
				+    assert(result != TestResult::kRUNNING);
			
 
				+    assert(testAtom.mStarted);
			
 
				+    reportTestResult(testAtom, result);
			
 
				+  }
			
 
				+
			
 
				+  static int reportPass(const TestAtom & testAtom)
			
 
				+  {
			
 
				+    reportTestEnd(testAtom, TestResult::kPASSED);
			
 
				+    return EXIT_SUCCESS;
			
 
				+  }
			
 
				+
			
 
				+  static int reportFail(const TestAtom & testAtom)
			
 
				+  {
			
 
				+    reportTestEnd(testAtom, TestResult::kFAILED);
			
 
				+    return EXIT_FAILURE;
			
 
				+  }
			
 
				+
			
 
				+  static int reportWaive(const TestAtom & testAtom)
			
 
				+  {
			
 
				+    reportTestEnd(testAtom, TestResult::kWAIVED);
			
 
				+    return EXIT_SUCCESS;
			
 
				+  }
			
 
				+
			
 
				+  static int reportTest(const TestAtom & testAtom, bool pass)
			
 
				+  {
			
 
				+    return pass ? reportPass(testAtom) : reportFail(testAtom);
			
 
				+  }
			
 
				+
			
 
				+  Severity getReportableSeverity() const { return mReportableSeverity; }
			
 
				+
			
 
				+private:
			
 
				+  //!
			
 
				+  //! \brief returns an appropriate string for prefixing a log message with the given severity
			
 
				+  //!
			
 
				+  static const char * severityPrefix(Severity severity)
			
 
				+  {
			
 
				+    switch (severity) {
			
 
				+      case Severity::kINTERNAL_ERROR:
			
 
				+        return "[F] ";
			
 
				+      case Severity::kERROR:
			
 
				+        return "[E] ";
			
 
				+      case Severity::kWARNING:
			
 
				+        return "[W] ";
			
 
				+      case Severity::kINFO:
			
 
				+        return "[I] ";
			
 
				+      case Severity::kVERBOSE:
			
 
				+        return "[V] ";
			
 
				+      default:
			
 
				+        assert(0);
			
 
				+        return "";
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief returns an appropriate string for prefixing a test result message with the given result
			
 
				+  //!
			
 
				+  static const char * testResultString(TestResult result)
			
 
				+  {
			
 
				+    switch (result) {
			
 
				+      case TestResult::kRUNNING:
			
 
				+        return "RUNNING";
			
 
				+      case TestResult::kPASSED:
			
 
				+        return "PASSED";
			
 
				+      case TestResult::kFAILED:
			
 
				+        return "FAILED";
			
 
				+      case TestResult::kWAIVED:
			
 
				+        return "WAIVED";
			
 
				+      default:
			
 
				+        assert(0);
			
 
				+        return "";
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
			
 
				+  //!
			
 
				+  static std::ostream & severityOstream(Severity severity)
			
 
				+  {
			
 
				+    return severity >= Severity::kINFO ? std::cout : std::cerr;
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief method that implements logging test results
			
 
				+  //!
			
 
				+  static void reportTestResult(const TestAtom & testAtom, TestResult result)
			
 
				+  {
			
 
				+    severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName
			
 
				+                                     << " # " << testAtom.mCmdline << std::endl;
			
 
				+  }
			
 
				+
			
 
				+  //!
			
 
				+  //! \brief generate a command line string from the given (argc, argv) values
			
 
				+  //!
			
 
				+  static std::string genCmdlineString(int argc, char const * const * argv)
			
 
				+  {
			
 
				+    std::stringstream ss;
			
 
				+    for (int i = 0; i < argc; i++) {
			
 
				+      if (i > 0) ss << " ";
			
 
				+      ss << argv[i];
			
 
				+    }
			
 
				+    return ss.str();
			
 
				+  }
			
 
				+
			
 
				+  Severity mReportableSeverity;
			
 
				+  bool mVerbose;
			
 
				+  std::atomic<bool> mThrottleStopFlag;
			
 
				+};
			
 
				+
			
 
				+namespace
			
 
				+{
			
 
				+
			
 
				+//!
			
 
				+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
			
 
				+//!
			
 
				+//! Example usage:
			
 
				+//!
			
 
				+//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
			
 
				+//!
			
 
				+inline LogStreamConsumer LOG_VERBOSE(const Logger & logger)
			
 
				+{
			
 
				+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
			
 
				+}
			
 
				+
			
 
				+//!
			
 
				+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
			
 
				+//!
			
 
				+//! Example usage:
			
 
				+//!
			
 
				+//!     LOG_INFO(logger) << "hello world" << std::endl;
			
 
				+//!
			
 
				+inline LogStreamConsumer LOG_INFO(const Logger & logger)
			
 
				+{
			
 
				+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
			
 
				+}
			
 
				+
			
 
				+//!
			
 
				+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
			
 
				+//!
			
 
				+//! Example usage:
			
 
				+//!
			
 
				+//!     LOG_WARN(logger) << "hello world" << std::endl;
			
 
				+//!
			
 
				+inline LogStreamConsumer LOG_WARN(const Logger & logger)
			
 
				+{
			
 
				+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
			
 
				+}
			
 
				+
			
 
				+//!
			
 
				+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
			
 
				+//!
			
 
				+//! Example usage:
			
 
				+//!
			
 
				+//!     LOG_ERROR(logger) << "hello world" << std::endl;
			
 
				+//!
			
 
				+inline LogStreamConsumer LOG_ERROR(const Logger & logger)
			
 
				+{
			
 
				+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
			
 
				+}
			
 
				+
			
 
				+//!
			
 
				+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity
			
 
				+//! kINTERNAL_ERROR
			
 
				+//         ("fatal" severity)
			
 
				+//!
			
 
				+//! Example usage:
			
 
				+//!
			
 
				+//!     LOG_FATAL(logger) << "hello world" << std::endl;
			
 
				+//!
			
 
				+inline LogStreamConsumer LOG_FATAL(const Logger & logger)
			
 
				+{
			
 
				+  return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
			
 
				+}
			
 
				+
			
 
				+}  // anonymous namespace
			
 
				+}  // namespace tensorrt_common
			
 
				+
			
 
				+#endif  // TENSORRT_COMMON__LOGGER_HPP_
			
--- a/src/detection/detection_lidar_transfusion/include/tensorrt_common/simple_profiler.hpp
+++ b/src/detection/detection_lidar_transfusion/include/tensorrt_common/simple_profiler.hpp
@@ -0,0 +1,70 @@
 
				+// Copyright 2023 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef TENSORRT_COMMON__SIMPLE_PROFILER_HPP_
			
 
				+#define TENSORRT_COMMON__SIMPLE_PROFILER_HPP_
			
 
				+
			
 
				+#include <NvInfer.h>
			
 
				+
			
 
				+#include <iostream>
			
 
				+#include <map>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+namespace tensorrt_common
			
 
				+{
			
 
				+struct LayerInfo
			
 
				+{
			
 
				+  int in_c;
			
 
				+  int out_c;
			
 
				+  int w;
			
 
				+  int h;
			
 
				+  int k;
			
 
				+  int stride;
			
 
				+  int groups;
			
 
				+  nvinfer1::LayerType type;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @class Profiler
			
 
				+ * @brief Collect per-layer profile information, assuming times are reported in the same order
			
 
				+ */
			
 
				+class SimpleProfiler : public nvinfer1::IProfiler
			
 
				+{
			
 
				+public:
			
 
				+  struct Record
			
 
				+  {
			
 
				+    float time{0};
			
 
				+    int count{0};
			
 
				+    float min_time{-1.0};
			
 
				+    int index;
			
 
				+  };
			
 
				+  SimpleProfiler(
			
 
				+    std::string name,
			
 
				+    const std::vector<SimpleProfiler> & src_profilers = std::vector<SimpleProfiler>());
			
 
				+
			
 
				+  void reportLayerTime(const char * layerName, float ms) noexcept override;
			
 
				+
			
 
				+  void setProfDict(nvinfer1::ILayer * layer) noexcept;
			
 
				+
			
 
				+  friend std::ostream & operator<<(std::ostream & out, const SimpleProfiler & value);
			
 
				+
			
 
				+private:
			
 
				+  std::string m_name;
			
 
				+  std::map<std::string, Record> m_profile;
			
 
				+  int m_index;
			
 
				+  std::map<std::string, LayerInfo> m_layer_dict;
			
 
				+};
			
 
				+}  // namespace tensorrt_common
			
 
				+#endif  // TENSORRT_COMMON__SIMPLE_PROFILER_HPP_
			
--- a/src/detection/detection_lidar_transfusion/include/tensorrt_common/tensorrt_common.hpp
+++ b/src/detection/detection_lidar_transfusion/include/tensorrt_common/tensorrt_common.hpp
@@ -0,0 +1,232 @@
 
				+// Copyright 2023 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef TENSORRT_COMMON__TENSORRT_COMMON_HPP_
			
 
				+#define TENSORRT_COMMON__TENSORRT_COMMON_HPP_
			
 
				+
			
 
				+//#ifndef YOLOX_STANDALONE
			
 
				+//#include <rclcpp/rclcpp.hpp>
			
 
				+//#endif
			
 
				+
			
 
				+#include <NvInfer.h>
			
 
				+#include <NvOnnxParser.h>
			
 
				+
			
 
				+#if (defined(_MSC_VER) or (defined(__GNUC__) and (7 <= __GNUC_MAJOR__)))
			
 
				+#include <filesystem>
			
 
				+namespace fs = ::std::filesystem;
			
 
				+#else
			
 
				+#include <experimental/filesystem>
			
 
				+namespace fs = ::std::experimental::filesystem;
			
 
				+#endif
			
 
				+
			
 
				+#include <tensorrt_common/logger.hpp>
			
 
				+#include <tensorrt_common/simple_profiler.hpp>
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <sstream>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+#include <algorithm>
			
 
				+
			
 
				+namespace tensorrt_common
			
 
				+{
			
 
				+/**
			
 
				+ * @struct BuildConfig
			
 
				+ * @brief Configuration to provide fine control regarding TensorRT builder
			
 
				+ */
			
 
				+struct BuildConfig
			
 
				+{
			
 
				+  // type for calibration
			
 
				+  std::string calib_type_str;
			
 
				+
			
 
				+  // DLA core ID that the process uses
			
 
				+  int dla_core_id;
			
 
				+
			
 
				+  // flag for partial quantization in first layer
			
 
				+  bool quantize_first_layer;  // For partial quantization
			
 
				+
			
 
				+  // flag for partial quantization in last layer
			
 
				+  bool quantize_last_layer;  // For partial quantization
			
 
				+
			
 
				+  // flag for per-layer profiler using IProfiler
			
 
				+  bool profile_per_layer;
			
 
				+
			
 
				+  // clip value for implicit quantization
			
 
				+  double clip_value;  // For implicit quantization
			
 
				+
			
 
				+  // Supported calibration type
			
 
				+  const std::array<std::string, 4> valid_calib_type = {"Entropy", "Legacy", "Percentile", "MinMax"};
			
 
				+
			
 
				+  BuildConfig()
			
 
				+  : calib_type_str("MinMax"),
			
 
				+    dla_core_id(-1),
			
 
				+    quantize_first_layer(false),
			
 
				+    quantize_last_layer(false),
			
 
				+    profile_per_layer(false),
			
 
				+    clip_value(0.0)
			
 
				+  {
			
 
				+  }
			
 
				+
			
 
				+  explicit BuildConfig(
			
 
				+    const std::string & calib_type_str, const int dla_core_id = -1,
			
 
				+    const bool quantize_first_layer = false, const bool quantize_last_layer = false,
			
 
				+    const bool profile_per_layer = false, const double clip_value = 0.0)
			
 
				+  : calib_type_str(calib_type_str),
			
 
				+    dla_core_id(dla_core_id),
			
 
				+    quantize_first_layer(quantize_first_layer),
			
 
				+    quantize_last_layer(quantize_last_layer),
			
 
				+    profile_per_layer(profile_per_layer),
			
 
				+    clip_value(clip_value)
			
 
				+  {
			
 
				+#ifndef YOLOX_STANDALONE
			
 
				+    if (
			
 
				+      std::find(valid_calib_type.begin(), valid_calib_type.end(), calib_type_str) ==
			
 
				+      valid_calib_type.end()) {
			
 
				+      std::stringstream message;
			
 
				+      message << "Invalid calibration type was specified: " << calib_type_str << std::endl
			
 
				+              << "Valid value is one of: [Entropy, (Legacy | Percentile), MinMax]" << std::endl
			
 
				+              << "Default calibration type will be used: MinMax" << std::endl;
			
 
				+      std::cerr << message.str();
			
 
				+    }
			
 
				+#endif
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+nvinfer1::Dims get_input_dims(const std::string & onnx_file_path);
			
 
				+
			
 
				+const std::array<std::string, 3> valid_precisions = {"fp32", "fp16", "int8"};
			
 
				+bool is_valid_precision_string(const std::string & precision);
			
 
				+
			
 
				+template <typename T>
			
 
				+struct InferDeleter  // NOLINT
			
 
				+{
			
 
				+  void operator()(T * obj) const
			
 
				+  {
			
 
				+    if (obj) {
			
 
				+#if TENSORRT_VERSION_MAJOR >= 8
			
 
				+      delete obj;
			
 
				+#else
			
 
				+      obj->destroy();
			
 
				+#endif
			
 
				+    }
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+using TrtUniquePtr = std::unique_ptr<T, InferDeleter<T>>;
			
 
				+
			
 
				+using BatchConfig = std::array<int32_t, 3>;
			
 
				+
			
 
				+/**
			
 
				+ * @class TrtCommon
			
 
				+ * @brief TensorRT common library
			
 
				+ */
			
 
				+class TrtCommon  // NOLINT
			
 
				+{
			
 
				+public:
			
 
				+  /**
			
 
				+   * @brief Construct TrtCommon.
			
 
				+   * @param[in] mode_path ONNX model_path
			
 
				+   * @param[in] precision precision for inference
			
 
				+   * @param[in] calibrator pointer for any type of INT8 calibrator
			
 
				+   * @param[in] batch_config configuration for batched execution
			
 
				+   * @param[in] max_workspace_size maximum workspace for building TensorRT engine
			
 
				+   * @param[in] buildConfig configuration including precision, calibration method, dla, remaining
			
 
				+   * fp16 for first layer,  remaining fp16 for last layer and profiler for builder
			
 
				+   * @param[in] plugin_paths path for custom plugin
			
 
				+   */
			
 
				+  TrtCommon(
			
 
				+    const std::string & model_path, const std::string & precision,
			
 
				+    std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator = nullptr,
			
 
				+    const BatchConfig & batch_config = {1, 1, 1}, const size_t max_workspace_size = (16 << 20),
			
 
				+    const BuildConfig & buildConfig = BuildConfig(),
			
 
				+    const std::vector<std::string> & plugin_paths = {});
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Deconstruct TrtCommon
			
 
				+   */
			
 
				+  ~TrtCommon();
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Load TensorRT engine
			
 
				+   * @param[in] engine_file_path path for a engine file
			
 
				+   * @return flag for whether loading are succeeded or failed
			
 
				+   */
			
 
				+  bool loadEngine(const std::string & engine_file_path);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Output layer information including GFLOPs and parameters
			
 
				+   * @param[in] onnx_file_path path for a onnx file
			
 
				+   * @warning This function is based on darknet log.
			
 
				+   */
			
 
				+  void printNetworkInfo(const std::string & onnx_file_path);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief build TensorRT engine from ONNX
			
 
				+   * @param[in] onnx_file_path path for a onnx file
			
 
				+   * @param[in] output_engine_file_path path for a engine file
			
 
				+   */
			
 
				+  bool buildEngineFromOnnx(
			
 
				+    const std::string & onnx_file_path, const std::string & output_engine_file_path);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief setup for TensorRT execution including building and loading engine
			
 
				+   */
			
 
				+  void setup();
			
 
				+
			
 
				+  bool isInitialized();
			
 
				+
			
 
				+  nvinfer1::Dims getBindingDimensions(const int32_t index) const;
			
 
				+  int32_t getNbBindings();
			
 
				+  bool setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const;
			
 
				+  bool enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief output per-layer information
			
 
				+   */
			
 
				+  void printProfiling(void);
			
 
				+
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
			
 
				+  /**
			
 
				+   * @brief get per-layer information for trt-engine-profiler
			
 
				+   */
			
 
				+  std::string getLayerInformation(nvinfer1::LayerInformationFormat format);
			
 
				+#endif
			
 
				+
			
 
				+private:
			
 
				+  Logger logger_;
			
 
				+  fs::path model_file_path_;
			
 
				+  TrtUniquePtr<nvinfer1::IRuntime> runtime_;
			
 
				+  TrtUniquePtr<nvinfer1::ICudaEngine> engine_;
			
 
				+  TrtUniquePtr<nvinfer1::IExecutionContext> context_;
			
 
				+  std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator_;
			
 
				+
			
 
				+  nvinfer1::Dims input_dims_;
			
 
				+  nvinfer1::Dims output_dims_;
			
 
				+  std::string precision_;
			
 
				+  BatchConfig batch_config_;
			
 
				+  size_t max_workspace_size_;
			
 
				+  bool is_initialized_{false};
			
 
				+
			
 
				+  // profiler for per-layer
			
 
				+  SimpleProfiler model_profiler_;
			
 
				+  // profiler for whole model
			
 
				+  SimpleProfiler host_profiler_;
			
 
				+
			
 
				+  std::unique_ptr<const BuildConfig> build_config_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace tensorrt_common
			
 
				+
			
 
				+#endif  // TENSORRT_COMMON__TENSORRT_COMMON_HPP_
			
--- a/src/detection/detection_lidar_transfusion/main.cpp
+++ b/src/detection/detection_lidar_transfusion/main.cpp
@@ -0,0 +1,8 @@
 
				+#include <QCoreApplication>
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+    QCoreApplication a(argc, argv);
			
 
				+
			
 
				+    return a.exec();
			
 
				+}
			
--- a/src/detection/detection_lidar_transfusion/network/network_trt.cpp
+++ b/src/detection/detection_lidar_transfusion/network/network_trt.cpp
@@ -0,0 +1,333 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include "include/network/network_trt.hpp"
			
 
				+
			
 
				+#include <NvOnnxParser.h>
			
 
				+
			
 
				+#include <fstream>
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+#include <bits/stdc++.h>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+std::ostream & operator<<(std::ostream & os, const ProfileDimension & profile)
			
 
				+{
			
 
				+  std::string delim = "";
			
 
				+  os << "min->[";
			
 
				+  for (int i = 0; i < profile.min.nbDims; ++i) {
			
 
				+    os << delim << profile.min.d[i];
			
 
				+    delim = ", ";
			
 
				+  }
			
 
				+  os << "], opt->[";
			
 
				+  delim = "";
			
 
				+  for (int i = 0; i < profile.opt.nbDims; ++i) {
			
 
				+    os << delim << profile.opt.d[i];
			
 
				+    delim = ", ";
			
 
				+  }
			
 
				+  os << "], max->[";
			
 
				+  delim = "";
			
 
				+  for (int i = 0; i < profile.max.nbDims; ++i) {
			
 
				+    os << delim << profile.max.d[i];
			
 
				+    delim = ", ";
			
 
				+  }
			
 
				+  os << "]";
			
 
				+  return os;
			
 
				+}
			
 
				+
			
 
				+NetworkTRT::NetworkTRT(const TransfusionConfig & config) : config_(config)
			
 
				+{
			
 
				+  ProfileDimension voxels_dims = {
			
 
				+    nvinfer1::Dims3(
			
 
				+      config_.min_voxel_size_, config_.min_point_in_voxel_size_, config_.min_network_feature_size_),
			
 
				+    nvinfer1::Dims3(
			
 
				+      config_.opt_voxel_size_, config_.opt_point_in_voxel_size_, config_.opt_network_feature_size_),
			
 
				+    nvinfer1::Dims3(
			
 
				+      config_.max_voxel_size_, config_.max_point_in_voxel_size_,
			
 
				+      config_.max_network_feature_size_)};
			
 
				+  ProfileDimension num_points_dims = {
			
 
				+    nvinfer1::Dims{1, {static_cast<int32_t>(config_.min_points_size_)}},
			
 
				+    nvinfer1::Dims{1, {static_cast<int32_t>(config_.opt_points_size_)}},
			
 
				+    nvinfer1::Dims{1, {static_cast<int32_t>(config_.max_points_size_)}}};
			
 
				+  ProfileDimension coors_dims = {
			
 
				+    nvinfer1::Dims2(config_.min_coors_size_, config_.min_coors_dim_size_),
			
 
				+    nvinfer1::Dims2(config_.opt_coors_size_, config_.opt_coors_dim_size_),
			
 
				+    nvinfer1::Dims2(config_.max_coors_size_, config_.max_coors_dim_size_)};
			
 
				+  in_profile_dims_ = {voxels_dims, num_points_dims, coors_dims};
			
 
				+}
			
 
				+
			
 
				+NetworkTRT::~NetworkTRT()
			
 
				+{
			
 
				+  context.reset();
			
 
				+  runtime_.reset();
			
 
				+  plan_.reset();
			
 
				+  engine.reset();
			
 
				+}
			
 
				+
			
 
				+bool NetworkTRT::init(
			
 
				+  const std::string & onnx_path, const std::string & engine_path, const std::string & precision)
			
 
				+{
			
 
				+  runtime_ =
			
 
				+    tensorrt_common::TrtUniquePtr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(logger_));
			
 
				+  if (!runtime_) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create runtime" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  bool success;
			
 
				+  std::ifstream engine_file(engine_path);
			
 
				+  if (engine_file.is_open()) {
			
 
				+    success = loadEngine(engine_path);
			
 
				+  } else {
			
 
				+    auto log_thread = logger_.log_throttle(
			
 
				+      nvinfer1::ILogger::Severity::kINFO,
			
 
				+      "Applying optimizations and building TRT CUDA engine. Please wait a minutes...", 5);
			
 
				+    success = parseONNX(onnx_path, engine_path, precision);
			
 
				+    logger_.stop_throttle(log_thread);
			
 
				+  }
			
 
				+  success &= createContext();
			
 
				+
			
 
				+  return success;
			
 
				+}
			
 
				+
			
 
				+bool NetworkTRT::setProfile(
			
 
				+  nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network,
			
 
				+  nvinfer1::IBuilderConfig & config)
			
 
				+{
			
 
				+  auto profile = builder.createOptimizationProfile();
			
 
				+
			
 
				+  auto voxels_name = network.getInput(NetworkIO::voxels)->getName();
			
 
				+  auto num_points_name = network.getInput(NetworkIO::num_points)->getName();
			
 
				+  auto coors_name = network.getInput(NetworkIO::coors)->getName();
			
 
				+
			
 
				+  profile->setDimensions(
			
 
				+    voxels_name, nvinfer1::OptProfileSelector::kMIN, in_profile_dims_[NetworkIO::voxels].min);
			
 
				+  profile->setDimensions(
			
 
				+    voxels_name, nvinfer1::OptProfileSelector::kOPT, in_profile_dims_[NetworkIO::voxels].opt);
			
 
				+  profile->setDimensions(
			
 
				+    voxels_name, nvinfer1::OptProfileSelector::kMAX, in_profile_dims_[NetworkIO::voxels].max);
			
 
				+
			
 
				+  profile->setDimensions(
			
 
				+    num_points_name, nvinfer1::OptProfileSelector::kMIN,
			
 
				+    in_profile_dims_[NetworkIO::num_points].min);
			
 
				+  profile->setDimensions(
			
 
				+    num_points_name, nvinfer1::OptProfileSelector::kOPT,
			
 
				+    in_profile_dims_[NetworkIO::num_points].opt);
			
 
				+  profile->setDimensions(
			
 
				+    num_points_name, nvinfer1::OptProfileSelector::kMAX,
			
 
				+    in_profile_dims_[NetworkIO::num_points].max);
			
 
				+
			
 
				+  profile->setDimensions(
			
 
				+    coors_name, nvinfer1::OptProfileSelector::kMIN, in_profile_dims_[NetworkIO::coors].min);
			
 
				+  profile->setDimensions(
			
 
				+    coors_name, nvinfer1::OptProfileSelector::kOPT, in_profile_dims_[NetworkIO::coors].opt);
			
 
				+  profile->setDimensions(
			
 
				+    coors_name, nvinfer1::OptProfileSelector::kMAX, in_profile_dims_[NetworkIO::coors].max);
			
 
				+
			
 
				+  config.addOptimizationProfile(profile);
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+bool NetworkTRT::createContext()
			
 
				+{
			
 
				+  if (!engine) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_)
			
 
				+      << "Failed to create context: Engine was not created" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  context =
			
 
				+    tensorrt_common::TrtUniquePtr<nvinfer1::IExecutionContext>(engine->createExecutionContext());
			
 
				+  if (!context) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create context" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+bool NetworkTRT::parseONNX(
			
 
				+  const std::string & onnx_path, const std::string & engine_path, const std::string & precision,
			
 
				+  const size_t workspace_size)
			
 
				+{
			
 
				+  auto builder =
			
 
				+    tensorrt_common::TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
			
 
				+  if (!builder) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create builder" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  auto config =
			
 
				+    tensorrt_common::TrtUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
			
 
				+  if (!config) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create config" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400
			
 
				+  config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size);
			
 
				+#else
			
 
				+  config->setMaxWorkspaceSize(workspace_size);
			
 
				+#endif
			
 
				+  if (precision == "fp16") {
			
 
				+    if (builder->platformHasFastFp16()) {
			
 
				+      tensorrt_common::LOG_INFO(logger_) << "Using TensorRT FP16 Inference" << std::endl;
			
 
				+      config->setFlag(nvinfer1::BuilderFlag::kFP16);
			
 
				+    } else {
			
 
				+      tensorrt_common::LOG_INFO(logger_)
			
 
				+        << "TensorRT FP16 Inference isn't supported in this environment" << std::endl;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  const auto flag =
			
 
				+    1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
			
 
				+  auto network =
			
 
				+    tensorrt_common::TrtUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(flag));
			
 
				+  if (!network) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create network" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  auto parser = tensorrt_common::TrtUniquePtr<nvonnxparser::IParser>(
			
 
				+    nvonnxparser::createParser(*network, logger_));
			
 
				+  parser->parseFromFile(onnx_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kERROR));
			
 
				+
			
 
				+  if (!setProfile(*builder, *network, *config)) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_) << "Failed to set profile" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  plan_ = tensorrt_common::TrtUniquePtr<nvinfer1::IHostMemory>(
			
 
				+    builder->buildSerializedNetwork(*network, *config));
			
 
				+  if (!plan_) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create serialized network" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+  engine = tensorrt_common::TrtUniquePtr<nvinfer1::ICudaEngine>(
			
 
				+    runtime_->deserializeCudaEngine(plan_->data(), plan_->size()));
			
 
				+  if (!engine) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_) << "Failed to create engine" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  return saveEngine(engine_path);
			
 
				+}
			
 
				+
			
 
				+bool NetworkTRT::saveEngine(const std::string & engine_path)
			
 
				+{
			
 
				+  tensorrt_common::LOG_INFO(logger_) << "Writing to " << engine_path << std::endl;
			
 
				+  std::ofstream file(engine_path, std::ios::out | std::ios::binary);
			
 
				+  file.write(reinterpret_cast<const char *>(plan_->data()), plan_->size());
			
 
				+  return validateNetworkIO();
			
 
				+}
			
 
				+
			
 
				+bool NetworkTRT::loadEngine(const std::string & engine_path)
			
 
				+{
			
 
				+  std::ifstream engine_file(engine_path);
			
 
				+  std::stringstream engine_buffer;
			
 
				+  engine_buffer << engine_file.rdbuf();
			
 
				+  std::string engine_str = engine_buffer.str();
			
 
				+  engine = tensorrt_common::TrtUniquePtr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(
			
 
				+    reinterpret_cast<const void *>(engine_str.data()), engine_str.size()));
			
 
				+  tensorrt_common::LOG_INFO(logger_) << "Loaded engine from " << engine_path << std::endl;
			
 
				+  return validateNetworkIO();
			
 
				+}
			
 
				+
			
 
				+bool NetworkTRT::validateNetworkIO()
			
 
				+{
			
 
				+  // Whether the number of IO match the expected size
			
 
				+  if (engine->getNbIOTensors() != NetworkIO::ENUM_SIZE) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_)
			
 
				+      << "Invalid network IO. Expected size: " << NetworkIO::ENUM_SIZE
			
 
				+      << ". Actual size: " << engine->getNbIOTensors() << "." << std::endl;
			
 
				+    throw std::runtime_error("Failed to initialize TRT network.");
			
 
				+  }
			
 
				+  for (int i = 0; i < NetworkIO::ENUM_SIZE; ++i) {
			
 
				+    tensors_names_.push_back(engine->getIOTensorName(i));
			
 
				+  }
			
 
				+
			
 
				+  // Log the network IO
			
 
				+  std::string tensors = std::accumulate(
			
 
				+    tensors_names_.begin(), tensors_names_.end(), std::string(),
			
 
				+    [](const std::string & a, const std::string & b) -> std::string { return a + b + " "; });
			
 
				+  tensorrt_common::LOG_INFO(logger_) << "Network IO: " << tensors << std::endl;
			
 
				+
			
 
				+  // Whether the current engine input profile match the config input profile
			
 
				+  for (int i = 0; i <= NetworkIO::coors; ++i) {
			
 
				+    ProfileDimension engine_dims{
			
 
				+      engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kMIN),
			
 
				+      engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kOPT),
			
 
				+      engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kMAX)};
			
 
				+
			
 
				+    tensorrt_common::LOG_INFO(logger_)
			
 
				+      << "Profile for " << tensors_names_[i] << ": " << engine_dims << std::endl;
			
 
				+
			
 
				+    if (engine_dims != in_profile_dims_[i]) {
			
 
				+      tensorrt_common::LOG_ERROR(logger_)
			
 
				+        << "Invalid network input dimension. Config: " << in_profile_dims_[i]
			
 
				+        << ". Please change the input profile or delete the engine file and build engine again."
			
 
				+        << std::endl;
			
 
				+      throw std::runtime_error("Failed to initialize TRT network.");
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Whether the IO tensor shapes match the network config, -1 for dynamic size
			
 
				+  validateTensorShape(
			
 
				+    NetworkIO::voxels, {-1, static_cast<int>(config_.points_per_voxel_),
			
 
				+                        static_cast<int>(config_.num_point_feature_size_)});
			
 
				+  validateTensorShape(NetworkIO::num_points, {-1});
			
 
				+  validateTensorShape(NetworkIO::coors, {-1, static_cast<int>(config_.num_point_values_)});
			
 
				+  auto cls_score = validateTensorShape(
			
 
				+    NetworkIO::cls_score,
			
 
				+    {static_cast<int>(config_.batch_size_), static_cast<int>(config_.num_classes_),
			
 
				+     static_cast<int>(config_.num_proposals_)});
			
 
				+  tensorrt_common::LOG_INFO(logger_) << "Network num classes: " << cls_score.d[1] << std::endl;
			
 
				+  validateTensorShape(
			
 
				+    NetworkIO::dir_pred,
			
 
				+    {static_cast<int>(config_.batch_size_), 2, static_cast<int>(config_.num_proposals_)});  // x, y
			
 
				+  validateTensorShape(
			
 
				+    NetworkIO::bbox_pred,
			
 
				+    {static_cast<int>(config_.batch_size_), static_cast<int>(config_.num_box_values_),
			
 
				+     static_cast<int>(config_.num_proposals_)});
			
 
				+
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+const char * NetworkTRT::getTensorName(NetworkIO name)
			
 
				+{
			
 
				+  return tensors_names_.at(name);
			
 
				+}
			
 
				+
			
 
				+nvinfer1::Dims NetworkTRT::validateTensorShape(NetworkIO name, const std::vector<int> shape)
			
 
				+{
			
 
				+  auto tensor_shape = engine->getTensorShape(tensors_names_[name]);
			
 
				+  if (tensor_shape.nbDims != static_cast<int>(shape.size())) {
			
 
				+    tensorrt_common::LOG_ERROR(logger_)
			
 
				+      << "Invalid tensor shape for " << tensors_names_[name] << ". Expected size: " << shape.size()
			
 
				+      << ". Actual size: " << tensor_shape.nbDims << "." << std::endl;
			
 
				+    throw std::runtime_error("Failed to initialize TRT network.");
			
 
				+  }
			
 
				+  for (int i = 0; i < tensor_shape.nbDims; ++i) {
			
 
				+    if (tensor_shape.d[i] != static_cast<int>(shape[i])) {
			
 
				+      tensorrt_common::LOG_ERROR(logger_)
			
 
				+        << "Invalid tensor shape for " << tensors_names_[name] << ". Expected: " << shape[i]
			
 
				+        << ". Actual: " << tensor_shape.d[i] << "." << std::endl;
			
 
				+      throw std::runtime_error("Failed to initialize TRT network.");
			
 
				+    }
			
 
				+  }
			
 
				+  return tensor_shape;
			
 
				+}
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
--- a/src/detection/detection_lidar_transfusion/postprocess/circle_nms_kernel.cu
+++ b/src/detection/detection_lidar_transfusion/postprocess/circle_nms_kernel.cu
@@ -0,0 +1,144 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+// Modified from
			
 
				+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
			
 
				+
			
 
				+/*
			
 
				+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
			
 
				+Written by Shaoshuai Shi
			
 
				+All Rights Reserved 2019-2020.
			
 
				+*/
			
 
				+
			
 
				+#include "cuda_utils.hpp"
			
 
				+#include "postprocess/circle_nms_kernel.hpp"
			
 
				+#include "utils.hpp"
			
 
				+
			
 
				+#include <thrust/host_vector.h>
			
 
				+
			
 
				+namespace
			
 
				+{
			
 
				+const std::size_t THREADS_PER_BLOCK_NMS = 16;
			
 
				+}  // namespace
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+__device__ inline float dist2dPow(const Box3D * a, const Box3D * b)
			
 
				+{
			
 
				+  return powf(a->x - b->x, 2) + powf(a->y - b->y, 2);
			
 
				+}
			
 
				+
			
 
				+// cspell: ignore divup
			
 
				+__global__ void circleNMS_Kernel(
			
 
				+  const Box3D * boxes, const std::size_t num_boxes3d, const std::size_t col_blocks,
			
 
				+  const float dist2d_pow_threshold, std::uint64_t * mask)
			
 
				+{
			
 
				+  // params: boxes (N,)
			
 
				+  // params: mask (N, divup(N/THREADS_PER_BLOCK_NMS))
			
 
				+
			
 
				+  const auto row_start = blockIdx.y;
			
 
				+  const auto col_start = blockIdx.x;
			
 
				+
			
 
				+  if (row_start > col_start) return;
			
 
				+
			
 
				+  const std::size_t row_size =
			
 
				+    fminf(num_boxes3d - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+  const std::size_t col_size =
			
 
				+    fminf(num_boxes3d - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+
			
 
				+  __shared__ Box3D block_boxes[THREADS_PER_BLOCK_NMS];
			
 
				+
			
 
				+  if (threadIdx.x < col_size) {
			
 
				+    block_boxes[threadIdx.x] = boxes[THREADS_PER_BLOCK_NMS * col_start + threadIdx.x];
			
 
				+  }
			
 
				+  __syncthreads();
			
 
				+
			
 
				+  if (threadIdx.x < row_size) {
			
 
				+    const std::size_t cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
			
 
				+    const Box3D * cur_box = boxes + cur_box_idx;
			
 
				+
			
 
				+    std::uint64_t t = 0;
			
 
				+    std::size_t start = 0;
			
 
				+    if (row_start == col_start) {
			
 
				+      start = threadIdx.x + 1;
			
 
				+    }
			
 
				+    for (std::size_t i = start; i < col_size; i++) {
			
 
				+      if (dist2dPow(cur_box, block_boxes + i) < dist2d_pow_threshold) {
			
 
				+        t |= 1ULL << i;
			
 
				+      }
			
 
				+    }
			
 
				+    mask[cur_box_idx * col_blocks + col_start] = t;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+cudaError_t circleNMS_launch(
			
 
				+  const thrust::device_vector<Box3D> & boxes3d, const std::size_t num_boxes3d,
			
 
				+  std::size_t col_blocks, const float distance_threshold,
			
 
				+  thrust::device_vector<std::uint64_t> & mask, cudaStream_t stream)
			
 
				+{
			
 
				+  const float dist2d_pow_thres = powf(distance_threshold, 2);
			
 
				+
			
 
				+  dim3 blocks(col_blocks, col_blocks);
			
 
				+  dim3 threads(THREADS_PER_BLOCK_NMS);
			
 
				+  circleNMS_Kernel<<<blocks, threads, 0, stream>>>(
			
 
				+    thrust::raw_pointer_cast(boxes3d.data()), num_boxes3d, col_blocks, dist2d_pow_thres,
			
 
				+    thrust::raw_pointer_cast(mask.data()));
			
 
				+
			
 
				+  return cudaGetLastError();
			
 
				+}
			
 
				+
			
 
				+std::size_t circleNMS(
			
 
				+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
			
 
				+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream)
			
 
				+{
			
 
				+  const auto num_boxes3d = boxes3d.size();
			
 
				+  const auto col_blocks = divup(num_boxes3d, THREADS_PER_BLOCK_NMS);
			
 
				+  thrust::device_vector<std::uint64_t> mask_d(num_boxes3d * col_blocks);
			
 
				+
			
 
				+  CHECK_CUDA_ERROR(
			
 
				+    circleNMS_launch(boxes3d, num_boxes3d, col_blocks, distance_threshold, mask_d, stream));
			
 
				+
			
 
				+  // memcpy device to host
			
 
				+  thrust::host_vector<std::uint64_t> mask_h(mask_d.size());
			
 
				+  thrust::copy(mask_d.begin(), mask_d.end(), mask_h.begin());
			
 
				+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
			
 
				+
			
 
				+  // generate keep_mask
			
 
				+  std::vector<std::uint64_t> remv_h(col_blocks);
			
 
				+  thrust::host_vector<bool> keep_mask_h(keep_mask.size());
			
 
				+  std::size_t num_to_keep = 0;
			
 
				+  for (std::size_t i = 0; i < num_boxes3d; i++) {
			
 
				+    auto nblock = i / THREADS_PER_BLOCK_NMS;
			
 
				+    auto inblock = i % THREADS_PER_BLOCK_NMS;
			
 
				+
			
 
				+    if (!(remv_h[nblock] & (1ULL << inblock))) {
			
 
				+      keep_mask_h[i] = true;
			
 
				+      num_to_keep++;
			
 
				+      std::uint64_t * p = &mask_h[0] + i * col_blocks;
			
 
				+      for (std::size_t j = nblock; j < col_blocks; j++) {
			
 
				+        remv_h[j] |= p[j];
			
 
				+      }
			
 
				+    } else {
			
 
				+      keep_mask_h[i] = false;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // memcpy host to device
			
 
				+  keep_mask = keep_mask_h;
			
 
				+
			
 
				+  return num_to_keep;
			
 
				+}
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
--- a/src/detection/detection_lidar_transfusion/postprocess/non_maximum_suppression.cpp
+++ b/src/detection/detection_lidar_transfusion/postprocess/non_maximum_suppression.cpp
@@ -0,0 +1,147 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include "postprocess/non_maximum_suppression.hpp"
			
 
				+
			
 
				+//#include <autoware/universe_utils/geometry/geometry.hpp>
			
 
				+//#include <object_recognition_utils/geometry.hpp>
			
 
				+//#include <object_recognition_utils/object_recognition_utils.hpp>
			
 
				+
			
 
				+// constant declarations
			
 
				+namespace Label {
			
 
				+static constexpr uint8_t UNKNOWN =
			
 
				+  0u;
			
 
				+static constexpr uint8_t CAR =
			
 
				+  1u;
			
 
				+static constexpr uint8_t TRUCK =
			
 
				+  2u;
			
 
				+static constexpr uint8_t BUS =
			
 
				+  3u;
			
 
				+static constexpr uint8_t TRAILER =
			
 
				+  4u;
			
 
				+static constexpr uint8_t MOTORCYCLE =
			
 
				+  5u;
			
 
				+static constexpr uint8_t BICYCLE =
			
 
				+  6u;
			
 
				+static constexpr uint8_t PEDESTRIAN =
			
 
				+  7u;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+uint8_t getSemanticType(const std::string & class_name)
			
 
				+{
			
 
				+  if (class_name == "CAR") {
			
 
				+    return Label::CAR;
			
 
				+  } else if (class_name == "TRUCK") {
			
 
				+    return Label::TRUCK;
			
 
				+  } else if (class_name == "BUS") {
			
 
				+    return Label::BUS;
			
 
				+  } else if (class_name == "TRAILER") {
			
 
				+    return Label::TRAILER;
			
 
				+  } else if (class_name == "MOTORCYCLE") {
			
 
				+    return Label::MOTORCYCLE;
			
 
				+  } else if (class_name == "BICYCLE") {
			
 
				+    return Label::BICYCLE;
			
 
				+  } else if (class_name == "PEDESTRIAN") {
			
 
				+    return Label::PEDESTRIAN;
			
 
				+  } else {  // CONSTRUCTION_VEHICLE, BARRIER, TRAFFIC_CONE
			
 
				+    return Label::UNKNOWN;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+void NonMaximumSuppression::setParameters(const NMSParams & params)
			
 
				+{
			
 
				+  assert(params.search_distance_2d_ >= 0.0);
			
 
				+  assert(params.iou_threshold_ >= 0.0 && params.iou_threshold_ <= 1.0);
			
 
				+
			
 
				+  params_ = params;
			
 
				+  target_class_mask_ = classNamesToBooleanMask(params.target_class_names_);
			
 
				+}
			
 
				+
			
 
				+bool NonMaximumSuppression::isTargetLabel(const uint8_t label)
			
 
				+{
			
 
				+  if (label >= target_class_mask_.size()) {
			
 
				+    return false;
			
 
				+  }
			
 
				+  return target_class_mask_.at(label);
			
 
				+}
			
 
				+
			
 
				+//bool NonMaximumSuppression::isTargetPairObject(
			
 
				+//  const DetectedObject & object1, const DetectedObject & object2)
			
 
				+//{
			
 
				+//  const auto label1 = object_recognition_utils::getHighestProbLabel(object1.classification);
			
 
				+//  const auto label2 = object_recognition_utils::getHighestProbLabel(object2.classification);
			
 
				+
			
 
				+//  if (isTargetLabel(label1) && isTargetLabel(label2)) {
			
 
				+//    return true;
			
 
				+//  }
			
 
				+
			
 
				+//  const auto search_sqr_dist_2d = params_.search_distance_2d_ * params_.search_distance_2d_;
			
 
				+//  const auto sqr_dist_2d = autoware::universe_utils::calcSquaredDistance2d(
			
 
				+//    object_recognition_utils::getPose(object1), object_recognition_utils::getPose(object2));
			
 
				+//  return sqr_dist_2d <= search_sqr_dist_2d;
			
 
				+//}
			
 
				+
			
 
				+//Eigen::MatrixXd NonMaximumSuppression::generateIoUMatrix(
			
 
				+//  const std::vector<DetectedObject> & input_objects)
			
 
				+//{
			
 
				+//  // NOTE: row = target objects to be suppressed, col = source objects to be compared
			
 
				+//  Eigen::MatrixXd triangular_matrix =
			
 
				+//    Eigen::MatrixXd::Zero(input_objects.size(), input_objects.size());
			
 
				+//  for (std::size_t target_i = 0; target_i < input_objects.size(); ++target_i) {
			
 
				+//    for (std::size_t source_i = 0; source_i < target_i; ++source_i) {
			
 
				+//      const auto & target_obj = input_objects.at(target_i);
			
 
				+//      const auto & source_obj = input_objects.at(source_i);
			
 
				+//      if (!isTargetPairObject(target_obj, source_obj)) {
			
 
				+//        continue;
			
 
				+//      }
			
 
				+
			
 
				+//      if (params_.nms_type_ == NMS_TYPE::IoU_BEV) {
			
 
				+//        const double iou = object_recognition_utils::get2dIoU(target_obj, source_obj);
			
 
				+//        triangular_matrix(target_i, source_i) = iou;
			
 
				+//        // NOTE: If the target object has any objects with iou > iou_threshold, it
			
 
				+//        // will be suppressed regardless of later results.
			
 
				+//        if (iou > params_.iou_threshold_) {
			
 
				+//          break;
			
 
				+//        }
			
 
				+//      }
			
 
				+//    }
			
 
				+//  }
			
 
				+
			
 
				+//  return triangular_matrix;
			
 
				+//}
			
 
				+
			
 
				+//std::vector<DetectedObject> NonMaximumSuppression::apply(
			
 
				+//  const std::vector<DetectedObject> & input_objects)
			
 
				+//{
			
 
				+//  Eigen::MatrixXd iou_matrix = generateIoUMatrix(input_objects);
			
 
				+
			
 
				+//  std::vector<DetectedObject> output_objects;
			
 
				+//  output_objects.reserve(input_objects.size());
			
 
				+//  for (std::size_t i = 0; i < input_objects.size(); ++i) {
			
 
				+//    const auto value = iou_matrix.row(i).maxCoeff();
			
 
				+//    if (params_.nms_type_ == NMS_TYPE::IoU_BEV) {
			
 
				+//      if (value <= params_.iou_threshold_) {
			
 
				+//        output_objects.emplace_back(input_objects.at(i));
			
 
				+//      }
			
 
				+//    }
			
 
				+//  }
			
 
				+
			
 
				+//  return output_objects;
			
 
				+//}
			
 
				+}  // namespace autoware::lidar_transfusion
			
--- a/src/detection/detection_lidar_transfusion/postprocess/postprocess_kernel.cu
+++ b/src/detection/detection_lidar_transfusion/postprocess/postprocess_kernel.cu
@@ -0,0 +1,145 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include "postprocess/circle_nms_kernel.hpp"
			
 
				+#include "postprocess/postprocess_kernel.hpp"
			
 
				+
			
 
				+#include <thrust/count.h>
			
 
				+#include <thrust/device_vector.h>
			
 
				+#include <thrust/sort.h>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+const size_t THREADS_PER_BLOCK = 256;
			
 
				+
			
 
				+struct is_score_greater
			
 
				+{
			
 
				+  is_score_greater(float t) : t_(t) {}
			
 
				+
			
 
				+  __device__ bool operator()(const Box3D & b) { return b.score > t_; }
			
 
				+
			
 
				+private:
			
 
				+  float t_{0.0};
			
 
				+};
			
 
				+
			
 
				+struct is_kept
			
 
				+{
			
 
				+  __device__ bool operator()(const bool keep) { return keep; }
			
 
				+};
			
 
				+
			
 
				+struct score_greater
			
 
				+{
			
 
				+  __device__ bool operator()(const Box3D & lb, const Box3D & rb) { return lb.score > rb.score; }
			
 
				+};
			
 
				+
			
 
				+__device__ inline float sigmoid(float x)
			
 
				+{
			
 
				+  return 1.0f / (1.0f + expf(-x));
			
 
				+}
			
 
				+
			
 
				+__global__ void generateBoxes3D_kernel(
			
 
				+  const float * __restrict__ cls_output, const float * __restrict__ box_output,
			
 
				+  const float * __restrict__ dir_cls_output, const float voxel_size_x, const float voxel_size_y,
			
 
				+  const float min_x_range, const float min_y_range, const int num_proposals, const int num_classes,
			
 
				+  const int num_point_values, const float * __restrict__ yaw_norm_thresholds,
			
 
				+  Box3D * __restrict__ det_boxes3d)
			
 
				+{
			
 
				+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if (point_idx >= num_proposals) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  int class_id = 0;
			
 
				+  float max_score = cls_output[point_idx];
			
 
				+
			
 
				+#pragma unroll
			
 
				+  for (int i = 1; i < num_classes; i++) {
			
 
				+    float score = cls_output[i * num_proposals + point_idx];
			
 
				+    if (score > max_score) {
			
 
				+      max_score = score;
			
 
				+      class_id = i;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // yaw validation
			
 
				+  const float yaw_sin = dir_cls_output[point_idx];
			
 
				+  const float yaw_cos = dir_cls_output[point_idx + num_proposals];
			
 
				+  const float yaw_norm = sqrtf(yaw_sin * yaw_sin + yaw_cos * yaw_cos);
			
 
				+
			
 
				+  det_boxes3d[point_idx].label = class_id;
			
 
				+  det_boxes3d[point_idx].score = yaw_norm >= yaw_norm_thresholds[class_id] ? max_score : 0.f;
			
 
				+  det_boxes3d[point_idx].x = box_output[point_idx] * num_point_values * voxel_size_x + min_x_range;
			
 
				+  det_boxes3d[point_idx].y =
			
 
				+    box_output[point_idx + num_proposals] * num_point_values * voxel_size_y + min_y_range;
			
 
				+  det_boxes3d[point_idx].z = box_output[point_idx + 2 * num_proposals];
			
 
				+  det_boxes3d[point_idx].width = expf(box_output[point_idx + 3 * num_proposals]);
			
 
				+  det_boxes3d[point_idx].length = expf(box_output[point_idx + 4 * num_proposals]);
			
 
				+  det_boxes3d[point_idx].height = expf(box_output[point_idx + 5 * num_proposals]);
			
 
				+  det_boxes3d[point_idx].yaw =
			
 
				+    atan2f(dir_cls_output[point_idx], dir_cls_output[point_idx + num_proposals]);
			
 
				+}
			
 
				+
			
 
				+PostprocessCuda::PostprocessCuda(const TransfusionConfig & config, cudaStream_t & stream)
			
 
				+: config_(config), stream_(stream)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+// cspell: ignore divup
			
 
				+cudaError_t PostprocessCuda::generateDetectedBoxes3D_launch(
			
 
				+  const float * cls_output, const float * box_output, const float * dir_cls_output,
			
 
				+  std::vector<Box3D> & det_boxes3d, cudaStream_t stream)
			
 
				+{
			
 
				+  dim3 threads = {THREADS_PER_BLOCK};
			
 
				+  dim3 blocks = {divup(config_.num_proposals_, threads.x)};
			
 
				+
			
 
				+  auto boxes3d_d = thrust::device_vector<Box3D>(config_.num_proposals_);
			
 
				+  auto yaw_norm_thresholds_d = thrust::device_vector<float>(
			
 
				+    config_.yaw_norm_thresholds_.begin(), config_.yaw_norm_thresholds_.end());
			
 
				+
			
 
				+  generateBoxes3D_kernel<<<blocks, threads, 0, stream>>>(
			
 
				+    cls_output, box_output, dir_cls_output, config_.voxel_x_size_, config_.voxel_y_size_,
			
 
				+    config_.min_x_range_, config_.min_y_range_, config_.num_proposals_, config_.num_classes_,
			
 
				+    config_.num_point_values_, thrust::raw_pointer_cast(yaw_norm_thresholds_d.data()),
			
 
				+    thrust::raw_pointer_cast(boxes3d_d.data()));
			
 
				+
			
 
				+  // suppress by score
			
 
				+  const auto num_det_boxes3d = thrust::count_if(
			
 
				+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), is_score_greater(config_.score_threshold_));
			
 
				+  if (num_det_boxes3d == 0) {
			
 
				+    return cudaGetLastError();
			
 
				+  }
			
 
				+  thrust::device_vector<Box3D> det_boxes3d_d(num_det_boxes3d);
			
 
				+  thrust::copy_if(
			
 
				+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), det_boxes3d_d.begin(),
			
 
				+    is_score_greater(config_.score_threshold_));
			
 
				+
			
 
				+  // sort by score
			
 
				+  thrust::sort(det_boxes3d_d.begin(), det_boxes3d_d.end(), score_greater());
			
 
				+
			
 
				+  // supress by NMS
			
 
				+  thrust::device_vector<bool> final_keep_mask_d(num_det_boxes3d);
			
 
				+  const auto num_final_det_boxes3d =
			
 
				+    circleNMS(det_boxes3d_d, config_.circle_nms_dist_threshold_, final_keep_mask_d, stream);
			
 
				+  thrust::device_vector<Box3D> final_det_boxes3d_d(num_final_det_boxes3d);
			
 
				+  thrust::copy_if(
			
 
				+    thrust::device, det_boxes3d_d.begin(), det_boxes3d_d.end(), final_keep_mask_d.begin(),
			
 
				+    final_det_boxes3d_d.begin(), is_kept());
			
 
				+
			
 
				+  // memcpy device to host
			
 
				+  det_boxes3d.resize(num_final_det_boxes3d);
			
 
				+  thrust::copy(final_det_boxes3d_d.begin(), final_det_boxes3d_d.end(), det_boxes3d.begin());
			
 
				+  return cudaGetLastError();
			
 
				+}
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
--- a/src/detection/detection_lidar_transfusion/preprocess/pointcloud_densification.cpp
+++ b/src/detection/detection_lidar_transfusion/preprocess/pointcloud_densification.cpp
@@ -0,0 +1,116 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include "autoware/lidar_transfusion/preprocess/pointcloud_densification.hpp"
			
 
				+
			
 
				+#include <pcl_ros/transforms.hpp>
			
 
				+
			
 
				+#include <pcl_conversions/pcl_conversions.h>
			
 
				+#ifdef ROS_DISTRO_GALACTIC
			
 
				+#include <tf2_eigen/tf2_eigen.h>
			
 
				+#else
			
 
				+#include <tf2_eigen/tf2_eigen.hpp>
			
 
				+#endif
			
 
				+
			
 
				+#include <boost/optional.hpp>
			
 
				+
			
 
				+#include <string>
			
 
				+#include <utility>
			
 
				+
			
 
				+namespace
			
 
				+{
			
 
				+
			
 
				+boost::optional<geometry_msgs::msg::Transform> getTransform(
			
 
				+  const tf2_ros::Buffer & tf_buffer, const std::string & target_frame_id,
			
 
				+  const std::string & source_frame_id, const rclcpp::Time & time)
			
 
				+{
			
 
				+  try {
			
 
				+    geometry_msgs::msg::TransformStamped transform_stamped;
			
 
				+    transform_stamped = tf_buffer.lookupTransform(
			
 
				+      target_frame_id, source_frame_id, time, rclcpp::Duration::from_seconds(0.5));
			
 
				+    return transform_stamped.transform;
			
 
				+  } catch (tf2::TransformException & ex) {
			
 
				+    RCLCPP_WARN_STREAM(rclcpp::get_logger("lidar_transfusion"), ex.what());
			
 
				+    return boost::none;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+Eigen::Affine3f transformToEigen(const geometry_msgs::msg::Transform & t)
			
 
				+{
			
 
				+  Eigen::Affine3f a;
			
 
				+  a.matrix() = tf2::transformToEigen(t).matrix().cast<float>();
			
 
				+  return a;
			
 
				+}
			
 
				+
			
 
				+}  // namespace
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+PointCloudDensification::PointCloudDensification(
			
 
				+  const DensificationParam & param, cudaStream_t & stream)
			
 
				+: param_(param), stream_(stream)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+bool PointCloudDensification::enqueuePointCloud(
			
 
				+  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
			
 
				+{
			
 
				+  const auto header = pointcloud_msg.header;
			
 
				+
			
 
				+  if (param_.pointcloud_cache_size() > 1) {
			
 
				+    auto transform_world2current =
			
 
				+      getTransform(tf_buffer, header.frame_id, param_.world_frame_id(), header.stamp);
			
 
				+    if (!transform_world2current) {
			
 
				+      return false;
			
 
				+    }
			
 
				+    auto affine_world2current = transformToEigen(transform_world2current.get());
			
 
				+
			
 
				+    enqueue(pointcloud_msg, affine_world2current);
			
 
				+  } else {
			
 
				+    enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
			
 
				+  }
			
 
				+
			
 
				+  dequeue();
			
 
				+
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+void PointCloudDensification::enqueue(
			
 
				+  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
			
 
				+{
			
 
				+  affine_world2current_ = affine_world2current;
			
 
				+  current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
			
 
				+
			
 
				+  auto data_d = cuda::make_unique<uint8_t[]>(
			
 
				+    sizeof(uint8_t) * msg.width * msg.height * msg.point_step / sizeof(uint8_t));
			
 
				+
			
 
				+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
			
 
				+    data_d.get(), msg.data.data(), sizeof(uint8_t) * msg.width * msg.height * msg.point_step,
			
 
				+    cudaMemcpyHostToDevice, stream_));
			
 
				+
			
 
				+  PointCloudWithTransform pointcloud = {
			
 
				+    std::move(data_d), msg.header, msg.width * msg.height, affine_world2current.inverse()};
			
 
				+
			
 
				+  pointcloud_cache_.push_front(std::move(pointcloud));
			
 
				+}
			
 
				+
			
 
				+void PointCloudDensification::dequeue()
			
 
				+{
			
 
				+  if (pointcloud_cache_.size() > param_.pointcloud_cache_size()) {
			
 
				+    pointcloud_cache_.pop_back();
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
--- a/src/detection/detection_lidar_transfusion/preprocess/preprocess_kernel.cu
+++ b/src/detection/detection_lidar_transfusion/preprocess/preprocess_kernel.cu
@@ -0,0 +1,221 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+/*
			
 
				+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
			
 
				+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "autoware/lidar_transfusion/cuda_utils.hpp"
			
 
				+#include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp"
			
 
				+
			
 
				+#include <cstdint>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+PreprocessCuda::PreprocessCuda(const TransfusionConfig & config, cudaStream_t & stream)
			
 
				+: stream_(stream), config_(config)
			
 
				+{
			
 
				+  mask_size_ = config_.grid_z_size_ * config_.grid_y_size_ * config_.grid_x_size_;
			
 
				+  voxels_size_ = config_.grid_z_size_ * config_.grid_y_size_ * config_.grid_x_size_ *
			
 
				+                   config_.max_num_points_per_pillar_ * config_.num_point_feature_size_ +
			
 
				+                 1;
			
 
				+  mask_ = cuda::make_unique<unsigned int[]>(mask_size_);
			
 
				+  voxels_ = cuda::make_unique<float[]>(voxels_size_);
			
 
				+}
			
 
				+
			
 
				+void PreprocessCuda::generateVoxels(
			
 
				+  float * points, unsigned int points_size, unsigned int * pillar_num, float * voxel_features,
			
 
				+  unsigned int * voxel_num, unsigned int * voxel_idxs)
			
 
				+{
			
 
				+  cuda::clear_async(mask_.get(), mask_size_, stream_);
			
 
				+  cuda::clear_async(voxels_.get(), voxels_size_, stream_);
			
 
				+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
			
 
				+
			
 
				+  CHECK_CUDA_ERROR(generateVoxels_random_launch(points, points_size, mask_.get(), voxels_.get()));
			
 
				+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
			
 
				+
			
 
				+  CHECK_CUDA_ERROR(generateBaseFeatures_launch(
			
 
				+    mask_.get(), voxels_.get(), pillar_num, voxel_features, voxel_num, voxel_idxs));
			
 
				+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
			
 
				+}
			
 
				+
			
 
				+__global__ void generateVoxels_random_kernel(
			
 
				+  float * points, unsigned int points_size, float min_x_range, float max_x_range, float min_y_range,
			
 
				+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
			
 
				+  float pillar_z_size, int grid_y_size, int grid_x_size, int points_per_voxel, unsigned int * mask,
			
 
				+  float * voxels)
			
 
				+{
			
 
				+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if (point_idx >= points_size) return;
			
 
				+
			
 
				+  float x = points[point_idx * 5];
			
 
				+  float y = points[point_idx * 5 + 1];
			
 
				+  float z = points[point_idx * 5 + 2];
			
 
				+  float i = points[point_idx * 5 + 3];
			
 
				+  float t = points[point_idx * 5 + 4];
			
 
				+
			
 
				+  if (
			
 
				+    x <= min_x_range || x >= max_x_range || y <= min_y_range || y >= max_y_range ||
			
 
				+    z <= min_z_range || z >= max_z_range)
			
 
				+    return;
			
 
				+
			
 
				+  int voxel_idx = floorf((x - min_x_range) / pillar_x_size);
			
 
				+  int voxel_idy = floorf((y - min_y_range) / pillar_y_size);
			
 
				+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
			
 
				+
			
 
				+  unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
			
 
				+
			
 
				+  if (point_id >= points_per_voxel) return;
			
 
				+  float * address = voxels + (voxel_index * points_per_voxel + point_id) * 5;
			
 
				+  atomicExch(address + 0, x);
			
 
				+  atomicExch(address + 1, y);
			
 
				+  atomicExch(address + 2, z);
			
 
				+  atomicExch(address + 3, i);
			
 
				+  atomicExch(address + 4, t);
			
 
				+}
			
 
				+
			
 
				+cudaError_t PreprocessCuda::generateVoxels_random_launch(
			
 
				+  float * points, unsigned int points_size, unsigned int * mask, float * voxels)
			
 
				+{
			
 
				+  if (points_size == 0) {
			
 
				+    return cudaGetLastError();
			
 
				+  }
			
 
				+  dim3 blocks(divup(points_size, config_.threads_for_voxel_));
			
 
				+  dim3 threads(config_.threads_for_voxel_);
			
 
				+
			
 
				+  generateVoxels_random_kernel<<<blocks, threads, 0, stream_>>>(
			
 
				+    points, points_size, config_.min_x_range_, config_.max_x_range_, config_.min_y_range_,
			
 
				+    config_.max_y_range_, config_.min_z_range_, config_.max_z_range_, config_.voxel_x_size_,
			
 
				+    config_.voxel_y_size_, config_.voxel_z_size_, config_.grid_y_size_, config_.grid_x_size_,
			
 
				+    config_.points_per_voxel_, mask, voxels);
			
 
				+  cudaError_t err = cudaGetLastError();
			
 
				+  return err;
			
 
				+}
			
 
				+
			
 
				+__global__ void generateBaseFeatures_kernel(
			
 
				+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, float points_per_voxel,
			
 
				+  float max_voxels, unsigned int * pillar_num, float * voxel_features, unsigned int * voxel_num,
			
 
				+  unsigned int * voxel_idxs)
			
 
				+{
			
 
				+  unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				+
			
 
				+  if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return;
			
 
				+
			
 
				+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
			
 
				+  unsigned int count = mask[voxel_index];
			
 
				+  if (!(count > 0)) return;
			
 
				+  count = count < points_per_voxel ? count : points_per_voxel;
			
 
				+
			
 
				+  unsigned int current_pillarId = 0;
			
 
				+  current_pillarId = atomicAdd(pillar_num, 1);
			
 
				+  if (current_pillarId >= max_voxels) return;
			
 
				+
			
 
				+  voxel_num[current_pillarId] = count;
			
 
				+
			
 
				+  uint4 idx = {0, 0, voxel_idy, voxel_idx};
			
 
				+  ((uint4 *)voxel_idxs)[current_pillarId] = idx;
			
 
				+
			
 
				+  for (int i = 0; i < count; i++) {
			
 
				+    int inIndex = voxel_index * points_per_voxel + i;
			
 
				+    int outIndex = current_pillarId * points_per_voxel + i;
			
 
				+    voxel_features[outIndex * 5] = voxels[inIndex * 5];
			
 
				+    voxel_features[outIndex * 5 + 1] = voxels[inIndex * 5 + 1];
			
 
				+    voxel_features[outIndex * 5 + 2] = voxels[inIndex * 5 + 2];
			
 
				+    voxel_features[outIndex * 5 + 3] = voxels[inIndex * 5 + 3];
			
 
				+    voxel_features[outIndex * 5 + 4] = voxels[inIndex * 5 + 4];
			
 
				+  }
			
 
				+
			
 
				+  // clear buffer for next infer
			
 
				+  atomicExch(mask + voxel_index, 0);
			
 
				+}
			
 
				+
			
 
				+// create 4 channels
			
 
				+cudaError_t PreprocessCuda::generateBaseFeatures_launch(
			
 
				+  unsigned int * mask, float * voxels, unsigned int * pillar_num, float * voxel_features,
			
 
				+  unsigned int * voxel_num, unsigned int * voxel_idxs)
			
 
				+{
			
 
				+  dim3 threads = {32, 32};
			
 
				+  dim3 blocks = {divup(config_.grid_x_size_, threads.x), divup(config_.grid_y_size_, threads.y)};
			
 
				+
			
 
				+  generateBaseFeatures_kernel<<<blocks, threads, 0, stream_>>>(
			
 
				+    mask, voxels, config_.grid_y_size_, config_.grid_x_size_, config_.points_per_voxel_,
			
 
				+    config_.max_voxels_, pillar_num, voxel_features, voxel_num, voxel_idxs);
			
 
				+  cudaError_t err = cudaGetLastError();
			
 
				+  return err;
			
 
				+}
			
 
				+
			
 
				+__global__ void generateSweepPoints_kernel(
			
 
				+  const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag,
			
 
				+  const float * transform_array, int num_features, float * output_points)
			
 
				+{
			
 
				+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if (point_idx >= points_size) return;
			
 
				+
			
 
				+  union {
			
 
				+    uint32_t raw{0};
			
 
				+    float value;
			
 
				+  } input_x, input_y, input_z;
			
 
				+
			
 
				+#pragma unroll
			
 
				+  for (int i = 0; i < 4; i++) {  // 4 bytes for float32
			
 
				+    input_x.raw |= input_data[point_idx * input_point_step + i] << i * 8;
			
 
				+    input_y.raw |= input_data[point_idx * input_point_step + i + 4] << i * 8;
			
 
				+    input_z.raw |= input_data[point_idx * input_point_step + i + 8] << i * 8;
			
 
				+  }
			
 
				+
			
 
				+  float input_intensity = static_cast<float>(input_data[point_idx * input_point_step + 12]);
			
 
				+
			
 
				+  output_points[point_idx * num_features] =
			
 
				+    transform_array[0] * input_x.value + transform_array[4] * input_y.value +
			
 
				+    transform_array[8] * input_z.value + transform_array[12];
			
 
				+  output_points[point_idx * num_features + 1] =
			
 
				+    transform_array[1] * input_x.value + transform_array[5] * input_y.value +
			
 
				+    transform_array[9] * input_z.value + transform_array[13];
			
 
				+  output_points[point_idx * num_features + 2] =
			
 
				+    transform_array[2] * input_x.value + transform_array[6] * input_y.value +
			
 
				+    transform_array[10] * input_z.value + transform_array[14];
			
 
				+  output_points[point_idx * num_features + 3] = input_intensity;
			
 
				+  output_points[point_idx * num_features + 4] = time_lag;
			
 
				+}
			
 
				+
			
 
				+cudaError_t PreprocessCuda::generateSweepPoints_launch(
			
 
				+  const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag,
			
 
				+  const float * transform_array, float * output_points)
			
 
				+{
			
 
				+  dim3 blocks(divup(points_size, config_.threads_for_voxel_));
			
 
				+  dim3 threads(config_.threads_for_voxel_);
			
 
				+
			
 
				+  generateSweepPoints_kernel<<<blocks, threads, 0, stream_>>>(
			
 
				+    input_data, points_size, input_point_step, time_lag, transform_array,
			
 
				+    config_.num_point_feature_size_, output_points);
			
 
				+
			
 
				+  cudaError_t err = cudaGetLastError();
			
 
				+  return err;
			
 
				+}
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
--- a/src/detection/detection_lidar_transfusion/preprocess/voxel_generator.cpp
+++ b/src/detection/detection_lidar_transfusion/preprocess/voxel_generator.cpp
@@ -0,0 +1,123 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include "autoware/lidar_transfusion/preprocess/voxel_generator.hpp"
			
 
				+
			
 
				+#include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp"
			
 
				+
			
 
				+#include <sensor_msgs/point_cloud2_iterator.hpp>
			
 
				+
			
 
				+#include <type_traits>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+VoxelGenerator::VoxelGenerator(
			
 
				+  const DensificationParam & densification_param, const TransfusionConfig & config,
			
 
				+  cudaStream_t & stream)
			
 
				+: config_(config), stream_(stream)
			
 
				+{
			
 
				+  pd_ptr_ = std::make_unique<PointCloudDensification>(densification_param, stream_);
			
 
				+  pre_ptr_ = std::make_unique<PreprocessCuda>(config_, stream_);
			
 
				+  cloud_data_d_ = cuda::make_unique<unsigned char[]>(config_.cloud_capacity_ * MAX_CLOUD_STEP_SIZE);
			
 
				+  affine_past2current_d_ = cuda::make_unique<float[]>(AFF_MAT_SIZE);
			
 
				+}
			
 
				+
			
 
				+bool VoxelGenerator::enqueuePointCloud(
			
 
				+  const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer)
			
 
				+{
			
 
				+  return pd_ptr_->enqueuePointCloud(msg, tf_buffer);
			
 
				+}
			
 
				+
			
 
				+std::size_t VoxelGenerator::generateSweepPoints(
			
 
				+  const sensor_msgs::msg::PointCloud2 & msg, cuda::unique_ptr<float[]> & points_d)
			
 
				+{
			
 
				+  if (!is_initialized_) {
			
 
				+    initCloudInfo(msg);
			
 
				+    std::stringstream ss;
			
 
				+    ss << "Input point cloud information: " << std::endl << cloud_info_;
			
 
				+    RCLCPP_DEBUG_STREAM(rclcpp::get_logger("lidar_transfusion"), ss.str());
			
 
				+
			
 
				+    CloudInfo default_cloud_info;
			
 
				+    if (cloud_info_ != default_cloud_info) {
			
 
				+      ss << "Expected point cloud information: " << std::endl << default_cloud_info;
			
 
				+      RCLCPP_ERROR_STREAM(rclcpp::get_logger("lidar_transfusion"), ss.str());
			
 
				+      throw std::runtime_error("Input point cloud has unsupported format");
			
 
				+    }
			
 
				+    is_initialized_ = true;
			
 
				+  }
			
 
				+
			
 
				+  size_t point_counter{0};
			
 
				+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
			
 
				+
			
 
				+  for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
			
 
				+       pc_cache_iter++) {
			
 
				+    auto sweep_num_points = pc_cache_iter->num_points;
			
 
				+    if (point_counter + sweep_num_points >= config_.cloud_capacity_) {
			
 
				+      RCLCPP_WARN_STREAM(
			
 
				+        rclcpp::get_logger("lidar_transfusion"), "Exceeding cloud capacity. Used "
			
 
				+                                                   << pd_ptr_->getIdx(pc_cache_iter) << " out of "
			
 
				+                                                   << pd_ptr_->getCacheSize() << " sweep(s)");
			
 
				+      break;
			
 
				+    }
			
 
				+    auto shift = point_counter * config_.num_point_feature_size_;
			
 
				+
			
 
				+    auto affine_past2current =
			
 
				+      pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
			
 
				+    static_assert(std::is_same<decltype(affine_past2current.matrix()), Eigen::Matrix4f &>::value);
			
 
				+    static_assert(!Eigen::Matrix4f::IsRowMajor, "matrices should be col-major.");
			
 
				+
			
 
				+    float time_lag = static_cast<float>(
			
 
				+      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());
			
 
				+
			
 
				+    CHECK_CUDA_ERROR(cudaMemcpyAsync(
			
 
				+      affine_past2current_d_.get(), affine_past2current.data(), AFF_MAT_SIZE * sizeof(float),
			
 
				+      cudaMemcpyHostToDevice, stream_));
			
 
				+    CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
			
 
				+
			
 
				+    pre_ptr_->generateSweepPoints_launch(
			
 
				+      pc_cache_iter->data_d.get(), sweep_num_points, cloud_info_.point_step, time_lag,
			
 
				+      affine_past2current_d_.get(), points_d.get() + shift);
			
 
				+    point_counter += sweep_num_points;
			
 
				+  }
			
 
				+
			
 
				+  return point_counter;
			
 
				+}
			
 
				+
			
 
				+void VoxelGenerator::initCloudInfo(const sensor_msgs::msg::PointCloud2 & msg)
			
 
				+{
			
 
				+  std::tie(cloud_info_.x_offset, cloud_info_.x_datatype, cloud_info_.x_count) =
			
 
				+    getFieldInfo(msg, "x");
			
 
				+  std::tie(cloud_info_.y_offset, cloud_info_.y_datatype, cloud_info_.y_count) =
			
 
				+    getFieldInfo(msg, "y");
			
 
				+  std::tie(cloud_info_.z_offset, cloud_info_.z_datatype, cloud_info_.z_count) =
			
 
				+    getFieldInfo(msg, "z");
			
 
				+  std::tie(
			
 
				+    cloud_info_.intensity_offset, cloud_info_.intensity_datatype, cloud_info_.intensity_count) =
			
 
				+    getFieldInfo(msg, "intensity");
			
 
				+  cloud_info_.point_step = msg.point_step;
			
 
				+  cloud_info_.is_bigendian = msg.is_bigendian;
			
 
				+}
			
 
				+
			
 
				+std::tuple<const uint32_t, const uint8_t, const uint8_t> VoxelGenerator::getFieldInfo(
			
 
				+  const sensor_msgs::msg::PointCloud2 & msg, const std::string & field_name)
			
 
				+{
			
 
				+  for (const auto & field : msg.fields) {
			
 
				+    if (field.name == field_name) {
			
 
				+      return std::make_tuple(field.offset, field.datatype, field.count);
			
 
				+    }
			
 
				+  }
			
 
				+  throw std::runtime_error("Missing field: " + field_name);
			
 
				+}
			
 
				+}  // namespace autoware::lidar_transfusion
			
--- a/src/detection/detection_lidar_transfusion/simple_profiler.cpp
+++ b/src/detection/detection_lidar_transfusion/simple_profiler.cpp
@@ -0,0 +1,132 @@
 
				+// Copyright 2023 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include <tensorrt_common/simple_profiler.hpp>
			
 
				+
			
 
				+#include <iomanip>
			
 
				+
			
 
				+namespace tensorrt_common
			
 
				+{
			
 
				+
			
 
				+SimpleProfiler::SimpleProfiler(std::string name, const std::vector<SimpleProfiler> & src_profilers)
			
 
				+: m_name(name)
			
 
				+{
			
 
				+  m_index = 0;
			
 
				+  for (const auto & src_profiler : src_profilers) {
			
 
				+    for (const auto & rec : src_profiler.m_profile) {
			
 
				+      auto it = m_profile.find(rec.first);
			
 
				+      if (it == m_profile.end()) {
			
 
				+        m_profile.insert(rec);
			
 
				+      } else {
			
 
				+        it->second.time += rec.second.time;
			
 
				+        it->second.count += rec.second.count;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void SimpleProfiler::reportLayerTime(const char * layerName, float ms) noexcept
			
 
				+{
			
 
				+  m_profile[layerName].count++;
			
 
				+  m_profile[layerName].time += ms;
			
 
				+  if (m_profile[layerName].min_time == -1.0) {
			
 
				+    m_profile[layerName].min_time = ms;
			
 
				+    m_profile[layerName].index = m_index;
			
 
				+    m_index++;
			
 
				+  } else if (m_profile[layerName].min_time > ms) {
			
 
				+    m_profile[layerName].min_time = ms;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void SimpleProfiler::setProfDict(nvinfer1::ILayer * layer) noexcept
			
 
				+{
			
 
				+  std::string name = layer->getName();
			
 
				+  m_layer_dict[name];
			
 
				+  m_layer_dict[name].type = layer->getType();
			
 
				+  if (layer->getType() == nvinfer1::LayerType::kCONVOLUTION) {
			
 
				+    nvinfer1::IConvolutionLayer * conv = (nvinfer1::IConvolutionLayer *)layer;
			
 
				+    nvinfer1::ITensor * in = layer->getInput(0);
			
 
				+    nvinfer1::Dims dim_in = in->getDimensions();
			
 
				+    nvinfer1::ITensor * out = layer->getOutput(0);
			
 
				+    nvinfer1::Dims dim_out = out->getDimensions();
			
 
				+    nvinfer1::Dims k_dims = conv->getKernelSizeNd();
			
 
				+    nvinfer1::Dims s_dims = conv->getStrideNd();
			
 
				+    int groups = conv->getNbGroups();
			
 
				+    int stride = s_dims.d[0];
			
 
				+    int kernel = k_dims.d[0];
			
 
				+    m_layer_dict[name].in_c = dim_in.d[1];
			
 
				+    m_layer_dict[name].out_c = dim_out.d[1];
			
 
				+    m_layer_dict[name].w = dim_in.d[3];
			
 
				+    m_layer_dict[name].h = dim_in.d[2];
			
 
				+    m_layer_dict[name].k = kernel;
			
 
				+    ;
			
 
				+    m_layer_dict[name].stride = stride;
			
 
				+    m_layer_dict[name].groups = groups;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+std::ostream & operator<<(std::ostream & out, const SimpleProfiler & value)
			
 
				+{
			
 
				+  out << "========== " << value.m_name << " profile ==========" << std::endl;
			
 
				+  float totalTime = 0;
			
 
				+  std::string layerNameStr = "Operation";
			
 
				+
			
 
				+  int maxLayerNameLength = static_cast<int>(layerNameStr.size());
			
 
				+  for (const auto & elem : value.m_profile) {
			
 
				+    totalTime += elem.second.time;
			
 
				+    maxLayerNameLength = std::max(maxLayerNameLength, static_cast<int>(elem.first.size()));
			
 
				+  }
			
 
				+
			
 
				+  auto old_settings = out.flags();
			
 
				+  auto old_precision = out.precision();
			
 
				+  // Output header
			
 
				+  {
			
 
				+    out << "index, " << std::setw(12);
			
 
				+    out << std::setw(maxLayerNameLength) << layerNameStr << " ";
			
 
				+    out << std::setw(12) << "Runtime"
			
 
				+        << "%,"
			
 
				+        << " ";
			
 
				+    out << std::setw(12) << "Invocations"
			
 
				+        << " , ";
			
 
				+    out << std::setw(12) << "Runtime[ms]"
			
 
				+        << " , ";
			
 
				+    out << std::setw(12) << "Avg Runtime[ms]"
			
 
				+        << " ,";
			
 
				+    out << std::setw(12) << "Min Runtime[ms]" << std::endl;
			
 
				+  }
			
 
				+  int index = value.m_index;
			
 
				+  for (int i = 0; i < index; i++) {
			
 
				+    for (const auto & elem : value.m_profile) {
			
 
				+      if (elem.second.index == i) {
			
 
				+        out << i << ",   ";
			
 
				+        out << std::setw(maxLayerNameLength) << elem.first << ",";
			
 
				+        out << std::setw(12) << std::fixed << std::setprecision(1)
			
 
				+            << (elem.second.time * 100.0F / totalTime) << "%"
			
 
				+            << ",";
			
 
				+        out << std::setw(12) << elem.second.count << ",";
			
 
				+        out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << ", ";
			
 
				+        out << std::setw(12) << std::fixed << std::setprecision(2)
			
 
				+            << elem.second.time / elem.second.count << ", ";
			
 
				+        out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.min_time
			
 
				+            << std::endl;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  out.flags(old_settings);
			
 
				+  out.precision(old_precision);
			
 
				+  out << "========== " << value.m_name << " total runtime = " << totalTime
			
 
				+      << " ms ==========" << std::endl;
			
 
				+  return out;
			
 
				+}
			
 
				+}  // namespace tensorrt_common
			
--- a/src/detection/detection_lidar_transfusion/tensorrt_common.cpp
+++ b/src/detection/detection_lidar_transfusion/tensorrt_common.cpp
@@ -0,0 +1,605 @@
 
				+// Copyright 2023 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include <tensorrt_common/tensorrt_common.hpp>
			
 
				+
			
 
				+#include <NvInferPlugin.h>
			
 
				+#include <dlfcn.h>
			
 
				+
			
 
				+#include <fstream>
			
 
				+#include <functional>
			
 
				+#include <iostream>
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+#include <utility>
			
 
				+
			
 
				+namespace
			
 
				+{
			
 
				+template <class T>
			
 
				+bool contain(const std::string & s, const T & v)
			
 
				+{
			
 
				+  return s.find(v) != std::string::npos;
			
 
				+}
			
 
				+}  // anonymous namespace
			
 
				+
			
 
				+namespace tensorrt_common
			
 
				+{
			
 
				+nvinfer1::Dims get_input_dims(const std::string & onnx_file_path)
			
 
				+{
			
 
				+  Logger logger_;
			
 
				+  auto builder = TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
			
 
				+  if (!builder) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder");
			
 
				+  }
			
 
				+
			
 
				+  const auto explicitBatch =
			
 
				+    1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
			
 
				+
			
 
				+  auto network =
			
 
				+    TrtUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
			
 
				+  if (!network) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network");
			
 
				+  }
			
 
				+
			
 
				+  auto config = TrtUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
			
 
				+  if (!config) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config");
			
 
				+  }
			
 
				+
			
 
				+  auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, logger_));
			
 
				+  if (!parser->parseFromFile(
			
 
				+        onnx_file_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kERROR))) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Failed to parse onnx file");
			
 
				+  }
			
 
				+
			
 
				+  const auto input = network->getInput(0);
			
 
				+  return input->getDimensions();
			
 
				+}
			
 
				+
			
 
				+bool is_valid_precision_string(const std::string & precision)
			
 
				+{
			
 
				+  if (
			
 
				+    std::find(valid_precisions.begin(), valid_precisions.end(), precision) ==
			
 
				+    valid_precisions.end()) {
			
 
				+    std::stringstream message;
			
 
				+    message << "Invalid precision was specified: " << precision << std::endl
			
 
				+            << "Valid string is one of: [";
			
 
				+    for (const auto & s : valid_precisions) {
			
 
				+      message << s << ", ";
			
 
				+    }
			
 
				+    message << "] (case sensitive)" << std::endl;
			
 
				+    std::cerr << message.str();
			
 
				+    return false;
			
 
				+  } else {
			
 
				+    return true;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TrtCommon::TrtCommon(
			
 
				+  const std::string & model_path, const std::string & precision,
			
 
				+  std::unique_ptr<nvinfer1::IInt8Calibrator> calibrator, const BatchConfig & batch_config,
			
 
				+  const size_t max_workspace_size, const BuildConfig & build_config,
			
 
				+  const std::vector<std::string> & plugin_paths)
			
 
				+: model_file_path_(model_path),
			
 
				+  calibrator_(std::move(calibrator)),
			
 
				+  precision_(precision),
			
 
				+  batch_config_(batch_config),
			
 
				+  max_workspace_size_(max_workspace_size),
			
 
				+  model_profiler_("Model"),
			
 
				+  host_profiler_("Host")
			
 
				+{
			
 
				+  // Check given precision is valid one
			
 
				+  if (!is_valid_precision_string(precision)) {
			
 
				+    return;
			
 
				+  }
			
 
				+  build_config_ = std::make_unique<const BuildConfig>(build_config);
			
 
				+
			
 
				+  for (const auto & plugin_path : plugin_paths) {
			
 
				+    int32_t flags{RTLD_LAZY};
			
 
				+// cspell: ignore asan
			
 
				+#if ENABLE_ASAN
			
 
				+    // https://github.com/google/sanitizers/issues/89
			
 
				+    // asan doesn't handle module unloading correctly and there are no plans on doing
			
 
				+    // so. In order to get proper stack traces, don't delete the shared library on
			
 
				+    // close so that asan can resolve the symbols correctly.
			
 
				+    flags |= RTLD_NODELETE;
			
 
				+#endif  // ENABLE_ASAN
			
 
				+    void * handle = dlopen(plugin_path.c_str(), flags);
			
 
				+    if (!handle) {
			
 
				+      logger_.log(nvinfer1::ILogger::Severity::kERROR, "Could not load plugin library");
			
 
				+    }
			
 
				+  }
			
 
				+  runtime_ = TrtUniquePtr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(logger_));
			
 
				+  if (build_config_->dla_core_id != -1) {
			
 
				+    runtime_->setDLACore(build_config_->dla_core_id);
			
 
				+  }
			
 
				+  initLibNvInferPlugins(&logger_, "");
			
 
				+}
			
 
				+
			
 
				+TrtCommon::~TrtCommon()
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+void TrtCommon::setup()
			
 
				+{
			
 
				+  if (!fs::exists(model_file_path_)) {
			
 
				+    is_initialized_ = false;
			
 
				+    return;
			
 
				+  }
			
 
				+  std::string engine_path = model_file_path_;
			
 
				+  if (model_file_path_.extension() == ".engine") {
			
 
				+    std::cout << "Load ... " << model_file_path_ << std::endl;
			
 
				+    loadEngine(model_file_path_);
			
 
				+  } else if (model_file_path_.extension() == ".onnx") {
			
 
				+    fs::path cache_engine_path{model_file_path_};
			
 
				+    std::string ext;
			
 
				+    std::string calib_name = "";
			
 
				+    if (precision_ == "int8") {
			
 
				+      if (build_config_->calib_type_str == "Entropy") {
			
 
				+        calib_name = "EntropyV2-";
			
 
				+      } else if (
			
 
				+        build_config_->calib_type_str == "Legacy" ||
			
 
				+        build_config_->calib_type_str == "Percentile") {
			
 
				+        calib_name = "Legacy-";
			
 
				+      } else {
			
 
				+        calib_name = "MinMax-";
			
 
				+      }
			
 
				+    }
			
 
				+    if (build_config_->dla_core_id != -1) {
			
 
				+      ext = "DLA" + std::to_string(build_config_->dla_core_id) + "-" + calib_name + precision_;
			
 
				+      if (build_config_->quantize_first_layer) {
			
 
				+        ext += "-firstFP16";
			
 
				+      }
			
 
				+      if (build_config_->quantize_last_layer) {
			
 
				+        ext += "-lastFP16";
			
 
				+      }
			
 
				+      ext += "-batch" + std::to_string(batch_config_[0]) + ".engine";
			
 
				+    } else {
			
 
				+      ext = calib_name + precision_;
			
 
				+      if (build_config_->quantize_first_layer) {
			
 
				+        ext += "-firstFP16";
			
 
				+      }
			
 
				+      if (build_config_->quantize_last_layer) {
			
 
				+        ext += "-lastFP16";
			
 
				+      }
			
 
				+      ext += "-batch" + std::to_string(batch_config_[0]) + ".engine";
			
 
				+    }
			
 
				+    cache_engine_path.replace_extension(ext);
			
 
				+
			
 
				+    // Output Network Information
			
 
				+    printNetworkInfo(model_file_path_);
			
 
				+
			
 
				+    if (fs::exists(cache_engine_path)) {
			
 
				+      std::cout << "Loading... " << cache_engine_path << std::endl;
			
 
				+      loadEngine(cache_engine_path);
			
 
				+    } else {
			
 
				+      std::cout << "Building... " << cache_engine_path << std::endl;
			
 
				+      logger_.log(nvinfer1::ILogger::Severity::kINFO, "Start build engine");
			
 
				+      auto log_thread = logger_.log_throttle(
			
 
				+        nvinfer1::ILogger::Severity::kINFO,
			
 
				+        "Applying optimizations and building TRT CUDA engine. Please wait for a few minutes...", 5);
			
 
				+      buildEngineFromOnnx(model_file_path_, cache_engine_path);
			
 
				+      logger_.stop_throttle(log_thread);
			
 
				+      logger_.log(nvinfer1::ILogger::Severity::kINFO, "End build engine");
			
 
				+    }
			
 
				+    engine_path = cache_engine_path;
			
 
				+  } else {
			
 
				+    is_initialized_ = false;
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  context_ = TrtUniquePtr<nvinfer1::IExecutionContext>(engine_->createExecutionContext());
			
 
				+  if (!context_) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context");
			
 
				+    is_initialized_ = false;
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  if (build_config_->profile_per_layer) {
			
 
				+    context_->setProfiler(&model_profiler_);
			
 
				+  }
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
			
 
				+  // Write profiles for trt-engine-explorer
			
 
				+  // See: https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer
			
 
				+  std::string j_ext = ".json";
			
 
				+  fs::path json_path{engine_path};
			
 
				+  json_path.replace_extension(j_ext);
			
 
				+  std::string ret = getLayerInformation(nvinfer1::LayerInformationFormat::kJSON);
			
 
				+  std::ofstream os(json_path, std::ofstream::trunc);
			
 
				+  os << ret << std::flush;
			
 
				+#endif
			
 
				+
			
 
				+  is_initialized_ = true;
			
 
				+}
			
 
				+
			
 
				+bool TrtCommon::loadEngine(const std::string & engine_file_path)
			
 
				+{
			
 
				+  std::ifstream engine_file(engine_file_path);
			
 
				+  std::stringstream engine_buffer;
			
 
				+  engine_buffer << engine_file.rdbuf();
			
 
				+  std::string engine_str = engine_buffer.str();
			
 
				+  engine_ = TrtUniquePtr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(
			
 
				+    reinterpret_cast<const void *>(engine_str.data()), engine_str.size()));
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+void TrtCommon::printNetworkInfo(const std::string & onnx_file_path)
			
 
				+{
			
 
				+  auto builder = TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
			
 
				+  if (!builder) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder");
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  const auto explicitBatch =
			
 
				+    1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
			
 
				+
			
 
				+  auto network =
			
 
				+    TrtUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
			
 
				+  if (!network) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network");
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  auto config = TrtUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
			
 
				+  if (!config) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config");
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  if (precision_ == "fp16" || precision_ == "int8") {
			
 
				+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
			
 
				+  }
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400
			
 
				+  config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
			
 
				+#else
			
 
				+  config->setMaxWorkspaceSize(max_workspace_size_);
			
 
				+#endif
			
 
				+
			
 
				+  auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, logger_));
			
 
				+  if (!parser->parseFromFile(
			
 
				+        onnx_file_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kERROR))) {
			
 
				+    return;
			
 
				+  }
			
 
				+  int num = network->getNbLayers();
			
 
				+  float total_gflops = 0.0;
			
 
				+  int total_params = 0;
			
 
				+  for (int i = 0; i < num; i++) {
			
 
				+    nvinfer1::ILayer * layer = network->getLayer(i);
			
 
				+    auto layer_type = layer->getType();
			
 
				+    if (build_config_->profile_per_layer) {
			
 
				+      model_profiler_.setProfDict(layer);
			
 
				+    }
			
 
				+    if (layer_type == nvinfer1::LayerType::kCONSTANT) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    nvinfer1::ITensor * in = layer->getInput(0);
			
 
				+    nvinfer1::Dims dim_in = in->getDimensions();
			
 
				+    nvinfer1::ITensor * out = layer->getOutput(0);
			
 
				+    nvinfer1::Dims dim_out = out->getDimensions();
			
 
				+
			
 
				+    if (layer_type == nvinfer1::LayerType::kCONVOLUTION) {
			
 
				+      nvinfer1::IConvolutionLayer * conv = (nvinfer1::IConvolutionLayer *)layer;
			
 
				+      nvinfer1::Dims k_dims = conv->getKernelSizeNd();
			
 
				+      nvinfer1::Dims s_dims = conv->getStrideNd();
			
 
				+      int groups = conv->getNbGroups();
			
 
				+      int stride = s_dims.d[0];
			
 
				+      int num_weights = (dim_in.d[1] / groups) * dim_out.d[1] * k_dims.d[0] * k_dims.d[1];
			
 
				+      float gflops = (2 * num_weights) * (dim_in.d[3] / stride * dim_in.d[2] / stride / 1e9);
			
 
				+      ;
			
 
				+      total_gflops += gflops;
			
 
				+      total_params += num_weights;
			
 
				+      std::cout << "L" << i << " [conv " << k_dims.d[0] << "x" << k_dims.d[1] << " (" << groups
			
 
				+                << ") "
			
 
				+                << "/" << s_dims.d[0] << "] " << dim_in.d[3] << "x" << dim_in.d[2] << "x"
			
 
				+                << dim_in.d[1] << " -> " << dim_out.d[3] << "x" << dim_out.d[2] << "x"
			
 
				+                << dim_out.d[1];
			
 
				+      std::cout << " weights:" << num_weights;
			
 
				+      std::cout << " GFLOPs:" << gflops;
			
 
				+      std::cout << std::endl;
			
 
				+    } else if (layer_type == nvinfer1::LayerType::kPOOLING) {
			
 
				+      nvinfer1::IPoolingLayer * pool = (nvinfer1::IPoolingLayer *)layer;
			
 
				+      auto p_type = pool->getPoolingType();
			
 
				+      nvinfer1::Dims dim_stride = pool->getStrideNd();
			
 
				+      nvinfer1::Dims dim_window = pool->getWindowSizeNd();
			
 
				+
			
 
				+      std::cout << "L" << i << " [";
			
 
				+      if (p_type == nvinfer1::PoolingType::kMAX) {
			
 
				+        std::cout << "max ";
			
 
				+      } else if (p_type == nvinfer1::PoolingType::kAVERAGE) {
			
 
				+        std::cout << "avg ";
			
 
				+      } else if (p_type == nvinfer1::PoolingType::kMAX_AVERAGE_BLEND) {
			
 
				+        std::cout << "max avg blend ";
			
 
				+      }
			
 
				+      float gflops = dim_in.d[1] * dim_window.d[0] / dim_stride.d[0] * dim_window.d[1] /
			
 
				+                     dim_stride.d[1] * dim_in.d[2] * dim_in.d[3] / 1e9;
			
 
				+      total_gflops += gflops;
			
 
				+      std::cout << "pool " << dim_window.d[0] << "x" << dim_window.d[1] << "]";
			
 
				+      std::cout << " GFLOPs:" << gflops;
			
 
				+      std::cout << std::endl;
			
 
				+    } else if (layer_type == nvinfer1::LayerType::kRESIZE) {
			
 
				+      std::cout << "L" << i << " [resize]" << std::endl;
			
 
				+    }
			
 
				+  }
			
 
				+  std::cout << "Total " << total_gflops << " GFLOPs" << std::endl;
			
 
				+  std::cout << "Total " << total_params / 1000.0 / 1000.0 << " M params" << std::endl;
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+bool TrtCommon::buildEngineFromOnnx(
			
 
				+  const std::string & onnx_file_path, const std::string & output_engine_file_path)
			
 
				+{
			
 
				+  auto builder = TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
			
 
				+  if (!builder) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder");
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  const auto explicitBatch =
			
 
				+    1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
			
 
				+
			
 
				+  auto network =
			
 
				+    TrtUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
			
 
				+  if (!network) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network");
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  auto config = TrtUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
			
 
				+  if (!config) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config");
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  int num_available_dla = builder->getNbDLACores();
			
 
				+  if (build_config_->dla_core_id != -1) {
			
 
				+    if (num_available_dla > 0) {
			
 
				+      std::cout << "###" << num_available_dla << " DLAs are supported! ###" << std::endl;
			
 
				+    } else {
			
 
				+      std::cout << "###Warning : "
			
 
				+                << "No DLA is supported! ###" << std::endl;
			
 
				+    }
			
 
				+    config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
			
 
				+    config->setDLACore(build_config_->dla_core_id);
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
			
 
				+    config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
			
 
				+#else
			
 
				+    config->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
			
 
				+#endif
			
 
				+    config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
			
 
				+  }
			
 
				+  if (precision_ == "fp16" || precision_ == "int8") {
			
 
				+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
			
 
				+  }
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400
			
 
				+  config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
			
 
				+#else
			
 
				+  config->setMaxWorkspaceSize(max_workspace_size_);
			
 
				+#endif
			
 
				+
			
 
				+  auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, logger_));
			
 
				+  if (!parser->parseFromFile(
			
 
				+        onnx_file_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kERROR))) {
			
 
				+    std::cout << "Failed to parse onnx file" << std::endl;
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  const int num = network->getNbLayers();
			
 
				+  bool first = build_config_->quantize_first_layer;
			
 
				+  bool last = build_config_->quantize_last_layer;
			
 
				+  // Partial Quantization
			
 
				+  if (precision_ == "int8") {
			
 
				+    network->getInput(0)->setDynamicRange(0, 255.0);
			
 
				+    for (int i = 0; i < num; i++) {
			
 
				+      nvinfer1::ILayer * layer = network->getLayer(i);
			
 
				+      auto layer_type = layer->getType();
			
 
				+      std::string name = layer->getName();
			
 
				+      nvinfer1::ITensor * out = layer->getOutput(0);
			
 
				+      if (build_config_->clip_value > 0.0) {
			
 
				+        std::cout << "Set max value for outputs : " << build_config_->clip_value << "  " << name
			
 
				+                  << std::endl;
			
 
				+        out->setDynamicRange(0.0, build_config_->clip_value);
			
 
				+      }
			
 
				+
			
 
				+      if (layer_type == nvinfer1::LayerType::kCONVOLUTION) {
			
 
				+        if (first) {
			
 
				+          layer->setPrecision(nvinfer1::DataType::kHALF);
			
 
				+          std::cout << "Set kHALF in " << name << std::endl;
			
 
				+          first = false;
			
 
				+        }
			
 
				+        if (last) {
			
 
				+          // cspell: ignore preds
			
 
				+          if (
			
 
				+            contain(name, "reg_preds") || contain(name, "cls_preds") ||
			
 
				+            contain(name, "obj_preds")) {
			
 
				+            layer->setPrecision(nvinfer1::DataType::kHALF);
			
 
				+            std::cout << "Set kHALF in " << name << std::endl;
			
 
				+          }
			
 
				+          for (int j = num - 1; j >= 0; j--) {
			
 
				+            nvinfer1::ILayer * inner_layer = network->getLayer(j);
			
 
				+            auto inner_layer_type = inner_layer->getType();
			
 
				+            std::string inner_name = inner_layer->getName();
			
 
				+            if (inner_layer_type == nvinfer1::LayerType::kCONVOLUTION) {
			
 
				+              inner_layer->setPrecision(nvinfer1::DataType::kHALF);
			
 
				+              std::cout << "Set kHALF in " << inner_name << std::endl;
			
 
				+              break;
			
 
				+            }
			
 
				+            if (inner_layer_type == nvinfer1::LayerType::kMATRIX_MULTIPLY) {
			
 
				+              inner_layer->setPrecision(nvinfer1::DataType::kHALF);
			
 
				+              std::cout << "Set kHALF in " << inner_name << std::endl;
			
 
				+              break;
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  const auto input = network->getInput(0);
			
 
				+  const auto input_dims = input->getDimensions();
			
 
				+  const auto input_channel = input_dims.d[1];
			
 
				+  const auto input_height = input_dims.d[2];
			
 
				+  const auto input_width = input_dims.d[3];
			
 
				+  const auto input_batch = input_dims.d[0];
			
 
				+
			
 
				+  if (input_batch > 1) {
			
 
				+    batch_config_[0] = input_batch;
			
 
				+  }
			
 
				+
			
 
				+  if (batch_config_.at(0) > 1 && (batch_config_.at(0) == batch_config_.at(2))) {
			
 
				+    // Attention : below API is deprecated in TRT8.4
			
 
				+    builder->setMaxBatchSize(batch_config_.at(2));
			
 
				+  } else {
			
 
				+    if (build_config_->profile_per_layer) {
			
 
				+      auto profile = builder->createOptimizationProfile();
			
 
				+      profile->setDimensions(
			
 
				+        network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMIN,
			
 
				+        nvinfer1::Dims4{batch_config_.at(0), input_channel, input_height, input_width});
			
 
				+      profile->setDimensions(
			
 
				+        network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kOPT,
			
 
				+        nvinfer1::Dims4{batch_config_.at(1), input_channel, input_height, input_width});
			
 
				+      profile->setDimensions(
			
 
				+        network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMAX,
			
 
				+        nvinfer1::Dims4{batch_config_.at(2), input_channel, input_height, input_width});
			
 
				+      config->addOptimizationProfile(profile);
			
 
				+    }
			
 
				+  }
			
 
				+  if (precision_ == "int8" && calibrator_) {
			
 
				+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
			
 
				+    config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS);
			
 
				+#else
			
 
				+    config->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
			
 
				+#endif
			
 
				+    // QAT requires no calibrator.
			
 
				+    //    assert((calibrator != nullptr) && "Invalid calibrator for INT8 precision");
			
 
				+    config->setInt8Calibrator(calibrator_.get());
			
 
				+  }
			
 
				+  if (build_config_->profile_per_layer) {
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
			
 
				+    config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kDETAILED);
			
 
				+#else
			
 
				+    config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kVERBOSE);
			
 
				+#endif
			
 
				+  }
			
 
				+
			
 
				+#if TENSORRT_VERSION_MAJOR >= 8
			
 
				+  auto plan =
			
 
				+    TrtUniquePtr<nvinfer1::IHostMemory>(builder->buildSerializedNetwork(*network, *config));
			
 
				+  if (!plan) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create host memory");
			
 
				+    return false;
			
 
				+  }
			
 
				+  engine_ = TrtUniquePtr<nvinfer1::ICudaEngine>(
			
 
				+    runtime_->deserializeCudaEngine(plan->data(), plan->size()));
			
 
				+#else
			
 
				+  engine_ = TrtUniquePtr<nvinfer1::ICudaEngine>(builder->buildEngineWithConfig(*network, *config));
			
 
				+#endif
			
 
				+
			
 
				+  if (!engine_) {
			
 
				+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine");
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  // save engine
			
 
				+#if TENSORRT_VERSION_MAJOR < 8
			
 
				+  auto data = TrtUniquePtr<nvinfer1::IHostMemory>(engine_->serialize());
			
 
				+#endif
			
 
				+  std::ofstream file;
			
 
				+  file.open(output_engine_file_path, std::ios::binary | std::ios::out);
			
 
				+  if (!file.is_open()) {
			
 
				+    return false;
			
 
				+  }
			
 
				+#if TENSORRT_VERSION_MAJOR < 8
			
 
				+  file.write(reinterpret_cast<const char *>(data->data()), data->size());
			
 
				+#else
			
 
				+  file.write(reinterpret_cast<const char *>(plan->data()), plan->size());
			
 
				+#endif
			
 
				+
			
 
				+  file.close();
			
 
				+
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+bool TrtCommon::isInitialized()
			
 
				+{
			
 
				+  return is_initialized_;
			
 
				+}
			
 
				+
			
 
				+nvinfer1::Dims TrtCommon::getBindingDimensions(const int32_t index) const
			
 
				+{
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSOR_PATCH * 10) >= 8500
			
 
				+  auto const & name = engine_->getIOTensorName(index);
			
 
				+  auto dims = context_->getTensorShape(name);
			
 
				+  bool const has_runtime_dim =
			
 
				+    std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; });
			
 
				+
			
 
				+  if (has_runtime_dim) {
			
 
				+    return dims;
			
 
				+  } else {
			
 
				+    return context_->getBindingDimensions(index);
			
 
				+  }
			
 
				+#else
			
 
				+  return context_->getBindingDimensions(index);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+int32_t TrtCommon::getNbBindings()
			
 
				+{
			
 
				+  return engine_->getNbBindings();
			
 
				+}
			
 
				+
			
 
				+bool TrtCommon::setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const
			
 
				+{
			
 
				+  return context_->setBindingDimensions(index, dimensions);
			
 
				+}
			
 
				+
			
 
				+bool TrtCommon::enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed)
			
 
				+{
			
 
				+  if (build_config_->profile_per_layer) {
			
 
				+    auto inference_start = std::chrono::high_resolution_clock::now();
			
 
				+
			
 
				+    bool ret = context_->enqueueV2(bindings, stream, input_consumed);
			
 
				+
			
 
				+    auto inference_end = std::chrono::high_resolution_clock::now();
			
 
				+    host_profiler_.reportLayerTime(
			
 
				+      "inference",
			
 
				+      std::chrono::duration<float, std::milli>(inference_end - inference_start).count());
			
 
				+    return ret;
			
 
				+  } else {
			
 
				+    return context_->enqueueV2(bindings, stream, input_consumed);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void TrtCommon::printProfiling()
			
 
				+{
			
 
				+  std::cout << host_profiler_;
			
 
				+  std::cout << std::endl;
			
 
				+  std::cout << model_profiler_;
			
 
				+}
			
 
				+
			
 
				+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
			
 
				+std::string TrtCommon::getLayerInformation(nvinfer1::LayerInformationFormat format)
			
 
				+{
			
 
				+  auto runtime = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(logger_));
			
 
				+  auto inspector = std::unique_ptr<nvinfer1::IEngineInspector>(engine_->createEngineInspector());
			
 
				+  if (context_ != nullptr) {
			
 
				+    inspector->setExecutionContext(&(*context_));
			
 
				+  }
			
 
				+  std::string result = inspector->getEngineInformation(format);
			
 
				+  return result;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace tensorrt_common
			
--- a/src/detection/detection_lidar_transfusion/transfusion_config.hpp
+++ b/src/detection/detection_lidar_transfusion/transfusion_config.hpp
@@ -0,0 +1,170 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__TRANSFUSION_CONFIG_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__TRANSFUSION_CONFIG_HPP_
			
 
				+
			
 
				+#include <cstdint>
			
 
				+#include <vector>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+class TransfusionConfig
			
 
				+{
			
 
				+public:
			
 
				+  TransfusionConfig(
			
 
				+    const std::size_t cloud_capacity, const std::vector<int64_t> & voxels_num,
			
 
				+    const std::vector<double> & point_cloud_range, const std::vector<double> & voxel_size,
			
 
				+    const std::size_t num_proposals, const float circle_nms_dist_threshold,
			
 
				+    const std::vector<double> & yaw_norm_thresholds, const float score_threshold)
			
 
				+  {
			
 
				+    cloud_capacity_ = cloud_capacity;
			
 
				+
			
 
				+    if (voxels_num.size() == 3) {
			
 
				+      max_voxels_ = voxels_num[2];
			
 
				+
			
 
				+      voxels_num_[0] = voxels_num[0];
			
 
				+      voxels_num_[1] = voxels_num[1];
			
 
				+      voxels_num_[2] = voxels_num[2];
			
 
				+
			
 
				+      min_voxel_size_ = voxels_num[0];
			
 
				+      opt_voxel_size_ = voxels_num[1];
			
 
				+      max_voxel_size_ = voxels_num[2];
			
 
				+
			
 
				+      min_points_size_ = voxels_num[0];
			
 
				+      opt_points_size_ = voxels_num[1];
			
 
				+      max_points_size_ = voxels_num[2];
			
 
				+
			
 
				+      min_coors_size_ = voxels_num[0];
			
 
				+      opt_coors_size_ = voxels_num[1];
			
 
				+      max_coors_size_ = voxels_num[2];
			
 
				+    }
			
 
				+    if (point_cloud_range.size() == 6) {
			
 
				+      min_x_range_ = static_cast<float>(point_cloud_range[0]);
			
 
				+      min_y_range_ = static_cast<float>(point_cloud_range[1]);
			
 
				+      min_z_range_ = static_cast<float>(point_cloud_range[2]);
			
 
				+      max_x_range_ = static_cast<float>(point_cloud_range[3]);
			
 
				+      max_y_range_ = static_cast<float>(point_cloud_range[4]);
			
 
				+      max_z_range_ = static_cast<float>(point_cloud_range[5]);
			
 
				+    }
			
 
				+    if (voxel_size.size() == 3) {
			
 
				+      voxel_x_size_ = static_cast<float>(voxel_size[0]);
			
 
				+      voxel_y_size_ = static_cast<float>(voxel_size[1]);
			
 
				+      voxel_z_size_ = static_cast<float>(voxel_size[2]);
			
 
				+    }
			
 
				+    if (num_proposals > 0) {
			
 
				+      num_proposals_ = num_proposals;
			
 
				+    }
			
 
				+    if (score_threshold > 0.0) {
			
 
				+      score_threshold_ = score_threshold;
			
 
				+    }
			
 
				+    if (circle_nms_dist_threshold > 0.0) {
			
 
				+      circle_nms_dist_threshold_ = circle_nms_dist_threshold;
			
 
				+    }
			
 
				+    yaw_norm_thresholds_ =
			
 
				+      std::vector<float>(yaw_norm_thresholds.begin(), yaw_norm_thresholds.end());
			
 
				+    for (auto & yaw_norm_threshold : yaw_norm_thresholds_) {
			
 
				+      yaw_norm_threshold =
			
 
				+        (yaw_norm_threshold >= 0.0 && yaw_norm_threshold < 1.0) ? yaw_norm_threshold : 0.0;
			
 
				+    }
			
 
				+    grid_x_size_ = static_cast<std::size_t>((max_x_range_ - min_x_range_) / voxel_x_size_);
			
 
				+    grid_y_size_ = static_cast<std::size_t>((max_y_range_ - min_y_range_) / voxel_y_size_);
			
 
				+    grid_z_size_ = static_cast<std::size_t>((max_z_range_ - min_z_range_) / voxel_z_size_);
			
 
				+
			
 
				+    feature_x_size_ = grid_x_size_ / out_size_factor_;
			
 
				+    feature_y_size_ = grid_y_size_ / out_size_factor_;
			
 
				+  }
			
 
				+
			
 
				+  ///// INPUT PARAMETERS /////
			
 
				+  std::size_t cloud_capacity_{};
			
 
				+  ///// KERNEL PARAMETERS /////
			
 
				+  const std::size_t threads_for_voxel_{256};  // threads number for a block
			
 
				+  const std::size_t points_per_voxel_{20};
			
 
				+  const std::size_t warp_size_{32};          // one warp(32 threads) for one pillar
			
 
				+  const std::size_t pillars_per_block_{64};  // one thread deals with one pillar
			
 
				+                                             // and a block has pillars_per_block threads
			
 
				+  const std::size_t pillar_feature_size_{64};
			
 
				+  std::size_t max_voxels_{60000};
			
 
				+
			
 
				+  ///// NETWORK PARAMETERS /////
			
 
				+  const std::size_t batch_size_{1};
			
 
				+  const std::size_t num_classes_{5};
			
 
				+  const std::size_t num_point_feature_size_{5};  // x, y, z, intensity, lag
			
 
				+  // the dimension of the input cloud
			
 
				+  float min_x_range_{-76.8};
			
 
				+  float max_x_range_{76.8};
			
 
				+  float min_y_range_{-76.8};
			
 
				+  float max_y_range_{76.8};
			
 
				+  float min_z_range_{-3.0};
			
 
				+  float max_z_range_{5.0};
			
 
				+  // the size of a pillar
			
 
				+  float voxel_x_size_{0.3};
			
 
				+  float voxel_y_size_{0.3};
			
 
				+  float voxel_z_size_{8.0};
			
 
				+  const std::size_t out_size_factor_{4};
			
 
				+  const std::size_t max_num_points_per_pillar_{points_per_voxel_};
			
 
				+  const std::size_t num_point_values_{4};
			
 
				+  std::size_t num_proposals_{200};
			
 
				+  // the number of feature maps for pillar scatter
			
 
				+  const std::size_t num_feature_scatter_{pillar_feature_size_};
			
 
				+  // the score threshold for classification
			
 
				+  float score_threshold_{0.2};
			
 
				+  float circle_nms_dist_threshold_{0.5};
			
 
				+  std::vector<float> yaw_norm_thresholds_{0.3, 0.3, 0.3, 0.3, 0.0};
			
 
				+  std::size_t max_num_pillars_{max_voxels_};
			
 
				+  const std::size_t pillar_points_bev_{max_num_points_per_pillar_ * max_num_pillars_};
			
 
				+  // the detected boxes result decode by (x, y, z, w, l, h, yaw)
			
 
				+  const std::size_t num_box_values_{8};
			
 
				+  // the input size of the 2D backbone network
			
 
				+  std::size_t grid_x_size_{512};
			
 
				+  std::size_t grid_y_size_{512};
			
 
				+  std::size_t grid_z_size_{1};
			
 
				+  // the output size of the 2D backbone network
			
 
				+  std::size_t feature_x_size_{grid_x_size_ / out_size_factor_};
			
 
				+  std::size_t feature_y_size_{grid_y_size_ / out_size_factor_};
			
 
				+
			
 
				+  ///// RUNTIME DIMENSIONS /////
			
 
				+  std::vector<std::size_t> voxels_num_{5000, 30000, 60000};
			
 
				+  // voxels
			
 
				+  std::size_t min_voxel_size_{voxels_num_[0]};
			
 
				+  std::size_t opt_voxel_size_{voxels_num_[1]};
			
 
				+  std::size_t max_voxel_size_{voxels_num_[2]};
			
 
				+
			
 
				+  std::size_t min_point_in_voxel_size_{points_per_voxel_};
			
 
				+  std::size_t opt_point_in_voxel_size_{points_per_voxel_};
			
 
				+  std::size_t max_point_in_voxel_size_{points_per_voxel_};
			
 
				+
			
 
				+  std::size_t min_network_feature_size_{num_point_feature_size_};
			
 
				+  std::size_t opt_network_feature_size_{num_point_feature_size_};
			
 
				+  std::size_t max_network_feature_size_{num_point_feature_size_};
			
 
				+
			
 
				+  // num_points
			
 
				+  std::size_t min_points_size_{voxels_num_[0]};
			
 
				+  std::size_t opt_points_size_{voxels_num_[1]};
			
 
				+  std::size_t max_points_size_{voxels_num_[2]};
			
 
				+
			
 
				+  // coors
			
 
				+  std::size_t min_coors_size_{voxels_num_[0]};
			
 
				+  std::size_t opt_coors_size_{voxels_num_[1]};
			
 
				+  std::size_t max_coors_size_{voxels_num_[2]};
			
 
				+
			
 
				+  std::size_t min_coors_dim_size_{num_point_values_};
			
 
				+  std::size_t opt_coors_dim_size_{num_point_values_};
			
 
				+  std::size_t max_coors_dim_size_{num_point_values_};
			
 
				+};
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__TRANSFUSION_CONFIG_HPP_
			
--- a/src/detection/detection_lidar_transfusion/utils.hpp
+++ b/src/detection/detection_lidar_transfusion/utils.hpp
@@ -0,0 +1,57 @@
 
				+// Copyright 2024 TIER IV, Inc.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef AUTOWARE__LIDAR_TRANSFUSION__UTILS_HPP_
			
 
				+#define AUTOWARE__LIDAR_TRANSFUSION__UTILS_HPP_
			
 
				+
			
 
				+#include <cstddef>
			
 
				+#include <iostream>
			
 
				+#include <stdexcept>
			
 
				+#include <unordered_map>
			
 
				+
			
 
				+namespace autoware::lidar_transfusion
			
 
				+{
			
 
				+
			
 
				+struct Box3D
			
 
				+{
			
 
				+  int label;
			
 
				+  float score;
			
 
				+  float x;
			
 
				+  float y;
			
 
				+  float z;
			
 
				+  float width;
			
 
				+  float length;
			
 
				+  float height;
			
 
				+  float yaw;
			
 
				+};
			
 
				+
			
 
				+enum NetworkIO { voxels = 0, num_points, coors, cls_score, dir_pred, bbox_pred, ENUM_SIZE };
			
 
				+
			
 
				+// cspell: ignore divup
			
 
				+template <typename T1, typename T2>
			
 
				+unsigned int divup(const T1 a, const T2 b)
			
 
				+{
			
 
				+  if (a == 0) {
			
 
				+    throw std::runtime_error("A dividend of divup isn't positive.");
			
 
				+  }
			
 
				+  if (b == 0) {
			
 
				+    throw std::runtime_error("A divisor of divup isn't positive.");
			
 
				+  }
			
 
				+
			
 
				+  return (a + b - 1) / b;
			
 
				+}
			
 
				+
			
 
				+}  // namespace autoware::lidar_transfusion
			
 
				+
			
 
				+#endif  // AUTOWARE__LIDAR_TRANSFUSION__UTILS_HPP_