Ver código fonte

first commit label_sign_detection

liyupeng 5 meses atrás
pai
commit
54dcac5a93

+ 114 - 0
src/detection/label_sign_detection/README.md

@@ -0,0 +1,114 @@
+## B站教学视频
+
+https://www.bilibili.com/video/BV1Pa4y1N7HS
+
+## Introduction
+
+- 基于**Tensorrt**加速**Yolov8**,本项目采用**ONNX转Tensorrt**方案
+- 支持**Windows10**和**Linux**
+- 支持**Python/C++**
+
+## YOLOv8
+
+<div align="center">
+<img src="assets/1.png" width=800>
+</div>
+
+## Environment
+
+- **Tensorrt 8.4.3.**
+- **Cuda 11.6 Cudnn 8.4.1**
+- **onnx 1.12.0**
+
+## Quick Start
+
+安装**yolov8**仓库,并下载官方模型。
+
+```
+pip install ultralytics==8.0.5
+pip install onnx==1.12.0
+# download offical weights(".pt" file)
+https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt
+```
+
+使用官方命令**导出ONNX模型**。
+
+```
+yolo mode=export model=yolov8n.pt format=onnx dynamic=False
+```
+
+使用本仓库**v8_transform.py转换官方的ONNX模型**,会自动生成yolov8n.transd.onnx。
+
+```
+python v8_transform.py yolov8n.onnx
+```
+
+将生成的**onnx**模型复制到**tensorrt/bin**文件夹下,使用官方**trtexec**转化onnx模型。**FP32预测删除`--fp16`参数即可**。
+
+```
+trtexec --onnx=yolov8n.transd.onnx --saveEngine=yolov8n_fp16.trt --fp16
+```
+
+## C++
+
+配置**Opencv**、**Tensorrt**环境,具体可参考https://github.com/Monday-Leo/Yolov5_Tensorrt_Win10
+
+打开本仓库的**CMakeLists.txt**,修改**Opencv**、**Tensorrt**路径,之后cmake。
+
+```
+#change to your own path
+##################################################
+set(OpenCV_DIR "E:/opencv/build")  
+set(TRT_DIR "E:/TensorRT-8.4.3.1")  
+##################################################
+```
+
+<div align="center">
+<img src="assets/2.png" width=600>
+</div>
+
+将**预测图片zidane.jpg和模型yolov8n_fp16.trt放入exe文件夹**,直接运行程序,**没有做warmup预测,首次预测时间不准**,想要精确计时请自行修改代码做warmup。**想要修改模型路径和图片路径请修改主程序。**
+
+```
+int main() {
+	std::string img_path = "zidane.jpg";
+	std::string model_path = "yolov8n_fp16.trt";
+	single_inference(img_path,model_path);
+	return 0;
+}
+```
+
+<div align="center">
+<img src="assets/3.jpg" width=600>
+</div>
+
+## Python
+
+在刚才的C++工程中右键yolov8,点击属性,修改为**动态链接库**。
+
+<div align="center">
+<img src="assets/4.png" width=600>
+</div>
+
+将本仓库的**python_trt.py**复制到dll文件夹下。
+
+<div align="center">
+<img src="assets/5.png" width=600>
+</div>
+
+设置模型路径,**dll**路径和想要预测的图片路径,特别注意模型**路径需要加b''**
+
+```
+det = Detector(model_path=b"./yolov8n_fp16.trt",dll_path="./yolov8.dll")  # b'' is needed
+img = cv2.imread("./zidane.jpg")
+```
+
+<div align="center">
+<img src="assets/6.png" width=600>
+</div>
+
+## Reference
+
+https://github.com/ultralytics/ultralytics
+
+https://github.com/shouxieai/infer

+ 36 - 0
src/detection/label_sign_detection/include/Hungarian.h

@@ -0,0 +1,36 @@
+//
+// Created by lqx on 20-4-23.
+//
+
+#ifndef TRACK_SORT_HUNGARIAN_H_H
+#define TRACK_SORT_HUNGARIAN_H_H
+
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+class HungarianAlgorithm
+{
+public:
+    HungarianAlgorithm();
+    ~HungarianAlgorithm();
+    double Solve(vector<vector<double>>& DistMatrix, vector<int>& Assignment);
+
+private:
+    void assignmentoptimal(int *assignment, double *cost, double *distMatrix, int nOfRows, int nOfColumns);
+    void buildassignmentvector(int *assignment, bool *starMatrix, int nOfRows, int nOfColumns);
+    void computeassignmentcost(int *assignment, double *cost, double *distMatrix, int nOfRows);
+    void step2a(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix,
+                bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim);
+    void step2b(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix,
+                bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim);
+    void step3(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix,
+               bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim);
+    void step4(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix,
+               bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim, int row, int col);
+    void step5(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix,
+               bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim);
+};
+
+#endif //TRACK_SORT_HUNGARIAN_H_H

+ 90 - 0
src/detection/label_sign_detection/include/KalmanTracker.h

@@ -0,0 +1,90 @@
+//
+// Created by lqx on 20-4-23.
+//
+
+#ifndef TRACK_SORT_KALMANTRACKER_H
+#define TRACK_SORT_KALMANTRACKER_H
+
+///////////////////////////////////////////////////////////////////////////////
+// KalmanTracker.h: KalmanTracker Class Declaration
+
+#include "opencv2/video/tracking.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+using namespace std;
+using namespace cv;
+
+#define StateType Rect_<float>
+
+
+// This class represents the internel state of individual tracked objects observed as bounding box.
+class KalmanTracker
+{
+public:
+    KalmanTracker()
+    {
+        init_kf(StateType());
+        m_time_since_update = 0;
+        m_hits = 0;
+        m_hit_streak = 0;
+        m_age = 0;
+        m_id = kf_count;
+        //kf_count++;
+    }
+    KalmanTracker(StateType initRect)
+    {
+        init_kf(initRect);
+        m_time_since_update = 0;
+        m_hits = 0;
+        m_hit_streak = 0;
+        m_age = 0;
+        m_id = kf_count;
+        //kf_count++;
+    }
+
+    KalmanTracker(StateType initRect, int classId,float prob)
+    {
+        init_kf(initRect);
+        m_time_since_update = 0;
+        m_hits = 0;
+        m_hit_streak = 0;
+        m_age = 0;
+        m_id = kf_count;
+        //kf_count++;
+        m_class_id = classId;
+        m_prob = prob;
+    }
+
+    ~KalmanTracker()
+    {
+        m_history.clear();
+        m_class_history.clear();
+    }
+
+    StateType predict();
+    void update(StateType stateMat,int classId, float prob);
+
+    StateType get_state();
+    StateType get_rect_xysr(float cx, float cy, float s, float r);
+
+    static int kf_count;
+
+    int m_time_since_update;
+    int m_hits;
+    int m_hit_streak;
+    int m_age;
+    int m_id;
+    int m_class_id;
+    std::vector<int> m_class_history;
+    float m_prob;
+
+private:
+    void init_kf(StateType stateMat);
+
+    cv::KalmanFilter kf;
+    cv::Mat measurement;
+
+    std::vector<StateType> m_history;
+};
+
+#endif //TRACK_SORT_KALMANTRACKER_H

+ 153 - 0
src/detection/label_sign_detection/include/cpm.hpp

@@ -0,0 +1,153 @@
+#ifndef __CPM_HPP__
+#define __CPM_HPP__
+
+// Comsumer Producer Model
+
+#include <algorithm>
+#include <condition_variable>
+#include <future>
+#include <memory>
+#include <queue>
+#include <thread>
+
+namespace cpm {
+
+template <typename Result, typename Input, typename Model>
+class Instance {
+ protected:
+  struct Item {
+    Input input;
+    std::shared_ptr<std::promise<Result>> pro;
+  };
+
+  std::condition_variable cond_;
+  std::queue<Item> input_queue_;
+  std::mutex queue_lock_;
+  std::shared_ptr<std::thread> worker_;
+  volatile bool run_ = false;
+  volatile int max_items_processed_ = 0;
+  void *stream_ = nullptr;
+
+ public:
+  virtual ~Instance() { stop(); }
+
+  void stop() {
+    run_ = false;
+    cond_.notify_one();
+    {
+      std::unique_lock<std::mutex> l(queue_lock_);
+      while (!input_queue_.empty()) {
+        auto &item = input_queue_.front();
+        if (item.pro) item.pro->set_value(Result());
+        input_queue_.pop();
+      }
+    };
+
+    if (worker_) {
+      worker_->join();
+      worker_.reset();
+    }
+  }
+
+  virtual std::shared_future<Result> commit(const Input &input) {
+    Item item;
+    item.input = input;
+    item.pro.reset(new std::promise<Result>());
+    {
+      std::unique_lock<std::mutex> __lock_(queue_lock_);
+      input_queue_.push(item);
+    }
+    cond_.notify_one();
+    return item.pro->get_future();
+  }
+
+  virtual std::vector<std::shared_future<Result>> commits(const std::vector<Input> &inputs) {
+    std::vector<std::shared_future<Result>> output;
+    {
+      std::unique_lock<std::mutex> __lock_(queue_lock_);
+      for (int i = 0; i < (int)inputs.size(); ++i) {
+        Item item;
+        item.input = inputs[i];
+        item.pro.reset(new std::promise<Result>());
+        output.emplace_back(item.pro->get_future());
+        input_queue_.push(item);
+      }
+    }
+    cond_.notify_one();
+    return output;
+  }
+
+  template <typename LoadMethod>
+  bool start(const LoadMethod &loadmethod, int max_items_processed = 1, void *stream = nullptr) {
+    stop();
+
+    this->stream_ = stream;
+    this->max_items_processed_ = max_items_processed;
+    std::promise<bool> status;
+    worker_ = std::make_shared<std::thread>(&Instance::worker<LoadMethod>, this,
+                                            std::ref(loadmethod), std::ref(status));
+    return status.get_future().get();
+  }
+
+ private:
+  template <typename LoadMethod>
+  void worker(const LoadMethod &loadmethod, std::promise<bool> &status) {
+    std::shared_ptr<Model> model = loadmethod();
+    if (model == nullptr) {
+      status.set_value(false);
+      return;
+    }
+
+    run_ = true;
+    status.set_value(true);
+
+    std::vector<Item> fetch_items;
+    std::vector<Input> inputs;
+    while (get_items_and_wait(fetch_items, max_items_processed_)) {
+      inputs.resize(fetch_items.size());
+      std::transform(fetch_items.begin(), fetch_items.end(), inputs.begin(),
+                     [](Item &item) { return item.input; });
+
+      auto ret = model->forwards(inputs, stream_);
+      for (int i = 0; i < (int)fetch_items.size(); ++i) {
+        if (i < (int)ret.size()) {
+          fetch_items[i].pro->set_value(ret[i]);
+        } else {
+          fetch_items[i].pro->set_value(Result());
+        }
+      }
+      inputs.clear();
+      fetch_items.clear();
+    }
+    model.reset();
+    run_ = false;
+  }
+
+  virtual bool get_items_and_wait(std::vector<Item> &fetch_items, int max_size) {
+    std::unique_lock<std::mutex> l(queue_lock_);
+    cond_.wait(l, [&]() { return !run_ || !input_queue_.empty(); });
+
+    if (!run_) return false;
+
+    fetch_items.clear();
+    for (int i = 0; i < max_size && !input_queue_.empty(); ++i) {
+      fetch_items.emplace_back(std::move(input_queue_.front()));
+      input_queue_.pop();
+    }
+    return true;
+  }
+
+  virtual bool get_item_and_wait(Item &fetch_item) {
+    std::unique_lock<std::mutex> l(queue_lock_);
+    cond_.wait(l, [&]() { return !run_ || !input_queue_.empty(); });
+
+    if (!run_) return false;
+
+    fetch_item = std::move(input_queue_.front());
+    input_queue_.pop();
+    return true;
+  }
+};
+};  // namespace cpm
+
+#endif  // __CPM_HPP__

+ 54 - 0
src/detection/label_sign_detection/include/detect_obstacle.h

@@ -0,0 +1,54 @@
+#ifndef DETECT_TURNSTILE_H
+#define DETECT_TURNSTILE_H
+
+#include "Hungarian.h"
+#include "KalmanTracker.h"
+
+namespace od{
+
+const int COLOR_MAP[3][3]={{0, 0, 255},{0, 255, 0},{0,255,255}};
+const int max_age = 3;
+const int min_hits = 3;
+const double iouThreshold = 0.5;
+
+struct bbox_t {
+    unsigned int x, y, w, h;       // (x,y) - top-left corner, (w, h) - width & height of bounded box
+    float prob;                    // confidence - probability that the object was found correctly
+    unsigned int obj_id;           // class of object - from range [0, classes-1]
+    unsigned int track_id;         // tracking id for video (0 - untracked, 1 - inf - tracked object)
+    unsigned int frames_counter;   // counter of frames on which the object was detected
+    float x_3d, y_3d, z_3d;        // center of object (in Meters) if ZED 3D Camera is used
+};
+
+typedef struct TrackingBox
+{
+    int frame;
+    int id;
+    int class_id;
+    float prob;
+    Rect_<float> box;
+    vector<int> class_history;
+}TrackingBox;
+
+//yolo data o DetectBox
+typedef struct DetectBox
+{
+    int class_id;
+    float prob;
+    Rect_<float> box;
+}DetectBox;
+
+//Computes IOU between two bounding boxes
+double GetIOU(Rect_<float> bb_test, Rect_<float> bb_gt);
+//画出检测框和相关信息
+void DrawBoxes(Mat &frame, vector<string> classes, int classId, int turnstileId,float conf, int left, int top, int right, int bottom);
+
+//画出检测结果,image
+void Drawer(Mat &frame, vector<bbox_t> outs, vector<string> classes);
+//画出检测结果,video
+void Drawer(Mat &frame, vector<od::TrackingBox> &track_result, vector<string> &classes);
+//tracking obstacle
+bool TrackObstacle(int frame_count,vector<KalmanTracker> &trackers,vector<bbox_t> &outs,vector<od::TrackingBox> &track_result);
+}
+
+#endif // DETECT_TURNSTILE_H

+ 82 - 0
src/detection/label_sign_detection/include/imageBuffer.h

@@ -0,0 +1,82 @@
+#ifndef IMAGEBUFFER_H
+#define IMAGEBUFFER_H
+
+#include <opencv2/opencv.hpp>
+#include <mutex>
+#include <condition_variable>
+#include <queue>
+template<typename T>
+class ConsumerProducerQueue
+{
+
+public:
+    ConsumerProducerQueue(int mxsz,bool dropFrame) :
+            maxSize(mxsz),dropFrame(dropFrame)
+    { }
+
+    bool add(T request)
+    {
+        std::unique_lock<std::mutex> lock(mutex);
+        if(dropFrame && isFull())
+        {
+            //lock.unlock();
+            //return false;
+            cpq.pop();
+            cpq.push(request);
+            cond.notify_all();
+            return true;
+        }
+        else {
+            cond.wait(lock, [this]() { return !isFull(); });
+            cpq.push(request);
+            //lock.unlock();
+            cond.notify_all();
+            return true;
+        }
+    }
+    void consume(T &request)
+    {
+        std::unique_lock<std::mutex> lock(mutex);
+        cond.wait(lock, [this]()
+        { return !isEmpty(); });
+        request = cpq.front();
+        cpq.pop();
+        //lock.unlock();
+        cond.notify_all();
+
+    }
+
+    bool isFull() const
+    {
+        return cpq.size() >= maxSize;
+    }
+
+    bool isEmpty() const
+    {
+        return cpq.size() == 0;
+    }
+
+    int length() const
+    {
+        return cpq.size();
+    }
+
+    void clear()
+    {
+        std::unique_lock<std::mutex> lock(mutex);
+        while (!isEmpty())
+        {
+            cpq.pop();
+        }
+        lock.unlock();
+        cond.notify_all();
+    }
+
+private:
+    std::condition_variable cond;  //条件变量允许通过通知进而实现线程同步
+    std::mutex mutex;     //提供了多种互斥操作,可以显式避免数据竞争
+    std::queue<T> cpq;    //容器适配器,它给予程序员队列的功能
+    int maxSize;
+    bool dropFrame;
+};
+#endif // IMAGEBUFFER_H

+ 98 - 0
src/detection/label_sign_detection/include/infer.hpp

@@ -0,0 +1,98 @@
+#ifndef __INFER_HPP__
+#define __INFER_HPP__
+
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace trt {
+
+#define INFO(...) trt::__log_func(__FILE__, __LINE__, __VA_ARGS__)
+void __log_func(const char *file, int line, const char *fmt, ...);
+
+enum class DType : int { FLOAT = 0, HALF = 1, INT8 = 2, INT32 = 3, BOOL = 4, UINT8 = 5 };
+
+class Timer {
+ public:
+  Timer();
+  virtual ~Timer();
+  void start(void *stream = nullptr);
+  float stop(const char *prefix = "Timer", bool print = true);
+
+ private:
+  void *start_, *stop_;
+  void *stream_;
+};
+
+class BaseMemory {
+ public:
+  BaseMemory() = default;
+  BaseMemory(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes);
+  virtual ~BaseMemory();
+  virtual void *gpu_realloc(size_t bytes);
+  virtual void *cpu_realloc(size_t bytes);
+  void release_gpu();
+  void release_cpu();
+  void release();
+  inline bool owner_gpu() const { return owner_gpu_; }
+  inline bool owner_cpu() const { return owner_cpu_; }
+  inline size_t cpu_bytes() const { return cpu_bytes_; }
+  inline size_t gpu_bytes() const { return gpu_bytes_; }
+  virtual inline void *get_gpu() const { return gpu_; }
+  virtual inline void *get_cpu() const { return cpu_; }
+  void reference(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes);
+
+ protected:
+  void *cpu_ = nullptr;
+  size_t cpu_bytes_ = 0, cpu_capacity_ = 0;
+  bool owner_cpu_ = true;
+
+  void *gpu_ = nullptr;
+  size_t gpu_bytes_ = 0, gpu_capacity_ = 0;
+  bool owner_gpu_ = true;
+};
+
+template <typename _DT>
+class Memory : public BaseMemory {
+ public:
+  Memory() = default;
+  Memory(const Memory &other) = delete;
+  Memory &operator=(const Memory &other) = delete;
+  virtual _DT *gpu(size_t size) { return (_DT *)BaseMemory::gpu_realloc(size * sizeof(_DT)); }
+  virtual _DT *cpu(size_t size) { return (_DT *)BaseMemory::cpu_realloc(size * sizeof(_DT)); }
+
+  inline size_t cpu_size() const { return cpu_bytes_ / sizeof(_DT); }
+  inline size_t gpu_size() const { return gpu_bytes_ / sizeof(_DT); }
+
+  virtual inline _DT *gpu() const { return (_DT *)gpu_; }
+  virtual inline _DT *cpu() const { return (_DT *)cpu_; }
+};
+
+class Infer {
+ public:
+  virtual bool forward(const std::vector<void *> &bindings, void *stream = nullptr,
+                       void *input_consum_event = nullptr) = 0;
+  virtual int index(const std::string &name) = 0;
+  virtual std::vector<int> run_dims(const std::string &name) = 0;
+  virtual std::vector<int> run_dims(int ibinding) = 0;
+  virtual std::vector<int> static_dims(const std::string &name) = 0;
+  virtual std::vector<int> static_dims(int ibinding) = 0;
+  virtual int numel(const std::string &name) = 0;
+  virtual int numel(int ibinding) = 0;
+  virtual int num_bindings() = 0;
+  virtual bool is_input(int ibinding) = 0;
+  virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) = 0;
+  virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) = 0;
+  virtual DType dtype(const std::string &name) = 0;
+  virtual DType dtype(int ibinding) = 0;
+  virtual bool has_dynamic_dim() = 0;
+  virtual void print() = 0;
+};
+
+std::shared_ptr<Infer> load(const std::string &file);
+std::string format_shape(const std::vector<int> &shape);
+
+}  // namespace trt
+
+#endif  // __INFER_HPP__

+ 75 - 0
src/detection/label_sign_detection/include/yolo.hpp

@@ -0,0 +1,75 @@
+#ifndef __YOLO_HPP__
+#define __YOLO_HPP__
+
+#include <future>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace yolo {
+
+enum class Type : int {
+  V5 = 0,
+  X = 1,
+  V3 = 2,
+  V7 = 3,
+  V8 = 5,
+  V8Seg = 6  // yolov8 instance segmentation
+};
+
+struct InstanceSegmentMap {
+  int width = 0, height = 0;      // width % 8 == 0
+  unsigned char *data = nullptr;  // is width * height memory
+
+  InstanceSegmentMap(int width, int height);
+  virtual ~InstanceSegmentMap();
+};
+
+struct Box {
+  float left, top, right, bottom, confidence;
+  int class_label;
+  std::shared_ptr<InstanceSegmentMap> seg;  // valid only in segment task
+
+  Box() = default;
+  Box(float left, float top, float right, float bottom, float confidence, int class_label)
+      : left(left),
+        top(top),
+        right(right),
+        bottom(bottom),
+        confidence(confidence),
+        class_label(class_label) {}
+};
+
+struct Image {
+  const void *bgrptr = nullptr;
+  int width = 0, height = 0;
+
+  Image() = default;
+  Image(const void *bgrptr, int width, int height) : bgrptr(bgrptr), width(width), height(height) {}
+};
+
+typedef std::vector<Box> BoxArray;
+
+// [Preprocess]: 0.50736 ms
+// [Forward]: 3.96410 ms
+// [BoxDecode]: 0.12016 ms
+// [SegmentDecode]: 0.15610 ms
+class Infer {
+ public:
+  virtual BoxArray forward(const Image &image, void *stream = nullptr) = 0;
+  virtual std::vector<BoxArray> forwards(const std::vector<Image> &images,
+                                         void *stream = nullptr) = 0;
+};
+
+std::shared_ptr<Infer> load(const std::string &engine_file, Type type,
+                            float confidence_threshold = 0.25f, float nms_threshold = 0.5f);
+
+Infer *loadraw(const std::string &engine_file, Type type, 
+							float confidence_threshold = 0.25f, float nms_threshold = 0.5f);
+
+const char *type_name(Type type);
+std::tuple<uint8_t, uint8_t, uint8_t> hsv2bgr(float h, float s, float v);
+std::tuple<uint8_t, uint8_t, uint8_t> random_color(int id);
+};  // namespace yolo
+
+#endif  // __YOLO_HPP__

+ 25 - 0
src/detection/label_sign_detection/label_sign_config/label_sign_config.yaml

@@ -0,0 +1,25 @@
+%YAML:1.0
+---
+imshow_flag: true
+ivlog_flag: true
+test_video:
+  open: false
+  video_path: /media/nvidia/Elements-lqx/saveVideoPy/1685957551.1323013.avi
+trackstate: true
+conf_thr: 0.4
+nms_thr : 0.4
+model:
+  onnx_path: ./label_sign_config/best.transd.onnx
+  engine_path: ./label_sign_config/best.transd.engine
+camera:
+  cropstate: true
+  crop_height: 
+    start: 0
+    end: 720
+  crop_width:
+    start: 280
+    end: 1000
+
+signlabels:
+  - "Speed5"
+  - "Park"

+ 134 - 0
src/detection/label_sign_detection/label_signdetect.pro

@@ -0,0 +1,134 @@
+QT -= gui
+
+CONFIG += c++11 console
+CONFIG -= app_bundle
+
+# The following define makes your compiler emit warnings if you use
+# any Qt feature that has been marked deprecated (the exact warnings
+# depend on your compiler). Please consult the documentation of the
+# deprecated API in order to know how to port your code away from it.
+DEFINES += QT_DEPRECATED_WARNINGS
+
+# You can also make your code fail to compile if it uses deprecated APIs.
+# In order to do so, uncomment the following line.
+# You can also select to disable deprecated APIs only up to a certain version of Qt.
+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
+
+# Default rules for deployment.
+qnx: target.path = /tmp/$${TARGET}/bin
+else: unix:!android: target.path = /opt/$${TARGET}/bin
+!isEmpty(target.path): INSTALLS += target
+
+
+
+SOURCES += main.cpp \
+           src/Hungarian.cpp \
+           src/KalmanTracker.cpp \
+           src/detect_obstacle.cpp\
+           $$PWD/../../include/msgtype/rawpic.pb.cc \
+           $$PWD/../../include/msgtype/cameraobjectarray.pb.cc \
+           $$PWD/../../include/msgtype/cameraobject.pb.cc
+
+
+
+INCLUDEPATH += include
+
+HEADERS += include/yolo.hpp \
+           include/infer.hpp \
+           include/cpm.hpp \
+           include/imageBuffer.h \
+           include/Hungarian.h \
+           include/KalmanTracker.h \
+           include/detect_obstacle.h
+
+
+CUDA_SOURCES += src/infer.cu \
+                src/yolo.cu
+
+DISTFILES +=  src/infer.cu \
+              src/yolo.cu
+
+LIBS += -L"/usr/local/lib" \
+        -lprotobuf
+
+# opencv
+INCLUDEPATH += /usr/include/opencv4
+LIBS += /usr/lib/aarch64-linux-gnu/libopencv_*.so
+
+
+# tensorrt
+INCLUDEPATH += /usr/src/tensorrt/include \
+               /usr/include/aarch64-linux-gnu \
+               /usr/local/cuda/include \
+               /usr/src/tensorrt/samples/common \
+               /usr/local/include
+
+LIBS += /usr/lib/aarch64-linux-gnu/libnvparsers.so \
+        /usr/lib/aarch64-linux-gnu/libnvinfer.so \
+        /usr/lib/aarch64-linux-gnu/libnvonnxparser.so \
+        /usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so
+
+# c++
+LIBS += -L/usr/lib/aarch64-linux-gnu -lstdc++fs
+# cuda
+CUDA_SDK = "/usr/local/cuda"   # Path to cuda SDK install
+CUDA_DIR = "/usr/local/cuda"            # Path to cuda toolkit install
+
+
+INCLUDEPATH += ../../../include
+
+
+#LIBS +=  $$PWD/../../../bin/libxmlparam.so \
+#         $$PWD/../../../bin/libivlog.so \
+#         $$PWD/../../../bin/libivfault.so \
+#         $$PWD/../../../bin/libmodulecomm.so
+
+!include(../../../include/common.pri ) {
+    error( "Couldn't find the common.pri file!" )
+}
+
+!include(../../../include/ivprotobuf.pri ) {
+    error( "Couldn't find the ivprotobuf.pri file!" )
+}
+!include(../../../include/ivopencv.pri ) {
+    error( "Couldn't find the ivopencv.pri file!" )
+}
+
+INCLUDEPATH += $$PWD/../../include/msgtype
+LIBS += -L/usr/lib/aarch64-linux-gnu/ -lglog
+
+#####系统类型,计算能力###########
+SYSTEM_NAME = linux         # Depending on your system either 'Win32', 'x64', or 'Win64'
+SYSTEM_TYPE = 64            # '32' or '64', depending on your system
+CUDA_ARCH = sm_72           # Type of CUDA architecture, for example 'compute_10', 'compute_11', 'sm_10'
+NVCC_OPTIONS = --use_fast_math
+
+INCLUDEPATH += $$CUDA_DIR/include
+QMAKE_LIBDIR += $$CUDA_DIR/lib64/
+
+CUDA_OBJECTS_DIR = ./
+
+# Add the necessary libraries
+CUDA_LIBS = -lcuda -lcudart #-lcublas
+
+# The following makes sure all path names (which often include spaces) are put between quotation marks
+CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
+LIBS += $$CUDA_LIBS
+
+# Configuration of the Cuda compiler
+CONFIG(debug, debug|release) {
+    # Debug mode
+    cuda_d.input = CUDA_SOURCES
+    cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}.o
+    cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+    cuda_d.dependency_type = TYPE_C
+    QMAKE_EXTRA_COMPILERS += cuda_d
+}
+else {
+    # Release mode
+    cuda.input = CUDA_SOURCES
+    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}.o
+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+    cuda.dependency_type = TYPE_C
+    QMAKE_EXTRA_COMPILERS += cuda
+}

+ 553 - 0
src/detection/label_sign_detection/main.cpp

@@ -0,0 +1,553 @@
+#include "NvInfer.h"
+#include "cuda_runtime_api.h"
+#include <fstream>
+#include <iostream>
+#include <opencv2/opencv.hpp>
+#include "cpm.hpp"
+#include "infer.hpp"
+#include "yolo.hpp"
+
+#include <QCoreApplication>
+#include "modulecomm.h"
+#include "xmlparam.h"
+#include "ivfault.h"
+#include "ivlog.h"
+#include "ivexit.h"
+#include "ivversion.h"
+#include "rawpic.pb.h"
+#include "cameraobject.pb.h"
+#include "cameraobjectarray.pb.h"
+#include <thread>
+#include "qmutex.h"
+#include "condition_variable"
+#include "imageBuffer.h"
+#include "detect_obstacle.h"
+// onnx转换头文件
+#include "NvOnnxParser.h"
+
+
+using namespace nvonnxparser;
+using namespace nvinfer1;
+using namespace std;
+
+static const char *cocolabels[] = {"Speed10","Speed5","Left","Right","Nopassing","Park"};
+
+std::vector<std::string> signlabels;
+string config_file = "./label_sign_config/label_sign_config.yaml";
+string onnx_path = "./label_sign_config/sign_best.onnx";
+string engine_path = "./label_sign_config/sign_best.engine"; // engine paths
+
+bool imshow_flag = true;
+bool ivlog_flag = false;
+
+
+float conf_thr = 0.5;
+float nms_thr = 0.4;
+
+bool calibrationstate = false;
+string calibration_yamlpath = "./yaml/camera_middle_640_360.yaml";
+
+bool cropstate = true;
+cv::Range crop_height = cv::Range(0,720);
+cv::Range crop_width = cv::Range(280,1000);
+
+
+bool test_video = false;
+//string video_path = "v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink";
+string video_path = "/home/nvidia/code/modularization/src/detection/trafficlight_detection/data/camera_test3.mp4";
+
+//是否开启跟踪
+bool trackstate = true;
+
+//摄像头
+void * gpcamera;
+string cameraname="image00";
+ConsumerProducerQueue<cv::Mat> * imageBuffer =  new ConsumerProducerQueue<cv::Mat>(3,true);
+
+//共享内存
+void * gpdetect;
+string detectname = "signarray";  //检测结果
+
+void * gpresultimg;
+string resultimgname = "signimage"; //检测结果图
+
+iv::Ivfault *gfault = nullptr;
+iv::Ivlog *givlog = nullptr;
+
+cv::Mat camera_matrix,dist_coe,map1,map2;  //标定参数
+
+//读取视频数据
+void ReadFunc(int n)
+{
+    cv::VideoCapture cap(video_path);
+    if(!cap.isOpened())
+    {
+        cout<<"camera failed to open"<<endl;
+    }
+    while(1)
+    {
+        cv::Mat frame;
+        //读视频的时候加上,读摄像头去掉
+        if(imageBuffer->isFull())
+        {
+            continue;
+        }
+        if(cap.read(frame))
+        {
+            if(calibrationstate)
+                cv::remap(frame,frame,map1,map2,cv::INTER_LINEAR,cv::BORDER_CONSTANT);
+            if(cropstate)
+                frame = frame(crop_height,crop_width);
+            imageBuffer->add(frame);
+        }
+        else
+        {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        }
+    }
+}
+void Listenpic(const char * strdata,const unsigned int nSize,const unsigned int index,const QDateTime * dt,const char * strmemname)
+{
+    if(nSize<1000)return;
+    iv::vision::rawpic pic;
+    if(false == pic.ParseFromArray(strdata,nSize))
+    {
+        std::cout<<"picview Listenpic fail."<<std::endl;
+        return;
+    }
+    cv::Mat mat(pic.height(),pic.width(),pic.mattype());
+    if(pic.type() == 1)
+        memcpy(mat.data,pic.picdata().data(),mat.rows*mat.cols*mat.elemSize());
+    else
+    {
+        //     mat.release();
+        std::vector<unsigned char> buff(pic.picdata().data(),pic.picdata().data() + pic.picdata().size());
+        mat = cv::imdecode(buff,cv::IMREAD_COLOR);
+    }
+    if(calibrationstate)
+        cv::remap(mat,mat,map1,map2,cv::INTER_LINEAR,cv::BORDER_CONSTANT);
+    if(cropstate)
+        mat = mat(crop_height,crop_width);
+    imageBuffer->add(mat);
+    mat.release();
+}
+void shareSignMsg(vector<yolo::Box> results)
+{
+    iv::vision::cameraobjectarray sign_array;   //向共享内存传结果
+    int index=0;
+    for (auto &result : results)
+    {
+        float width = result.right - result.left;
+        float height = result.bottom - result.top;
+        float center_x = result.left + 0.5*width;
+        float center_y = result.top + 0.5*height;
+        /*---------------protobuf----------------*/
+        iv::vision::cameraobject *object = sign_array.add_obj();
+        object->set_id(index);
+        auto type = signlabels[result.class_label];
+        object->set_type(type);
+        object->set_con(result.confidence);
+        object->set_w(width);
+        object->set_h(height);
+        object->set_x(result.left);
+        object->set_y(result.top);
+
+        //cout<<"sign id label: "<<index<<" "<<type<<endl;
+
+        if(ivlog_flag)
+            givlog->verbose("sign id label: %d %s",index,type.data());
+        index ++;
+    }
+    int size = sign_array.ByteSize();
+    char * strdata = new char[sign_array.ByteSize()];
+    if(sign_array.SerializeToArray(strdata, size))
+    {
+        iv::modulecomm::ModuleSendMsg(gpdetect, strdata, size);
+    }
+    else
+    {
+        std::cout<<"sign_array serialize error."<<std::endl;
+    }
+    sign_array.Clear();
+    delete strdata;
+
+    /*--------------------test ParseFromArray-------------------*/
+
+//    iv::vision::cameraobjectarray light_array1;
+//    light_array1.ParseFromArray(strdata,size);
+//    cout<<"parsefromarray:"<<std::endl;
+//    cout<<"light_size:"<<light_array1.obj_size()<<endl;
+//    for (int i=0;i<light_array1.obj_size();i++) {
+//        std::cout<<"index: "<<light_array1.obj(i).id()
+//                <<" type: "<<light_array1.obj(i).type()
+//                <<" lefttop: "<<light_array1.obj(i).x()
+//                <<" "<<light_array1.obj(i).y()
+//                <<" "<<light_array1.obj(i).w()
+//                <<" "<<light_array1.obj(i).h()<<std::endl;
+//    }
+//    delete strdata;
+
+
+}
+
+void SendResultImg(cv::Mat &resultimg, void* g_name)
+{
+    iv::vision::rawpic cameraPic;
+    //cameraPic.set_time(img_info.timestamp);
+    cameraPic.set_elemsize(resultimg.elemSize());
+    cameraPic.set_width(resultimg.cols);
+    cameraPic.set_height(resultimg.rows);
+    cameraPic.set_mattype(resultimg.type());
+    std::vector<int> param = std::vector<int>(2);
+    param[0] = cv::IMWRITE_JPEG_QUALITY;
+    param[1] = 95; // default(95) 0-100
+    std::vector<unsigned char> buff;
+    cv::imencode(".jpg",resultimg,buff,param);
+    cameraPic.set_picdata(buff.data(),buff.size());
+    buff.clear();
+    cameraPic.set_type(2);
+    std::string out_img = cameraPic.SerializeAsString();
+    iv::modulecomm::ModuleSendMsg(g_name,out_img.data(),out_img.length());
+}
+
+yolo::Image cvimg(const cv::Mat &image) { return yolo::Image(image.data, image.cols, image.rows); }
+
+
+//static Logger gLogger;
+//构建Logger
+class Logger : public ILogger
+{
+    void log(Severity severity, const char* msg) noexcept override
+    {
+        // suppress info-level messages
+        if (severity <= Severity::kWARNING)
+            std::cout << msg << std::endl;
+    }
+} gLogger;
+
+// Creat the engine using only the API and not any parser.
+ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config)
+{
+
+    INetworkDefinition* network = builder->createNetworkV2(1U); //此处重点1U为OU就有问题
+
+    IParser* parser = createParser(*network, gLogger);
+    parser->parseFromFile(onnx_path.c_str(), static_cast<int32_t>(ILogger::Severity::kWARNING));
+    //解析有错误将返回
+    for (int32_t i = 0; i < parser->getNbErrors(); ++i) { std::cout << parser->getError(i)->desc() << std::endl; }
+    //std::cout << "successfully parse the onnx model" << std::endl;
+
+    // Build engine
+    builder->setMaxBatchSize(maxBatchSize);
+    config->setMaxWorkspaceSize(1 << 20);
+    //config->setFlag(nvinfer1::BuilderFlag::kFP16); // 设置精度计算
+    //config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
+    //std::cout << "successfully  convert onnx to  engine!!! " << std::endl;
+
+    //销毁
+    network->destroy();
+    //parser->destroy();
+
+    return engine;
+}
+
+void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
+{
+
+    // Create builder
+    IBuilder* builder = createInferBuilder(gLogger);
+    IBuilderConfig* config = builder->createBuilderConfig();
+
+    // Create model to populate the network, then set the outputs and create an engine
+    ICudaEngine* engine = createEngine(maxBatchSize, builder, config);
+
+    assert(engine != nullptr);
+
+
+
+    // Serialize the engine
+    (*modelStream) = engine->serialize();
+    // Close everything down
+    engine->destroy();
+    builder->destroy();
+    config->destroy();
+
+}
+
+int get_trtengine() {
+
+    IHostMemory* modelStream{ nullptr };
+    APIToModel(1, &modelStream);
+    assert(modelStream != nullptr);
+
+    std::ofstream p(engine_path, std::ios::binary);
+    if (!p)
+    {
+        std::cerr << "could not open plan output file" << std::endl;
+        return -1;
+    }
+    p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
+    modelStream->destroy();
+
+    return 0;
+
+}
+
+
+int main(int argc, char** argv)
+{
+    showversion("yolov8");
+
+    QCoreApplication a(argc, argv);
+
+    gfault = new iv::Ivfault("sign_detection");
+    givlog = new iv::Ivlog("sign_detection");
+    gfault->SetFaultState(0,0,"yolov8 initialize.");
+
+    cv::FileStorage config(config_file, cv::FileStorage::READ);
+    bool config_isOpened = config.isOpened();
+    //const char* onnx_path_;
+    if(config_isOpened)
+    {
+        onnx_path = string(config["model"]["onnx_path"]);
+        engine_path = string(config["model"]["engine_path"]);
+        imshow_flag = (string(config["imshow_flag"]) == "true");
+        ivlog_flag = (string(config["ivlog_flag"]) == "true");
+        test_video = (string(config["test_video"]["open"]) == "true");
+        video_path = string(config["test_video"]["video_path"]);
+        trackstate = (string(config["trackstate"]) == "true");
+        conf_thr = float(config["conf_thr"]);
+        nms_thr = float(config["nms_thr"]);
+        cropstate = (string(config["camera"]["cropstate"]) == "true");
+        crop_height = cv::Range(int(config["camera"]["crop_height"]["start"]),
+                int(config["camera"]["crop_height"]["end"]));
+
+        crop_width = cv::Range(int(config["camera"]["crop_width"]["start"]),
+                int(config["camera"]["crop_width"]["end"]));
+
+        cv::FileNode signarraynode = config["signlabels"];
+        if(signarraynode.type() == cv::FileNode::SEQ)
+        {
+            cv::FileNodeIterator it = signarraynode.begin(),it_end = signarraynode.end();
+            for (; it != it_end; it++)
+            {
+                std::string value = (std::string)*it;
+                signlabels.push_back(value);
+            }
+        }
+    }
+    else
+    {
+        std::cout<<"Error: Can't find the label_sign_config.yaml file"<<std::endl;
+        if(ivlog_flag)
+            givlog->verbose("Error: Can't find the label_sign_config.yaml file");
+        return 0;
+    }
+    config.release();
+    if(test_video)
+        std::thread * readthread = new std::thread(ReadFunc,1);
+    else
+        gpcamera= iv::modulecomm::RegisterRecv(&cameraname[0],Listenpic);
+
+    //================================== camera calib init ==========================
+    if (calibrationstate)
+    {
+        cv::FileStorage calib_file(calibration_yamlpath, cv::FileStorage::READ);
+        calib_file["cameraMatrix"]>>camera_matrix;
+        calib_file["distCoeffs"]>>dist_coe;
+        cv::Mat R = cv::Mat::eye(3, 3, CV_64F);
+        cv::Size imgsize=cv::Size(1280,720);
+        cv::initUndistortRectifyMap(camera_matrix, dist_coe, R, camera_matrix,imgsize,CV_16SC2,map1,map2);
+    }
+
+    //共享内存,检测结果
+    gpdetect = iv::modulecomm::RegisterSend(&detectname[0],10000,1);
+
+    //gpresultimg = iv::modulecomm::RegisterSend(&resultimgname[0],1000000,1);
+
+
+    //检查engine文件是否存在,若不存在则生成
+    std::ifstream file(engine_path, std::ios::binary);
+    if(!file)
+    {
+        cout<<engine_path<<" not found!"<<endl;
+        cout<<"Build engine to "<< engine_path <<endl;
+        cout<<"Please wait a few minutes !!!"<<endl;
+        if(ivlog_flag)
+            givlog->verbose("Build engine now,Please wait a few minutes !!!");
+        get_trtengine();
+        cout << "Build engine done !!!!"<<endl;
+        if(ivlog_flag)
+            givlog->verbose("Build engine done !!!!");
+    }
+    else file.close();
+
+    //初始化模型
+    auto yolo = yolo::load(engine_path, yolo::Type::V8, conf_thr, nms_thr);
+    if (yolo == nullptr)
+    {
+
+        std::cout <<"Error: load engine failed,yolo == nullptr"<<std::endl;
+        if(ivlog_flag)
+            givlog->verbose("Error: load engine failed,yolo == nullptr");
+        return 0;
+    }
+
+
+    vector<KalmanTracker> trackers_90;
+    KalmanTracker::kf_count = 0; // tracking id relies on this, so we have to reset it in each seq.
+    int frame_count = 0;
+    double waittime = (double)cv::getTickCount();
+    while (1)
+    {
+        if(imageBuffer->isEmpty())
+        {
+            double waittotal = (double)cv::getTickCount() - waittime;
+            double totaltime = waittotal/cv::getTickFrequency();
+            //                    if(totaltime>10.0)
+            //                    {
+            //                        cout<<"Cant't get frame and quit"<<endl;
+            //                        lightstart = false;
+            //                        cv::destroyAllWindows();
+            //                        std::cout<<"------end program------"<<std::endl;
+            //                        break;
+            //                    }
+            cout<<"Wait for frame "<<totaltime<<"s"<<endl;
+            if(ivlog_flag)
+                givlog->verbose("Wait for frame %f s",totaltime);
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            continue;
+        }        
+        auto start = std::chrono::system_clock::now();  //时间函数
+        cv::Mat frame;
+        imageBuffer->consume(frame);
+        frame_count++;
+        cv::Mat image;
+        frame.copyTo(image);
+        auto objs = yolo->forward(cvimg(image));
+        auto end = std::chrono::system_clock::now();  //时间函数
+
+//        std::cout <<"sign detection infer time: "<<
+//                    std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+//                 << "ms" << std::endl;
+        //================================== track ==========================
+        std::vector<yolo::Box> objs_track;
+        if (trackstate)
+        {
+            auto start_track = std::chrono::system_clock::now();  //时间函数
+            od::bbox_t bbox_t_90;   //转成跟踪格式
+            vector<od::bbox_t> outs_90;
+            for (auto &obj : objs)
+            {
+                //-------------判断标志牌的大小-----------
+                //------------------跟踪是中心点和宽高--------------------
+                float width = obj.right - obj.left;
+                float height = obj.bottom - obj.top;
+                if(width<0 || height<0)
+                {
+                    std::cout<<"Error: bbox width<0 || height<0 "<<std::endl;
+                    if(ivlog_flag)
+                        givlog->verbose("Error: bbox width<0 || height<0 ");
+                    continue;
+                }
+                bbox_t_90.x = obj.left + 0.5*width;
+                bbox_t_90.y = obj.top + 0.5*height;
+                bbox_t_90.w = width;
+                bbox_t_90.h = height;
+                bbox_t_90.prob = obj.confidence;
+                bbox_t_90.obj_id = obj.class_label;
+                outs_90.push_back(bbox_t_90);
+            }
+            vector<od::TrackingBox>track_result_90;
+            bool track_flag_90 = od::TrackObstacle(frame_count,trackers_90,outs_90,track_result_90);
+            //sort(track_result_90.begin(), track_result_90.end(), comp);  //track id 本来就是由大到小
+            for(unsigned int i=0;i < track_result_90.size(); i++)
+            {
+                yolo::Box obstacle;
+                float width = track_result_90[i].box.width;
+                float height = track_result_90[i].box.height;
+                obstacle.left = track_result_90[i].box.x - 0.5*width;
+                obstacle.top = track_result_90[i].box.y - 0.5*height;
+                obstacle.right = track_result_90[i].box.x + 0.5*width;
+                obstacle.bottom = track_result_90[i].box.y + 0.5*height;
+                //cout<<"11111: "<<track_result_90[i].id<<endl;
+                //-----------------------通过判断5帧数输出最终的结果,参数修改在KalmanTracker.h-------------
+                vector<int> class_history;
+                class_history = track_result_90[i].class_history;
+                if(class_history.size()>0)
+                {
+                    vector<int> color_num(6);   //共有6类
+                    for(int j=0;j<class_history.size();j++)
+                    {
+                        int class_id = class_history[j];
+                        color_num[class_id] += 1;
+                    }
+                    std::vector<int>::iterator biggest = std::max_element(std::begin(color_num),std::end(color_num));
+                    int maxindex = std::distance(std::begin(color_num),biggest);
+                    obstacle.class_label = maxindex;
+                }
+                else {obstacle.class_label = track_result_90[i].class_id;}
+                obstacle.confidence = track_result_90[i].prob;
+                objs_track.push_back(obstacle);
+                //-------------------------------------------------------
+            }
+            auto end_track = std::chrono::system_clock::now();  //时间函数
+            //std::cout <<"track: "<< std::chrono::duration_cast<std::chrono::milliseconds>(end_track - start_track).count() << "ms" << std::endl;
+
+        }
+        //================================== track ==========================
+
+        vector<yolo::Box>results_final;
+        results_final = (trackstate)?objs_track:objs;
+        shareSignMsg(results_final);
+        auto end_final = std::chrono::system_clock::now();  //时间函数
+        std::cout <<"sign detection total time: "<<
+                    std::chrono::duration_cast<std::chrono::milliseconds>
+                                          (end_final - start).count() << "ms" << std::endl;
+        //将结果可视化
+        for (auto &result : results_final)
+        {
+            uint8_t b, g, r;
+            tie(b, g, r) = yolo::random_color(result.class_label);
+            cv::rectangle(image, cv::Point(result.left, result.top),
+                          cv::Point(result.right, result.bottom),
+                          cv::Scalar(b, g, r), 2);
+
+            auto name = signlabels[result.class_label];
+            auto caption = cv::format("%s %.2f", name.data(), result.confidence);
+            int width = int(cv::getTextSize(caption, 0, 1, 1, nullptr).width*0.8);
+            cv::rectangle(image, cv::Point(result.left, result.top - 25),
+                          cv::Point(result.left + width, result.top), cv::Scalar(b, g, r), -1);
+            cv::putText(image, caption, cv::Point(result.left, result.top - 5), 0, 0.7, cv::Scalar::all(0), 1, 16);
+
+        }
+        //将检测结果图像传到共享内存
+        //SendResultImg(image,gpresultimg);
+
+
+        if (imshow_flag)
+        {
+
+            cv::namedWindow("Result",cv::WINDOW_NORMAL);
+            cv::imshow("Result",image);
+            if(cv::waitKey(10) == 'q')
+            {
+                cv::destroyAllWindows();
+                //yolo_context->destroy();
+                //start_up = false;
+                break;
+            }
+            if(cv::waitKey(1) == 's')
+                cv::waitKey(0);
+        }
+
+        //writer << frame;
+        waittime = (double)cv::getTickCount();
+
+    }
+
+}
+
+
+

+ 398 - 0
src/detection/label_sign_detection/src/Hungarian.cpp

@@ -0,0 +1,398 @@
+//
+// Created by lqx on 20-4-23.
+//
+///////////////////////////////////////////////////////////////////////////////
+// Hungarian.cpp: Implementation file for Class HungarianAlgorithm.
+//
+// This is a C++ wrapper with slight modification of a hungarian algorithm implementation by Markus Buehren.
+// The original implementation is a few mex-functions for use in MATLAB, found here:
+// http://www.mathworks.com/matlabcentral/fileexchange/6543-functions-for-the-rectangular-assignment-problem
+//
+// Both this code and the orignal code are published under the BSD license.
+// by Cong Ma, 2016
+//
+
+#include <math.h>
+#include <cfloat>
+#include "Hungarian.h"
+
+
+HungarianAlgorithm::HungarianAlgorithm(){}
+HungarianAlgorithm::~HungarianAlgorithm(){}
+
+
+//********************************************************//
+// A single function wrapper for solving assignment problem.
+//********************************************************//
+double HungarianAlgorithm::Solve(vector<vector<double>>& DistMatrix, vector<int>& Assignment)
+{
+    unsigned int nRows = DistMatrix.size();
+    unsigned int nCols = DistMatrix[0].size();
+
+    double *distMatrixIn = new double[nRows * nCols];
+    int *assignment = new int[nRows];
+    double cost = 0.0;
+
+    // Fill in the distMatrixIn. Mind the index is "i + nRows * j".
+    // Here the cost matrix of size MxN is defined as a double precision array of N*M elements.
+    // In the solving functions matrices are seen to be saved MATLAB-internally in row-order.
+    // (i.e. the matrix [1 2; 3 4] will be stored as a vector [1 3 2 4], NOT [1 2 3 4]).
+    for (unsigned int i = 0; i < nRows; i++)
+        for (unsigned int j = 0; j < nCols; j++)
+            distMatrixIn[i + nRows * j] = DistMatrix[i][j];
+
+    // call solving function
+    assignmentoptimal(assignment, &cost, distMatrixIn, nRows, nCols);
+
+    Assignment.clear();
+    for (unsigned int r = 0; r < nRows; r++)
+        Assignment.push_back(assignment[r]);
+
+    delete[] distMatrixIn;
+    delete[] assignment;
+    return cost;
+}
+
+
+//********************************************************//
+// Solve optimal solution for assignment problem using Munkres algorithm, also known as Hungarian Algorithm.
+//********************************************************//
+void HungarianAlgorithm::assignmentoptimal(int *assignment, double *cost, double *distMatrixIn, int nOfRows, int nOfColumns)
+{
+    double *distMatrix, *distMatrixTemp, *distMatrixEnd, *columnEnd, value, minValue;
+    bool *coveredColumns, *coveredRows, *starMatrix, *newStarMatrix, *primeMatrix;
+    int nOfElements, minDim, row, col;
+
+    /* initialization */
+    *cost = 0;
+    for (row = 0; row<nOfRows; row++)
+        assignment[row] = -1;
+
+    /* generate working copy of distance Matrix */
+    /* check if all matrix elements are positive */
+    nOfElements = nOfRows * nOfColumns;
+    distMatrix = (double *)malloc(nOfElements * sizeof(double));
+    distMatrixEnd = distMatrix + nOfElements;
+
+    for (row = 0; row<nOfElements; row++)
+    {
+        value = distMatrixIn[row];
+        if (value < 0)
+            cerr << "All matrix elements have to be non-negative." << endl;
+        distMatrix[row] = value;
+    }
+
+
+    /* memory allocation */
+    coveredColumns = (bool *)calloc(nOfColumns, sizeof(bool));
+    coveredRows = (bool *)calloc(nOfRows, sizeof(bool));
+    starMatrix = (bool *)calloc(nOfElements, sizeof(bool));
+    primeMatrix = (bool *)calloc(nOfElements, sizeof(bool));
+    newStarMatrix = (bool *)calloc(nOfElements, sizeof(bool)); /* used in step4 */
+
+    /* preliminary steps */
+    if (nOfRows <= nOfColumns)
+    {
+        minDim = nOfRows;
+
+        for (row = 0; row<nOfRows; row++)
+        {
+            /* find the smallest element in the row */
+            distMatrixTemp = distMatrix + row;
+            minValue = *distMatrixTemp;
+            distMatrixTemp += nOfRows;
+            while (distMatrixTemp < distMatrixEnd)
+            {
+                value = *distMatrixTemp;
+                if (value < minValue)
+                    minValue = value;
+                distMatrixTemp += nOfRows;
+            }
+
+            /* subtract the smallest element from each element of the row */
+            distMatrixTemp = distMatrix + row;
+            while (distMatrixTemp < distMatrixEnd)
+            {
+                *distMatrixTemp -= minValue;
+                distMatrixTemp += nOfRows;
+            }
+        }
+
+        /* Steps 1 and 2a */
+        for (row = 0; row<nOfRows; row++)
+            for (col = 0; col<nOfColumns; col++)
+                if (fabs(distMatrix[row + nOfRows*col]) < DBL_EPSILON)
+                    if (!coveredColumns[col])
+                    {
+                        starMatrix[row + nOfRows*col] = true;
+                        coveredColumns[col] = true;
+                        break;
+                    }
+    }
+    else /* if(nOfRows > nOfColumns) */
+    {
+        minDim = nOfColumns;
+
+        for (col = 0; col<nOfColumns; col++)
+        {
+            /* find the smallest element in the column */
+            distMatrixTemp = distMatrix + nOfRows*col;
+            columnEnd = distMatrixTemp + nOfRows;
+
+            minValue = *distMatrixTemp++;
+            while (distMatrixTemp < columnEnd)
+            {
+                value = *distMatrixTemp++;
+                if (value < minValue)
+                    minValue = value;
+            }
+
+            /* subtract the smallest element from each element of the column */
+            distMatrixTemp = distMatrix + nOfRows*col;
+            while (distMatrixTemp < columnEnd)
+                *distMatrixTemp++ -= minValue;
+        }
+
+        /* Steps 1 and 2a */
+        for (col = 0; col<nOfColumns; col++)
+            for (row = 0; row<nOfRows; row++)
+                if (fabs(distMatrix[row + nOfRows*col]) < DBL_EPSILON)
+                    if (!coveredRows[row])
+                    {
+                        starMatrix[row + nOfRows*col] = true;
+                        coveredColumns[col] = true;
+                        coveredRows[row] = true;
+                        break;
+                    }
+        for (row = 0; row<nOfRows; row++)
+            coveredRows[row] = false;
+
+    }
+
+    /* move to step 2b */
+    step2b(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
+
+    /* compute cost and remove invalid assignments */
+    computeassignmentcost(assignment, cost, distMatrixIn, nOfRows);
+
+    /* free allocated memory */
+    free(distMatrix);
+    free(coveredColumns);
+    free(coveredRows);
+    free(starMatrix);
+    free(primeMatrix);
+    free(newStarMatrix);
+
+    return;
+}
+
+/********************************************************/
+void HungarianAlgorithm::buildassignmentvector(int *assignment, bool *starMatrix, int nOfRows, int nOfColumns)
+{
+    int row, col;
+
+    for (row = 0; row<nOfRows; row++)
+        for (col = 0; col<nOfColumns; col++)
+            if (starMatrix[row + nOfRows*col])
+            {
+#ifdef ONE_INDEXING
+                assignment[row] = col + 1; /* MATLAB-Indexing */
+#else
+                assignment[row] = col;
+#endif
+                break;
+            }
+}
+
+/********************************************************/
+void HungarianAlgorithm::computeassignmentcost(int *assignment, double *cost, double *distMatrix, int nOfRows)
+{
+    int row, col;
+
+    for (row = 0; row<nOfRows; row++)
+    {
+        col = assignment[row];
+        if (col >= 0)
+            *cost += distMatrix[row + nOfRows*col];
+    }
+}
+
+/********************************************************/
+void HungarianAlgorithm::step2a(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim)
+{
+    bool *starMatrixTemp, *columnEnd;
+    int col;
+
+    /* cover every column containing a starred zero */
+    for (col = 0; col<nOfColumns; col++)
+    {
+        starMatrixTemp = starMatrix + nOfRows*col;
+        columnEnd = starMatrixTemp + nOfRows;
+        while (starMatrixTemp < columnEnd){
+            if (*starMatrixTemp++)
+            {
+                coveredColumns[col] = true;
+                break;
+            }
+        }
+    }
+
+    /* move to step 3 */
+    step2b(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
+}
+
+/********************************************************/
+void HungarianAlgorithm::step2b(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim)
+{
+    int col, nOfCoveredColumns;
+
+    /* count covered columns */
+    nOfCoveredColumns = 0;
+    for (col = 0; col<nOfColumns; col++)
+        if (coveredColumns[col])
+            nOfCoveredColumns++;
+
+    if (nOfCoveredColumns == minDim)
+    {
+        /* algorithm finished */
+        buildassignmentvector(assignment, starMatrix, nOfRows, nOfColumns);
+    }
+    else
+    {
+        /* move to step 3 */
+        step3(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
+    }
+
+}
+
+/********************************************************/
+void HungarianAlgorithm::step3(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim)
+{
+    bool zerosFound;
+    int row, col, starCol;
+
+    zerosFound = true;
+    while (zerosFound)
+    {
+        zerosFound = false;
+        for (col = 0; col<nOfColumns; col++)
+            if (!coveredColumns[col])
+                for (row = 0; row<nOfRows; row++)
+                    if ((!coveredRows[row]) && (fabs(distMatrix[row + nOfRows*col]) < DBL_EPSILON))
+                    {
+                        /* prime zero */
+                        primeMatrix[row + nOfRows*col] = true;
+
+                        /* find starred zero in current row */
+                        for (starCol = 0; starCol<nOfColumns; starCol++)
+                            if (starMatrix[row + nOfRows*starCol])
+                                break;
+
+                        if (starCol == nOfColumns) /* no starred zero found */
+                        {
+                            /* move to step 4 */
+                            step4(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim, row, col);
+                            return;
+                        }
+                        else
+                        {
+                            coveredRows[row] = true;
+                            coveredColumns[starCol] = false;
+                            zerosFound = true;
+                            break;
+                        }
+                    }
+    }
+
+    /* move to step 5 */
+    step5(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
+}
+
+/********************************************************/
+void HungarianAlgorithm::step4(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim, int row, int col)
+{
+    int n, starRow, starCol, primeRow, primeCol;
+    int nOfElements = nOfRows*nOfColumns;
+
+    /* generate temporary copy of starMatrix */
+    for (n = 0; n<nOfElements; n++)
+        newStarMatrix[n] = starMatrix[n];
+
+    /* star current zero */
+    newStarMatrix[row + nOfRows*col] = true;
+
+    /* find starred zero in current column */
+    starCol = col;
+    for (starRow = 0; starRow<nOfRows; starRow++)
+        if (starMatrix[starRow + nOfRows*starCol])
+            break;
+
+    while (starRow<nOfRows)
+    {
+        /* unstar the starred zero */
+        newStarMatrix[starRow + nOfRows*starCol] = false;
+
+        /* find primed zero in current row */
+        primeRow = starRow;
+        for (primeCol = 0; primeCol<nOfColumns; primeCol++)
+            if (primeMatrix[primeRow + nOfRows*primeCol])
+                break;
+
+        /* star the primed zero */
+        newStarMatrix[primeRow + nOfRows*primeCol] = true;
+
+        /* find starred zero in current column */
+        starCol = primeCol;
+        for (starRow = 0; starRow<nOfRows; starRow++)
+            if (starMatrix[starRow + nOfRows*starCol])
+                break;
+    }
+
+    /* use temporary copy as new starMatrix */
+    /* delete all primes, uncover all rows */
+    for (n = 0; n<nOfElements; n++)
+    {
+        primeMatrix[n] = false;
+        starMatrix[n] = newStarMatrix[n];
+    }
+    for (n = 0; n<nOfRows; n++)
+        coveredRows[n] = false;
+
+    /* move to step 2a */
+    step2a(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
+}
+
+/********************************************************/
+void HungarianAlgorithm::step5(int *assignment, double *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, int nOfRows, int nOfColumns, int minDim)
+{
+    double h, value;
+    int row, col;
+
+    /* find smallest uncovered element h */
+    h = DBL_MAX;
+    for (row = 0; row<nOfRows; row++)
+        if (!coveredRows[row])
+            for (col = 0; col<nOfColumns; col++)
+                if (!coveredColumns[col])
+                {
+                    value = distMatrix[row + nOfRows*col];
+                    if (value < h)
+                        h = value;
+                }
+
+    /* add h to each covered row */
+    for (row = 0; row<nOfRows; row++)
+        if (coveredRows[row])
+            for (col = 0; col<nOfColumns; col++)
+                distMatrix[row + nOfRows*col] += h;
+
+    /* subtract h from each uncovered column */
+    for (col = 0; col<nOfColumns; col++)
+        if (!coveredColumns[col])
+            for (row = 0; row<nOfRows; row++)
+                distMatrix[row + nOfRows*col] -= h;
+
+    /* move to step 3 */
+    step3(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
+}
+

+ 107 - 0
src/detection/label_sign_detection/src/KalmanTracker.cpp

@@ -0,0 +1,107 @@
+//
+// Created by lqx on 20-4-23.
+//
+
+///////////////////////////////////////////////////////////////////////////////
+// KalmanTracker.cpp: KalmanTracker Class Implementation Declaration
+
+#include "KalmanTracker.h"
+#include "detect_obstacle.h"
+#include <iostream>
+
+int KalmanTracker::kf_count = 0;
+
+// initialize Kalman filter
+void KalmanTracker::init_kf(StateType stateMat)
+{
+    int stateNum = 7;
+    int measureNum = 4;
+    kf = KalmanFilter(stateNum, measureNum, 0);
+
+    measurement = Mat::zeros(measureNum, 1, CV_32F);
+
+    kf.transitionMatrix = (Mat_<float>(stateNum, stateNum) <<
+                                                            1, 0, 0, 0, 1, 0, 0,
+            0, 1, 0, 0, 0, 1, 0,
+            0, 0, 1, 0, 0, 0, 1,
+            0, 0, 0, 1, 0, 0, 0,
+            0, 0, 0, 0, 1, 0, 0,
+            0, 0, 0, 0, 0, 1, 0,
+            0, 0, 0, 0, 0, 0, 1);
+
+    setIdentity(kf.measurementMatrix);
+    setIdentity(kf.processNoiseCov, Scalar::all(1e-2));
+    setIdentity(kf.measurementNoiseCov, Scalar::all(1e-1));
+    setIdentity(kf.errorCovPost, Scalar::all(1));
+
+    // initialize state vector with bounding box in [cx,cy,s,r] style
+    kf.statePost.at<float>(0, 0) = stateMat.x + stateMat.width / 2;
+    kf.statePost.at<float>(1, 0) = stateMat.y + stateMat.height / 2;
+    kf.statePost.at<float>(2, 0) = stateMat.area();
+    kf.statePost.at<float>(3, 0) = stateMat.width / stateMat.height;
+}
+
+// Predict the estimated bounding box.
+StateType KalmanTracker::predict()
+{
+    // predict
+    Mat p = kf.predict();
+    m_age += 1;
+
+    if (m_time_since_update > od::max_age)
+        m_hit_streak = 0;
+    m_time_since_update += 1;
+    StateType predictBox = get_rect_xysr(p.at<float>(0, 0), p.at<float>(1, 0), p.at<float>(2, 0), p.at<float>(3, 0));
+    m_history.push_back(predictBox);
+    return m_history.back();
+}
+
+// Update the state vector with observed bounding box.
+void KalmanTracker::update(StateType stateMat, int classId, float prob)
+{
+    m_time_since_update = 0;
+    m_history.clear();
+    m_hits += 1;
+    m_hit_streak += 1;
+    m_class_id = classId;
+
+    //--------------只保留最近的5帧检测类别-------------
+    if (m_class_history.size() >= 5)
+        m_class_history.erase(m_class_history.begin());
+    m_class_history.push_back(classId);
+    //--------------只保留最近的5帧检测类别---------------
+
+    m_prob = prob;
+    // measurement
+    measurement.at<float>(0, 0) = stateMat.x + stateMat.width / 2;
+    measurement.at<float>(1, 0) = stateMat.y + stateMat.height / 2;
+    measurement.at<float>(2, 0) = stateMat.area();
+    measurement.at<float>(3, 0) = stateMat.width / stateMat.height;
+
+    // update
+    kf.correct(measurement);
+}
+
+// Return the current state vector
+StateType KalmanTracker::get_state()
+{
+    Mat s = kf.statePost;
+    return get_rect_xysr(s.at<float>(0, 0), s.at<float>(1, 0), s.at<float>(2, 0), s.at<float>(3, 0));
+}
+
+
+// Convert bounding box from [cx,cy,s,r] to [x,y,w,h] style.
+StateType KalmanTracker::get_rect_xysr(float cx, float cy, float s, float r)
+{
+    float w = sqrt(s * r);
+    float h = s / w;
+    float x = (cx - w / 2);
+    float y = (cy - h / 2);
+
+    if (x < 0 && cx > 0)
+        x = 0;
+    if (y < 0 && cy > 0)
+        y = 0;
+
+    return StateType(x, y, w, h);
+}

+ 260 - 0
src/detection/label_sign_detection/src/detect_obstacle.cpp

@@ -0,0 +1,260 @@
+
+#include <set>
+#include "detect_obstacle.h"
+
+namespace od{
+// Computes IOU between two bounding boxes
+double GetIOU(Rect_<float> bb_test, Rect_<float> bb_gt)
+{
+    float in = (bb_test & bb_gt).area();
+    float un = bb_test.area() + bb_gt.area() - in;
+
+    if (un < DBL_EPSILON)
+        return 0;
+
+    return (double)(in / un);
+}
+
+
+//tracking obstacle
+bool TrackObstacle(int frame_count,vector<KalmanTracker> &trackers,vector<bbox_t> &outs,vector<od::TrackingBox> &track_result)
+{
+    // variables used in the for-loop
+    vector<Rect_<float>> predictedBoxes;
+    vector<vector<double>> iouMatrix;
+    vector<int> assignment;
+    set<int> unmatchedDetections;
+    set<int> unmatchedTrajectories;
+    set<int> allItems;
+    set<int> matchedItems;
+    vector<cv::Point> matchedPairs;
+    unsigned int trkNum = 0;
+    unsigned int detNum = 0;
+    vector<od::DetectBox> detect_outs;
+    //bbox_t to Detect_box
+    for(unsigned int i=0;i<outs.size();i++)
+    {
+        od::DetectBox detect_temp;
+        detect_temp.class_id = outs[i].obj_id;
+        detect_temp.prob = outs[i].prob;
+        float tpx = outs[i].x;
+        float tpy = outs[i].y;
+        float tpw = outs[i].w;
+        float tph = outs[i].h;
+        //detect_temp.box = Rect_<float>(Point_<float>(tpx, tpy),Point_<float>(tpx + tpw, tpy + tph));
+        detect_temp.box = Rect_<float>(tpx,tpy,tpw,tph);
+        detect_outs.push_back(detect_temp);
+    }
+    //tracking
+    if (trackers.size() == 0) // the first frame met
+    {
+        // initialize kalman trackers using first detections.
+        for (unsigned int i = 0; i < outs.size(); i++)
+        {
+            KalmanTracker trk = KalmanTracker(detect_outs[i].box,
+                                              detect_outs[i].class_id,
+                                              detect_outs[i].prob);
+            trackers.push_back(trk);
+        }
+        return false;
+    }
+    ///////////////////////////////////////
+    // 3.1. get predicted locations from existing trackers.
+    predictedBoxes.clear();
+
+    for (auto it = trackers.begin(); it != trackers.end();)
+    {
+        Rect_<float> pBox = (*it).predict();
+        if (pBox.x >= 0 && pBox.y >= 0)
+        {
+            predictedBoxes.push_back(pBox);
+            it++;
+        }
+        else
+        {
+
+            cerr << "Box invalid at frame: " << frame_count <<" id "<<(*it).m_id+1<<endl;
+            it = trackers.erase(it);
+
+        }
+    }
+
+    if (trackers.size() == 0 || detect_outs.size() == 0) return false;
+
+    ///////////////////////////////////////
+    // 3.2. associate detections to tracked object (both represented as bounding boxes)
+    // dets : detFrameData[fi]
+    trkNum = predictedBoxes.size();
+    detNum = outs.size();
+
+    iouMatrix.clear();
+    iouMatrix.resize(trkNum, vector<double>(detNum, 0));
+
+    for (unsigned int i = 0; i < trkNum; i++) // compute iou matrix as a distance matrix
+    {
+        for (unsigned int j = 0; j < detNum; j++)
+        {
+            // use 1-iou because the hungarian algorithm computes a minimum-cost assignment.
+            iouMatrix[i][j] = 1 - GetIOU(predictedBoxes[i], detect_outs[j].box);
+        }
+    }
+    // solve the assignment problem using hungarian algorithm.
+    // the resulting assignment is [track(prediction) : detection], with len=preNum
+    HungarianAlgorithm HungAlgo;
+    assignment.clear();
+    HungAlgo.Solve(iouMatrix, assignment);
+
+    // find matches, unmatched_detections and unmatched_predictions
+    unmatchedTrajectories.clear();
+    unmatchedDetections.clear();
+    allItems.clear();
+    matchedItems.clear();
+    if (detNum > trkNum) //	there are unmatched detections
+    {
+        for (unsigned int n = 0; n < detNum; n++)
+            allItems.insert(n);
+
+        for (unsigned int i = 0; i < trkNum; ++i)
+            matchedItems.insert(assignment[i]);
+
+        set_difference(allItems.begin(), allItems.end(),
+                       matchedItems.begin(), matchedItems.end(),
+                       insert_iterator<set<int>>(unmatchedDetections, unmatchedDetections.begin()));
+    }
+    else if (detNum < trkNum) // there are unmatched trajectory/predictions
+    {
+        for (unsigned int i = 0; i < trkNum; ++i)
+            if (assignment[i] == -1) // unassigned label will be set as -1 in the assignment algorithm
+                unmatchedTrajectories.insert(i);
+    }
+    // filter out matched with low IOU
+    matchedPairs.clear();
+    for (unsigned int i = 0; i < trkNum; ++i)
+    {
+        if (assignment[i] == -1) // pass over invalid values
+            continue;
+        if (1 - iouMatrix[i][assignment[i]] < od::iouThreshold)
+        {
+            unmatchedTrajectories.insert(i);
+            unmatchedDetections.insert(assignment[i]);
+        }
+        else
+            matchedPairs.push_back(cv::Point(i, assignment[i]));
+    }
+    ///////////////////////////////////////
+    // 3.3. updating trackers
+    // update matched trackers with assigned detections.
+    // each prediction is corresponding to a tracker
+    int detIdx, trkIdx;
+    for (unsigned int i = 0; i < matchedPairs.size(); i++)
+    {
+        trkIdx = matchedPairs[i].x;
+        detIdx = matchedPairs[i].y;
+        trackers[trkIdx].update(detect_outs[detIdx].box,
+                                detect_outs[detIdx].class_id,
+                                detect_outs[detIdx].prob);
+    }
+    // create and initialise new trackers for unmatched detections
+    for (auto umd : unmatchedDetections)
+    {
+        KalmanTracker tracker = KalmanTracker(detect_outs[umd].box,
+                                              detect_outs[umd].class_id,
+                                              detect_outs[umd].prob);
+        trackers.push_back(tracker);
+    }
+
+#if 0
+    //get unique trackers,merg same trackers
+    unsigned int trackers_num = trackers.size();
+    iouMatrix.clear();
+    iouMatrix.resize(trackers_num, vector<double>(trackers_num, 0));
+    for (unsigned int i = 0; i < trackers_num; i++) // compute iou matrix as a distance matrix
+    {
+        for (unsigned int j = 0; j < trackers_num; j++)
+        {
+            // use 1-iou because the hungarian algorithm computes a minimum-cost assignment.
+            if(j==i)
+                iouMatrix[i][j] = 1;
+            else
+                iouMatrix[i][j] = 1 - GetIOU(trackers[i].get_state(), trackers[j].get_state());
+        }
+    }
+    // solve the assignment problem using hungarian algorithm.
+    // the resulting assignment is [track(prediction) : detection], with len=preNum
+    assignment.clear();
+    HungAlgo.Solve(iouMatrix, assignment);
+    // filter out matched with low IOU
+    matchedPairs.clear();
+    for (unsigned int i = 0; i < trackers_num; ++i)
+    {
+        if (assignment[i] == -1) // pass over invalid values
+            continue;
+        if (iouMatrix[i][assignment[i]] < od::iouThreshold)
+        {
+            matchedPairs.push_back(cv::Point(i, assignment[i]));
+        }
+    }
+    int index1,index2;
+    vector<int> delete_index;
+    for (unsigned int i = 0; i < matchedPairs.size(); i++)
+    {
+        index1 = matchedPairs[i].x;
+        index2 = matchedPairs[i].y;
+        if(index1 >= index2)
+            continue;
+        if((trackers[index1].m_id > trackers[index2].m_id) && (trackers[index1].m_class_history.size()>0))
+        {
+            trackers[index1].m_id = trackers[index2].m_id;
+            trackers[index1].m_class_history.insert(trackers[index1].m_class_history.begin(),
+                                                    trackers[index2].m_class_history.begin(),trackers[index2].m_class_history.end());
+            delete_index.push_back(index2);
+        }
+        else if((trackers[index2].m_id > trackers[index1].m_id) && (trackers[index2].m_class_history.size()>0))
+        {
+            trackers[index2].m_id = trackers[index1].m_id;
+            trackers[index2].m_class_history.insert(trackers[index2].m_class_history.begin(),
+                                                    trackers[index1].m_class_history.begin(),trackers[index1].m_class_history.end());
+            delete_index.push_back(index1);
+
+        }
+    }
+    for(unsigned int i = 0; i < delete_index.size(); i++)
+    {
+        int idx = delete_index[i] - i;
+        trackers.erase(trackers.begin() + idx);
+    }
+#endif
+
+    // get trackers' output
+    track_result.clear();
+    for (auto it = trackers.begin(); it != trackers.end();)
+    {
+        if (((*it).m_time_since_update <= od::max_age) &&
+                ((*it).m_hit_streak >= od::min_hits || frame_count <= od::min_hits))
+        {
+            od::TrackingBox res;
+            res.box = (*it).get_state();
+            res.id = (*it).m_id + 1;
+            res.frame = frame_count;
+            res.class_id = (*it).m_class_id;
+            res.prob = (*it).m_prob;
+            res.class_history = (*it).m_class_history;
+            track_result.push_back(res);
+            it++;
+        }
+        else
+            it ++;
+        //remove dead tracklet
+        if(it != trackers.end() && (*it).m_time_since_update > od::max_age)
+        {
+            it = trackers.erase(it);
+        }
+    }
+    if(track_result.size()>0)
+        return true;
+    else return false;
+}
+}
+
+
+

+ 443 - 0
src/detection/label_sign_detection/src/infer.cu

@@ -0,0 +1,443 @@
+
+#include <NvInfer.h>
+#include <cuda_runtime.h>
+#include <stdarg.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include <unordered_map>
+#include "../include/infer.hpp"
+
+
+namespace trt {
+
+using namespace std;
+using namespace nvinfer1;
+
+#define checkRuntime(call)                                                                 \
+  do {                                                                                     \
+    auto ___call__ret_code__ = (call);                                                     \
+    if (___call__ret_code__ != cudaSuccess) {                                              \
+      INFO("CUDA Runtime error💥 %s # %s, code = %s [ %d ]", #call,                         \
+           cudaGetErrorString(___call__ret_code__), cudaGetErrorName(___call__ret_code__), \
+           ___call__ret_code__);                                                           \
+      abort();                                                                             \
+    }                                                                                      \
+  } while (0)
+
+#define checkKernel(...)                 \
+  do {                                   \
+    { (__VA_ARGS__); }                   \
+    checkRuntime(cudaPeekAtLastError()); \
+  } while (0)
+
+#define Assert(op)                 \
+  do {                             \
+    bool cond = !(!(op));          \
+    if (!cond) {                   \
+      INFO("Assert failed, " #op); \
+      abort();                     \
+    }                              \
+  } while (0)
+
+#define Assertf(op, ...)                             \
+  do {                                               \
+    bool cond = !(!(op));                            \
+    if (!cond) {                                     \
+      INFO("Assert failed, " #op " : " __VA_ARGS__); \
+      abort();                                       \
+    }                                                \
+  } while (0)
+
+static string file_name(const string &path, bool include_suffix) {
+  if (path.empty()) return "";
+
+  int p = path.rfind('/');
+  int e = path.rfind('\\');
+  p = max(p, e);
+  p += 1;
+
+  // include suffix
+  if (include_suffix) return path.substr(p);
+
+  int u = path.rfind('.');
+  if (u == -1) return path.substr(p);
+
+  if (u <= p) u = path.size();
+  return path.substr(p, u - p);
+}
+
+void __log_func(const char *file, int line, const char *fmt, ...) {
+  va_list vl;
+  va_start(vl, fmt);
+  char buffer[2048];
+  string filename = file_name(file, true);
+  int n = snprintf(buffer, sizeof(buffer), "[%s:%d]: ", filename.c_str(), line);
+  vsnprintf(buffer + n, sizeof(buffer) - n, fmt, vl);
+  fprintf(stdout, "%s\n", buffer);
+}
+
+static std::string format_shape(const Dims &shape) {
+  stringstream output;
+  char buf[64];
+  const char *fmts[] = {"%d", "x%d"};
+  for (int i = 0; i < shape.nbDims; ++i) {
+    snprintf(buf, sizeof(buf), fmts[i != 0], shape.d[i]);
+    output << buf;
+  }
+  return output.str();
+}
+
+Timer::Timer() {
+  checkRuntime(cudaEventCreate((cudaEvent_t *)&start_));
+  checkRuntime(cudaEventCreate((cudaEvent_t *)&stop_));
+}
+
+Timer::~Timer() {
+  checkRuntime(cudaEventDestroy((cudaEvent_t)start_));
+  checkRuntime(cudaEventDestroy((cudaEvent_t)stop_));
+}
+
+void Timer::start(void *stream) {
+  stream_ = stream;
+  checkRuntime(cudaEventRecord((cudaEvent_t)start_, (cudaStream_t)stream_));
+}
+
+float Timer::stop(const char *prefix, bool print) {
+  checkRuntime(cudaEventRecord((cudaEvent_t)stop_, (cudaStream_t)stream_));
+  checkRuntime(cudaEventSynchronize((cudaEvent_t)stop_));
+
+  float latency = 0;
+  checkRuntime(cudaEventElapsedTime(&latency, (cudaEvent_t)start_, (cudaEvent_t)stop_));
+
+  if (print) {
+    printf("[%s]: %.5f ms\n", prefix, latency);
+  }
+  return latency;
+}
+
+BaseMemory::BaseMemory(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes) {
+  reference(cpu, cpu_bytes, gpu, gpu_bytes);
+}
+
+void BaseMemory::reference(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes) {
+  release();
+
+  if (cpu == nullptr || cpu_bytes == 0) {
+    cpu = nullptr;
+    cpu_bytes = 0;
+  }
+
+  if (gpu == nullptr || gpu_bytes == 0) {
+    gpu = nullptr;
+    gpu_bytes = 0;
+  }
+
+  this->cpu_ = cpu;
+  this->cpu_capacity_ = cpu_bytes;
+  this->cpu_bytes_ = cpu_bytes;
+  this->gpu_ = gpu;
+  this->gpu_capacity_ = gpu_bytes;
+  this->gpu_bytes_ = gpu_bytes;
+
+  this->owner_cpu_ = !(cpu && cpu_bytes > 0);
+  this->owner_gpu_ = !(gpu && gpu_bytes > 0);
+}
+
+BaseMemory::~BaseMemory() { release(); }
+
+void *BaseMemory::gpu_realloc(size_t bytes) {
+  if (gpu_capacity_ < bytes) {
+    release_gpu();
+
+    gpu_capacity_ = bytes;
+    checkRuntime(cudaMalloc(&gpu_, bytes));
+    // checkRuntime(cudaMemset(gpu_, 0, size));
+  }
+  gpu_bytes_ = bytes;
+  return gpu_;
+}
+
+void *BaseMemory::cpu_realloc(size_t bytes) {
+  if (cpu_capacity_ < bytes) {
+    release_cpu();
+
+    cpu_capacity_ = bytes;
+    checkRuntime(cudaMallocHost(&cpu_, bytes));
+    Assert(cpu_ != nullptr);
+    // memset(cpu_, 0, size);
+  }
+  cpu_bytes_ = bytes;
+  return cpu_;
+}
+
+void BaseMemory::release_cpu() {
+  if (cpu_) {
+    if (owner_cpu_) {
+      checkRuntime(cudaFreeHost(cpu_));
+    }
+    cpu_ = nullptr;
+  }
+  cpu_capacity_ = 0;
+  cpu_bytes_ = 0;
+}
+
+void BaseMemory::release_gpu() {
+  if (gpu_) {
+    if (owner_gpu_) {
+      checkRuntime(cudaFree(gpu_));
+    }
+    gpu_ = nullptr;
+  }
+  gpu_capacity_ = 0;
+  gpu_bytes_ = 0;
+}
+
+void BaseMemory::release() {
+  release_cpu();
+  release_gpu();
+}
+
+class __native_nvinfer_logger : public ILogger {
+ public:
+  virtual void log(Severity severity, const char *msg) noexcept override {
+    if (severity == Severity::kINTERNAL_ERROR) {
+      INFO("NVInfer INTERNAL_ERROR: %s", msg);
+      abort();
+    } else if (severity == Severity::kERROR) {
+      INFO("NVInfer: %s", msg);
+    }
+    // else  if (severity == Severity::kWARNING) {
+    //     INFO("NVInfer: %s", msg);
+    // }
+    // else  if (severity == Severity::kINFO) {
+    //     INFO("NVInfer: %s", msg);
+    // }
+    // else {
+    //     INFO("%s", msg);
+    // }
+  }
+};
+static __native_nvinfer_logger gLogger;
+
+template <typename _T>
+static void destroy_nvidia_pointer(_T *ptr) {
+  if (ptr) ptr->destroy();
+}
+
+static std::vector<uint8_t> load_file(const string &file) {
+  ifstream in(file, ios::in | ios::binary);
+  if (!in.is_open()) return {};
+
+  in.seekg(0, ios::end);
+  size_t length = in.tellg();
+
+  std::vector<uint8_t> data;
+  if (length > 0) {
+    in.seekg(0, ios::beg);
+    data.resize(length);
+
+    in.read((char *)&data[0], length);
+  }
+  in.close();
+  return data;
+}
+
+class __native_engine_context {
+ public:
+  virtual ~__native_engine_context() { destroy(); }
+
+  bool construct(const void *pdata, size_t size) {
+    destroy();
+
+    if (pdata == nullptr || size == 0) return false;
+
+    runtime_ = shared_ptr<IRuntime>(createInferRuntime(gLogger), destroy_nvidia_pointer<IRuntime>);
+    if (runtime_ == nullptr) return false;
+
+    engine_ = shared_ptr<ICudaEngine>(runtime_->deserializeCudaEngine(pdata, size, nullptr),
+                                      destroy_nvidia_pointer<ICudaEngine>);
+    if (engine_ == nullptr) return false;
+
+    context_ = shared_ptr<IExecutionContext>(engine_->createExecutionContext(),
+                                             destroy_nvidia_pointer<IExecutionContext>);
+    return context_ != nullptr;
+  }
+
+ private:
+  void destroy() {
+    context_.reset();
+    engine_.reset();
+    runtime_.reset();
+  }
+
+ public:
+  shared_ptr<IExecutionContext> context_;
+  shared_ptr<ICudaEngine> engine_;
+  shared_ptr<IRuntime> runtime_ = nullptr;
+};
+
+class InferImpl : public Infer {
+ public:
+  shared_ptr<__native_engine_context> context_;
+  unordered_map<string, int> binding_name_to_index_;
+
+  virtual ~InferImpl() = default;
+
+  bool construct(const void *data, size_t size) {
+    context_ = make_shared<__native_engine_context>();
+    if (!context_->construct(data, size)) {
+      return false;
+    }
+
+    setup();
+    return true;
+  }
+
+  bool load(const string &file) {
+    auto data = load_file(file);
+    if (data.empty()) {
+      INFO("An empty file has been loaded. Please confirm your file path: %s", file.c_str());
+      return false;
+    }
+    return this->construct(data.data(), data.size());
+  }
+
+  void setup() {
+    auto engine = this->context_->engine_;
+    int nbBindings = engine->getNbBindings();
+
+    binding_name_to_index_.clear();
+    for (int i = 0; i < nbBindings; ++i) {
+      const char *bindingName = engine->getBindingName(i);
+      binding_name_to_index_[bindingName] = i;
+    }
+  }
+
+  virtual int index(const std::string &name) override {
+    auto iter = binding_name_to_index_.find(name);
+    Assertf(iter != binding_name_to_index_.end(), "Can not found the binding name: %s",
+            name.c_str());
+    return iter->second;
+  }
+
+  virtual bool forward(const std::vector<void *> &bindings, void *stream,
+                       void *input_consum_event) override {
+    return this->context_->context_->enqueueV2((void**)bindings.data(), (cudaStream_t)stream,
+                                               (cudaEvent_t *)input_consum_event);
+  }
+
+  virtual std::vector<int> run_dims(const std::string &name) override {
+    return run_dims(index(name));
+  }
+
+  virtual std::vector<int> run_dims(int ibinding) override {
+    auto dim = this->context_->context_->getBindingDimensions(ibinding);
+    return std::vector<int>(dim.d, dim.d + dim.nbDims);
+  }
+
+  virtual std::vector<int> static_dims(const std::string &name) override {
+    return static_dims(index(name));
+  }
+
+  virtual std::vector<int> static_dims(int ibinding) override {
+    auto dim = this->context_->engine_->getBindingDimensions(ibinding);
+    return std::vector<int>(dim.d, dim.d + dim.nbDims);
+  }
+
+  virtual int num_bindings() override { return this->context_->engine_->getNbBindings(); }
+
+  virtual bool is_input(int ibinding) override {
+    return this->context_->engine_->bindingIsInput(ibinding);
+  }
+
+  virtual bool set_run_dims(const std::string &name, const std::vector<int> &dims) override {
+    return this->set_run_dims(index(name), dims);
+  }
+
+  virtual bool set_run_dims(int ibinding, const std::vector<int> &dims) override {
+    Dims d;
+    memcpy(d.d, dims.data(), sizeof(int) * dims.size());
+    d.nbDims = dims.size();
+    return this->context_->context_->setBindingDimensions(ibinding, d);
+  }
+
+  virtual int numel(const std::string &name) override { return numel(index(name)); }
+
+  virtual int numel(int ibinding) override {
+    auto dim = this->context_->context_->getBindingDimensions(ibinding);
+    return std::accumulate(dim.d, dim.d + dim.nbDims, 1, std::multiplies<int>());
+  }
+
+  virtual DType dtype(const std::string &name) override { return dtype(index(name)); }
+
+  virtual DType dtype(int ibinding) override {
+    return (DType)this->context_->engine_->getBindingDataType(ibinding);
+  }
+
+  virtual bool has_dynamic_dim() override {
+    // check if any input or output bindings have dynamic shapes
+    // code from ChatGPT
+    int numBindings = this->context_->engine_->getNbBindings();
+    for (int i = 0; i < numBindings; ++i) {
+      nvinfer1::Dims dims = this->context_->engine_->getBindingDimensions(i);
+      for (int j = 0; j < dims.nbDims; ++j) {
+        if (dims.d[j] == -1) return true;
+      }
+    }
+    return false;
+  }
+
+  virtual void print() override {
+    INFO("Infer %p [%s]", this, has_dynamic_dim() ? "DynamicShape" : "StaticShape");
+
+    int num_input = 0;
+    int num_output = 0;
+    auto engine = this->context_->engine_;
+    for (int i = 0; i < engine->getNbBindings(); ++i) {
+      if (engine->bindingIsInput(i))
+        num_input++;
+      else
+        num_output++;
+    }
+
+    INFO("Inputs: %d", num_input);
+    for (int i = 0; i < num_input; ++i) {
+      auto name = engine->getBindingName(i);
+      auto dim = engine->getBindingDimensions(i);
+      INFO("\t%d.%s : shape {%s}", i, name, format_shape(dim).c_str());
+    }
+
+    INFO("Outputs: %d", num_output);
+    for (int i = 0; i < num_output; ++i) {
+      auto name = engine->getBindingName(i + num_input);
+      auto dim = engine->getBindingDimensions(i + num_input);
+      INFO("\t%d.%s : shape {%s}", i, name, format_shape(dim).c_str());
+    }
+  }
+};
+
+Infer *loadraw(const std::string &file) {
+  InferImpl *impl = new InferImpl();
+  if (!impl->load(file)) {
+    delete impl;
+    impl = nullptr;
+  }
+  return impl;
+}
+
+std::shared_ptr<Infer> load(const std::string &file) {
+  return std::shared_ptr<InferImpl>((InferImpl *)loadraw(file));
+}
+
+std::string format_shape(const std::vector<int> &shape) {
+  stringstream output;
+  char buf[64];
+  const char *fmts[] = {"%d", "x%d"};
+  for (int i = 0; i < (int)shape.size(); ++i) {
+    snprintf(buf, sizeof(buf), fmts[i != 0], shape[i]);
+    output << buf;
+  }
+  return output.str();
+}
+};  // namespace trt

+ 740 - 0
src/detection/label_sign_detection/src/yolo.cu

@@ -0,0 +1,740 @@
+#include "../include/infer.hpp"
+#include "../include/yolo.hpp"
+
+namespace yolo {
+
+using namespace std;
+
+#define GPU_BLOCK_THREADS 512
+#define checkRuntime(call)                                                                 \
+  do {                                                                                     \
+    auto ___call__ret_code__ = (call);                                                     \
+    if (___call__ret_code__ != cudaSuccess) {                                              \
+      INFO("CUDA Runtime error💥 %s # %s, code = %s [ %d ]", #call,                         \
+           cudaGetErrorString(___call__ret_code__), cudaGetErrorName(___call__ret_code__), \
+           ___call__ret_code__);                                                           \
+      abort();                                                                             \
+    }                                                                                      \
+  } while (0)
+
+#define checkKernel(...)                 \
+  do {                                   \
+    { (__VA_ARGS__); }                   \
+    checkRuntime(cudaPeekAtLastError()); \
+  } while (0)
+
+enum class NormType : int { None = 0, MeanStd = 1, AlphaBeta = 2 };
+
+enum class ChannelType : int { None = 0, SwapRB = 1 };
+
+/* 归一化操作,可以支持均值标准差,alpha beta,和swap RB */
+struct Norm {
+  float mean[3];
+  float std[3];
+  float alpha, beta;
+  NormType type = NormType::None;
+  ChannelType channel_type = ChannelType::None;
+
+  // out = (x * alpha - mean) / std
+  static Norm mean_std(const float mean[3], const float std[3], float alpha = 1 / 255.0f,
+                       ChannelType channel_type = ChannelType::None);
+
+  // out = x * alpha + beta
+  static Norm alpha_beta(float alpha, float beta = 0, ChannelType channel_type = ChannelType::None);
+
+  // None
+  static Norm None();
+};
+
+Norm Norm::mean_std(const float mean[3], const float std[3], float alpha,
+                    ChannelType channel_type) {
+  Norm out;
+  out.type = NormType::MeanStd;
+  out.alpha = alpha;
+  out.channel_type = channel_type;
+  memcpy(out.mean, mean, sizeof(out.mean));
+  memcpy(out.std, std, sizeof(out.std));
+  return out;
+}
+
+Norm Norm::alpha_beta(float alpha, float beta, ChannelType channel_type) {
+  Norm out;
+  out.type = NormType::AlphaBeta;
+  out.alpha = alpha;
+  out.beta = beta;
+  out.channel_type = channel_type;
+  return out;
+}
+
+Norm Norm::None() { return Norm(); }
+
+const int NUM_BOX_ELEMENT = 8;  // left, top, right, bottom, confidence, class,
+                                // keepflag, row_index(output)
+const int MAX_IMAGE_BOXES = 1024;
+inline int upbound(int n, int align = 32) { return (n + align - 1) / align * align; }
+static __host__ __device__ void affine_project(float *matrix, float x, float y, float *ox,
+                                               float *oy) {
+  *ox = matrix[0] * x + matrix[1] * y + matrix[2];
+  *oy = matrix[3] * x + matrix[4] * y + matrix[5];
+}
+
+static __global__ void decode_kernel_common(float *predict, int num_bboxes, int num_classes,
+                                            int output_cdim, float confidence_threshold,
+                                            float *invert_affine_matrix, float *parray,
+                                            int MAX_IMAGE_BOXES) {
+  int position = blockDim.x * blockIdx.x + threadIdx.x;
+  if (position >= num_bboxes) return;
+
+  float *pitem = predict + output_cdim * position;
+  float objectness = pitem[4];
+  if (objectness < confidence_threshold) return;
+
+  float *class_confidence = pitem + 5;
+  float confidence = *class_confidence++;
+  int label = 0;
+  for (int i = 1; i < num_classes; ++i, ++class_confidence) {
+    if (*class_confidence > confidence) {
+      confidence = *class_confidence;
+      label = i;
+    }
+  }
+
+  confidence *= objectness;
+  if (confidence < confidence_threshold) return;
+
+  int index = atomicAdd(parray, 1);
+  if (index >= MAX_IMAGE_BOXES) return;
+
+  float cx = *pitem++;
+  float cy = *pitem++;
+  float width = *pitem++;
+  float height = *pitem++;
+  float left = cx - width * 0.5f;
+  float top = cy - height * 0.5f;
+  float right = cx + width * 0.5f;
+  float bottom = cy + height * 0.5f;
+  affine_project(invert_affine_matrix, left, top, &left, &top);
+  affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
+
+  float *pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
+  *pout_item++ = left;
+  *pout_item++ = top;
+  *pout_item++ = right;
+  *pout_item++ = bottom;
+  *pout_item++ = confidence;
+  *pout_item++ = label;
+  *pout_item++ = 1;  // 1 = keep, 0 = ignore
+}
+
+static __global__ void decode_kernel_v8(float *predict, int num_bboxes, int num_classes,
+                                        int output_cdim, float confidence_threshold,
+                                        float *invert_affine_matrix, float *parray,
+                                        int MAX_IMAGE_BOXES) {
+  int position = blockDim.x * blockIdx.x + threadIdx.x;
+  if (position >= num_bboxes) return;
+
+  float *pitem = predict + output_cdim * position;
+  float *class_confidence = pitem + 4;
+  float confidence = *class_confidence++;
+  int label = 0;
+  for (int i = 1; i < num_classes; ++i, ++class_confidence) {
+    if (*class_confidence > confidence) {
+      confidence = *class_confidence;
+      label = i;
+    }
+  }
+  if (confidence < confidence_threshold) return;
+
+  int index = atomicAdd(parray, 1);
+  if (index >= MAX_IMAGE_BOXES) return;
+
+  float cx = *pitem++;
+  float cy = *pitem++;
+  float width = *pitem++;
+  float height = *pitem++;
+  float left = cx - width * 0.5f;
+  float top = cy - height * 0.5f;
+  float right = cx + width * 0.5f;
+  float bottom = cy + height * 0.5f;
+  affine_project(invert_affine_matrix, left, top, &left, &top);
+  affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
+
+  float *pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
+  *pout_item++ = left;
+  *pout_item++ = top;
+  *pout_item++ = right;
+  *pout_item++ = bottom;
+  *pout_item++ = confidence;
+  *pout_item++ = label;
+  *pout_item++ = 1;  // 1 = keep, 0 = ignore
+  *pout_item++ = position;
+}
+
+static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft,
+                                float btop, float bright, float bbottom) {
+  float cleft = max(aleft, bleft);
+  float ctop = max(atop, btop);
+  float cright = min(aright, bright);
+  float cbottom = min(abottom, bbottom);
+
+  float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
+  if (c_area == 0.0f) return 0.0f;
+
+  float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
+  float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
+  return c_area / (a_area + b_area - c_area);
+}
+
+static __global__ void fast_nms_kernel(float *bboxes, int MAX_IMAGE_BOXES, float threshold) {
+  int position = (blockDim.x * blockIdx.x + threadIdx.x);
+  int count = min((int)*bboxes, MAX_IMAGE_BOXES);
+  if (position >= count) return;
+
+  // left, top, right, bottom, confidence, class, keepflag
+  float *pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
+  for (int i = 0; i < count; ++i) {
+    float *pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
+    if (i == position || pcurrent[5] != pitem[5]) continue;
+
+    if (pitem[4] >= pcurrent[4]) {
+      if (pitem[4] == pcurrent[4] && i < position) continue;
+
+      float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1],
+                          pitem[2], pitem[3]);
+
+      if (iou > threshold) {
+        pcurrent[6] = 0;  // 1=keep, 0=ignore
+        return;
+      }
+    }
+  }
+}
+
+static dim3 grid_dims(int numJobs) {
+  int numBlockThreads = numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
+  return dim3(((numJobs + numBlockThreads - 1) / (float)numBlockThreads));
+}
+
+static dim3 block_dims(int numJobs) {
+  return numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
+}
+
+static void decode_kernel_invoker(float *predict, int num_bboxes, int num_classes, int output_cdim,
+                                  float confidence_threshold, float nms_threshold,
+                                  float *invert_affine_matrix, float *parray, int MAX_IMAGE_BOXES,
+                                  Type type, cudaStream_t stream) {
+  auto grid = grid_dims(num_bboxes);
+  auto block = block_dims(num_bboxes);
+
+  if (type == Type::V8 || type == Type::V8Seg) {
+    checkKernel(decode_kernel_v8<<<grid, block, 0, stream>>>(
+        predict, num_bboxes, num_classes, output_cdim, confidence_threshold, invert_affine_matrix,
+        parray, MAX_IMAGE_BOXES));
+  } else {
+    checkKernel(decode_kernel_common<<<grid, block, 0, stream>>>(
+        predict, num_bboxes, num_classes, output_cdim, confidence_threshold, invert_affine_matrix,
+        parray, MAX_IMAGE_BOXES));
+  }
+
+  grid = grid_dims(MAX_IMAGE_BOXES);
+  block = block_dims(MAX_IMAGE_BOXES);
+  checkKernel(fast_nms_kernel<<<grid, block, 0, stream>>>(parray, MAX_IMAGE_BOXES, nms_threshold));
+}
+
+static __global__ void warp_affine_bilinear_and_normalize_plane_kernel(
+    uint8_t *src, int src_line_size, int src_width, int src_height, float *dst, int dst_width,
+    int dst_height, uint8_t const_value_st, float *warp_affine_matrix_2_3, Norm norm) {
+  int dx = blockDim.x * blockIdx.x + threadIdx.x;
+  int dy = blockDim.y * blockIdx.y + threadIdx.y;
+  if (dx >= dst_width || dy >= dst_height) return;
+
+  float m_x1 = warp_affine_matrix_2_3[0];
+  float m_y1 = warp_affine_matrix_2_3[1];
+  float m_z1 = warp_affine_matrix_2_3[2];
+  float m_x2 = warp_affine_matrix_2_3[3];
+  float m_y2 = warp_affine_matrix_2_3[4];
+  float m_z2 = warp_affine_matrix_2_3[5];
+
+  float src_x = m_x1 * dx + m_y1 * dy + m_z1;
+  float src_y = m_x2 * dx + m_y2 * dy + m_z2;
+  float c0, c1, c2;
+
+  if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
+    // out of range
+    c0 = const_value_st;
+    c1 = const_value_st;
+    c2 = const_value_st;
+  } else {
+    int y_low = floorf(src_y);
+    int x_low = floorf(src_x);
+    int y_high = y_low + 1;
+    int x_high = x_low + 1;
+
+    uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+    float ly = src_y - y_low;
+    float lx = src_x - x_low;
+    float hy = 1 - ly;
+    float hx = 1 - lx;
+    float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+    uint8_t *v1 = const_value;
+    uint8_t *v2 = const_value;
+    uint8_t *v3 = const_value;
+    uint8_t *v4 = const_value;
+    if (y_low >= 0) {
+      if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3;
+
+      if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3;
+    }
+
+    if (y_high < src_height) {
+      if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3;
+
+      if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3;
+    }
+
+    // same to opencv
+    c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
+    c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
+    c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
+  }
+
+  if (norm.channel_type == ChannelType::SwapRB) {
+    float t = c2;
+    c2 = c0;
+    c0 = t;
+  }
+
+  if (norm.type == NormType::MeanStd) {
+    c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
+    c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
+    c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
+  } else if (norm.type == NormType::AlphaBeta) {
+    c0 = c0 * norm.alpha + norm.beta;
+    c1 = c1 * norm.alpha + norm.beta;
+    c2 = c2 * norm.alpha + norm.beta;
+  }
+
+  int area = dst_width * dst_height;
+  float *pdst_c0 = dst + dy * dst_width + dx;
+  float *pdst_c1 = pdst_c0 + area;
+  float *pdst_c2 = pdst_c1 + area;
+  *pdst_c0 = c0;
+  *pdst_c1 = c1;
+  *pdst_c2 = c2;
+}
+
+static void warp_affine_bilinear_and_normalize_plane(uint8_t *src, int src_line_size, int src_width,
+                                                     int src_height, float *dst, int dst_width,
+                                                     int dst_height, float *matrix_2_3,
+                                                     uint8_t const_value, const Norm &norm,
+                                                     cudaStream_t stream) {
+  dim3 grid((dst_width + 31) / 32, (dst_height + 31) / 32);
+  dim3 block(32, 32);
+
+  checkKernel(warp_affine_bilinear_and_normalize_plane_kernel<<<grid, block, 0, stream>>>(
+      src, src_line_size, src_width, src_height, dst, dst_width, dst_height, const_value,
+      matrix_2_3, norm));
+}
+
+static __global__ void decode_single_mask_kernel(int left, int top, float *mask_weights,
+                                                 float *mask_predict, int mask_width,
+                                                 int mask_height, unsigned char *mask_out,
+                                                 int mask_dim, int out_width, int out_height) {
+  // mask_predict to mask_out
+  // mask_weights @ mask_predict
+  int dx = blockDim.x * blockIdx.x + threadIdx.x;
+  int dy = blockDim.y * blockIdx.y + threadIdx.y;
+  if (dx >= out_width || dy >= out_height) return;
+
+  int sx = left + dx;
+  int sy = top + dy;
+  if (sx < 0 || sx >= mask_width || sy < 0 || sy >= mask_height) {
+    mask_out[dy * out_width + dx] = 0;
+    return;
+  }
+
+  float cumprod = 0;
+  for (int ic = 0; ic < mask_dim; ++ic) {
+    float cval = mask_predict[(ic * mask_height + sy) * mask_width + sx];
+    float wval = mask_weights[ic];
+    cumprod += cval * wval;
+  }
+
+  float alpha = 1.0f / (1.0f + exp(-cumprod));
+  mask_out[dy * out_width + dx] = alpha * 255;
+}
+
+static void decode_single_mask(float left, float top, float *mask_weights, float *mask_predict,
+                               int mask_width, int mask_height, unsigned char *mask_out,
+                               int mask_dim, int out_width, int out_height, cudaStream_t stream) {
+  // mask_weights is mask_dim(32 element) gpu pointer
+  dim3 grid((out_width + 31) / 32, (out_height + 31) / 32);
+  dim3 block(32, 32);
+
+  checkKernel(decode_single_mask_kernel<<<grid, block, 0, stream>>>(
+      left, top, mask_weights, mask_predict, mask_width, mask_height, mask_out, mask_dim, out_width,
+      out_height));
+}
+
+const char *type_name(Type type) {
+  switch (type) {
+    case Type::V5:
+      return "YoloV5";
+    case Type::V3:
+      return "YoloV3";
+    case Type::V7:
+      return "YoloV7";
+    case Type::X:
+      return "YoloX";
+    case Type::V8:
+      return "YoloV8";
+    default:
+      return "Unknow";
+  }
+}
+
+struct AffineMatrix {
+  float i2d[6];  // image to dst(network), 2x3 matrix
+  float d2i[6];  // dst to image, 2x3 matrix
+
+  void compute(const std::tuple<int, int> &from, const std::tuple<int, int> &to) {
+    float scale_x = get<0>(to) / (float)get<0>(from);
+    float scale_y = get<1>(to) / (float)get<1>(from);
+    float scale = std::min(scale_x, scale_y);
+    i2d[0] = scale;
+    i2d[1] = 0;
+    i2d[2] = -scale * get<0>(from) * 0.5 + get<0>(to) * 0.5 + scale * 0.5 - 0.5;
+    i2d[3] = 0;
+    i2d[4] = scale;
+    i2d[5] = -scale * get<1>(from) * 0.5 + get<1>(to) * 0.5 + scale * 0.5 - 0.5;
+
+    double D = i2d[0] * i2d[4] - i2d[1] * i2d[3];
+    D = D != 0. ? double(1.) / D : double(0.);
+    double A11 = i2d[4] * D, A22 = i2d[0] * D, A12 = -i2d[1] * D, A21 = -i2d[3] * D;
+    double b1 = -A11 * i2d[2] - A12 * i2d[5];
+    double b2 = -A21 * i2d[2] - A22 * i2d[5];
+
+    d2i[0] = A11;
+    d2i[1] = A12;
+    d2i[2] = b1;
+    d2i[3] = A21;
+    d2i[4] = A22;
+    d2i[5] = b2;
+  }
+};
+
+InstanceSegmentMap::InstanceSegmentMap(int width, int height) {
+  this->width = width;
+  this->height = height;
+  checkRuntime(cudaMallocHost(&this->data, width * height));
+}
+
+InstanceSegmentMap::~InstanceSegmentMap() {
+  if (this->data) {
+    checkRuntime(cudaFreeHost(this->data));
+    this->data = nullptr;
+  }
+  this->width = 0;
+  this->height = 0;
+}
+
+class InferImpl : public Infer {
+ public:
+  shared_ptr<trt::Infer> trt_;
+  string engine_file_;
+  Type type_;
+  float confidence_threshold_;
+  float nms_threshold_;
+  vector<shared_ptr<trt::Memory<unsigned char>>> preprocess_buffers_;
+  trt::Memory<float> input_buffer_, bbox_predict_, output_boxarray_;
+  trt::Memory<float> segment_predict_;
+  int network_input_width_, network_input_height_;
+  Norm normalize_;
+  vector<int> bbox_head_dims_;
+  vector<int> segment_head_dims_;
+  int num_classes_ = 0;
+  bool has_segment_ = false;
+  bool isdynamic_model_ = false;
+  vector<shared_ptr<trt::Memory<unsigned char>>> box_segment_cache_;
+
+  virtual ~InferImpl() = default;
+
+  void adjust_memory(int batch_size) {
+    // the inference batch_size
+    size_t input_numel = network_input_width_ * network_input_height_ * 3;
+    input_buffer_.gpu(batch_size * input_numel);
+    bbox_predict_.gpu(batch_size * bbox_head_dims_[1] * bbox_head_dims_[2]);
+    output_boxarray_.gpu(batch_size * (32 + MAX_IMAGE_BOXES * NUM_BOX_ELEMENT));
+    output_boxarray_.cpu(batch_size * (32 + MAX_IMAGE_BOXES * NUM_BOX_ELEMENT));
+
+    if (has_segment_)
+      segment_predict_.gpu(batch_size * segment_head_dims_[1] * segment_head_dims_[2] *
+                           segment_head_dims_[3]);
+
+    if ((int)preprocess_buffers_.size() < batch_size) {
+      for (int i = preprocess_buffers_.size(); i < batch_size; ++i)
+        preprocess_buffers_.push_back(make_shared<trt::Memory<unsigned char>>());
+    }
+  }
+
+  void preprocess(int ibatch, const Image &image,
+                  shared_ptr<trt::Memory<unsigned char>> preprocess_buffer, AffineMatrix &affine,
+                  void *stream = nullptr) {
+    affine.compute(make_tuple(image.width, image.height),
+                   make_tuple(network_input_width_, network_input_height_));
+
+    size_t input_numel = network_input_width_ * network_input_height_ * 3;
+    float *input_device = input_buffer_.gpu() + ibatch * input_numel;
+    size_t size_image = image.width * image.height * 3;
+    size_t size_matrix = upbound(sizeof(affine.d2i), 32);
+    uint8_t *gpu_workspace = preprocess_buffer->gpu(size_matrix + size_image);
+    float *affine_matrix_device = (float *)gpu_workspace;
+    uint8_t *image_device = gpu_workspace + size_matrix;
+
+    uint8_t *cpu_workspace = preprocess_buffer->cpu(size_matrix + size_image);
+    float *affine_matrix_host = (float *)cpu_workspace;
+    uint8_t *image_host = cpu_workspace + size_matrix;
+
+    // speed up
+    cudaStream_t stream_ = (cudaStream_t)stream;
+    memcpy(image_host, image.bgrptr, size_image);
+    memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
+    checkRuntime(
+        cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
+    checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i),
+                                 cudaMemcpyHostToDevice, stream_));
+
+    warp_affine_bilinear_and_normalize_plane(image_device, image.width * 3, image.width,
+                                             image.height, input_device, network_input_width_,
+                                             network_input_height_, affine_matrix_device, 114,
+                                             normalize_, stream_);
+  }
+
+  bool load(const string &engine_file, Type type, float confidence_threshold, float nms_threshold) {
+    trt_ = trt::load(engine_file);
+    if (trt_ == nullptr) return false;
+
+    trt_->print();
+
+    this->type_ = type;
+    this->confidence_threshold_ = confidence_threshold;
+    this->nms_threshold_ = nms_threshold;
+
+    auto input_dim = trt_->static_dims(0);
+    bbox_head_dims_ = trt_->static_dims(1);
+    has_segment_ = type == Type::V8Seg;
+    if (has_segment_) {
+      bbox_head_dims_ = trt_->static_dims(2);
+      segment_head_dims_ = trt_->static_dims(1);
+    }
+    network_input_width_ = input_dim[3];
+    network_input_height_ = input_dim[2];
+    isdynamic_model_ = trt_->has_dynamic_dim();
+
+    if (type == Type::V5 || type == Type::V3 || type == Type::V7) {
+      normalize_ = Norm::alpha_beta(1 / 255.0f, 0.0f, ChannelType::SwapRB);
+      num_classes_ = bbox_head_dims_[2] - 5;
+    } else if (type == Type::V8) {
+      normalize_ = Norm::alpha_beta(1 / 255.0f, 0.0f, ChannelType::SwapRB);
+      num_classes_ = bbox_head_dims_[2] - 4;
+    } else if (type == Type::V8Seg) {
+      normalize_ = Norm::alpha_beta(1 / 255.0f, 0.0f, ChannelType::SwapRB);
+      num_classes_ = bbox_head_dims_[2] - 4 - segment_head_dims_[1];
+    } else if (type == Type::X) {
+      // float mean[] = {0.485, 0.456, 0.406};
+      // float std[]  = {0.229, 0.224, 0.225};
+      // normalize_ = Norm::mean_std(mean, std, 1/255.0f, ChannelType::SwapRB);
+      normalize_ = Norm::None();
+      num_classes_ = bbox_head_dims_[2] - 5;
+    } else {
+      INFO("Unsupport type %d", type);
+    }
+    return true;
+  }
+
+  virtual BoxArray forward(const Image &image, void *stream = nullptr) override {
+    auto output = forwards({image}, stream);
+    if (output.empty()) return {};
+    return output[0];
+  }
+
+  virtual vector<BoxArray> forwards(const vector<Image> &images, void *stream = nullptr) override {
+    int num_image = images.size();
+    if (num_image == 0) return {};
+
+    auto input_dims = trt_->static_dims(0);
+    int infer_batch_size = input_dims[0];
+    if (infer_batch_size != num_image) {
+      if (isdynamic_model_) {
+        infer_batch_size = num_image;
+        input_dims[0] = num_image;
+        if (!trt_->set_run_dims(0, input_dims)) return {};
+      } else {
+        if (infer_batch_size < num_image) {
+          INFO(
+              "When using static shape model, number of images[%d] must be "
+              "less than or equal to the maximum batch[%d].",
+              num_image, infer_batch_size);
+          return {};
+        }
+      }
+    }
+    adjust_memory(infer_batch_size);
+
+    vector<AffineMatrix> affine_matrixs(num_image);
+    cudaStream_t stream_ = (cudaStream_t)stream;
+    for (int i = 0; i < num_image; ++i)
+      preprocess(i, images[i], preprocess_buffers_[i], affine_matrixs[i], stream);
+
+    float *bbox_output_device = bbox_predict_.gpu();
+    vector<void *> bindings{input_buffer_.gpu(), bbox_output_device};
+
+    if (has_segment_) {
+      bindings = {input_buffer_.gpu(), segment_predict_.gpu(), bbox_output_device};
+    }
+
+    if (!trt_->forward(bindings, stream)) {
+      INFO("Failed to tensorRT forward.");
+      return {};
+    }
+
+    for (int ib = 0; ib < num_image; ++ib) {
+      float *boxarray_device =
+          output_boxarray_.gpu() + ib * (32 + MAX_IMAGE_BOXES * NUM_BOX_ELEMENT);
+      float *affine_matrix_device = (float *)preprocess_buffers_[ib]->gpu();
+      float *image_based_bbox_output =
+          bbox_output_device + ib * (bbox_head_dims_[1] * bbox_head_dims_[2]);
+      checkRuntime(cudaMemsetAsync(boxarray_device, 0, sizeof(int), stream_));
+      decode_kernel_invoker(image_based_bbox_output, bbox_head_dims_[1], num_classes_,
+                            bbox_head_dims_[2], confidence_threshold_, nms_threshold_,
+                            affine_matrix_device, boxarray_device, MAX_IMAGE_BOXES, type_, stream_);
+    }
+    checkRuntime(cudaMemcpyAsync(output_boxarray_.cpu(), output_boxarray_.gpu(),
+                                 output_boxarray_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
+    checkRuntime(cudaStreamSynchronize(stream_));
+
+    vector<BoxArray> arrout(num_image);
+    int imemory = 0;
+    for (int ib = 0; ib < num_image; ++ib) {
+      float *parray = output_boxarray_.cpu() + ib * (32 + MAX_IMAGE_BOXES * NUM_BOX_ELEMENT);
+      int count = min(MAX_IMAGE_BOXES, (int)*parray);
+      BoxArray &output = arrout[ib];
+      output.reserve(count);
+      for (int i = 0; i < count; ++i) {
+        float *pbox = parray + 1 + i * NUM_BOX_ELEMENT;
+        int label = pbox[5];
+        int keepflag = pbox[6];
+        if (keepflag == 1) {
+          Box result_object_box(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4], label);
+          if (has_segment_) {
+            int row_index = pbox[7];
+            int mask_dim = segment_head_dims_[1];
+            float *mask_weights = bbox_output_device +
+                                  (ib * bbox_head_dims_[1] + row_index) * bbox_head_dims_[2] +
+                                  num_classes_ + 4;
+
+            float *mask_head_predict = segment_predict_.gpu();
+            float left, top, right, bottom;
+            float *i2d = affine_matrixs[ib].i2d;
+            affine_project(i2d, pbox[0], pbox[1], &left, &top);
+            affine_project(i2d, pbox[2], pbox[3], &right, &bottom);
+
+            float box_width = right - left;
+            float box_height = bottom - top;
+
+            float scale_to_predict_x = segment_head_dims_[3] / (float)network_input_width_;
+            float scale_to_predict_y = segment_head_dims_[2] / (float)network_input_height_;
+            int mask_out_width = box_width * scale_to_predict_x + 0.5f;
+            int mask_out_height = box_height * scale_to_predict_y + 0.5f;
+
+            if (mask_out_width > 0 && mask_out_height > 0) {
+              if (imemory >= (int)box_segment_cache_.size()) {
+                box_segment_cache_.push_back(std::make_shared<trt::Memory<unsigned char>>());
+              }
+
+              int bytes_of_mask_out = mask_out_width * mask_out_height;
+              auto box_segment_output_memory = box_segment_cache_[imemory];
+              result_object_box.seg =
+                  make_shared<InstanceSegmentMap>(mask_out_width, mask_out_height);
+
+              unsigned char *mask_out_device = box_segment_output_memory->gpu(bytes_of_mask_out);
+              unsigned char *mask_out_host = result_object_box.seg->data;
+              decode_single_mask(left * scale_to_predict_x, top * scale_to_predict_y, mask_weights,
+                                 mask_head_predict + ib * segment_head_dims_[1] *
+                                                         segment_head_dims_[2] *
+                                                         segment_head_dims_[3],
+                                 segment_head_dims_[3], segment_head_dims_[2], mask_out_device,
+                                 mask_dim, mask_out_width, mask_out_height, stream_);
+              checkRuntime(cudaMemcpyAsync(mask_out_host, mask_out_device,
+                                           box_segment_output_memory->gpu_bytes(),
+                                           cudaMemcpyDeviceToHost, stream_));
+            }
+          }
+          output.emplace_back(result_object_box);
+        }
+      }
+    }
+
+    if (has_segment_) checkRuntime(cudaStreamSynchronize(stream_));
+
+    return arrout;
+  }
+};
+
+Infer *loadraw(const std::string &engine_file, Type type, float confidence_threshold,
+               float nms_threshold) {
+  InferImpl *impl = new InferImpl();
+  if (!impl->load(engine_file, type, confidence_threshold, nms_threshold)) {
+    delete impl;
+    impl = nullptr;
+  }
+  return impl;
+}
+
+shared_ptr<Infer> load(const string &engine_file, Type type, float confidence_threshold,
+                       float nms_threshold) {
+  return std::shared_ptr<InferImpl>(
+      (InferImpl *)loadraw(engine_file, type, confidence_threshold, nms_threshold));
+}
+
+std::tuple<uint8_t, uint8_t, uint8_t> hsv2bgr(float h, float s, float v) {
+  const int h_i = static_cast<int>(h * 6);
+  const float f = h * 6 - h_i;
+  const float p = v * (1 - s);
+  const float q = v * (1 - f * s);
+  const float t = v * (1 - (1 - f) * s);
+  float r, g, b;
+  switch (h_i) {
+    case 0:
+      r = v, g = t, b = p;
+      break;
+    case 1:
+      r = q, g = v, b = p;
+      break;
+    case 2:
+      r = p, g = v, b = t;
+      break;
+    case 3:
+      r = p, g = q, b = v;
+      break;
+    case 4:
+      r = t, g = p, b = v;
+      break;
+    case 5:
+      r = v, g = p, b = q;
+      break;
+    default:
+      r = 1, g = 1, b = 1;
+      break;
+  }
+  return make_tuple(static_cast<uint8_t>(b * 255), static_cast<uint8_t>(g * 255),
+                    static_cast<uint8_t>(r * 255));
+}
+
+std::tuple<uint8_t, uint8_t, uint8_t> random_color(int id) {
+  float h_plane = ((((unsigned int)id << 2) ^ 0x937151) % 100) / 100.0f;
+  float s_plane = ((((unsigned int)id << 3) ^ 0x315793) % 100) / 100.0f;
+  return hsv2bgr(h_plane, s_plane, 1);
+}
+
+};  // namespace yolo