Deepinfer Model

fd5257a8 · xin.wang.waytous · fd5257a8 · fd5257a8 · fd5257a8 · fd5257a8
Commit fd5257a8 authored Sep 16, 2022 by xin.wang.waytous
57 changed files
--- a/.gitignore
+++ b/.gitignore
+build
+test
+*.engine
+*.onnx
+*.wts
+*.cache
+*.jpg
+*.png
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/src/**",
+                "/usr/local/cuda/include",
+                "/usr/local/include",
+                "/usr/include/**",
+                "/home/wangxin/projects/TensorRT-7.0.0/TensorRT/include",
+            ],
+            "defines": [],
+            "compilerPath": "/usr/bin/gcc",
+            "cStandard": "c11",
+            "cppStandard": "c++17",
+            "intelliSenseMode": "linux-gcc-x64"
+        }
+    ],
+    "version": 4
+}
\ No newline at end of file
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
+{
+    "files.associations": {
+        "vector": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "strstream": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "chrono": "cpp",
+        "complex": "cpp",
+        "condition_variable": "cpp",
+        "cstdint": "cpp",
+        "deque": "cpp",
+        "list": "cpp",
+        "unordered_map": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "optional": "cpp",
+        "ratio": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "memory": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "ostream": "cpp",
+        "numeric": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "cfenv": "cpp",
+        "cinttypes": "cpp",
+        "utility": "cpp",
+        "typeindex": "cpp",
+        "typeinfo": "cpp",
+        "bit": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "iterator": "cpp",
+        "memory_resource": "cpp",
+        "random": "cpp",
+        "string": "cpp",
+        "unordered_set": "cpp"
+    }
+}
\ No newline at end of file
--- a/configs/trades.yaml
+++ b/configs/trades.yaml
+name: TraDeS
+inputNames: [cvImage1]
+outputNames: [segInstances1]
+globalParams:
+    inferBatchSize: 1
+    inputWidth: 640
+    inputHeight: 384
+    fixAspectRatio: false
+units:
+    -
+        name: CameraSrc
+        inputNames: [cvImage1]
+        outputNames: [uint8Image1]
+    -
+        name: ResizeNorm
+        inputNames: [uint8Image1]
+        outputNames: [resizeNormImages]
+        inputMean: [0.40789654, 0.44719302, 0.47026115]
+        inputStd: [0.28863828, 0.27408164, 0.27809835]
+        useBGR: true
+    -
+        name: TRTInference
+        inputNames: [resizeNormImages]
+        outputNames: [hm, reh, wh, conv_weight, seg_feat]
+        runMode: 1 # 0-fp32 1-fp16 2-int8  (int8 not supported)
+        weightsPath: "configs/mots/trades_segv3.onnx"
+        engineFile: "configs/mots/trades_segv3_fp16.engine"
+        # calibImgPathFile: "configs/mots/trades_calib_imgs.txt"
+        # calibTableCache: "configs/mots/trades_calib_table.cache"
+        inferDynamic: false
+        maxBatchSize: 1
+    -
+        name: TraDesPostProcess
+        inputNames: [hm, reh, wh, conv_weight, seg_feat, uint8Image1]
+        outputNames: [segInstances1]
+        scoreThreshold: 0.2
+        truncatedThreshold: 0.05
+        classNumber: 3
+        classNames: [vehicle, person, cycle]
+        topK: 100
+        downScale: 4
+        segDims: 64
+        maxCntsLength: 500
--- a/configs/yolov5.yaml
+++ b/configs/yolov5.yaml
+name: YoloV5
+inputNames: [cvImage1, cvImage2]
+outputNames: [trackedDetections1, trackedDetections2]
+globalParams:
+    inferBatchSize: 2
+    inputWidth: 640
+    inputHeight: 640
+    fixAspectRatio: true
+units:
+    -
+        name: CameraSrc
+        inputNames: [cvImage1]
+        outputNames: [uint8Image1]
+    -
+        name: CameraSrc
+        inputNames: [cvImage2]
+        outputNames: [uint8Image2]
+    -
+        name: ResizeNorm
+        inputNames: [uint8Image1, uint8Image2]
+        outputNames: [resizeNormImages]
+        inputMean: [0, 0, 0]
+        inputStd: [1, 1, 1]
+        useBGR: false
+    -
+        name: YoloV5TRTInference
+        inputNames: [resizeNormImages]
+        outputNames: [num_detections, nmsed_boxes, nmsed_scores, nmsed_classes]
+        runMode: 1 # 0-fp32 1-fp16 2-int8 (int8 not supported)
+        weightsPath: "configs/thermal/yolov5s_640x640_thermal.wts"
+        engineFile: "configs/thermal/yolov5s_640x640_thermal_fp16.engine"
+        calibImgPathFile: "configs/thermal/yolov5s_calib_imgs.txt"
+        calibTableCache: "configs/thermal/yolov5s_calib_table.cache"
+        inferDynamic: true
+        maxBatchSize: 4
+        classNumber: 2
+        numAnchorsPerLevel: 3
+        numLevels: 3
+        startScale: 8
+        maxOutputBBoxCount: 1000
+        keepTopK: 100
+        # used when build engine, always be smaller than scoreThreshold. 
+        # if you want it to take effect, you should delete the ${engineFile}
+        engineScoreThreshold: 0.1 
+        nmsThreshold: 0.45
+        # yolov5
+        # n: 0.33+0.25 s: 0.33+0.5 m: 0.67+0.75 l: 1.0+1.0 x: 1.33+1.25
+        gd: 0.33
+        gw: 0.5
+    -
+        name: YoloV5PostProcess
+        inputNames: [num_detections, nmsed_boxes, nmsed_scores, nmsed_classes, uint8Image1, uint8Image2]
+        outputNames: [detections1, detections2]
+        scoreThreshold: 0.2 # used when inference, can be modified
+        truncatedThreshold: 0.05
+        keepTopK: 100
+        classNames: [vehicle, person]
+    -
+        name: ByteTracker
+        inputNames: [detections1]
+        outputNames: [trackedDetections1]
+        frame_rate: 20
+        # max saved tracked-objs every track
+        track_buffer: 30
+        # if obj.confidence < track_thresh and > detector.score_threshold, it will add to lower detections and used for tracking later.
+        track_thresh: 0.2
+        # if obj.confidence < high_thresh, it wouldn't be initialized. 
+        high_thresh: 0.2
+        # iou threshold, if iou(a,b) > match_thresh, they matched
+        match_thresh: 0.8
+        # output detection after it was tracked in x frames
+        init_frames: 0
--- a/src/base/blob.h
+++ b/src/base/blob.h
+#ifndef DEEPINFER_BASE_BLOB_H_
+#define DEEPINFER_BASE_BLOB_H_
+#include <memory>
+#include <limits>
+#include "common/common.h"
+#include "base/syncmem.h"
+namespace waytous{
+namespace deepinfer{
+namespace base{
+constexpr size_t kMaxBlobAxes = 32;
+template <typename Dtype>
+class Blob {
+public:
+    Blob() : data_(), count_(0), capacity_(0), use_cuda_host_malloc_(false) {}
+    explicit Blob(bool use_cuda_host_malloc)
+        : data_(),
+        count_(0),
+        capacity_(0),
+        use_cuda_host_malloc_(use_cuda_host_malloc) {}
+    explicit Blob(const std::vector<int>& shape,
+                  const bool use_cuda_host_malloc = false): 
+        capacity_(0), use_cuda_host_malloc_(use_cuda_host_malloc) {
+        Reshape(shape);
+    };
+    explicit Blob(const std::vector<int>& shape,
+                  Dtype* cpu_d_,
+                  const bool use_cuda_host_malloc = false
+                  ): 
+        capacity_(0), use_cuda_host_malloc_(use_cuda_host_malloc) {
+        Reshape(shape);
+        auto d_ = data_->mutable_cpu_data();
+        memcpy(d_, cpu_d_, data_->size());
+    };
+    Blob(const Blob&) = delete;
+    void operator=(const Blob&) = delete;
+    void Reshape(const std::vector<int>& shape){
+        CHECK_LE(shape.size(), kMaxBlobAxes);
+        count_ = 1;
+        shape_.resize(shape.size());
+        if (!shape_data_ || shape_data_->size() < shape.size() * sizeof(int)) {
+            shape_data_.reset(
+                new SyncedMemory(shape.size() * sizeof(int), use_cuda_host_malloc_));
+        }
+        int* shape_data = static_cast<int*>(shape_data_->mutable_cpu_data());
+        for (size_t i = 0; i < shape.size(); ++i) {
+            CHECK_GE(shape[i], 0);
+            if (count_ != 0) {
+            CHECK_LE(shape[i], std::numeric_limits<int>::max() / count_)
+                << "blob size exceeds std::numeric_limits<int>::max()";
+            }
+            count_ *= shape[i];
+            shape_[i] = shape[i];
+            shape_data[i] = shape[i];
+        }
+        if (count_ > capacity_) {
+            capacity_ = count_;
+            data_.reset(
+                new SyncedMemory(capacity_ * sizeof(Dtype), use_cuda_host_malloc_));
+        }
+    };
+    void ReshapeLike(const Blob& other){
+        Reshape(other.shape());
+    };
+    inline const std::vector<int>& shape() const { return shape_; }
+    inline int shape(int index) const {
+        return shape_[CanonicalAxisIndex(index)];
+    }
+    inline int CanonicalAxisIndex(int axis_index) const {
+        CHECK_GE(axis_index, -num_axes())
+            << "axis " << axis_index << " out of range for " << num_axes()
+            << "-D Blob with shape ";
+        CHECK_LT(axis_index, num_axes())
+            << "axis " << axis_index << " out of range for " << num_axes()
+            << "-D Blob with shape ";
+        if (axis_index < 0) {
+        return axis_index + num_axes();
+        }
+        return axis_index;
+    }
+    inline int num_axes() const { return static_cast<int>(shape_.size()); }
+    inline int count() const { return count_; }
+    inline const std::shared_ptr<SyncedMemory>& data() const {
+        CHECK(data_);
+        return data_;
+    }
+    /// @brief Deprecated legacy shape accessor num: use shape(0) instead.
+    inline int num() const { return LegacyShape(0); }
+    /// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
+    inline int channels() const { return LegacyShape(1); }
+    /// @brief Deprecated legacy shape accessor height: use shape(2) instead.
+    inline int height() const { return LegacyShape(2); }
+    /// @brief Deprecated legacy shape accessor width: use shape(3) instead.
+    inline int width() const { return LegacyShape(3); }
+    inline int LegacyShape(int index) const {
+        CHECK_LE(num_axes(), 4)
+            << "Cannot use legacy accessors on Blobs with > 4 axes.";
+        CHECK_LT(index, 4);
+        CHECK_GE(index, -4);
+        if (index >= num_axes() || index < -num_axes()) {
+        // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
+        // indexing) -- this special case simulates the one-padding used to fill
+        // extraneous axes of legacy blobs.
+        return 1;
+        }
+        return shape(index);
+    }
+    inline int offset(const int n, const int c = 0, const int h = 0,
+                    const int w = 0) const {
+        CHECK_GE(n, 0);
+        CHECK_LE(n, num());
+        CHECK_GE(channels(), 0);
+        CHECK_LE(c, channels());
+        CHECK_GE(height(), 0);
+        CHECK_LE(h, height());
+        CHECK_GE(width(), 0);
+        CHECK_LE(w, width());
+        return ((n * channels() + c) * height() + h) * width() + w;
+    }
+    inline int offset(const std::vector<int>& indices) const {
+        CHECK_LE(indices.size(), static_cast<size_t>(num_axes()));
+        int offset = 0;
+        for (int i = 0; i < num_axes(); ++i) {
+            offset *= shape(i);
+            if (static_cast<int>(indices.size()) > i) {
+                CHECK_GE(indices[i], 0);
+                CHECK_LT(indices[i], shape(i));
+                offset += indices[i];
+            }
+        }
+        return offset;
+    }
+    const Dtype* cpu_data() const {
+        CHECK(data_);
+        return (const Dtype*)data_->cpu_data();
+    };
+    void set_cpu_data(Dtype* data){
+        CHECK(data);
+        // Make sure CPU and GPU sizes remain equal
+        size_t size = count_ * sizeof(Dtype);
+        if (data_->size() != size) {
+            data_.reset(new SyncedMemory(size, use_cuda_host_malloc_));
+        }
+        data_->set_cpu_data(data);
+    };
+    const int* gpu_shape() const{
+        CHECK(shape_data_);
+        return (const int*)shape_data_->gpu_data();
+    };
+    const Dtype* gpu_data() const {
+        CHECK(data_);
+        return (const Dtype*)data_->gpu_data();
+    };
+    void set_gpu_data(Dtype* data){
+        CHECK(data);
+        // Make sure CPU and GPU sizes remain equal
+        size_t size = count_ * sizeof(Dtype);
+        if (data_->size() != size) {
+            data_.reset(new SyncedMemory(size, use_cuda_host_malloc_));
+        }
+        data_->set_gpu_data(data);
+    };
+    Dtype* mutable_cpu_data(){
+        CHECK(data_);
+        return static_cast<Dtype*>(data_->mutable_cpu_data());
+    };
+    Dtype* mutable_gpu_data(){
+        CHECK(data_);
+        return static_cast<Dtype*>(data_->mutable_gpu_data());
+    };
+    void set_head_gpu() { data_->set_head_gpu(); }
+    void set_head_cpu() { data_->set_head_cpu(); }
+    SyncedMemory::SyncedHead head() const { return data_->head(); }
+    // void CopyFrom(const Blob<Dtype>& source, bool reshape = false);
+    void ShareData(const Blob& other){
+        CHECK_EQ(count_, other.count());
+        data_ = other.data();
+    };
+protected:
+    std::shared_ptr<SyncedMemory> data_;
+    std::shared_ptr<SyncedMemory> shape_data_;
+    std::vector<int> shape_;
+    int count_;
+    int capacity_;
+    bool use_cuda_host_malloc_;
+};
+template <typename Dtype>
+using BlobPtr = std::shared_ptr<Blob<Dtype>>;
+template <typename Dtype>
+using BlobConstPtr = std::shared_ptr<const Blob<Dtype>>;
+} //namespace base
+} //namspace deepinfer
+} //namespace waytous
+#endif
--- a/src/base/image.h
+++ b/src/base/image.h
+#ifndef DEEPINFER_BASE_IMAGE_H_
+#define DEEPINFER_BASE_IMAGE_H_
+#pragma once
+#include <string>
+#include <memory>
+#include <map>
+#include <opencv2/opencv.hpp>
+#include "base/blob.h"
+namespace waytous{
+namespace deepinfer{
+namespace base{
+enum class Color {
+    NONE = 0x00,
+    GRAY = 0x01,
+    RGB = 0x02,
+    BGR = 0x03,
+};
+const std::map<Color, int> kChannelsMap{
+    {Color::GRAY, 1}, {Color::RGB, 3}, {Color::BGR, 3}};
+class Image8U{
+public:
+    Image8U()
+      : rows_(0),
+        cols_(0),
+        type_(Color::NONE),
+        channels_(0),
+        width_step_(0),
+        blob_(nullptr),
+        offset_(0) {}
+    Image8U(int rows, int cols, Color type, std::shared_ptr<Blob<uint8_t>> blob,
+          int offset = 0)
+        : rows_(rows), cols_(cols), type_(type), blob_(blob), offset_(offset) {
+        channels_ = kChannelsMap.at(type);
+        CHECK_EQ(blob_->num_axes(), 3);
+        CHECK_EQ(blob_->shape(2), channels_);
+        CHECK_LE(offset_ + blob_->offset({rows - 1, cols - 1, channels_ - 1}),
+                (int)(blob_->count()));
+        width_step_ = blob_->offset({1, 0, 0}) * static_cast<int>(sizeof(uint8_t));
+    }
+    Image8U(int rows, int cols, Color type)
+        : rows_(rows), cols_(cols), type_(type), offset_(0) {
+        channels_ = kChannelsMap.at(type);
+        blob_.reset(new Blob<uint8_t>({rows_, cols_, channels_}));
+        width_step_ = blob_->offset({1, 0, 0}) * static_cast<int>(sizeof(uint8_t));
+    }
+    // use cv::image to init blob_
+    Image8U(cv::Mat& image){
+        rows_ = image.rows;
+        cols_ = image.cols;
+        type_ = Color::BGR;
+        offset_ = 0;
+        channels_ = image.channels();
+        blob_.reset(new Blob<uint8_t>({rows_, cols_, channels_}));
+        blob_->set_cpu_data(image.data);
+        width_step_ = blob_->offset({1, 0, 0}) * static_cast<int>(sizeof(uint8_t));
+    }
+    // use cv::image to init blob_
+    Image8U(cv::Mat* image){
+        rows_ = image->rows;
+        cols_ = image->cols;
+        type_ = Color::BGR;
+        offset_ = 0;
+        channels_ = image->channels();
+        blob_.reset(new Blob<uint8_t>({rows_, cols_, channels_}));
+        blob_->set_cpu_data(image->data);
+        width_step_ = blob_->offset({1, 0, 0}) * static_cast<int>(sizeof(uint8_t));
+    }
+    // convert to opencv img
+    cv::Mat toCVMat(){
+        cv::Mat image;
+        if(channels_ == 1){
+            image = cv::Mat(rows_, cols_, CV_8UC1);
+        }else if (channels_ == 3){
+            image = cv::Mat(rows_, cols_, CV_8UC3);
+        }
+        image.data = blob_->mutable_cpu_data();
+        return image;
+    }
+    Image8U(const Image8U &src)
+      : rows_(src.rows_),
+        cols_(src.cols_),
+        type_(src.type_),
+        channels_(src.channels_),
+        width_step_(src.width_step_),
+        blob_(src.blob_),
+        offset_(src.offset_) {}
+    Image8U &operator=(const Image8U &src) {
+        this->rows_ = src.rows_;
+        this->cols_ = src.cols_;
+        this->type_ = src.type_;
+        this->channels_ = src.channels_;
+        this->width_step_ = src.width_step_;
+        this->blob_ = src.blob_;
+        this->offset_ = src.offset_;
+        return *this;
+    }
+    ~Image8U() {}
+    uint8_t *mutable_cpu_data() { return mutable_cpu_ptr(0); }
+    uint8_t *mutable_gpu_data() { return mutable_gpu_ptr(0); }
+    const uint8_t *cpu_data() const { return cpu_ptr(0); }
+    const uint8_t *gpu_data() const { return gpu_ptr(0); }
+    const uint8_t *cpu_ptr(int row = 0) const {
+        return blob_->cpu_data() + blob_->offset({row, 0, 0}) + offset_;
+    }
+    const uint8_t *gpu_ptr(int row = 0) const {
+        return blob_->gpu_data() + blob_->offset({row, 0, 0}) + offset_;
+    }
+    uint8_t *mutable_cpu_ptr(int row = 0) {
+        return blob_->mutable_cpu_data() + blob_->offset({row, 0, 0}) + offset_;
+    }
+    uint8_t *mutable_gpu_ptr(int row = 0) {
+        return blob_->mutable_gpu_data() + blob_->offset({row, 0, 0}) + offset_;
+    }
+    Color type() const { return type_; }
+    int rows() const { return rows_; }
+    int cols() const { return cols_; }
+    int channels() const { return channels_; }
+    int width_step() const { return width_step_; }
+    // @brief: returns the total number of pixels.
+    int total() const { return rows_ * cols_ * channels_; }
+    Image8U operator()(const cv::Rect2i &roi) {
+        int offset = offset_ + blob_->offset({roi.y, roi.x, 0});
+        // return Image8U(roi.height, roi.width, type_, blob_, offset);
+        return Image8U(roi.height, roi.width, type_, blob_, offset);
+    }
+    Image8U* newRoiImage(const std::shared_ptr<cv::Rect2i> roi) {
+        int offset = offset_ + blob_->offset({roi->y, roi->x, 0});
+        // return Image8U(roi->height, roi->width, type_, blob_, offset);
+        return new Image8U(roi->height, roi->width, type_, blob_, offset);
+    }
+    std::shared_ptr<Blob<uint8_t>> blob() { return blob_; }
+    // DONOT return `std::shared_ptr<const Blob<uint8_t>> &` or `const std::... &`
+    std::shared_ptr<const Blob<uint8_t>> blob() const { return blob_; }
+protected:
+    int rows_;
+    int cols_;
+    Color type_;
+    int channels_;
+    int width_step_;
+    std::shared_ptr<Blob<uint8_t>> blob_;
+    int offset_;
+};  // class Image8U
+typedef std::shared_ptr<Image8U> Image8UPtr;
+typedef std::shared_ptr<const Image8U> Image8UConstPtr;
+} //namespace base
+} //namspace deepinfer
+} //namespace waytous
+#endif
--- a/src/base/pipeline.h
+++ b/src/base/pipeline.h
+#ifndef DEEPINFER_BASE_PIPELINE_H_
+#define DEEPINFER_BASE_PIPELINE_H_
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include <thread>
+#include <atomic>
+#include <yaml-cpp/yaml.h>
+#include "common/register.h"
+using namespace std;
+namespace waytous {
+namespace deepinfer {
+namespace base {
+class LifeCycle {
+public:
+    virtual void init(YAML::Node& node) = 0;
+    virtual void startUp() = 0;
+    virtual void shutDown() = 0;
+};
+template<typename T>
+class Component : public LifeCycle {
+public:
+    virtual std::string getName() = 0;
+    virtual void execute(T t) = 0;
+};
+template<typename T, typename R>
+class AbstractComponent : public Component<T> {
+private:
+    std::unordered_set<shared_ptr<Component<R>>> down_stream;
+protected:
+    const std::unordered_set<shared_ptr<Component<R>>> &getDownStream() {
+        return down_stream;
+    }
+protected:
+    virtual R doExecute(T t) = 0;
+public:
+    void addDownStream(shared_ptr<Component<R>> component) {
+        down_stream.insert(component);
+    }
+    void init(YAML::Node& node) override {
+    }
+    void execute(T t) override {
+        R r = doExecute(t);
+        cout << this->getName() + "\treceive\t" << typeid(t).name() << "\t" << t << "\treturn\t" << typeid(r).name()
+             << "\t" << r << endl;
+        if constexpr (is_same_v<R, void>) {
+            return;
+        }
+        for (auto &&obj : getDownStream()) {
+            obj->execute(r);
+        }
+    }
+    void startUp() override {
+        for (auto &&obj : this->getDownStream()) {
+            obj->startUp();
+        }
+        cout << "------------------ " + this->getName() + " is starting ----------------------" << endl;
+    }
+    void shutDown() override {
+        auto downStreams = this->getDownStream();
+        for (auto &&obj : downStreams) {
+            obj->shutDown();
+        }
+        cout << "------------------ " + this->getName() + " is ending ----------------------" << endl;
+    }
+};
+template<typename T, typename R>
+using Source = AbstractComponent<T, R>;
+// DEEPINFER_REGISTER_REGISTERER(Source);
+// #define DEEPINFER_REGISTER_INFERENCE_ENGINE(name) \
+//   DEEPINFER_REGISTER_CLASS(Source, name)
+template<typename T, typename R>
+using Channel = AbstractComponent<T, R>;
+template<typename T, typename R>
+using Sink = AbstractComponent<T, R>;
+template<typename R, typename T>
+class Pipeline : public LifeCycle {
+private:
+    shared_ptr<Source<R, T>> source;
+public:
+    void setSource(shared_ptr<Source<R, T>> component) {
+        source = component;
+    }
+    void init(YAML::Node& node) override {
+    }
+    void startUp() override {
+        assert(source.get() != nullptr);
+        source->startUp();
+    }
+    void shutDown() override {
+        source->shutDown();
+    }
+};
+/*
+class printSink;
+class intStringChannel;
+class printSink : public Sink<string, int> {
+ public:
+    string getName() override {
+        return "printSink";
+    }
+ protected:
+    int doExecute(string t) override {
+        return INT_MIN;
+    }
+};
+class intStringChannel : public Channel<int, string> {
+ public:
+    void init(std::string config) override {
+    }
+    string getName() override {
+        return "intStringChannel";
+    }
+    void startUp() override {
+    }
+    void shutDown() override {
+    }
+ protected:
+    string doExecute(int t) override {
+        return to_string(t + 100);
+    }
+};
+class IntSource : public Source<int, int> {
+ private:
+    int val = 0;
+ public:
+    void init(std::string config) override {
+        cout << "--------- " + getName() + " init --------- ";
+        val = 1;
+    }
+    string getName() override {
+        return "Int Source";
+    }
+    void startUp() override {
+        this->execute(val);
+    }
+ protected:
+    int doExecute(int) override {
+        return val + 1;
+    }
+};
+int main() {
+    pipeline<int, int> p;
+    // source
+    auto is = make_shared<IntSource>();
+    // channel
+    auto isc = make_shared<intStringChannel>();
+    // sink
+    auto ps = make_shared<printSink>();
+    is->addDownStream(isc);
+    isc->addDownStream(ps);
+    // 设置 source
+    p.setSource(is);
+    // 启动
+    p.startUp();
+}
+*/
+}  // namespace base
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/base/syncmem.cpp
+++ b/src/base/syncmem.cpp
+#include "base/syncmem.h"
+namespace waytous{
+namespace deepinfer{
+namespace base{
+SyncedMemory::SyncedMemory(bool use_cuda)
+    : cpu_ptr_(NULL),
+      gpu_ptr_(NULL),
+      size_(0),
+      head_(UNINITIALIZED),
+      own_cpu_data_(false),
+      cpu_malloc_use_cuda_(use_cuda),
+      own_gpu_data_(false),
+      device_(-1) {
+#ifdef PERCEPTION_DEBUG
+  CUDA_CHECK(cudaGetDevice(&device_));
+#endif
+}
+SyncedMemory::SyncedMemory(size_t size, bool use_cuda)
+    : cpu_ptr_(NULL),
+      gpu_ptr_(NULL),
+      size_(size),
+      head_(UNINITIALIZED),
+      own_cpu_data_(false),
+      cpu_malloc_use_cuda_(use_cuda),
+      own_gpu_data_(false),
+      device_(-1) {
+#ifdef PERCEPTION_DEBUG
+  CUDA_CHECK(cudaGetDevice(&device_));
+#endif
+}
+SyncedMemory::~SyncedMemory() {
+    check_device();
+    if (cpu_ptr_ && own_cpu_data_) {
+        PerceptionFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
+    }
+    if (gpu_ptr_ && own_gpu_data_) {
+        CUDA_CHECK(cudaFree(gpu_ptr_));
+    }
+}
+inline void SyncedMemory::to_cpu() {
+    check_device();
+    switch (head_) {
+        case UNINITIALIZED:
+            PerceptionMallocHost(&cpu_ptr_, size_, cpu_malloc_use_cuda_);
+            if (cpu_ptr_ == nullptr) {
+                LOG_ERROR << "cpu_ptr_ is null";
+                return;
+            }
+            memset(cpu_ptr_, 0, size_);
+            head_ = HEAD_AT_CPU;
+            own_cpu_data_ = true;
+            break;
+        case HEAD_AT_GPU:
+            if (cpu_ptr_ == nullptr) {
+                PerceptionMallocHost(&cpu_ptr_, size_, cpu_malloc_use_cuda_);
+                own_cpu_data_ = true;
+            }
+            CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDefault));
+            head_ = SYNCED;
+            break;
+        case HEAD_AT_CPU:
+        case SYNCED:
+            break;
+    }
+}
+inline void SyncedMemory::to_gpu() {
+    check_device();
+    switch (head_) {
+        case UNINITIALIZED:
+            CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
+            CUDA_CHECK(cudaMemset(gpu_ptr_, 0, size_));
+            head_ = HEAD_AT_GPU;
+            own_gpu_data_ = true;
+            // LOG_INFO << "gpu init malloc: " << size_;
+            break;
+        case HEAD_AT_CPU:
+            if (gpu_ptr_ == nullptr) {
+                CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
+                own_gpu_data_ = true;
+                // LOG_INFO << "gpu copy malloc: " << size_;
+            }
+            CUDA_CHECK(cudaMemcpy(gpu_ptr_, cpu_ptr_, size_, cudaMemcpyDefault));
+            // LOG_INFO << "gpu copy done";
+            head_ = SYNCED;
+            break;
+        case HEAD_AT_GPU:
+        case SYNCED:
+            break;
+    }
+}
+const void* SyncedMemory::cpu_data() {
+    check_device();
+    to_cpu();
+    return (const void*)cpu_ptr_;
+}
+void SyncedMemory::set_cpu_data(void* data) {
+    check_device();
+    CHECK(data);
+    if (own_cpu_data_) {
+        PerceptionFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
+    }
+    cpu_ptr_ = data;
+    head_ = HEAD_AT_CPU;
+    own_cpu_data_ = false;
+}
+const void* SyncedMemory::gpu_data() {
+    check_device();
+    to_gpu();
+    return (const void*)gpu_ptr_;
+}
+void SyncedMemory::set_gpu_data(void* data) {
+    check_device();
+    CHECK(data);
+    if (own_gpu_data_) {
+        CUDA_CHECK(cudaFree(gpu_ptr_));
+    }
+    gpu_ptr_ = data;
+    head_ = HEAD_AT_GPU;
+    own_gpu_data_ = false;
+}
+void* SyncedMemory::mutable_cpu_data() {
+    check_device();
+    to_cpu();
+    head_ = HEAD_AT_CPU;
+    return cpu_ptr_;
+}
+void* SyncedMemory::mutable_gpu_data() {
+    check_device();
+    to_gpu();
+    head_ = HEAD_AT_GPU;
+    return gpu_ptr_;
+}
+void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
+    check_device();
+    CHECK_EQ(head_, HEAD_AT_CPU);
+    if (gpu_ptr_ == nullptr) {
+        CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
+        own_gpu_data_ = true;
+        LOG_INFO << "gpu async malloc: " << size_;
+    }
+    const cudaMemcpyKind put = cudaMemcpyHostToDevice;
+    CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
+    // Assume caller will synchronize on the stream before use
+    head_ = SYNCED;
+}
+void SyncedMemory::check_device() {
+#ifdef PERCEPTION_DEBUG
+    int device;
+    cudaGetDevice(&device);
+    CHECK_EQ(device, device_);
+    if (gpu_ptr_ && own_gpu_data_) {
+        cudaPointerAttributes attributes;
+        CUDA_CHECK(cudaPointerGetAttributes(&attributes, gpu_ptr_));
+        CHECK_EQ(attributes.device, device_);
+    }
+#endif
+}
+} //namespace base
+} //namspace deepinfer
+} //namespace waytous
--- a/src/base/syncmem.h
+++ b/src/base/syncmem.h
+#ifndef DEEPINFER_BASE_SYNCMEM_H_
+#define DEEPINFER_BASE_SYNCMEM_H_
+#include "common/log.h"
+#include "common/common.h"
+namespace waytous{
+namespace deepinfer{
+namespace base{
+inline void PerceptionMallocHost(void** ptr, size_t size, bool use_cuda) {
+    if (use_cuda) {
+        CUDA_CHECK(cudaMallocHost(ptr, size));
+        // cudaHostAlloc();
+        // cudaHostGetDevicePointer();
+        return;
+    }
+    *ptr = malloc(size);
+    // LOG_INFO << "cpu malloc: " << size;
+    CHECK(*ptr) << "host allocation of size " << size << " failed";
+}
+inline void PerceptionFreeHost(void* ptr, bool use_cuda) {
+    if (use_cuda) {
+        CUDA_CHECK(cudaFreeHost(ptr));
+        return;
+    }
+    free(ptr);
+}
+/**
+ * @brief Manages memory allocation and synchronization between the host (CPU)
+ *        and device (GPU).
+ */
+class SyncedMemory {
+public:
+    enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
+    explicit SyncedMemory(bool use_cuda);
+    SyncedMemory(size_t size, bool use_cuda);
+    SyncedMemory(const SyncedMemory&) = delete;
+    void operator=(const SyncedMemory&) = delete;
+    ~SyncedMemory();
+    const void* cpu_data(); 
+    void set_cpu_data(void* data);
+    const void* gpu_data();
+    void set_gpu_data(void* data);
+    void* mutable_cpu_data();
+    void* mutable_gpu_data();
+    SyncedHead head() const { return head_; }
+    void set_head(SyncedHead head) { head_ = head; }
+    void set_head_gpu() { set_head(HEAD_AT_GPU); }
+    void set_head_cpu() { set_head(HEAD_AT_CPU); }
+    size_t size() { return size_; }
+    void async_gpu_push(const cudaStream_t& stream);
+private:
+    void check_device();
+    void to_cpu();
+    void to_gpu();
+private:
+    void* cpu_ptr_;
+    void* gpu_ptr_;
+    size_t size_;
+    SyncedHead head_;
+    bool own_cpu_data_;
+    bool cpu_malloc_use_cuda_;
+    bool own_gpu_data_;
+    int device_;
+};  // class SyncedMemory
+} //namespace base
+} //namspace deepinfer
+} //namespace waytous
+#endif
\ No newline at end of file
--- a/src/common/common.h
+++ b/src/common/common.h
+#ifndef DEEPINFER_COMMON_H_
+#define DEEPINFER_COMMON_H_
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <assert.h>
+#define BLOCK 512
+#define iMAX(a, b) ((a) > (b) ? (a) : (b))
+#define iMIN(a, b) ((a) > (b) ? (b) : (a))
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        cudaError_t error_code = callstr;                                                      \
+        if (error_code != cudaSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+#endif
+inline dim3 cudaGridSize(uint n, uint block)
+{
+    uint k = (n - 1) / block + 1;
+    uint x = k ;
+    uint y = 1 ;
+    if (x > 65535 )
+    {
+        x = ceil(sqrt(x));
+        y = (n - 1 )/(x * block) + 1;
+    }
+    dim3 d = {x, y, 1} ;
+    return d;
+}
+#endif
--- a/src/common/file.cpp
+++ b/src/common/file.cpp
+#include "common/file.h"
+namespace waytous{
+namespace deepinfer{
+namespace common{
+std::shared_ptr<ConfigRoot> ConfigRoot::instance = nullptr;
+const std::string ConfigRoot::GetRootPath(){
+    if(instance == nullptr){
+        SetRootPath("");
+        LOG_WARN << "Haven't init config root, use default ''.";
+    }
+    return instance->rootPath;
+}
+void ConfigRoot::SetRootPath(const std::string path){
+    if(instance == nullptr){
+        instance.reset(new ConfigRoot());
+    }
+    instance->rootPath = path;
+}
+} //namespace common
+} //namspace deepinfer
+} //namespace waytous
--- a/src/common/file.h
+++ b/src/common/file.h
+#ifndef WAYTOUS_DEEPINFER_COMMON_FILES_H_
+#define WAYTOUS_DEEPINFER_COMMON_FILES_H_
+#pragma once
+#include <string>
+#include <memory>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "common/log.h"
+namespace waytous{
+namespace deepinfer{
+namespace common{
+class ConfigRoot{
+public:
+    static std::shared_ptr<ConfigRoot> instance;
+public:
+    std::string rootPath;
+    static const std::string GetRootPath();
+    static void SetRootPath(const std::string path);
+};
+inline std::string GetCurrentPath() {
+    char tmp[4096];
+    return getcwd(tmp, sizeof(tmp)) ? std::string(tmp) : std::string("");
+}
+inline std::string GetAbsolutePath(const std::string &prefix,
+                            const std::string &relative_path) {
+    if (relative_path.empty()) {
+        return prefix;
+    }
+    // If prefix is empty or relative_path is already absolute.
+    if (prefix.empty() || relative_path.front() == '/') {
+        return relative_path;
+    }
+    if (prefix.back() == '/') {
+        return prefix + relative_path;
+    }
+    return prefix + "/" + relative_path;
+}
+inline bool PathExists(const std::string &path) {
+    struct stat info;
+    return stat(path.c_str(), &info) == 0;
+}
+} //namespace common
+} //namspace deepinfer
+} //namespace waytous
+#endif
\ No newline at end of file
--- a/src/common/log.h
+++ b/src/common/log.h
+#ifndef DEEPINFER_COMMON_LOG_H_
+#define DEEPINFER_COMMON_LOG_H_
+#pragma once
+#include <string>
+//sudo apt-get install libgoogle-glog*
+#include "glog/logging.h"
+#define LOG_INFO google::LogMessage(__FILE__, __LINE__, google::INFO).stream()
+#define LOG_WARN google::LogMessage(__FILE__, __LINE__, google::WARNING).stream()
+#define LOG_ERROR google::LogMessage(__FILE__, __LINE__, google::ERROR).stream()
+#define LOG_FATAL google::LogMessage(__FILE__, __LINE__, google::FATAL).stream()
+#define LOG_DEBUG VLOG(4) << "[DEBUG]"
+#endif
--- a/src/common/register.cpp
+++ b/src/common/register.cpp
+#include "common/register.h"
+namespace waytous {
+namespace deepinfer {
+namespace common {
+BaseClassMap &GlobalFactoryMap() {
+  static BaseClassMap factory_map;
+  return factory_map;
+}
+bool GetRegisteredClasses(
+    const std::string &base_class_name,
+    std::vector<std::string> *registered_derived_classes_names) {
+  if (registered_derived_classes_names == nullptr) {
+    LOG_ERROR << "registered_derived_classes_names is not available";
+    return false;
+  }
+  BaseClassMap &map = GlobalFactoryMap();
+  auto iter = map.find(base_class_name);
+  if (iter == map.end()) {
+    LOG_ERROR << "class not registered:" << base_class_name;
+    return false;
+  }
+  for (auto pair : iter->second) {
+    registered_derived_classes_names->push_back(pair.first);
+  }
+  return true;
+}
+}  // namespace common
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/common/register.h
+++ b/src/common/register.h
+#ifndef DEEPINFER_COMMON_REGISTER_H_
+#define DEEPINFER_COMMON_REGISTER_H_
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "common/log.h"
+namespace waytous {
+namespace deepinfer {
+namespace common {
+// from apollo register
+// idea from boost any but make it more simple and don't use type_info.
+class Any {
+public:
+    Any() : content_(NULL) {}
+    template <typename ValueType>
+    explicit Any(const ValueType &value)
+        : content_(new Holder<ValueType>(value)) {}
+    Any(const Any &other)
+        : content_(other.content_ ? other.content_->Clone() : nullptr) {}
+    ~Any() { delete content_; }
+    template <typename ValueType>
+    ValueType *AnyCast() {
+        return content_ ? &(static_cast<Holder<ValueType> *>(content_)->held_)
+                        : nullptr;
+    }
+private:
+    class PlaceHolder {
+    public:
+        virtual ~PlaceHolder() {}
+        virtual PlaceHolder *Clone() const = 0;
+    };
+    template <typename ValueType>
+    class Holder : public PlaceHolder {
+    public:
+        explicit Holder(const ValueType &value) : held_(value) {}
+        virtual ~Holder() {}
+        virtual PlaceHolder *Clone() const { return new Holder(held_); }
+        ValueType held_;
+    };
+    PlaceHolder *content_;
+};
+class ObjectFactory {
+public:
+    ObjectFactory() {}
+    virtual ~ObjectFactory() {}
+    virtual Any NewInstance() { return Any(); }
+    ObjectFactory(const ObjectFactory &) = delete;
+    ObjectFactory &operator=(const ObjectFactory &) = delete;
+private:
+};
+typedef std::map<std::string, ObjectFactory *> FactoryMap;
+typedef std::map<std::string, FactoryMap> BaseClassMap;
+BaseClassMap &GlobalFactoryMap();
+bool GetRegisteredClasses(
+    const std::string &base_class_name,
+    std::vector<std::string> *registered_derived_classes_names);
+}  // namespace common
+}  // namespace deepinfer
+}  // namespace waytous
+#define DEEPINFER_REGISTER_REGISTERER(base_class)                    \
+class base_class##Registerer {                                        \
+    typedef ::waytous::deepinfer::common::Any Any;                   \
+    typedef ::waytous::deepinfer::common::FactoryMap FactoryMap;     \
+                                                                      \
+public:                                                               \
+    static base_class *GetInstanceByName(const ::std::string &name) { \
+    FactoryMap &map =                                                 \
+      ::waytous::deepinfer::common::GlobalFactoryMap()[#base_class]; \
+    FactoryMap::iterator iter = map.find(name);                       \
+    if (iter == map.end()) {                                          \
+        for (auto c : map) {                                          \
+        LOG_ERROR << "Instance:" << c.first;                          \
+        }                                                             \
+        LOG_ERROR << "Get instance " << name << " failed.";           \
+        return nullptr;                                               \
+    }                                                                 \
+    Any object = iter->second->NewInstance();                         \
+    return *(object.AnyCast<base_class *>());                         \
+    }                                                                 \
+    static std::vector<base_class *> GetAllInstances() {              \
+    std::vector<base_class *> instances;                              \
+    FactoryMap &map =                                                 \
+      ::waytous::deepinfer::common::GlobalFactoryMap()[#base_class]; \
+    instances.reserve(map.size());                                    \
+    for (auto item : map) {                                           \
+        Any object = item.second->NewInstance();                      \
+        instances.push_back(*(object.AnyCast<base_class *>()));       \
+    }                                                                 \
+    return instances;                                                 \
+    }                                                                 \
+    static const ::std::string GetUniqInstanceName() {                \
+    FactoryMap &map =                                                 \
+      ::waytous::deepinfer::common::GlobalFactoryMap()[#base_class]; \
+    CHECK_EQ(map.size(), 1U) << map.size();                           \
+    return map.begin()->first;                                        \
+    }                                                                 \
+    static base_class *GetUniqInstance() {                            \
+    FactoryMap &map =                                                 \
+      ::waytous::deepinfer::common::GlobalFactoryMap()[#base_class]; \
+    CHECK_EQ(map.size(), 1U) << map.size();                           \
+    Any object = map.begin()->second->NewInstance();                  \
+    return *(object.AnyCast<base_class *>());                         \
+    }                                                                 \
+    static bool IsValid(const ::std::string &name) {                  \
+    FactoryMap &map =                                                 \
+      ::waytous::deepinfer::common::GlobalFactoryMap()[#base_class]; \
+    return map.find(name) != map.end();                               \
+    }                                                                 \
+};
+#define DEEPINFER_REGISTER_CLASS(clazz, name)                                  \
+namespace {                                                                     \
+class ObjectFactory##name : public waytous::deepinfer::common::ObjectFactory { \
+public:                                                                         \
+    virtual ~ObjectFactory##name() {}                                           \
+    virtual ::waytous::deepinfer::common::Any NewInstance() {                  \
+    return ::waytous::deepinfer::common::Any(new name());                      \
+    }                                                                           \
+};                                                                              \
+__attribute__((constructor)) void RegisterFactory##name() {                     \
+    ::waytous::deepinfer::common::FactoryMap &map =                            \
+        ::waytous::deepinfer::common::GlobalFactoryMap()[#clazz];              \
+    if (map.find(#name) == map.end()) map[#name] = new ObjectFactory##name();   \
+}                                                                               \
+}  // namespace
+#endif
--- a/src/interfaces/base_io.cpp
+++ b/src/interfaces/base_io.cpp
+#include "interfaces/base_io.h"
+namespace waytous {
+namespace deepinfer {
+namespace interfaces {
+BaseIOPtr GetIOPtr(std::string& ioName){
+    auto iter = GlobalIOMap.find(ioName);
+    if (iter == GlobalIOMap.end()) {
+        return nullptr;
+    }
+    return iter->second;
+}
+void SetIOPtr(std::string& ioName, BaseIOPtr src){
+    GlobalIOMap[ioName] = src;
+}
+}  // namespace interfaces
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/interfaces/base_io.h
+++ b/src/interfaces/base_io.h
+#ifndef DEEPINFER_INTERFACES_BASE_IO_H_
+#define DEEPINFER_INTERFACES_BASE_IO_H_
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+#include "common/log.h"
+namespace waytous {
+namespace deepinfer {
+namespace interfaces {
+class BaseIO{
+public:
+    BaseIO() = default;
+    virtual ~BaseIO() = default;
+};
+using BaseIOPtr = std::shared_ptr<BaseIO>;
+typedef std::map<std::string, BaseIOPtr> BaseIOMap;
+static BaseIOMap GlobalIOMap;
+BaseIOPtr GetIOPtr(std::string& ioName);
+void SetIOPtr(std::string& ioName, BaseIOPtr src);
+}  // namespace interfaces
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/interfaces/base_model.h
+++ b/src/interfaces/base_model.h
+#ifndef DEEPINFER_INTERFACES_BASE_MODEL_H_
+#define DEEPINFER_INTERFACES_BASE_MODEL_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include <yaml-cpp/yaml.h>
+#include "common/log.h"
+#include "common/register.h"
+#include "common/file.h"
+#include "base/image.h"
+#include "interfaces/base_io.h"
+#include "interfaces/base_unit.h"
+namespace waytous {
+namespace deepinfer {
+namespace interfaces {
+class BaseModel{
+public:
+    BaseModel() = default;
+    virtual ~BaseModel() = default;
+    virtual bool Init(std::string& configPath);
+    // for camera image infer
+    virtual bool Exec(std::vector<cv::Mat*> inputs, std::vector<BaseIOPtr>& outputs);
+    virtual bool Exec(std::vector<base::Image8UPtr> inputs, std::vector<BaseIOPtr>& outputs);
+    virtual std::string Name(){
+        return "BaseModel";
+    }
+public:
+    std::vector<std::string> inputNames;
+    std::vector<std::string> outputNames;
+};
+DEEPINFER_REGISTER_REGISTERER(BaseModel);
+#define DEEPINFER_REGISTER_MODELS(name) \
+  DEEPINFER_REGISTER_CLASS(BaseModel, name)
+}  // namespace interfaces
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/interfaces/base_task.h
+++ b/src/interfaces/base_task.h
+#ifndef WAYTOUS_DEEPINFER_TASK_H_
+#define WAYTOUS_DEEPINFER_TASK_H_
+#include <string>
+#include <opencv2/opencv.hpp>
+#include <yaml-cpp/yaml.h>
+namespace waytous{
+namespace deepinfer{
+namespace task{
+class Task{
+public:
+    virtual ~Task() = default;
+    virtual bool Init(const std::string& taskConfigPath);
+    /*
+        task input/output
+    */
+    virtual bool Infer() = 0;
+    /*
+        draw outputs on image
+    */
+    virtual void Visualize(cv::Mat& image) = 0;
+    virtual cv::Scalar get_color(int idx);
+public:
+    YAML::Node taskNode;
+};
+} //namespace task
+} //namspace deepinfer
+} //namespace waytous
+#endif
--- a/src/interfaces/base_unit.cpp
+++ b/src/interfaces/base_unit.cpp
+#include "interfaces/base_unit.h"
+namespace waytous {
+namespace deepinfer {
+namespace interfaces {
+bool BaseUnit::Init(YAML::Node& node, YAML::Node& globalParamNode){
+    inputNames = node.as<std::vector<std::string>>("inputNames");
+    outputNames = node.as<std::vector<std::string>>("outputNames");
+    inferBatchSize = globalParamNode["inferBatchSize"].as<int>(); // default=1
+    inputHeight = globalParamNode["inputHeight"].as<int>();
+    inputWidth = globalParamNode["inputWidth"].as<int>();
+    fixAspectRatio = globalParamNode["fixAspectRatio"].as<bool>();
+    return true;
+};
+}  // namespace interfaces
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/interfaces/base_unit.h
+++ b/src/interfaces/base_unit.h
+#ifndef DEEPINFER_INTERFACES_BASE_UNIT_H_
+#define DEEPINFER_INTERFACES_BASE_UNIT_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include <yaml-cpp/yaml.h>
+#include <opencv2/opencv.hpp>
+#include "base/blob.h"
+#include "common/log.h"
+#include "common/register.h"
+namespace waytous {
+namespace deepinfer {
+namespace interfaces {
+class BaseUnit{
+public:
+    BaseUnit() = default;
+    virtual ~BaseUnit() = default;
+    virtual bool Init(YAML::Node& node, YAML::Node& globalParamNode);
+    virtual bool Exec() = 0;
+    virtual std::string Name(){
+        return "BaseUnit";
+    }
+public:
+    std::vector<std::string> inputNames;
+    std::vector<std::string> outputNames;
+    //global params
+    int inferBatchSize = 1;
+    int inputWidth, inputHeight;
+    bool fixAspectRatio = true;
+};
+using BaseUnitPtr = std::shared_ptr<BaseUnit>;
+DEEPINFER_REGISTER_REGISTERER(BaseUnit);
+#define DEEPINFER_REGISTER_UNIT(name) \
+  DEEPINFER_REGISTER_CLASS(BaseUnit, name)
+}  // namespace interfaces
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/libs/inferences/tensorrt/trt_calibrator.cpp
+++ b/src/libs/inferences/tensorrt/trt_calibrator.cpp
+#include "libs/inferences/tensorrt/trt_calibrator.h"
+namespace waytous {
+namespace deepinfer {
+namespace inference {
+void resizeImageCPU(cv::Mat& img, float* hostImage, int inputWidth, int inputHeight,
+    const float* inputMean, const float* inputStd, bool fixAspect, bool bgr)
+{   
+    cv::Mat resized;
+    if (fixAspect){
+        float scale = cv::min(float(inputWidth) / img.cols, float(inputHeight) / img.rows);
+        auto scaleSize = cv::Size(img.cols * scale, img.rows * scale);
+        cv::resize(img, resized, scaleSize);
+        cv::Mat cropped = cv::Mat::zeros(inputHeight, inputWidth, CV_8UC3);
+        cv::Rect rect(0, 0, scaleSize.width, scaleSize.height);
+        resized.copyTo(cropped(rect));
+        resized = cropped;
+    }else{
+        cv::resize(img, resized, cv::Size(inputWidth, inputHeight));
+    }
+    for(int i=0; i < inputWidth * inputHeight; i++){
+        hostImage[i + inputWidth * inputHeight] = (resized.at<cv::Vec3b>(i)[1] / 255. - inputMean[1]) / inputStd[1];
+        if(bgr){
+            hostImage[i] = (resized.at<cv::Vec3b>(i)[0] / 255. - inputMean[0]) / inputStd[0] ;
+            hostImage[i + 2 * inputWidth * inputHeight] = (resized.at<cv::Vec3b>(i)[2] / 255. - inputMean[2]) / inputStd[2];
+        }else{
+            hostImage[i] = (resized.at<cv::Vec3b>(i)[2] / 255. - inputMean[2]) / inputStd[2] ;
+            hostImage[i + 2 * inputWidth * inputHeight] = (resized.at<cv::Vec3b>(i)[0] / 255. - inputMean[0]) / inputStd[0];
+        }
+    }
+}
+int8EntroyCalibrator::int8EntroyCalibrator(const std::string &imgPath_, const std::string &calibTablePath_, 
+    int batchSize_, int inputWidth_, int inputHeight_, std::vector<float>& inputMean_, std::vector<float>& inputStd_, bool useBGR_, bool fixAspectRatio_):
+    batchSize(batchSize_), inputWidth(inputWidth_), inputHeight(inputHeight_), imageIndex(0), useBGR(useBGR_), fixAspectRatio(fixAspectRatio_)
+    {   
+        calibTablePath = common::GetAbsolutePath(common::ConfigRoot::GetRootPath(), calibTablePath_);
+        for(int ii =0; ii<3; ii++){
+            inputMean[ii] = inputMean_[ii];
+            inputStd[ii] = inputStd_[ii];
+        }
+        inputCount =  3 * inputWidth * inputHeight;
+        std::fstream f(common::GetAbsolutePath(common::ConfigRoot::GetRootPath(), imgPath_));
+        if(f.is_open()){
+            std::string temp;
+            while (std::getline(f,temp)) imgPaths.push_back(common::GetAbsolutePath(common::ConfigRoot::GetRootPath(), temp));
+        }
+        batchData = new float[batchSize * inputCount];
+        CUDA_CHECK(cudaMalloc(&deviceInput, batchSize * inputCount * sizeof(float)));
+    };
+int8EntroyCalibrator::~int8EntroyCalibrator(){
+    CUDA_CHECK(cudaFree(deviceInput));
+    if(batchData)
+        delete[] batchData;
+};
+int int8EntroyCalibrator::getBatchSize() const { return batchSize; };
+bool int8EntroyCalibrator::getBatch(void *bindings[], const char *names[], int nbBindings){
+    if (imageIndex + batchSize > int(imgPaths.size()))
+        return false;
+    // load batch
+    float* ptr = batchData;
+    for (size_t j = imageIndex; j < imageIndex + batchSize; ++j)
+    {
+        auto img = cv::imread(imgPaths[j]);
+        LOG_INFO << "load image " << imgPaths[j] << "  " << (j+1)*100./imgPaths.size() << "%";
+        resizeImageCPU(img, ptr, inputWidth, inputHeight, inputMean, inputStd, fixAspectRatio, useBGR);
+        ptr += inputCount;
+    }
+    imageIndex += batchSize;
+    CUDA_CHECK(cudaMemcpy(deviceInput, batchData, batchSize * inputCount * sizeof(float), cudaMemcpyHostToDevice));
+    bindings[0] = deviceInput;
+    return true;
+};
+const void * int8EntroyCalibrator::readCalibrationCache(std::size_t &length){
+    calibrationCache.clear();
+    std::ifstream input(calibTablePath, std::ios::binary);
+    input >> std::noskipws;
+    if (readCache && input.good())
+        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
+                std::back_inserter(calibrationCache));
+    length = calibrationCache.size();
+    return length ? &calibrationCache[0] : nullptr;
+};
+void int8EntroyCalibrator::writeCalibrationCache(const void *cache, std::size_t length){
+    std::ofstream output(calibTablePath, std::ios::binary);
+    output.write(reinterpret_cast<const char*>(cache), length);
+};
+}  // namespace inference
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/libs/inferences/tensorrt/trt_calibrator.h
+++ b/src/libs/inferences/tensorrt/trt_calibrator.h
+#ifndef WAYTOUS_DEEPINFER_INFERENCE_TRT_CALIBRATOR_H_
+#define WAYTOUS_DEEPINFER_INFERENCE_TRT_CALIBRATOR_H_
+#pragma once
+#include <vector>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <iterator>
+#include <opencv2/opencv.hpp>
+#include "NvInfer.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "common/log.h"
+#include "common/file.h"
+#include "libs/inferences/tensorrt/trt_utils.h"
+namespace waytous {
+namespace deepinfer {
+namespace inference {
+void resizeImageCPU(cv::Mat& img, float* hostImage, int inputWidth, int inputHeight,
+    const float* inputMean, const float* inputStd, bool fixAspect, bool bgr);
+class int8EntroyCalibrator : public nvinfer1::IInt8EntropyCalibrator {
+public:
+    int8EntroyCalibrator(const std::string &imgPath_, const std::string &calibTablePath_, 
+        int batchSize_, int inputWidth_, int inputHeight_, std::vector<float>& inputMean_, 
+        std::vector<float>& inputStd_, bool useBGR_, bool fixAspectRatio_);
+    ~int8EntroyCalibrator();
+    int getBatchSize() const override;
+    bool getBatch(void *bindings[], const char *names[], int nbBindings) override;
+    const void *readCalibrationCache(std::size_t &length) override;
+    void writeCalibrationCache(const void *cache, std::size_t length) override;
+private:
+    int batchSize, inputWidth, inputHeight;
+    float inputMean[3], inputStd[3];
+    size_t inputCount;
+    size_t imageIndex;
+    bool useBGR = false;
+    bool fixAspectRatio = true;
+    std::string calibTablePath;
+    std::vector<std::string> imgPaths;
+    float *batchData = nullptr;
+    void  *deviceInput = nullptr;
+    bool readCache;
+    std::vector<char> calibrationCache;
+};
+}  // namespace inference
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/libs/inferences/tensorrt/trt_infer.cpp
+++ b/src/libs/inferences/tensorrt/trt_infer.cpp
+#include "libs/inferences/tensorrt/trt_infer.h"
+namespace waytous {
+namespace deepinfer {
+namespace inference {
+TRTInference::~TRTInference(){
+    if(mCudaStream){
+        cudaStreamSynchronize(mCudaStream);
+        cudaStreamDestroy(mCudaStream);
+    }
+    if (mRunTime)
+        mRunTime->destroy();
+    if (mContext)
+        mContext->destroy();
+    if (mEngine)
+        mEngine->destroy();
+}
+bool TRTInference::Init(YAML::Node& configNode, YAML::Node& globalParamNode){
+    if(!BaseUnit::Init(configNode, globalParamNode)){
+        LOG_WARN << "Init trt engine error";
+        return false;
+    };
+    engineFile = configNode["engineFile"].as<std::string>();
+    inferDynamic = configNode["inferDynamic"].as<bool>();
+    engineFile = common::GetAbsolutePath(common::ConfigRoot::GetRootPath(), engineFile);
+    if(!waytous::deepinfer::common::PathExists(engineFile)){
+        LOG_INFO << "Tensorrt engine haven't been built, built from saved weights.";
+        BuildEngine(configNode);
+    }
+    initLibNvInferPlugins(&gLogger, "");
+    std::fstream file;
+    file.open(engineFile, std::ios::binary | std::ios::in);
+    if(!file.is_open())
+    {
+        LOG_WARN << "Read engine file" << engineFile <<" failed, please check.";
+        return false;
+    }
+    file.seekg(0, std::ios::end);
+    int length = file.tellg();
+    file.seekg(0, std::ios::beg);
+    std::unique_ptr<char[]> data(new char[length]);
+    file.read(data.get(), length);
+    file.close();
+    LOG_INFO << "Deserializing tensorrt engine.";
+    mRunTime = nvinfer1::createInferRuntime(gLogger);
+    assert(mRunTime != nullptr);
+    mEngine= mRunTime->deserializeCudaEngine(data.get(), length, NULL);
+    assert(mEngine != nullptr);
+    mContext = mEngine->createExecutionContext();
+    assert(mContext != nullptr);
+    mContext->setProfiler(&mProfiler);
+    int nbBindings = mEngine->getNbBindings();
+    if(nbBindings != inputNames.size() + outputNames.size()){
+        LOG_ERROR << " model input(" << inputNames.size() << ")+output(" << 
+            outputNames.size() << ") != nbBindings:" << nbBindings;
+        return false;
+    }
+    for (auto inputName: inputNames){
+        auto input = std::dynamic_pointer_cast<ios::NormalIO>(interfaces::GetIOPtr(inputName));
+        blobs_.insert(std::make_pair(inputName, input->data_));
+        auto binding = input->data_->mutable_gpu_data();
+        mBindings.emplace_back(static_cast<void*>(binding));
+    }
+    for (int i = 0; i < nbBindings; ++i)
+    {   
+        bool isInput = mEngine->bindingIsInput(i);
+        if(isInput){
+            continue;
+        }
+        nvinfer1::Dims dims = mEngine->getBindingDimensions(i);
+        nvinfer1::DataType dtype = mEngine->getBindingDataType(i);
+        std::string name = mEngine->getBindingName(i);
+        std::vector<int> shape;
+        shape.push_back(inferBatchSize);
+        for(int dindex = 0; dindex < dims.nbDims; dindex++){
+            shape.push_back(dims.d[dindex]);
+        };
+        auto blob = std::make_shared<base::Blob<float>>(base::Blob<float>(shape));
+        blobs_.insert(std::make_pair(name, blob));
+        auto output = std::make_shared<ios::NormalIO>(ios::NormalIO(blob));
+        interfaces::SetIOPtr(name, output);
+        auto binding = blob->mutable_gpu_data();
+        mBindings.emplace_back(static_cast<void*>(binding));
+    }
+    CUDA_CHECK(cudaStreamCreate(&mCudaStream));
+    LOG_INFO << "Deserialized tensorrt engine.";
+    return true;
+}
+bool TRTInference::BuildEngine(YAML::Node& configNode){
+    // default convert onnx to engine
+    int maxBatchSize = configNode["maxBatchSize"].as<int>();
+    initLibNvInferPlugins(&gLogger, "");
+    // int number;
+    // auto lst = getPluginRegistry()->getPluginCreatorList(&number);
+    // for(int i=0; i<number; i++){
+    //     LOG_INFO << lst[i]->getPluginName()<<","<<lst[i]->getPluginVersion()<<","<<lst[i]->getPluginNamespace();
+    // }
+    nvinfer1::IHostMemory* modelStream{nullptr};
+    // int verbosity = (int) nvinfer1::ILogger::Severity::kVERBOSE;
+    int verbosity = (int) nvinfer1::ILogger::Severity::kINFO;
+    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
+    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1U);
+    auto parser = nvonnxparser::createParser(*network, gLogger);
+    std::string weightsPath = configNode["weightsPath"].as<std::string>();
+    weightsPath = common::GetAbsolutePath(common::ConfigRoot::GetRootPath(), weightsPath);
+    if (!parser->parseFromFile(weightsPath.c_str(), verbosity))
+    {
+        LOG_ERROR << "Failed to parse weights file: " << weightsPath;
+        return false;
+    }
+    builder->setMaxBatchSize(maxBatchSize);
+    builder->setMaxWorkspaceSize(1 << 30);// 1G
+    int8EntroyCalibrator *calibrator = nullptr;
+    int runMode = configNode["runMode"].as<int>();
+    if (runMode == 1)//fp16
+    {
+        LOG_INFO << "Set FP16 Mode.";
+        if (!builder->platformHasFastFp16())
+            LOG_INFO << "Notice: the platform do not has fast for fp16" ;
+        builder->setFp16Mode(true);
+    }
+    else if(runMode == 2){//int8
+        LOG_ERROR << "No supported int8";
+        /*LOG_INFO <<"Set Int8 Mode";
+        if (!builder->platformHasFastInt8())
+            LOG_INFO << "Notice: the platform do not has fast for int8.";
+        builder->setInt8Mode(true);
+        if(configNode["calibImgPathFile"].as<std::string>().size() > 0){
+            std::vector<float> inputMean = configNode["inputMean"].as<std::vector<float>>();
+            std::vector<float> inputStd = configNode["inputStd"].as<std::vector<float>>();
+            calibrator = new int8EntroyCalibrator(
+                configNode["calibImgPathFile"].as<std::string>(), 
+                configNode["calibTableCache"].as<std::string>(), maxBatchSize, 
+                configNode["inputWidth"].as<int>(), configNode["inputHeight"].as<int>(), 
+                inputMean, inputStd, useBGR, fixAspectRatio
+            );
+            builder->setInt8Calibrator(calibrator);
+        }
+        else{
+            LOG_ERROR << "Not imgs for calib int8. " << configNode["calibImgPathFile"].as<std::string>();
+            return false;
+        }
+        */
+    }
+    LOG_INFO << "Begin building engine..." ;
+    nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);
+    if (!engine){
+        std::string error_message ="Unable to create engine";
+        gLogger.log(nvinfer1::ILogger::Severity::kERROR, error_message.c_str());
+    }
+    LOG_INFO << "End building engine..." ;
+    // Serialize the engine, then close everything down.
+    modelStream = engine->serialize();
+    engine->destroy();
+    network->destroy();
+    builder->destroy();
+    parser->destroy();
+    assert(modelStream != nullptr);
+    if(calibrator){
+        delete calibrator;
+        calibrator = nullptr;
+    }
+    // write
+    std::ofstream file(engineFile, std::ios::binary);
+    assert(file);
+    file.write(static_cast<char*>(modelStream->data()), modelStream->size());
+    assert(!file.fail());
+    modelStream->destroy();
+    CUDA_CHECK(cudaStreamCreate(&mCudaStream));
+    LOG_INFO << "End writing engine";
+    return true;
+}
+std::shared_ptr<base::Blob<float>> TRTInference::get_blob(const std::string &name) {
+    auto iter = blobs_.find(name);
+    if (iter == blobs_.end()) {
+        return nullptr;
+    }
+    return iter->second;
+}
+bool TRTInference::Exec(){
+    CUDA_CHECK(cudaStreamSynchronize(mCudaStream));
+    for (auto name : inputNames) {
+        auto blob = get_blob(name);
+        if (blob != nullptr) {
+            blob->gpu_data();
+        }
+    }
+    // If `out_blob->mutable_cpu_data()` is invoked outside,
+    // HEAD will be set to CPU, and `out_blob->mutable_gpu_data()`
+    // after `enqueue` will copy data from CPU to GPU,
+    // which will overwrite the `inference` results.
+    // `out_blob->gpu_data()` will set HEAD to SYNCED,
+    // then no copy happends after `enqueue`.
+    for (auto name : outputNames) {
+        auto blob = get_blob(name);
+        if (blob != nullptr) {
+            blob->gpu_data();
+        }
+    }
+    if(inferDynamic){
+        mContext->enqueue(inferBatchSize, &mBindings[0], mCudaStream, nullptr);
+    }else{
+        mContext->executeV2(&mBindings[0]);
+    }
+    CUDA_CHECK(cudaStreamSynchronize(mCudaStream));
+    for (auto name : outputNames) {
+        auto blob = get_blob(name);
+        if (blob != nullptr) {
+            // LOG_INFO << "output name: " << name;
+            blob->mutable_gpu_data();
+        }
+    }
+}
+std::string TRTInference::Name(){
+    return "TRTInference";
+}
+DEEPINFER_REGISTER_UNIT(TRTInference);
+}  // namespace inference
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/libs/inferences/tensorrt/trt_infer.h
+++ b/src/libs/inferences/tensorrt/trt_infer.h
+#ifndef WAYTOUS_DEEPINFER_INFERENCE_TRT_INFER_H_
+#define WAYTOUS_DEEPINFER_INFERENCE_TRT_INFER_H_
+#pragma once
+#include <string>
+#include <yaml-cpp/yaml.h>
+// GPU TensorRT
+#include <cuda_runtime_api.h>
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+#include "NvOnnxParser.h"
+#include "base/blob.h"
+#include "common/file.h"
+#include "common/register.h"
+#include "interfaces/base_unit.h"
+#include "libs/ios/normal_ios.h"
+#include "libs/inferences/tensorrt/trt_utils.h"
+#include "libs/inferences/tensorrt/trt_calibrator.h"
+namespace waytous {
+namespace deepinfer {
+namespace inference {
+static Logger gLogger;
+typedef std::map<std::string,
+                 std::shared_ptr<base::Blob<float>>>
+    BlobMap;
+class TRTInference: public interfaces::BaseUnit {
+public:
+    ~TRTInference();
+    bool Init(YAML::Node& node, YAML::Node& globalParamNode) override;
+    virtual bool BuildEngine(YAML::Node& node);
+    bool Exec() override;
+    virtual std::string Name() override;
+    std::shared_ptr<base::Blob<float>> get_blob(const std::string &name);
+public:
+    nvinfer1::IExecutionContext* mContext = nullptr;
+    nvinfer1::ICudaEngine* mEngine = nullptr;
+    nvinfer1::IRuntime* mRunTime = nullptr;
+    Profiler mProfiler;
+    cudaStream_t mCudaStream;
+    std::vector<void*> mBindings;
+    BlobMap blobs_;
+    std::string engineFile;
+    bool inferDynamic = false;
+};  // class TRTInferenceEngine
+}  // namespace inference
+}  // namespace deepinfer
+}  // namespace waytous
+#endif // header
--- a/src/libs/inferences/tensorrt/trt_utils.h
+++ b/src/libs/inferences/tensorrt/trt_utils.h
+#ifndef WAYTOUS_DEEPINFER_INFERENCE_TRT_UTILS_H_
+#define WAYTOUS_DEEPINFER_INFERENCE_TRT_UTILS_H_
+#pragma once
+#include <iostream>
+#include <numeric>
+#include <iterator>
+#include <string>
+#include <vector>
+#include <map>
+#include "NvInfer.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "common/common.h"
+namespace waytous {
+namespace deepinfer {
+namespace inference {
+#if NV_TENSORRT_MAJOR >= 8
+#define TRT_NOEXCEPT noexcept
+#define TRT_CONST_ENQUEUE const
+#else
+#define TRT_NOEXCEPT
+#define TRT_CONST_ENQUEUE
+#endif
+namespace Tn
+{
+    template<typename T> 
+    void write(char*& buffer, const T& val)
+    {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+    template<typename T> 
+    void read(const char*& buffer, T& val)
+    {
+        val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+    }
+} // Tn
+namespace Yolo{
+    static const int maxNumAnchorsPerLevel = 10; // every level can have 10 anchors at most.
+    struct YoloKernel{
+        int width;
+        int height;
+        float anchors[maxNumAnchorsPerLevel * 2];
+    };
+}//Yolo kernel
+class Profiler : public nvinfer1::IProfiler
+{
+public:
+    struct Record
+    {
+        float time{0};
+        int count{0};
+    };
+    void printTime(const int& runTimes)
+    {
+        //std::cout << "========== " << mName << " profile ==========" << std::endl;
+        float totalTime = 0;
+        std::string layerNameStr = "TensorRT layer name";
+        int maxLayerNameLength = std::max(static_cast<int>(layerNameStr.size()), 70);
+        for (const auto& elem : mProfile)
+        {
+            totalTime += elem.second.time;
+            maxLayerNameLength = std::max(maxLayerNameLength, static_cast<int>(elem.first.size()));
+        }
+        std::cout<< " total runtime = " << totalTime/(runTimes + 1e-5) << " ms " << std::endl;
+    }
+    virtual void reportLayerTime(const char* layerName, float ms)
+    {
+        mProfile[layerName].count++;
+        mProfile[layerName].time += ms;
+    }
+private:
+    std::map<std::string, Record> mProfile;
+};
+class Logger : public nvinfer1::ILogger
+{
+public:
+    Logger(Severity severity = Severity::kWARNING)
+            : reportableSeverity(severity)
+    {
+    }
+    void log(Severity severity, const char* msg) override
+    {
+        // suppress messages with severity enum value greater than the reportable
+        if (severity > reportableSeverity)
+            return;
+        switch (severity)
+        {
+            case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
+            case Severity::kERROR: std::cerr << "ERROR: "; break;
+            case Severity::kWARNING: std::cerr << "WARNING: "; break;
+            case Severity::kINFO: std::cerr << "INFO: "; break;
+            default: std::cerr << "UNKNOWN: "; break;
+        }
+        std::cerr << msg << std::endl;
+    }
+    Severity reportableSeverity;
+};
+inline int64_t volume(const nvinfer1::Dims& d)
+{
+    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
+}
+inline unsigned int getElementSize(nvinfer1::DataType t)
+{
+    switch (t)
+    {
+        case nvinfer1::DataType::kINT32: return 4;
+        case nvinfer1::DataType::kFLOAT: return 4;
+        case nvinfer1::DataType::kHALF: return 2;
+        case nvinfer1::DataType::kINT8: return 1;
+        // case nvinfer1::DataType::kBOOL: return 1;
+    }
+    throw std::runtime_error("Invalid DataType.");
+    return 0;
+}
+inline void* safeCudaMalloc(size_t memSize)
+{
+    void* deviceMem;
+    CUDA_CHECK(cudaMalloc(&deviceMem, memSize));
+    if (deviceMem == nullptr)
+    {
+        std::cerr << "Out of memory" << std::endl;
+        exit(1);
+    }
+    return deviceMem;
+}
+}  // namespace inference
+}  // namespace deepinfer
+}  // namespace waytous
+#endif // header
--- a/src/libs/inferences/tensorrt/trt_yolov5_infer.cpp
+++ b/src/libs/inferences/tensorrt/trt_yolov5_infer.cpp
--- a/src/libs/inferences/tensorrt/trt_yolov5_infer.h
+++ b/src/libs/inferences/tensorrt/trt_yolov5_infer.h
+#ifndef WAYTOUS_DEEPINFER_INFERENCE_TRT_YOLOV5_INFER_H_
+#define WAYTOUS_DEEPINFER_INFERENCE_TRT_YOLOV5_INFER_H_
+#include "libs/inferences/tensorrt/trt_infer.h"
+#include "libs/inferences/tensorrt/trt_yolov5_layer.h"
+using namespace nvinfer1;
+namespace waytous {
+namespace deepinfer {
+namespace inference {
+// TensorRT weight files have a simple space delimited format:
+// [type] [size] <data x size in hex>
+inline std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
+    LOG_INFO << "Loading weights: " << file ;
+    std::map<std::string, nvinfer1::Weights> weightMap;
+    // Open weights file
+    std::ifstream input(file);
+    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");
+    // Read number of weight blobs
+    int32_t count;
+    input >> count;
+    assert(count > 0 && "Invalid weight map file.");
+    while (count--)
+    {
+        nvinfer1::Weights wt{ nvinfer1::DataType::kFLOAT, nullptr, 0 };
+        uint32_t size;
+        // Read name and type of blob
+        std::string name;
+        input >> name >> std::dec >> size;
+        wt.type = nvinfer1::DataType::kFLOAT;
+        // Load blob
+        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
+        for (uint32_t x = 0, y = size; x < y; ++x)
+        {
+            input >> std::hex >> val[x];
+        }
+        wt.values = val;
+        wt.count = size;
+        weightMap[name] = wt;
+    }
+    return weightMap;
+}
+inline static int get_width(int x, float gw, int divisor = 8) {
+    return int(ceil((x * gw) / divisor)) * divisor;
+}
+inline static int get_depth(int x, float gd) {
+    if (x == 1) return 1;
+    int r = round(x * gd);
+    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) {
+        --r;
+    }
+    return std::max<int>(r, 1);
+}
+class YoloV5TRTInference: public TRTInference{
+public:
+    bool BuildEngine(YAML::Node& node) override;
+    std::string Name() override;
+private:
+    IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps);
+    ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname);
+    ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname, int inputWidth, int inputHeight);
+    ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname);
+    ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname);
+    ILayer* C3(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname);
+    ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname);
+    ILayer* SPPF(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k, std::string lname);
+    std::vector<std::vector<float>> getAnchors(std::map<std::string, Weights>& weightMap, std::string lname, int numAnchorsPerLevel);
+    IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, std::string lname, std::vector<IConvolutionLayer*> dets,
+        int classNumber, int inputWidth, int inputHeight, int maxOutputBBoxCount, int numAnchorsPerLevel, int startScale=8);
+    IPluginV2Layer *addBatchedNMSLayer(INetworkDefinition *network, IPluginV2Layer *yolo, int num_classes, int top_k, int keep_top_k, float score_thresh, 
+        float iou_thresh, bool is_normalized = false, bool clip_boxes = false);
+};
+}  // namespace inference
+}  // namespace deepinfer
+}  // namespace waytous
+#endif // header
--- a/src/libs/inferences/tensorrt/trt_yolov5_layer.cu
+++ b/src/libs/inferences/tensorrt/trt_yolov5_layer.cu
--- a/src/libs/inferences/tensorrt/trt_yolov5_layer.h
+++ b/src/libs/inferences/tensorrt/trt_yolov5_layer.h
+#ifndef WAYTOUS_DEEPINFER_INFERENCE_TRT_YOLOV5_LAYER_H_
+#define WAYTOUS_DEEPINFER_INFERENCE_TRT_YOLOV5_LAYER_H_
+#include <assert.h>
+#include <iostream>
+#include <vector>
+#include <string>
+#include <NvInfer.h>
+#include "libs/inferences/tensorrt/trt_utils.h"
+namespace nvinfer1
+{
+    class YoloV5LayerPlugin : public IPluginV2IOExt
+    {
+    public:
+        YoloV5LayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, 
+            int numAnchorsPerLevel, const std::vector<waytous::deepinfer::inference::Yolo::YoloKernel> &vYoloKernel);
+        YoloV5LayerPlugin(const void *data, size_t length);
+        ~YoloV5LayerPlugin();
+        int getNbOutputs() const TRT_NOEXCEPT override
+        {
+            return 2;
+        }
+        Dims getOutputDimensions(int index, const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override;
+        int initialize() TRT_NOEXCEPT override;
+        virtual void terminate() TRT_NOEXCEPT override{};
+        virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override
+        {
+            return maxBatchSize * sizeof(int);
+        }
+        virtual int enqueue(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
+        virtual void serialize(void *buffer) const TRT_NOEXCEPT override;
+        bool supportsFormatCombination(int pos, const PluginTensorDesc *inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override
+        {
+            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
+        }
+        const char *getPluginType() const TRT_NOEXCEPT override;
+        const char *getPluginVersion() const TRT_NOEXCEPT override;
+        void destroy() TRT_NOEXCEPT override;
+        IPluginV2IOExt *clone() const TRT_NOEXCEPT override;
+        void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override;
+        const char *getPluginNamespace() const TRT_NOEXCEPT override;
+        DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool *inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;
+        bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
+        void attachToContext(
+            cudnnContext *cudnnContext, cublasContext *cublasContext, IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
+        void configurePlugin(const PluginTensorDesc *in, int nbInput, const PluginTensorDesc *out, int nbOutput) TRT_NOEXCEPT override;
+        void detachFromContext() TRT_NOEXCEPT override;
+    private:
+        void forwardGpu(const float *const *inputs, void **outputs, void *workspace, cudaStream_t stream, int batchSize = 1);
+        int mThreadCount = 256;
+        const char *mPluginNamespace;
+        int mKernelCount;
+        int mClassCount;
+        int mYoloV5NetWidth;
+        int mYoloV5NetHeight;
+        int mMaxOutObject;
+        int mNumAnchorsPerLevel;
+        std::vector<waytous::deepinfer::inference::Yolo::YoloKernel> mYoloKernel;
+        void **mAnchor;
+    };
+    class YoloV5PluginCreator : public IPluginCreator
+    {
+    public:
+        YoloV5PluginCreator();
+        ~YoloV5PluginCreator() override = default;
+        const char *getPluginName() const TRT_NOEXCEPT override;
+        const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override;
+        IPluginV2IOExt *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override;
+        IPluginV2IOExt *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override;
+        void setPluginNamespace(const char *libNamespace) TRT_NOEXCEPT override
+        {
+            mNamespace = libNamespace;
+        }
+        const char *getPluginNamespace() const TRT_NOEXCEPT override
+        {
+            return mNamespace.c_str();
+        }
+    private:
+        std::string mNamespace;
+        static PluginFieldCollection mFC;
+        static std::vector<PluginField> mPluginAttributes;
+    };
+    REGISTER_TENSORRT_PLUGIN(YoloV5PluginCreator);
+};
+#endif
--- a/src/libs/ios/camera_ios.h
+++ b/src/libs/ios/camera_ios.h
+#ifndef DEEPINFER_CAMERA_IOS_H_
+#define DEEPINFER_CAMERA_IOS_H_
+#include <vector>
+#include <string>
+#include "base/image.h"
+#include "interfaces/base_io.h"
+namespace waytous {
+namespace deepinfer {
+namespace ios {
+class CameraSrcIn: public interfaces::BaseIO{
+public:
+    CameraSrcIn(cv::Mat* img): cv_img_(img){};
+    cv::Mat* cv_img_;
+};
+using CameraSrcInPtr = std::shared_ptr<CameraSrcIn>;
+class CameraSrcOut: public interfaces::BaseIO{
+public:
+    CameraSrcOut(base::Image8UPtr img): img_ptr_(img){};
+    base::Image8UPtr img_ptr_;
+};
+using CameraSrcOutPtr = std::shared_ptr<CameraSrcOut>;
+}  // namespace ios
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/libs/ios/detection.cpp
+++ b/src/libs/ios/detection.cpp
+#include "libs/ios/detection.h"
+namespace waytous {
+namespace deepinfer {
+namespace ios {
+void Det2D::validCoordinate(){
+    x1 = iMIN(image_width - 1, iMAX(0, x1));
+    x2 = iMIN(image_width - 1, iMAX(0, x2));
+    y1 = iMIN(image_height - 1, iMAX(0, y1));
+    y2 = iMIN(image_height - 1, iMAX(0, y2));
+}
+Det2DPtr Det2D::copy(){
+    Det2DPtr obj = std::make_shared<Det2D>(Det2D());
+    obj->x1 = this->x1;
+    obj->x2 = this->x2;
+    obj->y1 = this->y1;
+    obj->y2 = this->y2;
+    obj->confidence = this->confidence;
+    obj->class_label = this->class_label;
+    obj->class_name = this->class_name;
+    obj->truncated = this->truncated;
+    obj->track_id = this->track_id;
+    if (obj->mask_ptr != nullptr){
+        obj->mask_ptr.reset(new InstanceMask(this->mask_ptr.get()));
+    }
+}
+void Det2D::update(Det2DPtr obj_){
+    this->x1 = obj_->x1;
+    this->x2 = obj_->x2;
+    this->y1 = obj_->y1;
+    this->y2 = obj_->y2;
+    this->confidence = obj_->confidence;
+    this->class_label = obj_->class_label;
+    this->class_name = obj_->class_name;
+    this->truncated = obj_->truncated;
+    this->track_id = obj_->track_id;
+    if (this->mask_ptr != nullptr){
+        this->mask_ptr->update(obj_->mask_ptr.get());
+    }
+};
+void Det2D::update(std::vector<float>& tlbr){
+    this->x1 = tlbr[0];
+    this->y1 = tlbr[1];
+    this->x2 = tlbr[2];
+    this->y2 = tlbr[3];
+};
+}  // namespace ios
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/libs/ios/detection.h
+++ b/src/libs/ios/detection.h
+#ifndef DEEPINFER_DETECTION_H_
+#define DEEPINFER_DETECTION_H_
+#include <vector>
+#include <string>
+#include "common/common.h"
+#include "interfaces/base_io.h"
+#include "libs/ios/instance_mask.h"
+namespace waytous {
+namespace deepinfer {
+namespace ios {
+class Det{
+public:
+    Det() = default;
+    virtual ~Det() = default;
+};
+using DetPtr = std::shared_ptr<Det>;
+class Det2D : public Det{
+public:
+    void validCoordinate();
+    Det2DPtr copy();
+    void update(Det2DPtr obj_);
+    void update(std::vector<float>& tlbr);
+public:
+    float image_width = 0, image_height = 0;
+    float x1, y1, x2, y2;
+    float confidence;
+    int class_label;
+    std::string class_name;
+    bool truncated = false;
+    InstanceMaskPtr mask_ptr = nullptr;
+    int track_id = -1;
+};
+using Det2DPtr = std::shared_ptr<Det2D>;
+class Detection2Ds: public interfaces::BaseIO{
+public:
+    std::vector<Det2DPtr> detections;
+};
+using Detection2DsPtr = std::shared_ptr<Detection2Ds>;
+}  // namespace ios
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/libs/ios/instance_mask.cpp
+++ b/src/libs/ios/instance_mask.cpp
+#include "libs/ios/instance_mask.h"
+namespace waytous{
+namespace deepinfer{
+namespace ios{
+InstanceMask::InstanceMask(){
+    width = -1;
+    height = -1;
+    rle_string = "";
+};
+InstanceMask::InstanceMask(const InstanceMask* mask){
+    this->width = mask->width;
+    this->height = mask->height;
+    this->rle_string = mask->rle_string;
+}
+void InstanceMask::update(const InstanceMask* mask){
+    this->width = mask->width;
+    this->height = mask->height;
+    this->rle_string = mask->rle_string;
+}
+InstanceMask::InstanceMask(int width, int height, const std::string &rle_string){
+    this->width = width;
+    this->height = height;
+    this->rle_string = rle_string;
+}
+InstanceMask::InstanceMask(const cv::Mat &map){
+    encode(map);
+}
+InstanceMask::InstanceMask(int width, int height, const char* data){
+    this->width = width;
+    this->height = height;
+    encode(data);
+}
+InstanceMask::InstanceMask(int width, int height, const int* cnts, int length){
+    this->width = width;
+    this->height = height;
+    encode(cnts, length);
+}
+cv::Mat InstanceMask::decode(){
+    int m=0, p=0, k; long x; int more;
+    std::vector<uint> cnts;
+    while(rle_string[p]) {
+        x=0; k=0; more=1;
+        while(more) {
+            char c = rle_string[p] - 48;
+            x |= (c & 0x1f) << 5*k;
+            more = c & 0x20;
+            p++; k++;
+            if(!more && (c & 0x10))
+                x |= -1 << 5*k;
+        }
+        if(m>2)
+            x += (long) cnts[m-2];
+        cnts.push_back((uint)x);
+        m++;
+    }
+    bool v=0;
+    cv::Mat M = cv::Mat::zeros(cv::Size(width, height), CV_8UC1);
+    int index = 0;
+    for( int j=0; j<m; j++ ) {
+        for( k=0; k<cnts[j]; k++ ){
+            M.at<uchar>(index++) = v;
+        }
+        v=!v;
+    }
+    // std::cout<<index<<"--777777777777777777"<<std::endl;
+    return M;
+}
+void InstanceMask::encode(const char* data){
+    uint count = 0;
+    char p = 0;
+    std::vector<uint> cnts; 
+    for (int i = 0; i < height * width; i++){
+        if(*(data + i) != p){
+            cnts.push_back(count);
+            count = 0;
+            p = *(data + i);
+        }
+        count++;
+    }
+    long x; int more;
+    int m = cnts.size();
+    for(int i=0; i<m; i++ ) {
+        x=(long) cnts[i]; 
+        if(i>2) 
+            x -= (long) cnts[i-2];
+        more = 1;
+        while( more ) {
+            char c=x & 0x1f;
+            x >>= 5;
+            more = (c & 0x10) ? x!=-1 : x!=0;
+            if(more)
+                c |= 0x20;
+            c += 48;
+            rle_string.push_back(c);
+        }
+    }
+}
+void InstanceMask::encode(const int* cnts, int length){
+    long x; int more;
+    for(int i=0; i<length; i++ ) {
+        x=(long) cnts[i]; 
+        if(i>2) 
+            x -= (long) cnts[i-2];
+        more = 1;
+        while( more ) {
+            char c=x & 0x1f;
+            x >>= 5;
+            more = (c & 0x10) ? x!=-1 : x!=0;
+            if(more)
+                c |= 0x20;
+            c += 48;
+            rle_string.push_back(c);
+        }
+    }
+}
+void InstanceMask::encode(const cv::Mat& map){
+    height = map.rows;
+    width = map.cols;
+    uint count = 0;
+    char p = 0;
+    std::vector<uint> cnts; 
+    for (int i=0; i<height; i++){
+        for(int j=0; j<width; j++){
+            if(map.at<char>(i, j) != p){
+                cnts.push_back(count);
+                count = 0;
+                p = map.at<char>(i, j);
+            }
+            count++;
+        }
+    }
+    long x; int more;
+    int m = cnts.size();
+    for(int i=0; i<m; i++ ) {
+        x=(long) cnts[i]; 
+        if(i>2) 
+            x -= (long) cnts[i-2];
+        more = 1;
+        while( more ) {
+            char c=x & 0x1f;
+            x >>= 5;
+            more = (c & 0x10) ? x!=-1 : x!=0;
+            if(more)
+                c |= 0x20;
+            c += 48;
+            rle_string.push_back(c);
+        }
+    }
+}
+} //namespace ios
+} //namspace deepinfer
+} //namespace waytous
--- a/src/libs/ios/instance_mask.h
+++ b/src/libs/ios/instance_mask.h
+#ifndef DEEPINFER_INSTANCE_MASK_H_
+#define DEEPINFER_INSTANCE_MASK_H_
+#include <string>
+#include <memory>
+#include <opencv2/opencv.hpp>
+namespace waytous{
+namespace deepinfer{
+namespace ios{
+class InstanceMask{
+public:
+    InstanceMask();
+    InstanceMask(const InstanceMask* mask);
+    InstanceMask(int width, int height, const std::string &rle_string);
+    InstanceMask(const cv::Mat &map);
+    InstanceMask(int width, int height, const char* data);
+    InstanceMask(int width, int height, const int* cnts, int length);
+    void update(const InstanceMask* mask);
+    cv::Mat decode();
+    void encode(const cv::Mat& map);
+    void encode(const char* data);
+    void encode(const int* cnts, int length);
+public:
+    int width;
+    int height;
+    std::string rle_string; // coco encode mask string
+};
+using InstanceMaskPtr = std::shared_ptr<InstanceMask>;
+} //namespace ios
+} //namspace deepinfer
+} //namespace waytous
+#endif
--- a/src/libs/ios/normal_ios.h
+++ b/src/libs/ios/normal_ios.h
+#ifndef DEEPINFER_NORMAL_IOS_H_
+#define DEEPINFER_NORMAL_IOS_H_
+#include <vector>
+#include <string>
+#include "base/blob.h"
+#include "interfaces/base_io.h"
+namespace waytous {
+namespace deepinfer {
+namespace ios {
+class NormalIO: public interfaces::BaseIO{
+public:
+    NormalIO(base::BlobPtr<float> data): data_(data){};
+    base::BlobPtr<float> data_;
+};
+using NormalIOPtr = std::shared_ptr<NormalIO>;
+class NormalIOI: public interfaces::BaseIO{
+public:
+    NormalIOI(base::BlobPtr<int> data): data_(data){};
+    base::BlobPtr<int> data_;
+};
+using NormalIOIPtr = std::shared_ptr<NormalIOI>;
+}  // namespace ios
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/libs/postprocessors/trades_post.cu
+++ b/src/libs/postprocessors/trades_post.cu
--- a/src/libs/postprocessors/trades_post.h
+++ b/src/libs/postprocessors/trades_post.h
+#ifndef DEEPINFER_POSTPROCESS_TRDES_H_
+#define DEEPINFER_POSTPROCESS_TRDES_H_
+#include <cuda_runtime.h>
+#include <cstdint>
+#include "interfaces/base_unit.h"
+#include "base/image.h"
+#include "libs/ios/normal_ios.h"
+#include "libs/ios/camera_ios.h"
+#include "libs/ios/detection.h"
+namespace waytous {
+namespace deepinfer {
+namespace postprocess {
+class TraDesPostProcess: public interfaces::BaseUnit{
+public:
+    bool Init(YAML::Node& node, YAML::Node& globalParamNode) override;
+    bool Exec() override;
+    std::string Name() override;
+protected:
+    void trades_postprocess(const float *hm, const float *reg, const float *wh, const float* seg_weights, const float* seg_feat, 
+        int* output_length, float *output_bboxes, int* output_mask_cnt_lengths, int* output_mask_cnts, const int topK, const int max_cnts_length, 
+        const int w, const int h, const int number_classes, const int kernerl_size,  const int seg_dims, const float score_threshold);
+public:
+    base::BlobPtr<int> output_length_ptr;
+    base::BlobPtr<float> bboxes_ptr;
+    base::BlobPtr<int> maskCnts_lengths_ptr;
+    base::BlobPtr<int> maskCnts_ptr;
+    float scoreThreshold, truncatedThreshold;
+    int classNumber;
+    std::vector<std::string> classNames;
+    int topK = 100;
+    int downScale = 4;
+    int segDims = 64;
+    int maxCntsLength = 500;
+};
+}  // namespace postprocess
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/libs/postprocessors/yolov5_post.cpp
+++ b/src/libs/postprocessors/yolov5_post.cpp
+#include "libs/postprocessors/yolov5_post.h"
+namespace waytous {
+namespace deepinfer {
+namespace postprocess {
+bool YoloV5PostProcess::Init(YAML::Node& node, YAML::Node& globalParamNode) {
+    if(!BaseUnit::Init(node, globalParamNode)){
+        LOG_WARN << "Init trades postprocess error";
+        return false;
+    };
+    classNames = node["classNames"].as<std::vector<std::string>>();
+    truncatedThreshold = node["truncatedThreshold"].as<float>(); // default 5%
+    scoreThreshold = node["scoreThreshold"].as<float>();
+    keepTopK = node["keepTopK"].as<int>();
+}
+bool YoloV5PostProcess::Exec(){
+    if (inputNames.size() != 4 + outputNames.size()){
+        LOG_ERROR << "yolov5 postprocess, inputsize != 4 + ouputsize.";
+        return false;
+    }
+    std::vector<base::BlobPtr<float>> inputs; // model outputs
+    for(int i=0; i<4; i++){
+        auto inputName = inputNames[i];
+        auto ptr = std::dynamic_pointer_cast<ios::NormalIO>(interfaces::GetIOPtr(inputName));
+        inputs.push_back(ptr->data_);
+    }
+    std::vector<base::Image8UPtr> inputImages;
+    for(int j=4; j<inputNames.size(); j++){
+        auto iName = inputNames[j];
+        auto iptr = std::dynamic_pointer_cast<ios::CameraSrcOut>(interfaces::GetIOPtr(iName));
+        inputImages.push_back(iptr->img_ptr_);
+    }
+    //post
+    int* num_detections = static_cast<int*>(static_cast<void*>(inputs[0]->mutable_cpu_data()));// it's type is int32
+    const float* nmsed_boxes = inputs[1]->cpu_data();
+    const float* nmsed_scores = inputs[2]->cpu_data();
+    const float* nmsed_classes = inputs[3]->cpu_data();
+    for(int b=0; b<inputImages.size(); b++){
+        auto outName = outputNames[b];
+        float img_width = float(inputImages[b]->cols());
+        float img_height = float(inputImages[b]->rows());
+        float scalex = inputWidth / img_width;
+        float scaley = inputHeight / img_height;
+        if(fixAspectRatio){
+            scalex = scaley = std::min(scalex, scaley);
+        }
+        auto dets = std::make_shared<ios::Detection2Ds>(ios::Detection2Ds());
+        for(int i = 0; i < num_detections[b]; i++){
+            if(nmsed_scores[b * keepTopK * 1 + i] < scoreThreshold){
+                continue;
+            }
+            ios::Det2DPtr obj = std::make_shared<ios::Det2D>(ios::Det2D());
+            obj->confidence = nmsed_scores[b * keepTopK * 1 + i];
+            obj->class_label = int(nmsed_classes[b * keepTopK * 1 + i]);
+            obj->class_name = classNames[obj->class_label];
+            obj->x1= nmsed_boxes[b * keepTopK * 4 + i * 4 + 0] / scalex;
+            obj->y1 = nmsed_boxes[b * keepTopK * 4 + i * 4 + 1] / scaley;
+            obj->x2 = nmsed_boxes[b * keepTopK * 4 + i * 4 + 2] / scalex;
+            obj->y2 = nmsed_boxes[b * keepTopK * 4 + i * 4 + 3] / scaley;
+            obj->image_height = img_height;
+            obj->image_width = img_width;
+            obj->validCoordinate(); //
+            // LOG_INFO << "box:" << obj->x1 << ","<< obj->y1 << ","<< obj->x2 << ","<< obj->y2;
+            if((obj->x1 / img_width  < truncatedThreshold) || (obj->y1 / img_height  < truncatedThreshold) ||
+            (obj->x2 / img_width  > (1 - truncatedThreshold)) || (obj->y2 / img_height  > (1 - truncatedThreshold))){
+                obj->truncated = true;
+            }
+            dets->detections.push_back(obj);
+        }
+        interfaces::SetIOPtr(outName, dets);
+    }
+}
+std::string YoloV5PostProcess::Name() {
+    return "YoloV5PostProcess";
+};
+DEEPINFER_REGISTER_UNIT(YoloV5PostProcess);
+}  // namespace postprocess
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/libs/postprocessors/yolov5_post.h
+++ b/src/libs/postprocessors/yolov5_post.h
+#ifndef DEEPINFER_POSTPROCESS_YOLOV5_H_
+#define DEEPINFER_POSTPROCESS_YOLOV5_H_
+#include <cuda_runtime.h>
+#include <cstdint>
+#include "interfaces/base_unit.h"
+#include "base/image.h"
+#include "libs/ios/normal_ios.h"
+#include "libs/ios/camera_ios.h"
+#include "libs/ios/detection.h"
+namespace waytous {
+namespace deepinfer {
+namespace postprocess {
+class YoloV5PostProcess: public interfaces::BaseUnit{
+public:
+    bool Init(YAML::Node& node, YAML::Node& globalParamNode) override;
+    bool Exec() override;
+    std::string Name() override;
+public:
+    int keepTopK = 100;
+    float scoreThreshold, truncatedThreshold;
+    std::vector<std::string> classNames;
+};
+}  // namespace postprocess
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/libs/preprocessors/resize_norm.cu
+++ b/src/libs/preprocessors/resize_norm.cu
+#include "libs/preprocessors/resize_norm.h"
+namespace waytous {
+namespace deepinfer {
+namespace preprocess {
+bool ResizeNorm::Init(YAML::Node& node, YAML::Node& globalParamNode){
+    if(!BaseUnit::Init(node, globalParamNode)){
+        LOG_WARN << "Init resize_norm error";
+        return false;
+    };
+    useBGR = node["useBGR"].as<bool>();
+    std::vector<float> inputMean = node["inputMean"].as<std::vector<float>>();
+    std::vector<float> inputStd = node["inputStd"].as<std::vector<float>>();
+    dst.reset(new base::Blob<float>({inferBatchSize, 3, inputHeight, inputWidth}));
+    auto dst_ptr = std::make_shared<ios::NormalIO>(ios::NormalIO(dst));
+    interfaces::SetIOPtr(outputNames[0], dst_ptr);
+    mean.reset(new base::Blob<float>({3, 1}, inputMean.data()));
+    std.reset(new base::Blob<float>({3, 1}, inputStd.data()));
+};
+bool ResizeNorm::Exec(){
+    for(int b=0; b < inputNames.size(); b++){
+        auto inputName = inputNames[b];
+        auto input = std::dynamic_pointer_cast<ios::CameraSrcOut>(interfaces::GetIOPtr(inputName));
+        if(input == nullptr){
+            LOG_ERROR << "resize norm input ptr haven't init";
+            return false;
+        }
+        auto img = input->img_ptr_;
+        resizeGPU(
+            img->mutable_gpu_data(),
+            img->cols(),
+            img->rows(),
+            img->width_step(),
+            dst->mutable_gpu_data() + (b * 3 * inputHeight * inputWidth), // multi inputs
+            inputWidth, inputHeight,
+            mean->mutable_gpu_data(),
+            std->mutable_gpu_data(),
+            useBGR, fixAspectRatio, stream_
+        );
+    }
+    // ios::NormalIOPtr dst_ptr = std::make_shared<ios::NormalIO>(ios::NormalIO(dst));
+    return true;
+};
+__global__ void warpaffine_kernel( 
+    uint8_t* src, int stepwidth, int src_width, 
+    int src_height, float* dst, int dst_width, 
+    int dst_height, uint8_t const_value_st,
+    AffineMatrix d2s, int edge, float* input_mean, float* input_std, bool bgr) {
+    int position = blockDim.x * blockIdx.x + threadIdx.x;
+    if (position >= edge) return;
+    float m_x1 = d2s.value[0];
+    float m_y1 = d2s.value[1];
+    float m_z1 = d2s.value[2];
+    float m_x2 = d2s.value[3];
+    float m_y2 = d2s.value[4];
+    float m_z2 = d2s.value[5];
+    int dx = position % dst_width;
+    int dy = position / dst_width;
+    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
+    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
+    float c0, c1, c2;
+    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
+        // out of range
+        c0 = const_value_st;
+        c1 = const_value_st;
+        c2 = const_value_st;
+    } else {
+        int y_low = floorf(src_y);
+        int x_low = floorf(src_x);
+        int y_high = y_low + 1;
+        int x_high = x_low + 1;
+        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+        float ly = src_y - y_low;
+        float lx = src_x - x_low;
+        float hy = 1 - ly;
+        float hx = 1 - lx;
+        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+        uint8_t* v1 = const_value;
+        uint8_t* v2 = const_value;
+        uint8_t* v3 = const_value;
+        uint8_t* v4 = const_value;
+        if (y_low >= 0) {
+            if (x_low >= 0)
+                v1 = src + y_low * stepwidth + x_low * 3;
+            if (x_high < src_width)
+                v2 = src + y_low * stepwidth + x_high * 3;
+        }
+        if (y_high < src_height) {
+            if (x_low >= 0)
+                v3 = src + y_high * stepwidth + x_low * 3;
+            if (x_high < src_width)
+                v4 = src + y_high * stepwidth + x_high * 3;
+        }
+        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
+        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
+        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
+    }
+    if(!bgr){
+        //bgr to rgb 
+        float t = c2;
+        c2 = c0;
+        c0 = t;
+    }
+    //normalization
+    c0 = (c0 / 255.0f - input_mean[0]) / input_std[0];
+    c1 = (c1 / 255.0f - input_mean[1]) / input_std[1];
+    c2 = (c2 / 255.0f - input_mean[2]) / input_std[2];
+    //rgbrgbrgb to rrrgggbbb or bgrbgrbgr to bbbgggrrr
+    int area = dst_width * dst_height;
+    float* pdst_c0 = dst + dy * dst_width + dx;
+    float* pdst_c1 = pdst_c0 + area;
+    float* pdst_c2 = pdst_c1 + area;
+    *pdst_c0 = c0;
+    *pdst_c1 = c1;
+    *pdst_c2 = c2;
+}
+void ResizeNorm::resizeGPU(uint8_t* src, int src_width, int src_height, int step_width,
+    float* dst, int dst_width, int dst_height, float* input_mean, float* input_std, 
+    bool bgr, bool resizeFixAspectRatio, cudaStream_t stream){
+    AffineMatrix s2d, d2s;
+    float scalex =  dst_width / (float)src_width;
+    float scaley = dst_height / (float)src_height;
+    if(resizeFixAspectRatio){
+        scalex = scaley = std::min(scalex, scaley);
+    }
+    s2d.value[0] = scalex;
+    s2d.value[1] = 0;
+    s2d.value[2] = 0; // -scalex * src_width  * 0.5  + dst_width * 0.5;
+    s2d.value[3] = 0;
+    s2d.value[4] = scaley;
+    s2d.value[5] = 0; // -scaley * src_height * 0.5 + dst_height * 0.5;
+    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
+    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
+    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
+    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
+    // printf("affine: %f, %f, %f, %f, %f, %f\n", d2s.value[0], d2s.value[1], d2s.value[2], d2s.value[3], d2s.value[4], d2s.value[5]);
+    int jobs = dst_height * dst_width;
+    int threads = 256;
+    int blocks = ceil(jobs / (float)threads);
+    warpaffine_kernel<<<blocks, threads, 0, stream>>>(
+        src, step_width, src_width,
+        src_height, dst, dst_width,
+        dst_height, 128, d2s, jobs, input_mean, input_std, bgr);
+    };
+std::string ResizeNorm::Name(){
+    return "ResizeNorm";
+};
+DEEPINFER_REGISTER_UNIT(ResizeNorm);
+}  // namespace preprocess
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/libs/preprocessors/resize_norm.h
+++ b/src/libs/preprocessors/resize_norm.h
+#ifndef DEEPINFER_PREPROCESS_RESIZE_NORM_H_
+#define DEEPINFER_PREPROCESS_RESIZE_NORM_H_
+#include <cuda_runtime.h>
+#include <cstdint>
+#include "interfaces/base_unit.h"
+#include "base/image.h"
+#include "libs/sources/camera_src.h"
+#include "libs/ios/normal_ios.h"
+#include "libs/ios/camera_ios.h"
+namespace waytous {
+namespace deepinfer {
+namespace preprocess {
+struct AffineMatrix{
+    float value[6];
+};
+class ResizeNorm: public interfaces::BaseUnit{
+public:
+    bool Init(YAML::Node& node, YAML::Node& globalParamNode) override;
+    bool Exec() override;
+    std::string Name() override;
+protected:
+    void resizeGPU(uint8_t* src, int src_width, int src_height, int step_width,
+            float* dst, int dst_width, int dst_height, float* input_mean, float* input_std, 
+            bool bgr, bool resizeFixAspectRatio, cudaStream_t stream);
+public:
+    cudaStream_t stream_;
+    base::BlobPtr<float> dst, mean, std;
+    bool useBGR = false;
+};
+}  // namespace preprocess
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/libs/preprocessors/undistort.h
+++ b/src/libs/preprocessors/undistort.h
--- a/src/libs/sources/camera_src.cpp
+++ b/src/libs/sources/camera_src.cpp
+#include "libs/sources/camera_src.h"
+namespace waytous {
+namespace deepinfer {
+namespace sources {
+bool CameraSrc::Exec(){
+    auto src_input = interfaces::GetIOPtr(inputNames[0]);
+    auto src = std::dynamic_pointer_cast<ios::CameraSrcIn>(src_input);
+    auto img = std::make_shared<base::Image8U>(base::Image8U(src->cv_img_));
+    auto dst = std::make_shared<ios::CameraSrcOut>(ios::CameraSrcOut(img));
+    interfaces::SetIOPtr(outputNames[0], dst);
+    return true;
+}
+std::string CameraSrc::Name(){
+    return "CameraSrc";
+}
+DEEPINFER_REGISTER_UNIT(CameraSrc);
+}  // namespace sources
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/libs/sources/camera_src.h
+++ b/src/libs/sources/camera_src.h
+#ifndef DEEPINFER_CAMERA_SOURCE_H_
+#define DEEPINFER_CAMERA_SOURCE_H_
+#include <cuda_runtime.h>
+#include <cstdint>
+#include "base/image.h"
+#include "interfaces/base_unit.h"
+#include "libs/ios/camera_ios.h"
+namespace waytous {
+namespace deepinfer {
+namespace sources {
+class CameraSrc: public interfaces::BaseUnit{
+public:
+    bool Exec() override;
+    std::string Name() override;
+};
+using CameraSrcPtr = std::shared_ptr<CameraSrc>;
+}  // namespace sources
+}  // namespace deepinfer
+}  // namespace waytous
+#endif
--- a/src/libs/trackers/byte_tracker.cpp
+++ b/src/libs/trackers/byte_tracker.cpp
--- a/src/libs/trackers/byte_tracker.h
+++ b/src/libs/trackers/byte_tracker.h
+#ifndef WAYTOUS_DEEPINFER_TRACKER_BYTETRACKER_H_
+#define WAYTOUS_DEEPINFER_TRACKER_BYTETRACKER_H_
+#include <opencv2/opencv.hpp>
+#include "interfaces/base_unit.h"
+#include "libs/trackers/tracker_datatype.h"
+#include "libs/trackers/track.h"
+#include "libs/trackers/lapjv.h"
+namespace waytous{
+namespace deepinfer{
+namespace tracker{
+class ByteTracker: public interfaces::BaseUnit
+{
+public:
+	bool Init(YAML::Node& node, YAML::Node& globalParamNode) override;
+	bool Exec() override;
+	std::string Name() override;
+private:
+	std::vector<Track*> joint_tracks(std::vector<Track*> &tlista, std::vector<Track> &tlistb);
+	std::vector<Track> joint_tracks(std::vector<Track> &tlista, std::vector<Track> &tlistb);
+	std::vector<Track> sub_tracks(std::vector<Track> &tlista, std::vector<Track> &tlistb);
+	void remove_duplicate_tracks(std::vector<Track> &resa, std::vector<Track> &resb, std::vector<Track> &tracksa, std::vector<Track> &tracksb);
+	void linear_assignment(std::vector<std::vector<float> > &cost_matrix, int cost_matrix_size, int cost_matrix_size_size, float thresh,
+		std::vector<std::vector<int> > &matches, std::vector<int> &unmatched_a, std::vector<int> &unmatched_b);
+	std::vector<std::vector<float> > iou_distance(std::vector<Track*> &atracks, std::vector<Track> &btracks, int &dist_size, int &dist_size_size);
+	std::vector<std::vector<float> > iou_distance(std::vector<Track> &atracks, std::vector<Track> &btracks);
+	std::vector<std::vector<float> > ious(std::vector<std::vector<float> > &atlbrs, std::vector<std::vector<float> > &btlbrs);
+	double lapjv(const std::vector<std::vector<float> > &cost, std::vector<int> &rowsol, std::vector<int> &colsol, 
+		bool extend_cost = false, float cost_limit = LONG_MAX, bool return_cost = true);
+private:
+	float track_thresh;
+	float high_thresh;
+	float match_thresh;
+	int frame_id;
+	int max_time_lost;
+	int init_frames = 0;
+	std::vector<Track> tracked_tracks;
+	std::vector<Track> lost_tracks;
+	std::vector<Track> removed_tracks;
+	KalmanFilter kalman_filter;
+}; 
+} //namespace tracker
+} //namspace deepinfer
+} //namespace waytous
+#endif
--- a/src/libs/trackers/kalman_filter.cpp
+++ b/src/libs/trackers/kalman_filter.cpp
+#include "libs/trackers/kalman_filter.h"
+namespace waytous{
+namespace deepinfer{
+namespace tracker{
+const double KalmanFilter::chi2inv95[10] = {
+        0,
+        3.8415,
+        5.9915,
+        7.8147,
+        9.4877,
+        11.070,
+        12.592,
+        14.067,
+        15.507,
+        16.919
+};
+KalmanFilter::KalmanFilter()
+{
+    int ndim = 4;
+    double dt = 1.;
+    _motion_mat = Eigen::MatrixXf::Identity(8, 8);
+    for (int i = 0; i < ndim; i++) {
+        _motion_mat(i, ndim + i) = dt;
+    }
+    _update_mat = Eigen::MatrixXf::Identity(4, 8);
+    this->_std_weight_position = 1. / 20;
+    this->_std_weight_velocity = 1. / 160;
+}
+KAL_DATA KalmanFilter::initiate(const DETECTBOX &measurement)
+{
+    DETECTBOX mean_pos = measurement;
+    DETECTBOX mean_vel;
+    for (int i = 0; i < 4; i++) mean_vel(i) = 0;
+    KAL_MEAN mean;
+    for (int i = 0; i < 8; i++) {
+        if (i < 4) mean(i) = mean_pos(i);
+        else mean(i) = mean_vel(i - 4);
+    }
+    KAL_MEAN std;
+    std(0) = 2 * _std_weight_position * measurement[3];
+    std(1) = 2 * _std_weight_position * measurement[3];
+    std(2) = 1e-2;
+    std(3) = 2 * _std_weight_position * measurement[3];
+    std(4) = 10 * _std_weight_velocity * measurement[3];
+    std(5) = 10 * _std_weight_velocity * measurement[3];
+    std(6) = 1e-5;
+    std(7) = 10 * _std_weight_velocity * measurement[3];
+    KAL_MEAN tmp = std.array().square();
+    KAL_COVA var = tmp.asDiagonal();
+    return std::make_pair(mean, var);
+}
+void KalmanFilter::predict(KAL_MEAN &mean, KAL_COVA &covariance)
+{
+    //revise the data;
+    DETECTBOX std_pos;
+    std_pos << _std_weight_position * mean(3),
+        _std_weight_position * mean(3),
+        1e-2,
+        _std_weight_position * mean(3);
+    DETECTBOX std_vel;
+    std_vel << _std_weight_velocity * mean(3),
+        _std_weight_velocity * mean(3),
+        1e-5,
+        _std_weight_velocity * mean(3);
+    KAL_MEAN tmp;
+    tmp.block<1, 4>(0, 0) = std_pos;
+    tmp.block<1, 4>(0, 4) = std_vel;
+    tmp = tmp.array().square();
+    KAL_COVA motion_cov = tmp.asDiagonal();
+    KAL_MEAN mean1 = this->_motion_mat * mean.transpose();
+    KAL_COVA covariance1 = this->_motion_mat * covariance *(_motion_mat.transpose());
+    covariance1 += motion_cov;
+    mean = mean1;
+    covariance = covariance1;
+}
+KAL_HDATA KalmanFilter::project(const KAL_MEAN &mean, const KAL_COVA &covariance)
+{
+    DETECTBOX std;
+    std << _std_weight_position * mean(3), _std_weight_position * mean(3),
+        1e-1, _std_weight_position * mean(3);
+    KAL_HMEAN mean1 = _update_mat * mean.transpose();
+    KAL_HCOVA covariance1 = _update_mat * covariance * (_update_mat.transpose());
+    Eigen::Matrix<float, 4, 4> diag = std.asDiagonal();
+    diag = diag.array().square().matrix();
+    covariance1 += diag;
+    //    covariance1.diagonal() << diag;
+    return std::make_pair(mean1, covariance1);
+}
+KAL_DATA
+    KalmanFilter::update(
+        const KAL_MEAN &mean,
+        const KAL_COVA &covariance,
+        const DETECTBOX &measurement)
+{
+    KAL_HDATA pa = project(mean, covariance);
+    KAL_HMEAN projected_mean = pa.first;
+    KAL_HCOVA projected_cov = pa.second;
+    //chol_factor, lower =
+    //scipy.linalg.cho_factor(projected_cov, lower=True, check_finite=False)
+    //kalmain_gain =
+    //scipy.linalg.cho_solve((cho_factor, lower),
+    //np.dot(covariance, self._upadte_mat.T).T,
+    //check_finite=False).T
+    Eigen::Matrix<float, 4, 8> B = (covariance * (_update_mat.transpose())).transpose();
+    Eigen::Matrix<float, 8, 4> kalman_gain = (projected_cov.llt().solve(B)).transpose(); // eg.8x4
+    Eigen::Matrix<float, 1, 4> innovation = measurement - projected_mean; //eg.1x4
+    auto tmp = innovation * (kalman_gain.transpose());
+    KAL_MEAN new_mean = (mean.array() + tmp.array()).matrix();
+    KAL_COVA new_covariance = covariance - kalman_gain * projected_cov*(kalman_gain.transpose());
+    return std::make_pair(new_mean, new_covariance);
+}
+Eigen::Matrix<float, 1, -1>
+    KalmanFilter::gating_distance(
+        const KAL_MEAN &mean,
+        const KAL_COVA &covariance,
+        const std::vector<DETECTBOX> &measurements,
+        bool only_position)
+{
+    KAL_HDATA pa = this->project(mean, covariance);
+    if (only_position) {
+        printf("not implement!");
+        exit(0);
+    }
+    KAL_HMEAN mean1 = pa.first;
+    KAL_HCOVA covariance1 = pa.second;
+    //    Eigen::Matrix<float, -1, 4, Eigen::RowMajor> d(size, 4);
+    DETECTBOXSS d(measurements.size(), 4);
+    int pos = 0;
+    for (DETECTBOX box : measurements) {
+        d.row(pos++) = box - mean1;
+    }
+    Eigen::Matrix<float, -1, -1, Eigen::RowMajor> factor = covariance1.llt().matrixL();
+    Eigen::Matrix<float, -1, -1> z = factor.triangularView<Eigen::Lower>().solve<Eigen::OnTheRight>(d).transpose();
+    auto zz = ((z.array())*(z.array())).matrix();
+    auto square_maha = zz.colwise().sum();
+    return square_maha;
+}
+} //namespace tracker
+} //namspace deepinfer
+} //namespace waytous
--- a/src/libs/trackers/kalman_filter.h
+++ b/src/libs/trackers/kalman_filter.h
+#ifndef WAYTOUS_DEEPINFER_TRACKER_KALMAN_FILTER_H_
+#define WAYTOUS_DEEPINFER_TRACKER_KALMAN_FILTER_H_
+#include <Eigen/Cholesky>
+#include "libs/trackers/tracker_datatype.h"
+namespace waytous{
+namespace deepinfer{
+namespace tracker{
+class KalmanFilter
+{
+public:
+    static const double chi2inv95[10];
+    KalmanFilter();
+    KAL_DATA initiate(const DETECTBOX& measurement);
+    void predict(KAL_MEAN& mean, KAL_COVA& covariance);
+    KAL_HDATA project(const KAL_MEAN& mean, const KAL_COVA& covariance);
+    KAL_DATA update(const KAL_MEAN& mean,
+        const KAL_COVA& covariance,
+        const DETECTBOX& measurement);
+    Eigen::Matrix<float, 1, -1> gating_distance(
+        const KAL_MEAN& mean,
+        const KAL_COVA& covariance,
+        const std::vector<DETECTBOX>& measurements,
+        bool only_position = false);
+private:
+    Eigen::Matrix<float, 8, 8, Eigen::RowMajor> _motion_mat;
+    Eigen::Matrix<float, 4, 8, Eigen::RowMajor> _update_mat;
+    float _std_weight_position;
+    float _std_weight_velocity;
+};
+} //namespace tracker
+} //namspace deepinfer
+} //namespace waytous
+#endif
--- a/src/libs/trackers/lapjv.cpp
+++ b/src/libs/trackers/lapjv.cpp
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libs/trackers/lapjv.h"
+namespace waytous{
+namespace deepinfer{
+namespace tracker{
+/** Column-reduction and reduction transfer for a dense cost matrix.
+ */
+int_t _ccrrt_dense(const uint_t n, cost_t *cost[],
+	int_t *free_rows, int_t *x, int_t *y, cost_t *v)
+{
+	int_t n_free_rows;
+	boolean *unique;
+	for (uint_t i = 0; i < n; i++) {
+		x[i] = -1;
+		v[i] = LARGE;
+		y[i] = 0;
+	}
+	for (uint_t i = 0; i < n; i++) {
+		for (uint_t j = 0; j < n; j++) {
+			const cost_t c = cost[i][j];
+			if (c < v[j]) {
+				v[j] = c;
+				y[j] = i;
+			}
+			PRINTF("i=%d, j=%d, c[i,j]=%f, v[j]=%f y[j]=%d\n", i, j, c, v[j], y[j]);
+		}
+	}
+	PRINT_COST_ARRAY(v, n);
+	PRINT_INDEX_ARRAY(y, n);
+	NEW(unique, boolean, n);
+	memset(unique, TRUE, n);
+	{
+		int_t j = n;
+		do {
+			j--;
+			const int_t i = y[j];
+			if (x[i] < 0) {
+				x[i] = j;
+			}
+			else {
+				unique[i] = FALSE;
+				y[j] = -1;
+			}
+		} while (j > 0);
+	}
+	n_free_rows = 0;
+	for (uint_t i = 0; i < n; i++) {
+		if (x[i] < 0) {
+			free_rows[n_free_rows++] = i;
+		}
+		else if (unique[i]) {
+			const int_t j = x[i];
+			cost_t min = LARGE;
+			for (uint_t j2 = 0; j2 < n; j2++) {
+				if (j2 == (uint_t)j) {
+					continue;
+				}
+				const cost_t c = cost[i][j2] - v[j2];
+				if (c < min) {
+					min = c;
+				}
+			}
+			PRINTF("v[%d] = %f - %f\n", j, v[j], min);
+			v[j] -= min;
+		}
+	}
+	FREE(unique);
+	return n_free_rows;
+}
+/** Augmenting row reduction for a dense cost matrix.
+ */
+int_t _carr_dense(
+	const uint_t n, cost_t *cost[],
+	const uint_t n_free_rows,
+	int_t *free_rows, int_t *x, int_t *y, cost_t *v)
+{
+	uint_t current = 0;
+	int_t new_free_rows = 0;
+	uint_t rr_cnt = 0;
+	PRINT_INDEX_ARRAY(x, n);
+	PRINT_INDEX_ARRAY(y, n);
+	PRINT_COST_ARRAY(v, n);
+	PRINT_INDEX_ARRAY(free_rows, n_free_rows);
+	while (current < n_free_rows) {
+		int_t i0;
+		int_t j1, j2;
+		cost_t v1, v2, v1_new;
+		boolean v1_lowers;
+		rr_cnt++;
+		PRINTF("current = %d rr_cnt = %d\n", current, rr_cnt);
+		const int_t free_i = free_rows[current++];
+		j1 = 0;
+		v1 = cost[free_i][0] - v[0];
+		j2 = -1;
+		v2 = LARGE;
+		for (uint_t j = 1; j < n; j++) {
+			PRINTF("%d = %f %d = %f\n", j1, v1, j2, v2);
+			const cost_t c = cost[free_i][j] - v[j];
+			if (c < v2) {
+				if (c >= v1) {
+					v2 = c;
+					j2 = j;
+				}
+				else {
+					v2 = v1;
+					v1 = c;
+					j2 = j1;
+					j1 = j;
+				}
+			}
+		}
+		i0 = y[j1];
+		v1_new = v[j1] - (v2 - v1);
+		v1_lowers = v1_new < v[j1];
+		PRINTF("%d %d 1=%d,%f 2=%d,%f v1'=%f(%d,%g) \n", free_i, i0, j1, v1, j2, v2, v1_new, v1_lowers, v[j1] - v1_new);
+		if (rr_cnt < current * n) {
+			if (v1_lowers) {
+				v[j1] = v1_new;
+			}
+			else if (i0 >= 0 && j2 >= 0) {
+				j1 = j2;
+				i0 = y[j2];
+			}
+			if (i0 >= 0) {
+				if (v1_lowers) {
+					free_rows[--current] = i0;
+				}
+				else {
+					free_rows[new_free_rows++] = i0;
+				}
+			}
+		}
+		else {
+			PRINTF("rr_cnt=%d >= %d (current=%d * n=%d)\n", rr_cnt, current * n, current, n);
+			if (i0 >= 0) {
+				free_rows[new_free_rows++] = i0;
+			}
+		}
+		x[free_i] = j1;
+		y[j1] = free_i;
+	}
+	return new_free_rows;
+}
+/** Find columns with minimum d[j] and put them on the SCAN list.
+ */
+uint_t _find_dense(const uint_t n, uint_t lo, cost_t *d, int_t *cols, int_t *y)
+{
+	uint_t hi = lo + 1;
+	cost_t mind = d[cols[lo]];
+	for (uint_t k = hi; k < n; k++) {
+		int_t j = cols[k];
+		if (d[j] <= mind) {
+			if (d[j] < mind) {
+				hi = lo;
+				mind = d[j];
+			}
+			cols[k] = cols[hi];
+			cols[hi++] = j;
+		}
+	}
+	return hi;
+}
+// Scan all columns in TODO starting from arbitrary column in SCAN
+// and try to decrease d of the TODO columns using the SCAN column.
+int_t _scan_dense(const uint_t n, cost_t *cost[],
+	uint_t *plo, uint_t*phi,
+	cost_t *d, int_t *cols, int_t *pred,
+	int_t *y, cost_t *v)
+{
+	uint_t lo = *plo;
+	uint_t hi = *phi;
+	cost_t h, cred_ij;
+	while (lo != hi) {
+		int_t j = cols[lo++];
+		const int_t i = y[j];
+		const cost_t mind = d[j];
+		h = cost[i][j] - v[j] - mind;
+		PRINTF("i=%d j=%d h=%f\n", i, j, h);
+		// For all columns in TODO
+		for (uint_t k = hi; k < n; k++) {
+			j = cols[k];
+			cred_ij = cost[i][j] - v[j] - h;
+			if (cred_ij < d[j]) {
+				d[j] = cred_ij;
+				pred[j] = i;
+				if (cred_ij == mind) {
+					if (y[j] < 0) {
+						return j;
+					}
+					cols[k] = cols[hi];
+					cols[hi++] = j;
+				}
+			}
+		}
+	}
+	*plo = lo;
+	*phi = hi;
+	return -1;
+}
+/** Single iteration of modified Dijkstra shortest path algorithm as explained in the JV paper.
+ *
+ * This is a dense matrix version.
+ *
+ * \return The closest free column index.
+ */
+int_t find_path_dense(
+	const uint_t n, cost_t *cost[],
+	const int_t start_i,
+	int_t *y, cost_t *v,
+	int_t *pred)
+{
+	uint_t lo = 0, hi = 0;
+	int_t final_j = -1;
+	uint_t n_ready = 0;
+	int_t *cols;
+	cost_t *d;
+	NEW(cols, int_t, n);
+	NEW(d, cost_t, n);
+	for (uint_t i = 0; i < n; i++) {
+		cols[i] = i;
+		pred[i] = start_i;
+		d[i] = cost[start_i][i] - v[i];
+	}
+	PRINT_COST_ARRAY(d, n);
+	while (final_j == -1) {
+		// No columns left on the SCAN list.
+		if (lo == hi) {
+			PRINTF("%d..%d -> find\n", lo, hi);
+			n_ready = lo;
+			hi = _find_dense(n, lo, d, cols, y);
+			PRINTF("check %d..%d\n", lo, hi);
+			PRINT_INDEX_ARRAY(cols, n);
+			for (uint_t k = lo; k < hi; k++) {
+				const int_t j = cols[k];
+				if (y[j] < 0) {
+					final_j = j;
+				}
+			}
+		}
+		if (final_j == -1) {
+			PRINTF("%d..%d -> scan\n", lo, hi);
+			final_j = _scan_dense(
+				n, cost, &lo, &hi, d, cols, pred, y, v);
+			PRINT_COST_ARRAY(d, n);
+			PRINT_INDEX_ARRAY(cols, n);
+			PRINT_INDEX_ARRAY(pred, n);
+		}
+	}
+	PRINTF("found final_j=%d\n", final_j);
+	PRINT_INDEX_ARRAY(cols, n);
+	{
+		const cost_t mind = d[cols[lo]];
+		for (uint_t k = 0; k < n_ready; k++) {
+			const int_t j = cols[k];
+			v[j] += d[j] - mind;
+		}
+	}
+	FREE(cols);
+	FREE(d);
+	return final_j;
+}
+/** Augment for a dense cost matrix.
+ */
+int_t _ca_dense(
+	const uint_t n, cost_t *cost[],
+	const uint_t n_free_rows,
+	int_t *free_rows, int_t *x, int_t *y, cost_t *v)
+{
+	int_t *pred;
+	NEW(pred, int_t, n);
+	for (int_t *pfree_i = free_rows; pfree_i < free_rows + n_free_rows; pfree_i++) {
+		int_t i = -1, j;
+		uint_t k = 0;
+		PRINTF("looking at free_i=%d\n", *pfree_i);
+		j = find_path_dense(n, cost, *pfree_i, y, v, pred);
+		ASSERT(j >= 0);
+		ASSERT(j < n);
+		while (i != *pfree_i) {
+			PRINTF("augment %d\n", j);
+			PRINT_INDEX_ARRAY(pred, n);
+			i = pred[j];
+			PRINTF("y[%d]=%d -> %d\n", j, y[j], i);
+			y[j] = i;
+			PRINT_INDEX_ARRAY(x, n);
+			SWAP_INDICES(j, x[i]);
+			k++;
+			if (k >= n) {
+				ASSERT(FALSE);
+			}
+		}
+	}
+	FREE(pred);
+	return 0;
+}
+/** Solve dense sparse LAP.
+ */
+int lapjv_internal(
+	const uint_t n, cost_t *cost[],
+	int_t *x, int_t *y)
+{
+	int ret;
+	int_t *free_rows;
+	cost_t *v;
+	NEW(free_rows, int_t, n);
+	NEW(v, cost_t, n);
+	ret = _ccrrt_dense(n, cost, free_rows, x, y, v);
+	int i = 0;
+	while (ret > 0 && i < 2) {
+		ret = _carr_dense(n, cost, ret, free_rows, x, y, v);
+		i++;
+	}
+	if (ret > 0) {
+		ret = _ca_dense(n, cost, ret, free_rows, x, y, v);
+	}
+	FREE(v);
+	FREE(free_rows);
+	return ret;
+}
+} //namespace tracker
+} //namspace deepinfer
+} //namespace waytous
--- a/src/libs/trackers/lapjv.h
+++ b/src/libs/trackers/lapjv.h
+#ifndef WAYTOUS_DEEPINFER_TRACKER_LAPJV_H_
+#define WAYTOUS_DEEPINFER_TRACKER_LAPJV_H_
+namespace waytous{
+namespace deepinfer{
+namespace tracker{
+#define LARGE 1000000
+#if !defined TRUE
+#define TRUE 1
+#endif
+#if !defined FALSE
+#define FALSE 0
+#endif
+#define NEW(x, t, n) if ((x = (t *)malloc(sizeof(t) * (n))) == 0) { return -1; }
+#define FREE(x) if (x != 0) { free(x); x = 0; }
+#define SWAP_INDICES(a, b) { int_t _temp_index = a; a = b; b = _temp_index; }
+#if 0
+#include <assert.h>
+#define ASSERT(cond) assert(cond)
+#define PRINTF(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#define PRINT_COST_ARRAY(a, n) \
+    while (1) { \
+        printf(#a" = ["); \
+        if ((n) > 0) { \
+            printf("%f", (a)[0]); \
+            for (uint_t j = 1; j < n; j++) { \
+                printf(", %f", (a)[j]); \
+            } \
+        } \
+        printf("]\n"); \
+        break; \
+    }
+#define PRINT_INDEX_ARRAY(a, n) \
+    while (1) { \
+        printf(#a" = ["); \
+        if ((n) > 0) { \
+            printf("%d", (a)[0]); \
+            for (uint_t j = 1; j < n; j++) { \
+                printf(", %d", (a)[j]); \
+            } \
+        } \
+        printf("]\n"); \
+        break; \
+    }
+#else
+#define ASSERT(cond)
+#define PRINTF(fmt, ...)
+#define PRINT_COST_ARRAY(a, n)
+#define PRINT_INDEX_ARRAY(a, n)
+#endif
+typedef signed int int_t;
+typedef unsigned int uint_t;
+typedef double cost_t;
+typedef char boolean;
+typedef enum fp_t { FP_1 = 1, FP_2 = 2, FP_DYNAMIC = 3 } fp_t;
+extern int_t lapjv_internal(
+	const uint_t n, cost_t *cost[],
+	int_t *x, int_t *y);
+} //namespace tracker
+} //namspace deepinfer
+} //namespace waytous
+#endif 
--- a/src/libs/trackers/track.cpp
+++ b/src/libs/trackers/track.cpp
+#include "libs/trackers/track.h"
+namespace waytous{
+namespace deepinfer{
+namespace tracker{
+Track::Track(ios::Det2DPtr obj)
+{
+    obj_ = obj->copy();
+	_tlwh.resize(4);
+    _tlwh[0] = obj->x1;
+    _tlwh[1] = obj->y1;
+    _tlwh[2] = obj->x2 - _tlwh[0];
+    _tlwh[3] = obj->y2 - _tlwh[1];
+	is_activated = false;
+	track_id = 0;
+	state = TrackState::New;
+	tlwh.resize(4);
+	tlbr.resize(4);
+	static_tlwh();
+	static_tlbr();
+	frame_id = 0;
+	tracklet_len = 0;
+	start_frame = 0;
+}
+Track::~Track()
+{
+}
+void Track::activate(KalmanFilter &kalman_filter, int frame_id)
+{
+	this->kalman_filter = kalman_filter;
+	this->track_id = this->next_id();
+    this->obj_->track_id = this->track_id;
+	std::vector<float> _tlwh_tmp(4);
+	_tlwh_tmp[0] = this->_tlwh[0];
+	_tlwh_tmp[1] = this->_tlwh[1];
+	_tlwh_tmp[2] = this->_tlwh[2];
+	_tlwh_tmp[3] = this->_tlwh[3];
+	std::vector<float> xyah = tlwh_to_xyah(_tlwh_tmp);
+	DETECTBOX xyah_box;
+	xyah_box[0] = xyah[0];
+	xyah_box[1] = xyah[1];
+	xyah_box[2] = xyah[2];
+	xyah_box[3] = xyah[3];
+	auto mc = this->kalman_filter.initiate(xyah_box);
+	this->mean = mc.first;
+	this->covariance = mc.second;
+	static_tlwh();
+	static_tlbr();
+	this->tracklet_len = 0;
+	this->state = TrackState::Tracked;
+	if (frame_id == 1)
+	{
+		this->is_activated = true;
+	}
+	//this->is_activated = true;
+	this->frame_id = frame_id;
+	this->start_frame = frame_id;
+}
+void Track::re_activate(Track &new_track, int frame_id, bool new_id)
+{
+	std::vector<float> xyah = tlwh_to_xyah(new_track.tlwh);
+	DETECTBOX xyah_box;
+	xyah_box[0] = xyah[0];
+	xyah_box[1] = xyah[1];
+	xyah_box[2] = xyah[2];
+	xyah_box[3] = xyah[3];
+	auto mc = this->kalman_filter.update(this->mean, this->covariance, xyah_box);
+	this->mean = mc.first;
+	this->covariance = mc.second;
+	static_tlwh();
+	static_tlbr();
+	// this->tracklet_len = 0;
+	this->tracklet_len++;
+	this->state = TrackState::Tracked;
+	this->is_activated = true;
+	this->frame_id = frame_id;
+	if (new_id){
+		this->track_id = next_id();
+        this->obj_->track_id = this->track_id;
+    }
+    // update obj_
+    this->obj_->update(new_track.obj_);
+    this->obj_->update(tlbr);
+}
+void Track::update(Track &new_track, int frame_id)
+{
+	this->frame_id = frame_id;
+	this->tracklet_len++;
+	std::vector<float> xyah = tlwh_to_xyah(new_track.tlwh);
+	DETECTBOX xyah_box;
+	xyah_box[0] = xyah[0];
+	xyah_box[1] = xyah[1];
+	xyah_box[2] = xyah[2];
+	xyah_box[3] = xyah[3];
+	auto mc = this->kalman_filter.update(this->mean, this->covariance, xyah_box);
+	this->mean = mc.first;
+	this->covariance = mc.second;
+	static_tlwh();
+	static_tlbr();
+	this->state = TrackState::Tracked;
+	this->is_activated = true;
+	this->_tlwh = new_track._tlwh; // save current detection for vis mask
+    // update obj_
+    this->obj_->update(new_track.obj_);
+    this->obj_->update(tlbr);
+}
+void Track::static_tlwh()
+{
+	if (this->state == TrackState::New)
+	{
+		tlwh[0] = _tlwh[0];
+		tlwh[1] = _tlwh[1];
+		tlwh[2] = _tlwh[2];
+		tlwh[3] = _tlwh[3];
+		return;
+	}
+	tlwh[0] = mean[0];
+	tlwh[1] = mean[1];
+	tlwh[2] = mean[2];
+	tlwh[3] = mean[3];
+	tlwh[2] *= tlwh[3];
+	tlwh[0] -= tlwh[2] / 2;
+	tlwh[1] -= tlwh[3] / 2;
+}
+void Track::static_tlbr()
+{
+	tlbr.clear();
+	tlbr.assign(tlwh.begin(), tlwh.end());
+	tlbr[2] += tlbr[0];
+	tlbr[3] += tlbr[1];
+}
+std::vector<float> Track::tlwh_to_xyah(std::vector<float> tlwh_tmp)
+{
+	std::vector<float> tlwh_output = tlwh_tmp;
+	tlwh_output[0] += tlwh_output[2] / 2;
+	tlwh_output[1] += tlwh_output[3] / 2;
+	tlwh_output[2] /= tlwh_output[3];
+	return tlwh_output;
+}
+std::vector<float> Track::to_xyah()
+{
+	return tlwh_to_xyah(tlwh);
+}
+std::vector<float> Track::tlbr_to_tlwh(std::vector<float> &tlbr)
+{
+	tlbr[2] -= tlbr[0];
+	tlbr[3] -= tlbr[1];
+	return tlbr;
+}
+void Track::mark_lost()
+{
+	state = TrackState::Lost;
+}
+void Track::mark_removed()
+{
+	state = TrackState::Removed;
+}
+int Track::next_id()
+{
+	static int _count = 0;
+	_count++;
+	return _count;
+}
+int Track::end_frame()
+{
+	return this->frame_id;
+}
+void Track::multi_predict(std::vector<Track*> &tracks, KalmanFilter &kalman_filter)
+{
+	for (int i = 0; i < tracks.size(); i++)
+	{
+		if (tracks[i]->state != TrackState::Tracked)
+		{
+			tracks[i]->mean[7] = 0;
+		}
+		kalman_filter.predict(tracks[i]->mean, tracks[i]->covariance);
+	}
+}
+} //namespace camera
+} //namspace percpetion
+} //namespace waytous
--- a/src/libs/trackers/track.h
+++ b/src/libs/trackers/track.h
+#ifndef WAYTOUS_DEEPINFER_TRACK_H_
+#define WAYTOUS_DEEPINFER_TRACK_H_
+#include <vector>
+#include "libs/ios/detection.h"
+#include "libs/trackers/tracker_datatype.h"
+#include "libs/trackers/kalman_filter.h"
+namespace waytous{
+namespace deepinfer{
+namespace tracker{
+enum TrackState { New = 0, Tracked, Lost, Removed };
+class Track
+{
+public:
+	Track(ios::Det2DPtr obj);
+	~Track();
+	std::vector<float> static tlbr_to_tlwh(std::vector<float> &tlbr);
+	void static multi_predict(std::vector<Track*> &tracks, KalmanFilter &kalman_filter);
+	void static_tlwh();
+	void static_tlbr();
+	std::vector<float> tlwh_to_xyah(std::vector<float> tlwh_tmp);
+	std::vector<float> to_xyah();
+	void mark_lost();
+	void mark_removed();
+	int next_id();
+	int end_frame();
+	void activate(KalmanFilter &kalman_filter, int frame_id);
+	void re_activate(Track &new_track, int frame_id, bool new_id = false);
+	void update(Track &new_track, int frame_id);
+public:
+	bool is_activated;
+	int track_id;
+	int state;
+	std::vector<float> _tlwh;
+	std::vector<float> tlwh;
+	std::vector<float> tlbr;
+	int frame_id;
+	int tracklet_len;
+	int start_frame;
+	KAL_MEAN mean;
+	KAL_COVA covariance;
+    ios::Det2DPtr obj_;
+private:
+	KalmanFilter kalman_filter;
+};
+} //namespace tracker
+} //namspace deepinfer
+} //namespace waytous
+#endif
--- a/src/libs/trackers/tracker_datatype.h
+++ b/src/libs/trackers/tracker_datatype.h
+#ifndef WAYTOUS_DEEPINFER_TRACKER_DATATYPE_H_
+#define WAYTOUS_DEEPINFER_TRACKER_DATATYPE_H_
+#include <vector>
+#include <Eigen/Core>
+#include <Eigen/Dense>
+namespace waytous{
+namespace deepinfer{
+namespace tracker{
+typedef Eigen::Matrix<float, 1, 4, Eigen::RowMajor> DETECTBOX; // xyah
+typedef Eigen::Matrix<float, -1, 4, Eigen::RowMajor> DETECTBOXSS;
+//Kalmanfilter
+//typedef Eigen::Matrix<float, 8, 8, Eigen::RowMajor> KAL_FILTER;
+typedef Eigen::Matrix<float, 1, 8, Eigen::RowMajor> KAL_MEAN;
+typedef Eigen::Matrix<float, 8, 8, Eigen::RowMajor> KAL_COVA;
+typedef Eigen::Matrix<float, 1, 4, Eigen::RowMajor> KAL_HMEAN;
+typedef Eigen::Matrix<float, 4, 4, Eigen::RowMajor> KAL_HCOVA;
+using KAL_DATA = std::pair<KAL_MEAN, KAL_COVA>;
+using KAL_HDATA = std::pair<KAL_HMEAN, KAL_HCOVA>;
+//tracker:
+using MATCH_DATA = std::pair<int, int>;
+typedef struct t {
+	std::vector<MATCH_DATA> matches;
+	std::vector<int> unmatched_tracks;
+	std::vector<int> unmatched_detections;
+}TRACHER_MATCHD;
+//linear_assignment:
+typedef Eigen::Matrix<float, -1, -1, Eigen::RowMajor> DYNAMICM;
+} //namespace tracker
+} //namspace deepinfer
+} //namespace waytous
+#endif
--- a/src/models/camera_model.cpp
+++ b/src/models/camera_model.cpp
+#include "models/camera_model.h"
+namespace waytous {
+namespace deepinfer {
+namespace camera {
+bool CameraModel::Init(std::string& configPath) {
+    // CUDA_CHECK(cudaStreamCreate(&stream_));
+    std::string cfgPath = common::GetAbsolutePath(common::ConfigRoot::GetRootPath(), configPath);
+    if(!common::PathExists(cfgPath)){
+        LOG_WARN << "config_file "<< common::ConfigRoot::GetRootPath() << " " << cfgPath << " not exist.";
+        return false;
+    }
+    modelConfigNode = YAML::LoadFile(cfgPath);
+    if (modelConfigNode.IsNull()) {
+        LOG_WARN << "Load " << configPath << " failed! please check!";
+        return false;
+    }
+    inputNames = modelConfigNode.as<std::vector<std::string>>("inputNames");
+    outputNames = modelConfigNode.as<std::vector<std::string>>("outputNames");
+    auto globalParamNode = modelConfigNode["globalParams"];
+    // units
+    auto unitNodes = modelConfigNode["units"];
+    for(int i=0; i<unitNodes.size(); i++){
+        auto unitNode = unitNodes[i];
+        std::string unitName = unitNode["name"].as<std::string>();
+        interfaces::BaseUnitPtr unitPtr = std::make_shared<interfaces::BaseUnit>(interfaces::BaseUnitRegisterer::GetInstanceByName(unitName));
+        if(!unitPtr->Init(unitNode, globalParamNode)){
+            LOG_WARN << "Init unit " << unitName << " failed!";
+            return false;
+        };
+        units_.push_back(unitPtr);
+    }
+    return true;
+}
+bool CameraModel::Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseIOPtr>& outputs){
+    // set source
+    for (int i=0; i<inputs.size(); i++){
+        auto input_img = inputs[i];
+        auto inputName = inputNames[i];
+        auto input = std::make_shared<ios::CameraSrcIn>(ios::CameraSrcIn(input_img));
+        interfaces::SetIOPtr(inputName, input);
+    }
+    // infer
+    for(auto unit: units_){
+        unit->Exec();
+    }
+    // output
+    for(auto outName: outputNames){
+        outputs.push_back(interfaces::GetIOPtr(outName));
+    }
+    return true;
+}
+bool CameraModel::Exec(std::vector<base::Image8UPtr> inputs, std::vector<interfaces::BaseIOPtr>& outputs){
+    // set source
+    for (int i=0; i<inputs.size(); i++){
+        auto input_img = inputs[i];
+        auto inputName = inputNames[i];
+        auto input = std::make_shared<ios::CameraSrcOut>(ios::CameraSrcOut(input_img));
+        interfaces::SetIOPtr(inputName, input);
+    }
+    // infer
+    for(auto unit: units_){
+        unit->Exec();
+    }
+    // output
+    for(auto outName: outputNames){
+        outputs.push_back(interfaces::GetIOPtr(outName));
+    }
+    return true;
+}
+std::string CameraModel::Name(){
+    return "CameraModel";
+}
+DEEPINFER_REGISTER_MODELS(CameraModel);
+}  // namespace camera
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/models/camera_model.h
+++ b/src/models/camera_model.h
+#ifndef DEEPINFER_CAMERA_MODEL_H_
+#define DEEPINFER_CAMERA_MODEL_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include "libs/sources/camera_src.h"
+#include "interfaces/base_model.h"
+namespace waytous {
+namespace deepinfer {
+namespace camera {
+class CameraModel: public interfaces::BaseModel{
+public:
+    virtual bool Init(std::string& configPath) override;
+    virtual bool Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseIOPtr>& outputs);
+    virtual bool Exec(std::vector<base::Image8UPtr> inputs, std::vector<interfaces::BaseIOPtr>& outputs);
+    virtual std::string Name() override;
+public:
+    YAML::Node modelConfigNode;
+    std::vector<interfaces::BaseUnitPtr> units_;
+};
+}  // namespace camera
+}  // namespace deepinfer
+}  // namespace waytous
+#endif