camera-multi-task-model

7d1e70d6 · xin.wang.waytous · 05bdf66c · 7d1e70d6 · 7d1e70d6 · 7d1e70d6
Commit 7d1e70d6 authored Jun 14, 2023 by xin.wang.waytous
33 changed files
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -62,6 +62,11 @@
        "random": "cpp",
        "string": "cpp",
        "unordered_set": "cpp",
-        "dense": "cpp"
+        "dense": "cpp",
+        "csignal": "cpp",
+        "forward_list": "cpp",
+        "codecvt": "cpp",
+        "filesystem": "cpp",
+        "__nullptr": "cpp"
    }
 }
\ No newline at end of file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ project(WaytousDeepInfer)

 # cuda opencv
 find_package(CUDA REQUIRED)
+# find_package(CUDAToolkit REQUIRED)
 find_package(OpenCV REQUIRED)
 # enable_language(CUDA)
 link_directories(${OpenCV_LIBRARIES_DIRS})
@@ -24,8 +25,10 @@ include_directories(${EIGEN3_INCLUDE_DIRS})
 include_directories(include)

 # tensorrt 
-include_directories(/root/TensorRT/TensorRT-7.0.0.11-cuda-10.0/include)
-link_directories(/root/TensorRT/TensorRT-7.0.0.11-cuda-10.0/lib)
+# include_directories(/root/TensorRT/TensorRT-7.0.0.11-cuda-10.0/include)
+# link_directories(/root/TensorRT/TensorRT-7.0.0.11-cuda-10.0/lib)
+include_directories(/root/TensorRT/TensorRT-8.2.3.0/include)
+link_directories(/root/TensorRT/TensorRT-8.2.3.0/lib)
 # aarch64
    # # figure out real path
    # include_directories(/home/nvidia/.../tensorrt-7.1-aarch64-DCN/include)
@@ -43,6 +46,7 @@ include_directories(/usr/local/include/yaml-cpp)
 include_directories(/usr/include/gflags)
 include_directories(/usr/include/glog)

+# -D CUDA_nppicom_LIBRARY=/usr/local/cuda/lib64/libnppim.so
 set(DEP_LIBS 
    ${CUDA_npp_LIBRARY}
    ${CUDA_nppc_LIBRARY}

--- a/configs/tasks/multi/isp_68m_last.onnx
+++ b/configs/tasks/multi/isp_68m_last.onnx
--- a/configs/tasks/multi/multi_task.yaml
+++ b/configs/tasks/multi/multi_task.yaml
+
+configRootPath: /home/wangxin/projects/waytous/DeepInfer
+device: 0
+modelName: CameraModel
+modelConfigPath: configs/tasks/multi/multi_yolov5.yaml
+
--- a/configs/tasks/multi/multi_yolov5.yaml
+++ b/configs/tasks/multi/multi_yolov5.yaml
+
+name: CameraModel
+inputNames: [cvImage]
+outputNames: [out_instances, out_semantics, out_depths]
+# outputNames: [detections, segProtos, rawDepths, rawSemantics]
+
+units:
+    -
+        name: CameraSrc
+        inputNames: [cvImage]
+        outputNames: [uint8Image]
+    -
+        name: ResizeNorm
+        inputNames: [uint8Image]
+        outputNames: [resizeNormImages]
+        inputMean: [0, 0, 0]
+        inputStd: [1, 1, 1]
+        inferBatchSize: 1
+        inputWidth: 640
+        inputHeight: 640
+        fixAspectRatio: false
+        useBGR: false
+    -
+        name: TRTInference
+        inputNames: [resizeNormImages]
+        outputNames: [detections, seg_protos, depths, semantics]
+        runMode: 1 # 0-fp32 1-fp16 2-int8 (int8 not supported)
+        weightsPath: "configs/tasks/multi/isp_68m_last.onnx"
+        engineFile: "configs/tasks/multi/isp_68m_last_fp16.engine"
+        # calibImgPathFile: "configs/tasks/multi/isp_calib_imgs.txt"
+        # calibTableCache: "configs/tasks/multi/isp_calib_table.cache"
+        inferDynamic: false
+        inferBatchSize: 1
+        inputWidth: 640
+        inputHeight: 640
+        maxBatchSize: 1 # used when build engine
+    -
+        name: MultiPostProcess
+        inputNames: [detections, seg_protos, depths, semantics, uint8Image]
+        outputNames: [out_instances, out_semantics, out_depths]
+        inputWidth: 640
+        inputHeight: 640
+        fixAspectRatio: false
+        # instance-seg
+        nmsThreshold: 0.45
+        scoreThreshold: 0.2 # used when inference, can be modified
+        truncatedThreshold: 0.05
+        maxOutputNum: 1000
+        rawDetectionLength: 25200
+        keepTopK: 100
+        segProtoDim: 32
+        instanceDownScale: 4
+        instanceClassNumber: 9
+        instanceClassNames: ["pedestrian", "two_wheel", "car", "truck", "construction_machine", "fence", "stone", "dust", "cone"]
+        # semantic-seg
+        semanticDownScale: 4
+        semanticClassNumber: 2
+        semanticClassNames: ["road", "sky"]
+        # depth
+        depthDownScale: 4
+        depthDistanceScale: 256
+
+        
+
+
+    
+
+
--- a/include/interfaces/base_task.h
+++ b/include/interfaces/base_task.h
@@ -25,6 +25,7 @@ public:
    // camera image
    virtual bool Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseIOPtr>& outputs) = 0;
    virtual void Visualize(cv::Mat* image, interfaces::BaseIOPtr outs) = 0;
+    virtual void Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs){};

    virtual cv::Scalar get_color(int idx);


--- a/include/libs/ios/depth.h
+++ b/include/libs/ios/depth.h
+
+#ifndef DEEPINFER_DEPTH_H_
+#define DEEPINFER_DEPTH_H_
+
+#include <vector>
+#include <string>
+#include <opencv2/opencv.hpp>
+
+#include "common/common.h"
+#include "interfaces/base_io.h"
+
+
+namespace waytous {
+namespace deepinfer {
+namespace ios {
+
+
+
+class Depth: public interfaces::BaseIO{
+
+public:
+    cv::Mat depth;
+};
+using DepthPtr = std::shared_ptr<Depth>;
+
+
+}  // namespace ios
+}  // namespace deepinfer
+}  // namespace waytous
+
+#endif
--- a/include/libs/ios/detection.h
+++ b/include/libs/ios/detection.h
@@ -44,6 +44,7 @@ public:
    std::string class_name;
    bool truncated = false;
    InstanceMaskPtr mask_ptr = nullptr;
+    std::vector<float> mask_coeff;
    int track_id = -1;
    int camera_id = -1;
 };

--- a/include/libs/ios/semantic.h
+++ b/include/libs/ios/semantic.h
+
+#ifndef DEEPINFER_SEMANTIC_H_
+#define DEEPINFER_SEMANTIC_H_
+
+#include <vector>
+#include <string>
+
+#include "common/common.h"
+#include "interfaces/base_io.h"
+#include "libs/ios/instance_mask.h"
+
+
+namespace waytous {
+namespace deepinfer {
+namespace ios {
+
+
+class SemanticSeg{
+public:
+    int class_label = 0;
+    std::string class_name;
+    InstanceMaskPtr mask_ptr = nullptr;
+    int camera_id = -1;
+};
+using SemanticSegPtr = std::shared_ptr<SemanticSeg>;
+
+
+class Semantics: public interfaces::BaseIO{
+
+public:
+    std::vector<SemanticSegPtr> semantic_segs;
+};
+using SemanticsPtr = std::shared_ptr<Semantics>;
+
+
+}  // namespace ios
+}  // namespace deepinfer
+}  // namespace waytous
+
+#endif
--- a/main.cpp
+++ b/main.cpp
@@ -73,7 +73,11 @@ int main(int argc, char** argv){
            std::chrono::duration_cast<std::chrono::microseconds>(e2 - e1).count() / 1000. / infer_count << " ms" << std::endl;
        if(inputs.size() != outputs.size()){
            cv::Mat vis = images[0];
+            if(taskName == "TaskMulti"){
+                t->Visualize(&vis, outputs);
+            }else{
                t->Visualize(&vis, outputs[0]);
+            }  
            cv::imwrite(savePaths[0], vis);
        }else{
            for(int i=0; i<inputs.size(); i++){
@@ -124,6 +128,8 @@ int main(int argc, char** argv){

 ./main TaskBSW image ../configs/tasks/bsw/bsw.yaml ../test/bsw.jpg ../test/bsw_res.jpg

+./main TaskMulti image ../configs/tasks/multi/multi_task.yaml ../test/bsw.jpg ../test/multi_res.jpg
+
 */



--- a/src/libs/fusioner/bayesian_fusioner.cpp
+++ b/src/libs/fusioner/bayesian_fusioner.cpp
@@ -46,7 +46,7 @@ bool BayesianFusioner::Exec(){
        [&](int a, int b){ return (all_objects[a]->confidence - float(all_objects[a]->truncated)) > (all_objects[b]->confidence - float(all_objects[b]->truncated));}
    );

-    auto fusioned_objects = std::make_shared<ios::Detection2Ds>(ios::Detection2Ds());
+    auto fusioned_objects = std::make_shared<ios::Detection2Ds>();
    interfaces::SetIOPtr(outputNames[0], fusioned_objects);
    for(size_t i=0; i < indices.size(); i++){
        if(!deleted[indices[i]]){

--- a/src/libs/inferences/tensorrt/trt_infer.cpp
+++ b/src/libs/inferences/tensorrt/trt_infer.cpp
@@ -105,11 +105,9 @@ bool TRTInference::Init(YAML::Node& configNode){
        for(int dindex = 0; dindex < dims.nbDims; dindex++){
            shape.push_back(dims.d[dindex]);
        };
-        //auto blob = std::make_shared<base::Blob<float>>(base::Blob<float>(shape)); // Blob(Blob)=delete
-        base::BlobPtr<float> blob;
-        blob.reset(new base::Blob<float>(shape));
+        auto blob = std::make_shared<base::Blob<float>>(shape);
        blobs_.insert(std::make_pair(name, blob));
-        auto output = std::make_shared<ios::NormalIO>(ios::NormalIO(blob));
+        auto output = std::make_shared<ios::NormalIO>(blob);
        interfaces::SetIOPtr(name, output);
        auto binding = blob->mutable_gpu_data();
        mBindings.emplace_back(static_cast<void*>(binding));

--- a/src/libs/ios/detection.cpp
+++ b/src/libs/ios/detection.cpp
@@ -18,7 +18,7 @@ void Det2D::validCoordinate(){


 Det2DPtr Det2D::copy(){
-    Det2DPtr obj = std::make_shared<Det2D>(Det2D());
+    Det2DPtr obj = std::make_shared<Det2D>();
    obj->image_height = this->image_height;
    obj->image_width = this->image_width;
    obj->x1 = this->x1;
@@ -31,6 +31,7 @@ Det2DPtr Det2D::copy(){
    obj->truncated = this->truncated;
    obj->track_id = this->track_id;
    obj->camera_id = this->camera_id;
+    obj->mask_coeff = this->mask_coeff;
    if (this->mask_ptr != nullptr){
        obj->mask_ptr.reset(new InstanceMask(this->mask_ptr.get()));
    }
@@ -50,6 +51,7 @@ void Det2D::update(Det2DPtr obj_){
    this->truncated = obj_->truncated;
    // this->track_id = obj_->track_id;// don't update track_id, because they matched and get same track_id.
    this->camera_id = obj_->camera_id;
+    this->mask_coeff = obj_->mask_coeff;
    if (obj_->mask_ptr != nullptr){
        this->mask_ptr->update(obj_->mask_ptr.get());
    }

--- a/src/libs/ios/instance_mask.cpp
+++ b/src/libs/ios/instance_mask.cpp
@@ -136,10 +136,11 @@ void InstanceMask::encode(const cv::Mat& map){
    std::vector<uint> cnts; 
    for (int i=0; i<height; i++){
        for(int j=0; j<width; j++){
-            if(map.at<char>(i, j) != p){
+            auto pj = (map.at<float>(i, j) > 0.5);
+            if(pj != p){
                cnts.push_back(count);
                count = 0;
-                p = map.at<char>(i, j);
+                p = pj;
            }
            count++;
        }

--- a/src/libs/postprocessors/homo_project.cpp
+++ b/src/libs/postprocessors/homo_project.cpp
@@ -40,7 +40,7 @@ bool HomoProject::Exec(){
        return false;
    }

-    auto projected_dets = std::make_shared<ios::Detection2Ds>(ios::Detection2Ds());
+    auto projected_dets = std::make_shared<ios::Detection2Ds>();
    Eigen::Matrix<float, 3, 2, Eigen::RowMajor> default_box, pbox;
    default_box(4) = 1;
    default_box(5) = 1;

--- a/src/libs/postprocessors/mobilefacenet_post.cpp
+++ b/src/libs/postprocessors/mobilefacenet_post.cpp
@@ -51,9 +51,9 @@ bool MobileFaceNetPostProcess::Exec(){
        if(fixAspectRatio){
            scalex = scaley = std::min(scalex, scaley);
        }
-        auto face_landmarks = std::make_shared<ios::Landmarks>(ios::Landmarks());
+        auto face_landmarks = std::make_shared<ios::Landmarks>();
        for(int l=0; l < landmarkNumber; l++){
-            ios::Point2DPtr lptr = std::make_shared<ios::Point2D>(ios::Point2D());
+            ios::Point2DPtr lptr = std::make_shared<ios::Point2D>();
            lptr->x = points[l * 2] * inputWidth / scalex;
            lptr->y = points[l * 2 + 1] * inputHeight / scaley;
            face_landmarks->landmarks.push_back(lptr);

--- a/src/libs/postprocessors/multi_post.cpp
+++ b/src/libs/postprocessors/multi_post.cpp
+#include "libs/postprocessors/multi_post.h"
+
+namespace waytous {
+namespace deepinfer {
+namespace postprocess {
+
+
+bool MultiPostProcess::Init(YAML::Node& node) {
+    if(!BaseUnit::Init(node)){
+        LOG_WARN << "Init multitask postprocess error";
+        return false;
+    };
+    
+    inputHeight = node["inputHeight"].as<int>();
+    inputWidth = node["inputWidth"].as<int>();
+    fixAspectRatio = node["fixAspectRatio"].as<bool>();
+
+    nmsThreshold = node["nmsThreshold"].as<float>();
+    scoreThreshold = node["scoreThreshold"].as<float>();
+    truncatedThreshold = node["truncatedThreshold"].as<float>();
+    maxOutputNum = node["maxOutputNum"].as<int>();
+    rawDetectionLength = node["rawDetectionLength"].as<int>();
+    keepTopK = node["keepTopK"].as<int>();
+    segProtoDim = node["segProtoDim"].as<int>();
+    instanceDownScale = node["instanceDownScale"].as<int>();
+    instanceClassNumber = node["instanceClassNumber"].as<int>();
+    instanceClassNames = node["instanceClassNames"].as<std::vector<std::string>>();
+    detectionStep = segProtoDim + instanceClassNumber + 5;
+
+    semanticDownScale = node["semanticDownScale"].as<int>();
+    semanticClassNumber = node["semanticClassNumber"].as<int>();
+    semanticClassNames = node["semanticClassNames"].as<std::vector<std::string>>();
+
+    depthDownScale = node["depthDownScale"].as<int>();
+    depthDistanceScale = node["depthDistanceScale"].as<int>();
+
+    output_length_ptr.reset(new base::Blob<int>({inferBatchSize, 1}));
+    output_length_ptr->cpu_data();
+    bboxes_ptr.reset(new base::Blob<float>({inferBatchSize, maxOutputNum, detectionStep})); // xywh s c[] mask[]
+    bboxes_ptr->cpu_data(); // init, cpu malloc
+    return true;
+};
+
+
+bool MultiPostProcess::Exec() {
+    
+    auto rawDetections = std::dynamic_pointer_cast<ios::NormalIO>(interfaces::GetIOPtr(inputNames[0]));
+    if (rawDetections == nullptr){
+        LOG_ERROR << "multitask postprocess input " << inputNames[0] << " haven't been init or doesn't exist.";
+        return false;
+    }
+    auto segProtos = std::dynamic_pointer_cast<ios::NormalIO>(interfaces::GetIOPtr(inputNames[1]));
+    if (segProtos == nullptr){
+        LOG_ERROR << "multitask postprocess input " << inputNames[1] << " haven't been init or doesn't exist.";
+        return false;
+    }
+    auto rawDepths = std::dynamic_pointer_cast<ios::NormalIO>(interfaces::GetIOPtr(inputNames[2]));
+    if (rawDepths == nullptr){
+        LOG_ERROR << "multitask postprocess input " << inputNames[2] << " haven't been init or doesn't exist.";
+        return false;
+    }
+    auto rawSemantics = std::dynamic_pointer_cast<ios::NormalIO>(interfaces::GetIOPtr(inputNames[3]));
+    if (rawSemantics == nullptr){
+        LOG_ERROR << "multitask postprocess input " << inputNames[3] << " haven't been init or doesn't exist.";
+        return false;
+    }
+    auto inputImage = std::dynamic_pointer_cast<ios::CameraSrcOut>(interfaces::GetIOPtr(inputNames[4]));
+    if (inputImage == nullptr){
+        LOG_ERROR << "multitask postprocess input " << inputNames[4] << " haven't been init or doesn't exist.";
+        return false;
+    }
+    
+    
+    // filter detections 25200 x (5+9+32) -> 1000 x (5+9+32)
+    // reset output_length=0, otherwise, it will increase after every inference.
+    output_length_ptr->mutable_cpu_data()[0] = 0;
+    multitask_instance_filter(
+        rawDetections->data_->gpu_data(), rawDetectionLength,
+        bboxes_ptr->mutable_gpu_data(),
+        output_length_ptr->mutable_gpu_data(),
+        scoreThreshold, detectionStep, maxOutputNum
+    );
+
+    auto outputLength = output_length_ptr->cpu_data();
+    auto outputBoxes = bboxes_ptr->mutable_cpu_data();
+    auto proto = segProtos->data_->mutable_cpu_data();
+
+    // Detection
+    float img_width = float(inputImage->img_ptr_->cols());
+    float img_height = float(inputImage->img_ptr_->rows());
+    float scalex = inputWidth / img_width;
+    float scaley = inputHeight / img_height;
+    if(fixAspectRatio){
+        scalex = scaley = std::min(scalex, scaley);
+    }
+    auto dets = std::make_shared<ios::Detection2Ds>();
+    std::vector<std::vector<float>> mask_coeffs;
+    for(int i = 0; i < outputLength[0]; i++){
+        float* current_box_info = outputBoxes + i * detectionStep;
+        float max_class_conf = 0.0;
+        int class_id = 0;
+        for(int ic=0; ic < instanceClassNumber; ic++){
+            if(current_box_info[5 + ic] > max_class_conf){
+                max_class_conf = current_box_info[5 + ic];
+                class_id = ic;
+            }
+        }
+        float confidence = max_class_conf * current_box_info[4];
+        if(confidence < scoreThreshold){
+            continue;
+        }
+        auto obj = std::make_shared<ios::Det2D>();
+        obj->confidence = confidence;
+        obj->class_label = class_id;
+        obj->class_name = instanceClassNames[obj->class_label];
+        obj->x1= (current_box_info[0] - current_box_info[2] / 2)  / scalex;
+        obj->y1 = (current_box_info[1] - current_box_info[3] / 2)  / scaley;
+        obj->x2 = (current_box_info[0] + current_box_info[2] / 2) / scalex;
+        obj->y2 = (current_box_info[1] + current_box_info[3] / 2) / scaley;
+        obj->image_height = img_height;
+        obj->image_width = img_width;
+        obj->validCoordinate(); //
+        // LOG_INFO << "box:" << obj->x1 << ","<< obj->y1 << ","<< obj->x2 << ","<< obj->y2;
+        if((obj->x1 / img_width  < truncatedThreshold) || (obj->y1 / img_height  < truncatedThreshold) ||
+        (obj->x2 / img_width  > (1 - truncatedThreshold)) || (obj->y2 / img_height  > (1 - truncatedThreshold))){
+            obj->truncated = true;
+        }
+        for(int im=0; im < segProtoDim; im++){
+            obj->mask_coeff.push_back(current_box_info[5 + instanceClassNumber + im]);
+        }
+        dets->detections.push_back(obj);
+    }
+    // NMS
+    LOG_INFO << "before nms:" << dets->detections.size();
+    nms_cpu(dets, scoreThreshold, nmsThreshold, instanceClassNumber, keepTopK);
+    LOG_INFO << "after nms:" << dets->detections.size();
+
+    // Instance Mask
+    for(auto det: dets->detections){
+        cv::Mat mask_mat = cv::Mat::zeros(inputWidth / instanceDownScale, inputHeight / instanceDownScale, CV_32FC1);
+        int x1 = round(det->x1 * scalex / instanceDownScale); // scale to output mask level.
+        int x2 = round(det->x2 * scalex / instanceDownScale);
+        int y1 = round(det->y1 * scaley / instanceDownScale);
+        int y2 = round(det->y2 * scaley / instanceDownScale);
+        for (int x = x1; x < x2; x++) {
+            for (int y = y1; y < y2; y++) {
+                float e = 0.0f;
+                for (int j = 0; j < segProtoDim; j++) {
+                    e += det->mask_coeff[j] * proto[j * mask_mat.cols * mask_mat.rows + y * mask_mat.cols + x];
+                }
+                e = 1.0f / (1.0f + expf(-e));
+                mask_mat.at<float>(y, x) = e;
+            }
+        }
+        // cv::Mat mask_res;
+        // if(fixAspectRatio){
+        //     int w = img_width * scalex / instanceDownScale;
+        //     int h = img_height * scaley / instanceDownScale;
+        //     cv::Rect r(0, 0, w, h);
+        //     cv::resize(mask_mat(r), mask_res, cv::Size(img_width, img_height));
+        // }else{
+        //     cv::resize(mask_mat, mask_res, cv::Size(img_width, img_height));
+        // }
+        det->mask_ptr.reset(
+            new ios::InstanceMask(mask_mat)
+        );
+        // LOG_INFO << x1 << " " << x2 << " " << y1 << " " << y2 <<", " << det->mask_ptr->rle_string;
+    }
+    interfaces::SetIOPtr(outputNames[0], dets);
+
+    // Semantic Mask
+    auto semantics = std::make_shared<ios::Semantics>();
+    auto rawSemanticSegs = rawSemantics->data_->mutable_cpu_data();
+    for(int is=0; is < semanticClassNumber; is++){
+        auto seg = std::make_shared<ios::SemanticSeg>();
+        seg->class_label = is;
+        seg->class_name = semanticClassNames[is];
+        cv::Mat mask_mat = cv::Mat::zeros(inputWidth / semanticDownScale, inputHeight / semanticDownScale, CV_32FC1);
+        for (int x = 0; x < inputWidth / semanticDownScale; x++) {
+            for (int y = 0; y < inputHeight / semanticDownScale; y++) {
+                float e = rawSemanticSegs[is * mask_mat.cols * mask_mat.rows + y * mask_mat.cols + x];
+                // e = 1.0f / (1.0f + expf(-e));
+                mask_mat.at<float>(y, x) = e;
+            }
+        }
+        // cv::Mat mask_res;
+        // if(fixAspectRatio){
+        //     int w = img_width * scalex / semanticDownScale;
+        //     int h = img_height * scaley / semanticDownScale;
+        //     cv::Rect r(0, 0, w, h);
+        //     cv::resize(mask_mat(r), mask_res, cv::Size(img_width, img_height));
+        // }else{
+        //     cv::resize(mask_mat, mask_res, cv::Size(img_width, img_height));
+        // }
+        seg->mask_ptr.reset(
+            new ios::InstanceMask(mask_mat)
+        );
+        // LOG_INFO << seg->mask_ptr->rle_string;
+        semantics->semantic_segs.push_back(seg);
+    }
+    interfaces::SetIOPtr(outputNames[1], semantics);
+
+    // Depth
+    auto depth = std::make_shared<ios::Depth>();
+    auto rawDepth = rawDepths->data_->mutable_cpu_data();
+    cv::Mat mask_mat = cv::Mat::zeros(inputWidth / depthDownScale, inputHeight / depthDownScale, CV_32FC1);
+    for (int x = 0; x < inputWidth / depthDownScale; x++) {
+        for (int y = 0; y < inputHeight / depthDownScale; y++) {
+            float e = rawDepth[y * mask_mat.cols + x];
+            // e = 1.0f / (1.0f + expf(-e));
+            mask_mat.at<float>(y, x) = e * depthDistanceScale;
+        }
+    }
+    // cv::Mat mask_res;
+    // if(fixAspectRatio){
+    //     int w = img_width * scalex / depthDownScale;
+    //     int h = img_height * scaley / depthDownScale;
+    //     cv::Rect r(0, 0, w, h);
+    //     cv::resize(mask_mat(r), mask_res, cv::Size(img_width, img_height));
+    // }else{
+    //     cv::resize(mask_mat, mask_res, cv::Size(img_width, img_height));
+    // }
+    depth->depth = mask_mat;
+    interfaces::SetIOPtr(outputNames[2], depth);
+
+    return true;
+
+};
+
+
+std::string MultiPostProcess::Name() {
+    return "MultiPostProcess";
+};
+
+
+}  // namespace postprocess
+}  // namespace deepinfer
+}  // namespace waytous
+
+
+
--- a/src/libs/postprocessors/multi_post.h
+++ b/src/libs/postprocessors/multi_post.h
+
+#ifndef DEEPINFER_POSTPROCESS_MULTI_H_
+#define DEEPINFER_POSTPROCESS_MULTI_H_
+
+#include "interfaces/base_unit.h"
+#include "base/image.h"
+
+#include "libs/ios/normal_ios.h"
+#include "libs/ios/camera_ios.h"
+#include "libs/ios/detection.h"
+#include "libs/ios/semantic.h"
+#include "libs/ios/depth.h"
+#include "libs/postprocessors/multi_post_gpu.h"
+#include "libs/postprocessors/nms.h"
+
+namespace waytous {
+namespace deepinfer {
+namespace postprocess {
+
+
+class MultiPostProcess: public interfaces::BaseUnit{
+
+public:
+    bool Init(YAML::Node& node) override;
+
+    bool Exec() override;
+
+    std::string Name() override;
+
+
+public:
+    base::BlobPtr<int> output_length_ptr;
+    base::BlobPtr<float> bboxes_ptr;
+
+    int inputHeight, inputWidth;
+    int inferBatchSize = 1;
+    bool fixAspectRatio = true;
+    float nmsThreshold, scoreThreshold, truncatedThreshold;
+    int rawDetectionLength;
+    int maxOutputNum = 1000;
+    int keepTopK = 100;
+    int segProtoDim = 32;
+    int instanceDownScale = 4;
+    int instanceClassNumber;
+    std::vector<std::string> instanceClassNames;
+    int detectionStep;
+
+    int semanticDownScale = 4, semanticClassNumber;
+    std::vector<std::string> semanticClassNames;
+
+    int depthDownScale = 4, depthDistanceScale = 256;
+};
+
+DEEPINFER_REGISTER_UNIT(MultiPostProcess);
+
+
+}  // namespace postprocess
+}  // namespace deepinfer
+}  // namespace waytous
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/src/libs/postprocessors/multi_post_gpu.cu
+++ b/src/libs/postprocessors/multi_post_gpu.cu
+
+#include "libs/postprocessors/multi_post_gpu.h"
+
+namespace waytous {
+namespace deepinfer {
+namespace postprocess {
+
+
+
+__global__ void filter_kernel(const float* inputs, const int input_length, float* outputs, int* out_length, 
+    float ignore_threshold, int step, int max_out_num) {
+    int idx = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+    if (idx > input_length) return;
+    
+    float score = inputs[idx * step + 4];
+    if(score < ignore_threshold)
+        return;
+    int res_count = (int) atomicAdd(out_length, 1);
+    if(res_count < max_out_num){
+        for(int i=0; i<step; i++){
+            outputs[res_count * step + i] = inputs[idx * step + i];
+        }
+    }
+}
+
+
+void multitask_instance_filter(const float* inputs, const int input_length, float* outputs, int* out_length, 
+    float ignore_threshold, int step, int max_out_num)
+{
+    uint block = 512;
+    dim3 threads = dim3(block, 1, 1);
+    filter_kernel<<<common::cudaGridSize(input_length, block), threads>>>(inputs, input_length, outputs, out_length, 
+        ignore_threshold, step, max_out_num);
+    CUDA_CHECK(cudaDeviceSynchronize());
+};
+
+}  // namespace postprocess
+}  // namespace deepinfer
+}  // namespace waytous
--- a/src/libs/postprocessors/multi_post_gpu.h
+++ b/src/libs/postprocessors/multi_post_gpu.h
+
+#ifndef DEEPINFER_POSTPROCESS_MULTI_GPU_H_
+#define DEEPINFER_POSTPROCESS_MULTI_GPU_H_
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+#include "common/common.h"
+
+namespace waytous {
+namespace deepinfer {
+namespace postprocess {
+
+
+void multitask_instance_filter(const float* inputs, const int input_length, float* outputs, int* out_length, float ignore_threshold, int step, int max_out_num);
+
+
+
+}  // namespace postprocess
+}  // namespace deepinfer
+}  // namespace waytous
+
+#endif
+
--- a/src/libs/postprocessors/nms.cpp
+++ b/src/libs/postprocessors/nms.cpp
+
+#include "libs/postprocessors/nms.h"
+
+
+namespace waytous {
+namespace deepinfer {
+namespace postprocess {
+
+
+float iou(float lbox[4], float rbox[4]) {
+    float interBox[] = {
+        (std::max)(lbox[0], rbox[0]), //left
+        (std::min)(lbox[2] , rbox[2]), //right
+        (std::max)(lbox[1] , rbox[1]), //top
+        (std::min)(lbox[3] , rbox[3]), //bottom
+    };
+
+    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
+        return 0.0f;
+
+    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
+    return interBoxS / ((lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS + 1e-5);
+}
+
+
+void nms_cpu(ios::Detection2DsPtr dets, float conf_thresh, float nms_thresh, int num_classes, int keep_top_k){
+    std::vector<ios::Det2DPtr> res;
+    std::map<int, std::vector<ios::Det2DPtr>> m;
+    for (auto det: dets->detections) {
+        if (m.count(det->class_label) == 0) 
+            m.emplace(det->class_label, std::vector<ios::Det2DPtr>());
+        m[det->class_label].push_back(det);
+    }
+    dets->detections.clear();
+    for (auto it = m.begin(); it != m.end(); it++) {
+        auto& detections = it->second;
+        std::sort(detections.begin(), detections.end(), [](const ios::Det2DPtr a, const ios::Det2DPtr b){
+            return a->confidence > b->confidence;
+        });
+        for (size_t ii = 0; ii < detections.size(); ++ii) {
+            auto& item = detections[ii];
+            dets->detections.push_back(item);
+            if(dets->detections.size() >= keep_top_k){
+                return;
+            }
+            for (size_t n = ii + 1; n < detections.size(); ++n) {
+                float lbox[4] = {item->x1, item->y1, item->x2, item->y2};
+                float rbox[4] = {detections[n]->x1, detections[n]->y1, detections[n]->x2, detections[n]->y2};
+                if (iou(lbox, rbox) > nms_thresh) {
+                    detections.erase(detections.begin() + n);
+                    --n;
+                }
+            }
+        }
+    }
+
+};
+
+
+
+}  // namespace postprocess
+}  // namespace deepinfer
+}  // namespace waytous
+
+
--- a/src/libs/postprocessors/nms.h
+++ b/src/libs/postprocessors/nms.h
+
+#ifndef DEEPINFER_POSTPROCESS_NMS_H_
+#define DEEPINFER_POSTPROCESS_NMS_H_
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <vector>
+#include <map>
+
+#include "libs/ios/detection.h"
+#include "common/common.h"
+
+
+namespace waytous {
+namespace deepinfer {
+namespace postprocess {
+
+
+// x1y1x2y2
+float iou(float lbox[4], float rbox[4]);
+
+
+void nms_cpu(ios::Detection2DsPtr dets, float conf_thresh, 
+        float nms_thresh, int num_classes, int keep_top_k=100);
+
+
+
+}  // namespace postprocess
+}  // namespace deepinfer
+}  // namespace waytous
+
+#endif
+
--- a/src/libs/postprocessors/trades_post.cpp
+++ b/src/libs/postprocessors/trades_post.cpp
@@ -95,12 +95,12 @@ bool TraDesPostProcess::Exec() {
        if(fixAspectRatio){
            scalex = scaley = std::min(scalex, scaley);
        }
-        auto dets = std::make_shared<ios::Detection2Ds>(ios::Detection2Ds());
+        auto dets = std::make_shared<ios::Detection2Ds>();
        for(int i = 0; i < outputLength[b]; i++){
            if(outputBoxes[b * topK * 6 + i * 6 + 4] < scoreThreshold){
                continue;
            }
-            auto obj = std::make_shared<ios::Det2D>(ios::Det2D());
+            auto obj = std::make_shared<ios::Det2D>();
            obj->confidence = outputBoxes[b * topK * 6 + i * 6 + 4];
            obj->class_label = int(outputBoxes[b * topK * 6 + i * 6 + 5]);
            obj->class_name = classNames[obj->class_label];

--- a/src/libs/postprocessors/whenet_post.cpp
+++ b/src/libs/postprocessors/whenet_post.cpp
@@ -35,7 +35,7 @@ bool WHENetPostProcess::Exec(){
    float* pitch = inputs[2]->mutable_cpu_data();

    for(int b=0; b<outputNames.size(); b++){
-        auto headpose = std::make_shared<ios::HeadPose>(ios::HeadPose());
+        auto headpose = std::make_shared<ios::HeadPose>();
        interfaces::SetIOPtr(outputNames[b], headpose);
        
        float yawSum = 0, pitchSum=0, rollSum = 0;

--- a/src/libs/postprocessors/yolov5_post.cpp
+++ b/src/libs/postprocessors/yolov5_post.cpp
@@ -63,12 +63,12 @@ bool YoloV5PostProcess::Exec(){
        if(fixAspectRatio){
            scalex = scaley = std::min(scalex, scaley);
        }
-        auto dets = std::make_shared<ios::Detection2Ds>(ios::Detection2Ds());
+        auto dets = std::make_shared<ios::Detection2Ds>();
        for(int i = 0; i < num_detections[b]; i++){
            if(nmsed_scores[b * keepTopK * 1 + i] < scoreThreshold){
                continue;
            }
-            ios::Det2DPtr obj = std::make_shared<ios::Det2D>(ios::Det2D());
+            ios::Det2DPtr obj = std::make_shared<ios::Det2D>();
            obj->confidence = nmsed_scores[b * keepTopK * 1 + i];
            obj->class_label = int(nmsed_classes[b * keepTopK * 1 + i]);
            obj->class_name = classNames[obj->class_label];

--- a/src/libs/preprocessors/resize_norm.cpp
+++ b/src/libs/preprocessors/resize_norm.cpp
@@ -27,7 +27,7 @@ bool ResizeNorm::Init(YAML::Node& node){
    std::vector<float> inputStd = node["inputStd"].as<std::vector<float>>();

    dst.reset(new base::Blob<float>({inferBatchSize, 3, inputHeight, inputWidth}));
-    auto dst_ptr = std::make_shared<ios::NormalIO>(ios::NormalIO(dst));
+    auto dst_ptr = std::make_shared<ios::NormalIO>(dst);
    interfaces::SetIOPtr(outputNames[0], dst_ptr);

    mean.reset(new base::Blob<float>({3, 1}, inputMean.data()));
@@ -59,7 +59,6 @@ bool ResizeNorm::Exec(){
            useBGR, fixAspectRatio
        );
    }
-    // ios::NormalIOPtr dst_ptr = std::make_shared<ios::NormalIO>(ios::NormalIO(dst));
    return true;
 };


--- a/src/libs/preprocessors/undistort.cpp
+++ b/src/libs/preprocessors/undistort.cpp
@@ -32,8 +32,8 @@ bool Undistort::Init(YAML::Node& node) {
                            camera_intrinsic, width_,
                            height_, &d_mapx_, &d_mapy_);

-    dst_img = std::make_shared<base::Image8U>(base::Image8U(height_, width_, base::Color::BGR));
-    auto output = std::make_shared<ios::CameraSrcOut>(ios::CameraSrcOut(dst_img));
+    dst_img = std::make_shared<base::Image8U>(height_, width_, base::Color::BGR);
+    auto output = std::make_shared<ios::CameraSrcOut>(dst_img);
    interfaces::SetIOPtr(outputNames[0], output);
    inited_ = true;
    return true;

--- a/src/libs/sources/camera_src.cpp
+++ b/src/libs/sources/camera_src.cpp
@@ -14,8 +14,8 @@ bool CameraSrc::Exec(){
        return false;
    }
    auto src = std::dynamic_pointer_cast<ios::CameraSrcIn>(src_input);
-    auto img = std::make_shared<base::Image8U>(base::Image8U(src->cv_img_));
-    auto dst = std::make_shared<ios::CameraSrcOut>(ios::CameraSrcOut(img));
+    auto img = std::make_shared<base::Image8U>(src->cv_img_);
+    auto dst = std::make_shared<ios::CameraSrcOut>(img);
    interfaces::SetIOPtr(outputNames[0], dst);
    return true;
 }

--- a/src/libs/trackers/byte_tracker.cpp
+++ b/src/libs/trackers/byte_tracker.cpp
@@ -242,7 +242,7 @@ bool ByteTracker::Exec()
 	this->lost_tracks.clear();
 	this->lost_tracks.assign(resb.begin(), resb.end());
 	
-	auto tracked_bboxes = std::make_shared<ios::Detection2Ds>(ios::Detection2Ds());
+	auto tracked_bboxes = std::make_shared<ios::Detection2Ds>();
 	for (int i = 0; i < this->tracked_tracks.size(); i++)
 	{
 		if (this->tracked_tracks[i].is_activated && (this->tracked_tracks[i].tracklet_len) >= this->init_frames)

--- a/src/models/camera_model.cpp
+++ b/src/models/camera_model.cpp
@@ -44,7 +44,7 @@ bool CameraModel::Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::Bas
        auto input_img = inputs[i];
        auto inputName = inputNames[i];
        LOG_INFO << "CameraModel input: " << inputName;
-        auto input = std::make_shared<ios::CameraSrcIn>(ios::CameraSrcIn(input_img));
+        auto input = std::make_shared<ios::CameraSrcIn>(input_img);
        interfaces::SetIOPtr(inputName, input);
    }
    // infer
@@ -71,7 +71,7 @@ bool CameraModel::Exec(std::vector<base::Image8UPtr> inputs, std::vector<interfa
    for (int i=0; i<inputs.size(); i++){
        auto input_img = inputs[i];
        auto inputName = inputNames[i];
-        auto input = std::make_shared<ios::CameraSrcOut>(ios::CameraSrcOut(input_img));
+        auto input = std::make_shared<ios::CameraSrcOut>(input_img);
        interfaces::SetIOPtr(inputName, input);
    }


--- a/src/tasks/task_multi.cpp
+++ b/src/tasks/task_multi.cpp
+
+#include "tasks/task_multi.h"
+
+namespace waytous{
+namespace deepinfer{
+namespace task{
+
+
+bool TaskMulti::Init(std::string& taskConfigPath){
+    if(!interfaces::BaseTask::Init(taskConfigPath)){
+        LOG_ERROR << "Init task Multi error";
+        return false;
+    };
+    std::string modelName = taskNode["modelName"].as<std::string>();
+    std::string modelConfigPath = taskNode["modelConfigPath"].as<std::string>();
+
+    MulitTaskModel.reset(interfaces::BaseModelRegisterer::GetInstanceByName(modelName));
+    if(!MulitTaskModel->Init(modelConfigPath)){
+        LOG_ERROR << "Task Multi "<< modelName << " model init problem";
+        return false;
+    };
+    return true;
+}
+
+
+bool TaskMulti::Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseIOPtr>& outputs){
+    if(!MulitTaskModel->Exec(inputs, outputs)){
+        LOG_ERROR << "Task Multi Exec error";
+        return false;
+    };
+    return true;
+}
+
+
+void TaskMulti::Visualize(cv::Mat* image, interfaces::BaseIOPtr outs){
+}
+
+void TaskMulti::Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs){
+    auto detections = std::dynamic_pointer_cast<ios::Detection2Ds>(outputs[0])->detections;
+    for(auto& obj: detections){
+        cv::Scalar color = get_color(obj->class_label * 100 + obj->track_id);
+        cv::putText(*image, std::to_string(obj->class_label) + ":" + common::formatValue(obj->confidence, 2), 
+                    cv::Point(int(obj->x1), int(obj->y1) - 5), 
+                    0, 0.6, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
+        
+        cv::rectangle(*image, cv::Rect(int(obj->x1), int(obj->y1), 
+                    int(obj->x2 - obj->x1), int(obj->y2 - obj->y1)), color, 2);
+        // LOG_INFO << obj->mask_ptr->width << ", " << obj->mask_ptr->height << ", " << obj->mask_ptr->rle_string;
+        cv::Mat instance_mask = obj->mask_ptr->decode();
+        cv::resize(instance_mask, instance_mask, image->size());
+        for (int x = obj->x1; x < round(obj->x2); x++) {
+            for (int y = obj->y1; y < round(obj->y2); y++) {
+                float val = instance_mask.at<uchar>(y, x);
+                if (val <= 0.5) continue;
+                // LOG_INFO<<val<<" ";
+                image->at<cv::Vec3b>(y, x)[0] = image->at<cv::Vec3b>(y, x)[0] / 2 + color[0] / 2;
+                image->at<cv::Vec3b>(y, x)[1] = image->at<cv::Vec3b>(y, x)[1] / 2 + color[1] / 2;
+                image->at<cv::Vec3b>(y, x)[2] = image->at<cv::Vec3b>(y, x)[2] / 2 + color[2] / 2;
+            }
+        }
+    }
+
+    auto semantics = std::dynamic_pointer_cast<ios::Semantics>(outputs[1])->semantic_segs;
+    for(auto& obj: semantics){
+        cv::Scalar color = get_color(obj->class_label * 1000);
+        // LOG_INFO << obj->mask_ptr->width << ", " << obj->mask_ptr->height << ", " << obj->mask_ptr->rle_string;
+        cv::Mat instance_mask = obj->mask_ptr->decode();
+        cv::resize(instance_mask, instance_mask, image->size());
+        for (int x = 0; x < image->cols; x++) {
+            for (int y = 0; y < image->rows; y++) {
+                float val = instance_mask.at<uchar>(y, x);
+                if (val <= 0.5) continue;
+                image->at<cv::Vec3b>(y, x)[0] = image->at<cv::Vec3b>(y, x)[0] / 2 + color[0] / 2;
+                image->at<cv::Vec3b>(y, x)[1] = image->at<cv::Vec3b>(y, x)[1] / 2 + color[1] / 2;
+                image->at<cv::Vec3b>(y, x)[2] = image->at<cv::Vec3b>(y, x)[2] / 2 + color[2] / 2;
+            }
+        }
+    }
+}
+
+
+std::string TaskMulti::Name(){
+    return "TaskMulti";
+}
+
+
+} //namespace task
+} //namspace deepinfer
+} //namespace waytous
+
+
+
+
+
+
+
+
+
--- a/src/tasks/task_multi.h
+++ b/src/tasks/task_multi.h
+
+#ifndef WAYTOUS_DEEPINFER_TASK_MULTI_H_
+#define WAYTOUS_DEEPINFER_TASK_MULTI_H_
+
+#include "interfaces/base_task.h"
+#include "models/camera_model.h"
+#include "libs/ios/detection.h"
+#include "libs/ios/semantic.h"
+#include "libs/ios/depth.h"
+
+namespace waytous{
+namespace deepinfer{
+namespace task{
+
+
+class TaskMulti: public interfaces::BaseTask{
+public:
+
+    bool Init(std::string& taskConfigPath) override;
+
+    bool Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseIOPtr>& outputs) override;
+    void Visualize(cv::Mat* image, interfaces::BaseIOPtr outs) override;
+    void Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs);
+    std::string Name() override;
+
+public:
+    std::shared_ptr<interfaces::BaseModel> MulitTaskModel; 
+
+};
+
+DEEPINFER_REGISTER_TASKS(TaskMulti);
+
+} //namespace task
+} //namspace deepinfer
+} //namespace waytous
+
+
+#endif
+
+
--- a/test/multi_res.jpg
+++ b/test/multi_res.jpg