add batch inference

f3b17ea9 · yangxue · c7156e91 · f3b17ea9 · f3b17ea9 · f3b17ea9
Commit f3b17ea9 authored Sep 06, 2025 by yangxue
25 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.5)
 project(WaytousDeepInfer)
@@ -39,6 +36,8 @@ link_directories(/home/ubuntu/projects/infer/cuda/TensorRT-8.2.3.0/lib)
 # yaml
 find_package(yaml-cpp REQUIRED)
+include_directories(/usr/local/include/yaml-cpp)
+link_directories(/usr/local/lib)
 # glog gflags
 # sudo apt-get install libgoogle-glog*

--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 * @Author: yangxue && xue.yang@waytous.com
 * @Date: 2025-04-07 09:39:32
 * @LastEditors: yangxue xue.yang@waytous.com
- * @LastEditTime: 2025-04-07 09:57:50
+ * @LastEditTime: 2025-09-05 11:05:30
 * @FilePath: /ubuntu/projects/deepinfer/README.md
 * @Description: 
 * 
@@ -30,7 +30,6 @@ include_directories(/usr/include/glog)
 ```
 `mkdir build && cd build`
 `cmake ..` or `cmake -DCUDA_nppicom_LIBRARY=/usr/local/cuda/lib64/libnppim.so ..`

--- a/configs/tasks/multi/multi_task.yaml
+++ b/configs/tasks/multi/multi_task.yaml
-configRootPath: /home/wangxin/projects/waytous/DeepInfer
+configRootPath: /home/wangxin/projects/deepinfer
 device: 0
 modelName: CameraModel
 modelConfigPath: configs/tasks/multi/multi_yolov5.yaml

--- a/configs/tasks/multi/multi_yolov5.yaml
+++ b/configs/tasks/multi/multi_yolov5.yaml
 name: CameraModel
-inputNames: [cvImage]
+inputNames: [cvImage,cvImage2]
-outputNames: [out_instances, out_semantics, out_depths, undistortVisImage]
+outputNames: [out_instances, out_semantics, out_depths, undistortVisImage,undistortVisImage2]
 units:
    -
        name: CameraSrc
-        inputNames: [cvImage]
+        inputNames: [cvImage, cvImage2]
-        outputNames: [uint8Image]
+        outputNames: [uint8Image, uint8Image2]
    -
        name: Undistort
        inputNames: [uint8Image]
@@ -16,12 +16,19 @@ units:
        imageHeight: 540
        IntrinsicPath: configs/tasks/multi/rgb_intrinsic.yaml
    -
+        name: Undistort
+        inputNames: [uint8Image2]
+        outputNames: [undistortVisImage2]
+        imageWidth: 960
+        imageHeight: 540
+        IntrinsicPath: configs/tasks/multi/rgb_intrinsic.yaml
+    -
        name: ResizeNorm
-        inputNames: [undistortVisImage]
+        inputNames: [undistortVisImage, undistortVisImage2]
        outputNames: [resizeNormImages]
        inputMean: [0, 0, 0]
        inputStd: [1, 1, 1]
-        inferBatchSize: 1
+        inferBatchSize: 2
        inputWidth: 960 # 640
        inputHeight: 544 # 640
        fixAspectRatio: false
@@ -33,33 +40,34 @@ units:
        runMode: 1 # 0-fp32 1-fp16 2-int8 (int8 not supported)
        # weightsPath: "configs/tasks/multi/isp_612m_best.onnx"
        # engineFile: "configs/tasks/multi/isp_612m_best_fp16.engine"
-        weightsPath: "configs/tasks/multi/isp_0718m_best.onnx"
+        weightsPath: "configs/tasks/multi/ips_2504m_best.onnx" # projects/deepinfer/configs/tasks/multi/ips_2504m_best.onnx
-        engineFile: "configs/tasks/multi/isp_0718m_best_fp16.engine"
+        engineFile: "configs/tasks/multi/ips_2504m_best_fp16.engine"
        # calibImgPathFile: "configs/tasks/multi/isp_calib_imgs.txt"
        # calibTableCache: "configs/tasks/multi/isp_calib_table.cache"
-        inferDynamic: false
+        inferDynamic: true
-        inferBatchSize: 1
+        inferBatchSize: 2
        inputWidth: 960 # 640
        inputHeight: 544 # 640
-        maxBatchSize: 1 # used when build engine
+        maxBatchSize: 4 # used when build engine
    -
        name: MultiPostProcess
        inputNames: [detections, seg_protos, depths, semantics, undistortVisImage]
        outputNames: [out_instances, out_semantics, out_depths]
+        inferBatchSize: 2
        inputWidth: 960 # 640
        inputHeight: 544 # 640
        fixAspectRatio: false
        # instance-seg
        nmsThreshold: 0.45
-        scoreThreshold: 0.2 # used when inference, can be modified
+        scoreThreshold: 0.1 # used when inference, can be modified
        truncatedThreshold: 0.05
        maxOutputNum: 1000
-        rawDetectionLength: 32130 #25200
+        rawDetectionLength: 10710 # 32130 #25200
        keepTopK: 100
        segProtoDim: 32
        instanceDownScale: 4
-        instanceClassNumber: 9
+        instanceClassNumber: 3
-        instanceClassNames: ["pedestrian", "two_wheel", "car", "truck", "construction_machine", "fence", "stone", "dust", "cone"]
+        instanceClassNames: ["pedestrian", "vehicle", "stone"]
        # semantic-seg
        semanticDownScale: 4
        semanticClassNumber: 2

--- a/include/base/blob.h
+++ b/include/base/blob.h
@@ -216,3 +216,37 @@ using BlobConstPtr = std::shared_ptr<const Blob<Dtype>>;
 #endif
+/*
+Blob 类是一个模板类，用于管理多维数据（如神经网络中层的特征图或参数），其主要功能和设计点如下：
+- 数据存储
+使用模板参数 Dtype 以支持不同类型的数据。
+内部通过智能指针管理两个 SyncedMemory 对象，一个用于存储实际数据（data_），另一个保存数据形状（shape_data_），
+以便在 CPU 和 GPU 内存之间进行同步。
+- 形状管理和重塑（Reshape）
+提供 Reshape() 方法，通过输入一个 int 类型的 vector 来设置数据的维度，同时计算数据总数 count_。
+检查输入形状不超过最大轴数限制（kMaxBlobAxes），并动态分配足够内存存储数据。
+提供 ReshapeLike() 方法，使得当前 Blob 可复制另一个 Blob 的形状。
+- 数据访问接口
+提供 const 和 mutable 版本的 cpu_data() 与 gpu_data() 方法，允许对数据进行只读或读写访问，并自动根据数据状态同步内存。
+同时支持设置外部数据指针（set_cpu_data / set_gpu_data），确保数据在 CPU 和 GPU 端保持一致。
+Legacy 接口支持
+为向后兼容，提供了 num()、channels()、height()、width() 这些 legacy 访问器，这些方法通过对形状的特殊处理模拟低维数据
+（通常为 4D）的访问。
+- 索引与偏移计算
+提供 offset() 函数，用于计算多维数据在一维数组中的具体偏移，也支持通过 vector 提供多个索引实现多维访问。
+数据共享
+通过 ShareData() 方法，可以在不同 Blob 之间共享数据，即共享同一个 SyncedMemory 的实例，防止数据复制时的冗余开销。
+总体来说，Blob 类将复杂的 CPU/GPU 内存管理、数据同步以及多维张量操作封装起来，使得上层算法可以直接关注数据维度和计算，
+而不用过多关心底层的内存细节。
+*/
\ No newline at end of file
--- a/include/base/syncmem.h
+++ b/include/base/syncmem.h
@@ -32,6 +32,28 @@ inline void PerceptionFreeHost(void* ptr, bool use_cuda) {
 /**
 * @brief Manages memory allocation and synchronization between the host (CPU)
 *        and device (GPU).
+  SyncedMemory 类主要用于管理主机（CPU）和设备（GPU）之间的内存分配和数据同步，具体说明如下：
+    内存分配机制
+    SyncedMemory 内部维护两个指针：一个指向 CPU 内存（cpu_ptr_），另一个指向 GPU 内存（gpu_ptr_）。在 CPU 内存分配时，
+    会根据是否使用 CUDA 分配使用相应的函数（malloc 或 cudaMallocHost）。
+    数据状态管理
+    类中定义了一个枚举类型 SyncedHead，用来记录当前有效数据所在位置（例如 UNINITIALIZED、HEAD_AT_CPU、HEAD_AT_GPU、SYNCED）。
+    这样可以确保当修改某一端的数据时，能够明确数据是最新的，从而在需要时进行同步。
+    数据访问接口
+    提供了 cpu_data()、gpu_data()、mutable_cpu_data()、mutable_gpu_data() 以及 set_cpu_data()、set_gpu_data() 等接口，
+    方便用户以只读或可修改的方式访问数据。当数据状态不一致时，内部会自动调用 to_cpu() 或 to_gpu() 方法将数据同步到对应端。
+    异步数据传输
+    async_gpu_push() 方法允许利用 CUDA 流（cudaStream_t）实现异步数据传输，提升数据同步的效率。
+    设备检查
+    内部通过 check_device() 方法检查当前所用的 GPU 设备是否和数据所在的设备一致，避免跨设备数据混乱的问题。
+    总体来说，SyncedMemory 封装了跨 CPU 与 GPU 内存管理和数据同步的逻辑，使得开发人员能够更简单地在混合计算场景中处理数据，
+    而无需手动编写繁琐的内存分配和同步代码。
 */
 class SyncedMemory {
 public:

--- a/include/common/register.h
+++ b/include/common/register.h
@@ -15,6 +15,21 @@ namespace common {
 // from apollo register
 // idea from boost any but make it more simple and don't use type_info.
+/*
+Any 类实现了一个简化版的类型擦除机制，其主要作用包括：
+存储任意类型的数据
+Any 内部持有一个指向 PlaceHolder 的指针，通过模板内部类 Holder 来存储任意类型的对象。这允许在运行时动态保存不同类型的数据，而不需要明确指定类型。
+复制和克隆机制
+Holder 类中实现了 Clone 方法，使得 Any 对象支持深拷贝，即在拷贝构造时可以生成一个独立的副本，确保存储的数据被正确复制。
+类型转换接口
+Any 提供了 AnyCast 模板方法，用于将存储的数据转换回原本的类型。调用者需保证转换类型与实际存储类型相符，否则会返回空指针。
+这种设计类似于 Boost.Any，通过类型擦除来实现对不同类型对象的统一管理，在工厂模式等场景中十分有用。
+*/
 class Any {
 public:
    Any() : content_(NULL) {}
@@ -77,6 +92,22 @@ bool GetRegisteredClasses(
 }  // namespace deepinfer
 }  // namespace waytous
+/*
+这两个宏的作用在于实现自动注册和工厂实例化机制，具体说明如下：
+- **DEEPINFER_REGISTER_REGISTERER(base_class)**  
+  - 定义了一个注册器类，用于提供对某个基类下所有注册的派生类的管理接口。
+  - 提供通过名称获取单例、获取所有实例、验证注册名称等静态方法，便于后续动态创建对象。
+  - 它依赖全局工厂映射（GlobalFactoryMap）来查找注册的对象工厂，从而调用 NewInstance() 创建对象。
+- **DEEPINFER_REGISTER_CLASS(clazz, name)**  
+  - 定义了一个局部（匿名命名空间内）的对象工厂类，该工厂类继承自 ObjectFactory，并实现了 NewInstance() 方法，即实例化具体的派生类。
+  - 利用构造函数属性（__attribute__((constructor)))，在程序加载时自动执行注册，将具体类的工厂注册到全局工厂映射中，关联键为给定的类名和名称。
+简单来说，前者（REGISTER_REGISTERER）为某个基类提供了对象获取和管理接口，而后者（REGISTER_CLASS）则用于将某个具体的类注册到全局工厂中，从而支持基于名称的动态实例化。
+*/
 #define DEEPINFER_REGISTER_REGISTERER(base_class)                    \
 class base_class##Registerer {                                        \
    typedef ::waytous::deepinfer::common::Any Any;                   \
@@ -146,3 +177,4 @@ __attribute__((constructor)) void RegisterFactory##name() {                     
 #endif
--- a/include/interfaces/base_task.h
+++ b/include/interfaces/base_task.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 10:57:32
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-06 02:39:29
+ * @FilePath: /ubuntu/projects/deepinfer/include/interfaces/base_task.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef WAYTOUS_DEEPINFER_TASK_H_
 #define WAYTOUS_DEEPINFER_TASK_H_
@@ -26,6 +36,7 @@ public:
    virtual bool Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseIOPtr>& outputs) = 0;
    virtual void Visualize(cv::Mat* image, interfaces::BaseIOPtr outs) = 0;
    virtual void Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs){};
+    virtual void Visualize(std::vector<cv::Mat*> image, std::vector<interfaces::BaseIOPtr>& outputs) {};
    virtual cv::Scalar get_color(int idx);

--- a/include/libs/ios/depth.h
+++ b/include/libs/ios/depth.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 09:14:47
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-05 09:14:48
+ * @FilePath: /ubuntu/projects/deepinfer/include/libs/ios/depth.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef DEEPINFER_DEPTH_H_
 #define DEEPINFER_DEPTH_H_
@@ -17,13 +27,16 @@ namespace ios {
 class Depth: public interfaces::BaseIO{
 public:
    cv::Mat depth;
 };
 using DepthPtr = std::shared_ptr<Depth>;
+class BatchDepth: public interfaces::BaseIO{
+public:
+    std::vector<DepthPtr> batch_depths;
+};
+using BatchDepthPtr = std::shared_ptr<BatchDepth>;
 }  // namespace ios
 }  // namespace deepinfer
 }  // namespace waytous

--- a/include/libs/ios/detection.h
+++ b/include/libs/ios/detection.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 09:13:12
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-05 09:18:00
+ * @FilePath: /ubuntu/projects/deepinfer/include/libs/ios/detection.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef DEEPINFER_DETECTION_H_
 #define DEEPINFER_DETECTION_H_
@@ -58,6 +68,13 @@ public:
 };
 using Detection2DsPtr = std::shared_ptr<Detection2Ds>;
+class BatchDetection2Ds: public interfaces::BaseIO{
+public:
+    std::vector<Detection2DsPtr> batch_detections;
+};
+using BatchDetection2DsPtr = std::shared_ptr<BatchDetection2Ds>;
 }  // namespace ios
 }  // namespace deepinfer
 }  // namespace waytous

--- a/include/libs/ios/instance_mask.h
+++ b/include/libs/ios/instance_mask.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-04 08:27:52
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-04 08:27:54
+ * @FilePath: /ubuntu/projects/deepinfer/include/libs/ios/instance_mask.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef DEEPINFER_INSTANCE_MASK_H_
 #define DEEPINFER_INSTANCE_MASK_H_
@@ -38,7 +48,7 @@ public:
 public:
    int width;
    int height;
-    std::string rle_string; // coco encode mask string
+    std::string rle_string; // coco encode mask string (run-length encoding)
 };
 using InstanceMaskPtr = std::shared_ptr<InstanceMask>;

--- a/include/libs/ios/semantic.h
+++ b/include/libs/ios/semantic.h
@@ -32,6 +32,12 @@ public:
 };
 using SemanticsPtr = std::shared_ptr<Semantics>;
+class BatchSemantics: public interfaces::BaseIO{
+public:
+    std::vector<SemanticsPtr> batch_semantic_segs;
+};
+using BatchSemanticsPtr = std::shared_ptr<BatchSemantics>;
 }  // namespace ios
 }  // namespace deepinfer

--- a/main.cpp
+++ b/main.cpp
@@ -36,7 +36,7 @@ int main(int argc, char** argv){
    std::string configPath = argv[3];
    std::string srcPath = argv[4];
    std::string savePath = argv[5];
-    int infer_count = 100;
+    int infer_count = 1;
    if(argc >= 7){
        infer_count = std::stoi(argv[6]);
    }
@@ -72,13 +72,27 @@ int main(int argc, char** argv){
        std::cout << "avg infer time: " <<
            std::chrono::duration_cast<std::chrono::microseconds>(e2 - e1).count() / 1000. / infer_count << " ms" << std::endl;
        if(inputs.size() != outputs.size()){
-            cv::Mat vis = images[0];
+            // cv::Mat vis = images[0];    
-            if(taskName == "TaskMulti"){
+            std::cout << "vis 0: ...................." << taskName << std::endl;        
-                t->Visualize(&vis, outputs);
+            int vis_num = inputs.size();
+            std::vector<cv::Mat*> vis;          
+            for(int i = 0; i < vis_num; i++) {                
+                vis.push_back(inputs[i]);
+            }            
+            if(taskName == "TaskMulti"){ // only implement TaskMulti !!!
+                std::cout << "vis taskmulti: ...................." << std::endl;     
+                t->Visualize(vis, outputs);
            }else{
-                t->Visualize(&vis, outputs[0]);
+                // t->Visualize(&vis, outputs[0]);
            }  
-            cv::imwrite(savePaths[0], vis);
+            // write image
+            for(int i = 0; i < vis_num; i++) {
+                std::cout << "write image: " << savePaths[i] << std::endl;
+                cv::imwrite(savePaths[i], *vis[i]);
+            }
        }else{
            for(int i=0; i<inputs.size(); i++){
                cv::Mat vis = images[i];
@@ -130,6 +144,9 @@ int main(int argc, char** argv){
 ./main TaskMulti image ../configs/tasks/multi/multi_task.yaml ../test/multi_test.jpg ../test/multi_res.jpg
+# 测试多图
+./main TaskMulti image ../configs/tasks/multi/multi_task.yaml ../test/multi_test.jpg,../test/multi_test.jpg ../test/multi_res1.jpg,../test/multi_res2.jpg
 */

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,7 +2,6 @@
 project(DeepInfer)
 include_directories(.)
 # include_directories(src/base)

--- a/src/libs/inferences/tensorrt/trt_infer.cpp
+++ b/src/libs/inferences/tensorrt/trt_infer.cpp
@@ -40,6 +40,7 @@ bool TRTInference::Init(YAML::Node& configNode, interfaces::BaseIOMapPtr pmap){
    engineFile = configNode["engineFile"].as<std::string>();
    inferDynamic = configNode["inferDynamic"].as<bool>();
    engineFile = common::GetAbsolutePath(common::ConfigRoot::GetRootPath(), engineFile);
+    LOG_INFO << "engine file: " << engineFile; 
    if(!waytous::deepinfer::common::PathExists(engineFile)){
        LOG_INFO << "Tensorrt engine haven't been built, built from saved weights.";
        BuildEngine(configNode);
@@ -69,7 +70,18 @@ bool TRTInference::Init(YAML::Node& configNode, interfaces::BaseIOMapPtr pmap){
    mContext = mEngine->createExecutionContext();
    assert(mContext != nullptr);
-    mContext->setProfiler(&mProfiler);
+    // mContext->setProfiler(&mProfiler); 
+    // use default profiler    
+    // set input dynamic shape for context
+    // https://docs.nvidia.com/deeplearning/tensorrt-rtx/latest/inference-library/work-with-dynamic-shapes.html#overview
+    if(inferDynamic){
+        nvinfer1::Dims inputDims = mEngine->getBindingDimensions(0);
+        inputDims.d[0] = inferBatchSize;
+        mContext->setBindingDimensions(0, inputDims);
+        LOG_INFO << "set dynamic batch size: " << inferBatchSize;
+    }
    int nbBindings = mEngine->getNbBindings();
    if(nbBindings != inputNames.size() + outputNames.size()){
@@ -102,9 +114,12 @@ bool TRTInference::Init(YAML::Node& configNode, interfaces::BaseIOMapPtr pmap){
        std::vector<int> shape;
        shape.push_back(inferBatchSize);
-        for(int dindex = 0; dindex < dims.nbDims; dindex++){
+        for(int dindex = 1; dindex < dims.nbDims; dindex++){
            shape.push_back(dims.d[dindex]);
        };
+        for(auto dim: shape) {
+            LOG_INFO <<name << " dim: " <<  dim;
+        }
        auto blob = std::make_shared<base::Blob<float>>(shape);
        blobs_.insert(std::make_pair(name, blob));
        auto output = std::make_shared<ios::NormalIO>(blob);
@@ -203,6 +218,13 @@ bool TRTInference::BuildEngine(YAML::Node& configNode){
        // */
        // return false;
    }
+    // 设置输入images的动态形状
+    nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile();
+	profile->setDimensions("images", nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1,3, inputHeight, inputWidth));
+    profile->setDimensions("images", nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4(2,3, inputHeight, inputWidth));
+    profile->setDimensions("images", nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4(4,3, inputHeight, inputWidth));	//这里的尺寸更具你自己的输入修改（最大尺寸）
+    builderConfig->addOptimizationProfile(profile);	//添加进 IBuilderConfig
    LOG_INFO << "Begin building engine..." ;
    #ifdef TRT_8
@@ -248,6 +270,10 @@ std::shared_ptr<base::Blob<float>> TRTInference::get_blob(const std::string &nam
    return iter->second;
 }
+/*
+输入一个ios::NormalIO (base::Blob)， 表示[b,c,h,w]的图像 -> 
+输出多个ios::NormalIO (base::Blob)，表示detections, seg_protos, depths, semantics
+*/
 bool TRTInference::Exec(){
    CUDA_CHECK(cudaStreamSynchronize(mCudaStream));
@@ -256,6 +282,10 @@ bool TRTInference::Exec(){
        if (blob != nullptr) {
            blob->gpu_data();
        }
+        LOG_INFO << "TRTInference inputName: " << name;
+        for(int i =0; i < blob->num_axes(); i++) {
+            LOG_INFO << "TRTInference input shape [" << i << "] : " << blob->shape(i); 
+        }
    }
    // If `out_blob->mutable_cpu_data()` is invoked outside,
    // HEAD will be set to CPU, and `out_blob->mutable_gpu_data()`
@@ -269,11 +299,14 @@ bool TRTInference::Exec(){
            blob->gpu_data();
        }
    }
    if(inferDynamic){
+        //mContext->enqueueV2(&mBindings[0], mCudaStream, nullptr);
        mContext->enqueue(inferBatchSize, &mBindings[0], mCudaStream, nullptr);
    }else{
        mContext->executeV2(&mBindings[0]);
    }
    CUDA_CHECK(cudaStreamSynchronize(mCudaStream));
    for (auto name : outputNames) {

--- a/src/libs/postprocessors/multi_post.cpp
+++ b/src/libs/postprocessors/multi_post.cpp
@@ -33,6 +33,7 @@ bool MultiPostProcess::Init(YAML::Node& node, interfaces::BaseIOMapPtr pmap) {
    depthDownScale = node["depthDownScale"].as<int>();
    depthDistanceScale = node["depthDistanceScale"].as<int>();
+    inferBatchSize = node["inferBatchSize"].as<int>();
    output_length_ptr.reset(new base::Blob<int>({inferBatchSize, 1}));
    output_length_ptr->cpu_data();
@@ -41,6 +42,10 @@ bool MultiPostProcess::Init(YAML::Node& node, interfaces::BaseIOMapPtr pmap) {
    return true;
 };
+/*
+输入多个ios::NormalIO (base::Blob), 分别表示detections, seg_protos, depths, semantics, undistortVisImage
+输出 ios::Detection2Ds, ios::Semantics, ios::Depth
+*/
 bool MultiPostProcess::Exec() {
@@ -70,160 +75,214 @@ bool MultiPostProcess::Exec() {
        return false;
    }
+    // rawDetections: [1,1, rawDetectionLength, detectionStep]
-    // filter detections 25200 x (5+9+32) -> 1000 x (5+9+32)
+    for(int i = 0; i < rawDetections->data_->num_axes(); i++) {
-    // reset output_length=0, otherwise, it will increase after every inference.
+        LOG_INFO << "rawDetections shape: "<<  i << ":  "<< rawDetections->data_->shape(i);
-    output_length_ptr->mutable_cpu_data()[0] = 0;
+    }   
-    multitask_instance_filter(
+    // segProtos: [1,1,SegProtoDim, 136,240], 1/4 downscaled of input image
-        rawDetections->data_->gpu_data(), rawDetectionLength,
+    for(int i = 0; i < segProtos->data_->num_axes(); i++) {
-        bboxes_ptr->mutable_gpu_data(),
+        LOG_INFO << "segProtos shape: "<<  i << ":  "<< segProtos->data_->shape(i);
-        output_length_ptr->mutable_gpu_data(),
+    }   
-        scoreThreshold, detectionStep, maxOutputNum
+    // rawSemantics: [1,1, semanticClassNumber, 136, 240]
-    );
+    for(int i = 0; i < rawSemantics->data_->num_axes(); i++) {
+        LOG_INFO << "rawSemantics shape: "<<  i << ":  "<< rawSemantics->data_->shape(i);
-    auto outputLength = output_length_ptr->cpu_data();
-    auto outputBoxes = bboxes_ptr->mutable_cpu_data();
-    auto proto = segProtos->data_->mutable_cpu_data();
-    // Detection
-    float img_width = float(inputImage->img_ptr_->cols());
-    float img_height = float(inputImage->img_ptr_->rows());
-    float scalex = inputWidth / img_width;
-    float scaley = inputHeight / img_height;
-    if(fixAspectRatio){
-        scalex = scaley = std::min(scalex, scaley);
    }
-    auto dets = std::make_shared<ios::Detection2Ds>();
+    // rawDepths: [1,1,1,136,240]
-    std::vector<std::vector<float>> mask_coeffs;
+    for(int i = 0; i < rawDepths->data_->num_axes(); i++) {
-    for(int i = 0; i < outputLength[0]; i++){
+        LOG_INFO << "rawDepths shape: "<<  i << ":  "<< rawDepths->data_->shape(i);
-        float* current_box_info = outputBoxes + i * detectionStep;
+    }   
-        float max_class_conf = 0.0;
+    // check rawDetections validity with two same samples
-        int class_id = 0;
+    int rawDetectionArea = rawDetectionLength * detectionStep;
-        for(int ic=0; ic < instanceClassNumber; ic++){
+    int equal_count = 0;
-            if(current_box_info[5 + ic] > max_class_conf){
+    for(int i = 0; i < rawDetectionArea; i++) {
-                max_class_conf = current_box_info[5 + ic];
+        equal_count += rawDetections->data_->cpu_data()[i] == rawDetections->data_->cpu_data()[i + rawDetectionArea]? 1 : 0;
-                class_id = ic;
+    }    
-            }
+    LOG_WARN << "RawDetections equal count: " << equal_count << " / " << rawDetectionArea;
-        }
-        float confidence = max_class_conf * current_box_info[4];
+    // iterate over batch
-        if(confidence < scoreThreshold){
+    auto detection_ptr_array = std::make_shared<std::vector<ios::Detection2DsPtr>>();
-            continue;
+    auto semantic_ptr_array = std::make_shared<std::vector<ios::SemanticsPtr>>();
+    auto depth_ptr_array = std::make_shared<std::vector<ios::DepthPtr>>();
+    for(int bi = 0; bi < inferBatchSize; bi++) {           
+        // filter detections 25200 x (5+class_num+32) -> 1000 x (5+class_num+32)
+        // reset output_length=0, otherwise, it will increase after every inference.
+        output_length_ptr->mutable_cpu_data()[bi] = 0;
+        multitask_instance_filter(
+            rawDetections->data_->gpu_data() + bi * rawDetectionLength * detectionStep, 
+            rawDetectionLength,
+            bboxes_ptr->mutable_gpu_data() + bi * maxOutputNum * detectionStep,
+            output_length_ptr->mutable_gpu_data() + bi,
+            scoreThreshold, detectionStep, maxOutputNum
+        );
+        auto outputLength = output_length_ptr->cpu_data() + bi;
+        auto outputBoxes = bboxes_ptr->mutable_cpu_data() + bi * maxOutputNum * detectionStep;  
+        LOG_INFO << "Num of boxes after filter (outputLength[0]): " << outputLength[0];  
+        LOG_INFO << "Elem of boxes after filter (outputBoxes[0]): " << outputBoxes[0];  
+        // Detection
+        float img_width = float(inputImage->img_ptr_->cols());
+        float img_height = float(inputImage->img_ptr_->rows());
+        float scalex = inputWidth / img_width;
+        float scaley = inputHeight / img_height;
+        if(fixAspectRatio){
+            scalex = scaley = std::min(scalex, scaley);
        }
-        auto obj = std::make_shared<ios::Det2D>();
+        auto dets = std::make_shared<ios::Detection2Ds>();
-        obj->confidence = confidence;
+        std::vector<std::vector<float>> mask_coeffs;
-        obj->class_label = class_id;
+        for(int i = 0; i < outputLength[0]; i++){
-        obj->class_name = instanceClassNames[obj->class_label];
+            float* current_box_info = outputBoxes + i * detectionStep;
-        obj->x1= (current_box_info[0] - current_box_info[2] / 2)  / scalex;
+            float max_class_conf = 0.0;
-        obj->y1 = (current_box_info[1] - current_box_info[3] / 2)  / scaley;
+            int class_id = 0;
-        obj->x2 = (current_box_info[0] + current_box_info[2] / 2) / scalex;
+            for(int ic=0; ic < instanceClassNumber; ic++){
-        obj->y2 = (current_box_info[1] + current_box_info[3] / 2) / scaley;
+                if(current_box_info[5 + ic] > max_class_conf){
-        obj->image_height = img_height;
+                    max_class_conf = current_box_info[5 + ic];
-        obj->image_width = img_width;
+                    class_id = ic;
-        obj->validCoordinate(); //
+                }
-        // LOG_INFO << "box:" << obj->x1 << ","<< obj->y1 << ","<< obj->x2 << ","<< obj->y2;
+            }
-        if((obj->x1 / img_width  < truncatedThreshold) || (obj->y1 / img_height  < truncatedThreshold) ||
+            float confidence = max_class_conf * current_box_info[4];
-        (obj->x2 / img_width  > (1 - truncatedThreshold)) || (obj->y2 / img_height  > (1 - truncatedThreshold))){
+            if(confidence < scoreThreshold){
-            obj->truncated = true;
+                continue;
+            }
+            auto obj = std::make_shared<ios::Det2D>();
+            obj->confidence = confidence;
+            obj->class_label = class_id;
+            obj->class_name = instanceClassNames[obj->class_label];
+            obj->x1= (current_box_info[0] - current_box_info[2] / 2)  / scalex;
+            obj->y1 = (current_box_info[1] - current_box_info[3] / 2)  / scaley;
+            obj->x2 = (current_box_info[0] + current_box_info[2] / 2) / scalex;
+            obj->y2 = (current_box_info[1] + current_box_info[3] / 2) / scaley;
+            obj->image_height = img_height;
+            obj->image_width = img_width;
+            obj->validCoordinate(); //
+            // LOG_INFO << "box:" << obj->x1 << ","<< obj->y1 << ","<< obj->x2 << ","<< obj->y2;
+            if((obj->x1 / img_width  < truncatedThreshold) || (obj->y1 / img_height  < truncatedThreshold) ||
+            (obj->x2 / img_width  > (1 - truncatedThreshold)) || (obj->y2 / img_height  > (1 - truncatedThreshold))){
+                obj->truncated = true;
+            }
+            for(int im=0; im < segProtoDim; im++){
+                obj->mask_coeff.push_back(current_box_info[5 + instanceClassNumber + im]);
+            }
+            dets->detections.push_back(obj);
        }
-        for(int im=0; im < segProtoDim; im++){
+        // NMS
-            obj->mask_coeff.push_back(current_box_info[5 + instanceClassNumber + im]);
+        LOG_INFO << "before nms:" << dets->detections.size();
+        nms_cpu(dets, scoreThreshold, nmsThreshold, instanceClassNumber, keepTopK, true);
+        LOG_INFO << "after nms:" << dets->detections.size();
+        // Instance Mask
+        int mask_width = inputWidth / instanceDownScale;
+        int mask_height = inputHeight / instanceDownScale;
+        auto proto = segProtos->data_->mutable_cpu_data() + bi * mask_width * mask_height * segProtoDim;
+        for(auto det: dets->detections){
+            int x1 = round(det->x1 * scalex / instanceDownScale); // scale to output mask level.
+            int x2 = round(det->x2 * scalex / instanceDownScale);
+            int y1 = round(det->y1 * scaley / instanceDownScale);
+            int y2 = round(det->y2 * scaley / instanceDownScale);
+            cv::Mat mask_mat = cv::Mat::zeros(y2 - y1, x2 - x1, CV_32FC1); // local mask
+            for (int x = x1; x < x2; x++) {
+                for (int y = y1; y < y2; y++) {
+                    float e = 0.0f;
+                    for (int j = 0; j < segProtoDim; j++) {
+                        e += det->mask_coeff[j] * proto[j * mask_width * mask_height + y * mask_width + x];
+                    }
+                    e = 1.0f / (1.0f + expf(-e));
+                    mask_mat.at<float>(y - y1, x - x1) = e;
+                }
+            }
+            // cv::Mat mask_res;
+            // if(fixAspectRatio){
+            //     int w = img_width * scalex / instanceDownScale;
+            //     int h = img_height * scaley / instanceDownScale;
+            //     cv::Rect r(0, 0, w, h);
+            //     cv::resize(mask_mat(r), mask_res, cv::Size(img_width, img_height));
+            // }else{
+            //     cv::resize(mask_mat, mask_res, cv::Size(img_width, img_height));
+            // }
+            det->mask_ptr.reset(
+                new ios::InstanceMask(mask_mat)
+            );
+            // LOG_INFO << x1 << " " << x2 << " " << y1 << " " << y2 <<", " << det->mask_ptr->rle_string;
        }
-        dets->detections.push_back(obj);
+        // pMap->SetIOPtr(outputNames[0], dets);
-    }
+        detection_ptr_array->emplace_back(dets);
-    // NMS
-    LOG_INFO << "before nms:" << dets->detections.size();
+        // Semantic Mask
-    nms_cpu(dets, scoreThreshold, nmsThreshold, instanceClassNumber, keepTopK, true);
+        auto semantics = std::make_shared<ios::Semantics>();
-    LOG_INFO << "after nms:" << dets->detections.size();
+        int semanticHeight = inputHeight / semanticDownScale;
+        int semanticWidth = inputWidth / semanticDownScale;
-    // Instance Mask
+        auto rawSemanticSegs = rawSemantics->data_->mutable_cpu_data() + bi * semanticClassNumber * semanticHeight * semanticWidth;
-    int mask_width = inputWidth / instanceDownScale;
-    int mask_height = inputHeight / instanceDownScale;
+        for(int is=0; is < semanticClassNumber; is++){
-    for(auto det: dets->detections){
+            auto seg = std::make_shared<ios::SemanticSeg>();
-        int x1 = round(det->x1 * scalex / instanceDownScale); // scale to output mask level.
+            seg->class_label = is;
-        int x2 = round(det->x2 * scalex / instanceDownScale);
+            seg->class_name = semanticClassNames[is];
-        int y1 = round(det->y1 * scaley / instanceDownScale);
+            cv::Mat mask_mat = cv::Mat::zeros(semanticHeight, semanticWidth, CV_32FC1);
-        int y2 = round(det->y2 * scaley / instanceDownScale);
+            for (int x = 0; x < semanticWidth; x++) {
-        cv::Mat mask_mat = cv::Mat::zeros(y2 - y1, x2 - x1, CV_32FC1); // local mask
+                for (int y = 0; y < semanticHeight; y++) {
-        for (int x = x1; x < x2; x++) {
+                    float e = rawSemanticSegs[is * mask_mat.cols * mask_mat.rows + y * mask_mat.cols + x];
-            for (int y = y1; y < y2; y++) {
+                    // e = 1.0f / (1.0f + expf(-e));
-                float e = 0.0f;
+                    mask_mat.at<float>(y, x) = e;
-                for (int j = 0; j < segProtoDim; j++) {
-                    e += det->mask_coeff[j] * proto[j * mask_width * mask_height + y * mask_width + x];
                }
-                e = 1.0f / (1.0f + expf(-e));
-                mask_mat.at<float>(y - y1, x - x1) = e;
            }
+            // cv::Mat mask_res;
+            // if(fixAspectRatio){
+            //     int w = img_width * scalex / semanticDownScale;
+            //     int h = img_height * scaley / semanticDownScale;
+            //     cv::Rect r(0, 0, w, h);
+            //     cv::resize(mask_mat(r), mask_res, cv::Size(img_width, img_height));
+            // }else{
+            //     cv::resize(mask_mat, mask_res, cv::Size(img_width, img_height));
+            // }
+            seg->mask_ptr.reset(
+                new ios::InstanceMask(mask_mat)
+            );
+            // LOG_INFO << seg->mask_ptr->rle_string;
+            semantics->semantic_segs.push_back(seg);
        }
-        // cv::Mat mask_res;
+        // pMap->SetIOPtr(outputNames[1], semantics);
-        // if(fixAspectRatio){
+        semantic_ptr_array->emplace_back(semantics);
-        //     int w = img_width * scalex / instanceDownScale;
-        //     int h = img_height * scaley / instanceDownScale;
+        // Depth
-        //     cv::Rect r(0, 0, w, h);
+        auto depth = std::make_shared<ios::Depth>();
-        //     cv::resize(mask_mat(r), mask_res, cv::Size(img_width, img_height));
+        int depthHeight = inputHeight / depthDownScale;
-        // }else{
+        int depthWidth = inputWidth / depthDownScale;
-        //     cv::resize(mask_mat, mask_res, cv::Size(img_width, img_height));
-        // }
+        cv::Mat mask_mat = cv::Mat::zeros(depthHeight, depthWidth, CV_32FC1);
-        det->mask_ptr.reset(
+        auto rawDepth = rawDepths->data_->mutable_cpu_data() + bi * depthHeight * depthWidth;
-            new ios::InstanceMask(mask_mat)
-        );
+        for (int x = 0; x < depthWidth; x++) {
-        // LOG_INFO << x1 << " " << x2 << " " << y1 << " " << y2 <<", " << det->mask_ptr->rle_string;
+            for (int y = 0; y < depthHeight; y++) {
-    }
+                float e = rawDepth[y * mask_mat.cols + x];
-    pMap->SetIOPtr(outputNames[0], dets);
-    // Semantic Mask
-    auto semantics = std::make_shared<ios::Semantics>();
-    auto rawSemanticSegs = rawSemantics->data_->mutable_cpu_data();
-    for(int is=0; is < semanticClassNumber; is++){
-        auto seg = std::make_shared<ios::SemanticSeg>();
-        seg->class_label = is;
-        seg->class_name = semanticClassNames[is];
-        cv::Mat mask_mat = cv::Mat::zeros(inputHeight / semanticDownScale, inputWidth / semanticDownScale, CV_32FC1);
-        for (int x = 0; x < inputWidth / semanticDownScale; x++) {
-            for (int y = 0; y < inputHeight / semanticDownScale; y++) {
-                float e = rawSemanticSegs[is * mask_mat.cols * mask_mat.rows + y * mask_mat.cols + x];
                // e = 1.0f / (1.0f + expf(-e));
-                mask_mat.at<float>(y, x) = e;
+                mask_mat.at<float>(y, x) = e * depthDistanceScale;
            }
        }
        // cv::Mat mask_res;
        // if(fixAspectRatio){
-        //     int w = img_width * scalex / semanticDownScale;
+        //     int w = img_width * scalex / depthDownScale;
-        //     int h = img_height * scaley / semanticDownScale;
+        //     int h = img_height * scaley / depthDownScale;
        //     cv::Rect r(0, 0, w, h);
        //     cv::resize(mask_mat(r), mask_res, cv::Size(img_width, img_height));
        // }else{
        //     cv::resize(mask_mat, mask_res, cv::Size(img_width, img_height));
        // }
-        seg->mask_ptr.reset(
+        depth->depth = mask_mat;
-            new ios::InstanceMask(mask_mat)
+        // pMap->SetIOPtr(outputNames[2], depth);
-        );
+        depth_ptr_array->emplace_back(depth);
-        // LOG_INFO << seg->mask_ptr->rle_string;
-        semantics->semantic_segs.push_back(seg);
-    }
-    pMap->SetIOPtr(outputNames[1], semantics);
-    // Depth
-    auto depth = std::make_shared<ios::Depth>();
-    auto rawDepth = rawDepths->data_->mutable_cpu_data();
-    cv::Mat mask_mat = cv::Mat::zeros(inputHeight / depthDownScale, inputWidth / depthDownScale, CV_32FC1);
-    for (int x = 0; x < inputWidth / depthDownScale; x++) {
-        for (int y = 0; y < inputHeight / depthDownScale; y++) {
-            float e = rawDepth[y * mask_mat.cols + x];
-            // e = 1.0f / (1.0f + expf(-e));
-            mask_mat.at<float>(y, x) = e * depthDistanceScale;
-        }
    }
-    // cv::Mat mask_res;
+    auto batch_detection_ptr = std::make_shared<ios::BatchDetection2Ds>();
-    // if(fixAspectRatio){
+    batch_detection_ptr->batch_detections = *detection_ptr_array;
-    //     int w = img_width * scalex / depthDownScale;
+    auto batch_semantic_ptr =  std::make_shared<ios::BatchSemantics>();
-    //     int h = img_height * scaley / depthDownScale;
+    batch_semantic_ptr->batch_semantic_segs =  *semantic_ptr_array; 
-    //     cv::Rect r(0, 0, w, h);
+    auto batch_depth_ptr = std::make_shared<ios::BatchDepth>();
-    //     cv::resize(mask_mat(r), mask_res, cv::Size(img_width, img_height));
+    batch_depth_ptr->batch_depths = *depth_ptr_array;
-    // }else{
-    //     cv::resize(mask_mat, mask_res, cv::Size(img_width, img_height));
+    pMap->SetIOPtr(outputNames[0], batch_detection_ptr);
-    // }
+    pMap->SetIOPtr(outputNames[1], batch_semantic_ptr);
-    depth->depth = mask_mat;
+    pMap->SetIOPtr(outputNames[2], batch_depth_ptr);
-    pMap->SetIOPtr(outputNames[2], depth);
    return true;

--- a/src/libs/preprocessors/resize_norm.cpp
+++ b/src/libs/preprocessors/resize_norm.cpp
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 03:06:58
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-06 03:24:47
+ * @FilePath: /ubuntu/projects/deepinfer/src/libs/preprocessors/resize_norm.cpp
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #include "libs/preprocessors/resize_norm.h"
@@ -17,6 +27,7 @@ bool ResizeNorm::Init(YAML::Node& node, interfaces::BaseIOMapPtr pmap){
    inferBatchSize = node["inferBatchSize"].as<int>();
    fixAspectRatio = node["fixAspectRatio"].as<bool>();
+    LOG_INFO << "inputNames size / inferBatchSize: " << inputNames.size() << " "  << inferBatchSize;
    if(inputNames.size() != inferBatchSize){
        LOG_ERROR << "Resize norm got wrong inputs number: " << inputNames.size() << " with infer batchsize: " << inferBatchSize;
        return false;
@@ -37,7 +48,9 @@ bool ResizeNorm::Init(YAML::Node& node, interfaces::BaseIOMapPtr pmap){
    return true;
 };
+/*
+多个ios::CameraSrcOut (base::Image8U) -> 一个 ios::NormalIO (base::Blob)
+*/
 bool ResizeNorm::Exec(){
    for(int b=0; b < inputNames.size(); b++){
        auto inputName = inputNames[b];
@@ -59,6 +72,14 @@ bool ResizeNorm::Exec(){
            useBGR, fixAspectRatio
        );
    }
+    // LOG_INFO << "resize norm done.";
+    // // debug: download to cpu and save image
+    // for(int i = 0; i < inferBatchSize; i++) {
+    //     cv::Mat mat(inputHeight, inputWidth, CV_32FC3);
+    //     cudaMemcpy(mat.data, dst->cpu_data() + (i * 3 * inputHeight * inputWidth), sizeof(float) * 3 * inputHeight * inputWidth, cudaMemcpyDeviceToHost);
+    //     mat.convertTo(mat, CV_8UC3, 255.0);        
+    //     cv::imwrite("resize_norm_" + std::to_string(i) + ".jpg", mat);
+    // }
    return true;
 };

--- a/src/libs/sources/camera_src.cpp
+++ b/src/libs/sources/camera_src.cpp
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 08:58:55
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-05 08:58:55
+ * @FilePath: /ubuntu/projects/deepinfer/src/libs/sources/camera_src.cpp
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #include "libs/sources/camera_src.h"
@@ -8,15 +18,19 @@ namespace sources {
 bool CameraSrc::Exec(){
-    auto src_input = pMap->GetIOPtr(inputNames[0]);
+    // 将ios::CameraSrcIn(cv::Mat) 转换为 ios::CameraSrcOut (base::Image8U)
-    if(src_input == nullptr){
+    for(int i = 0; i < inputNames.size(); i++) {
-        LOG_ERROR << "CameraSrc input" << inputNames[0] << " haven't init";
+        auto src_input = pMap->GetIOPtr(inputNames[i]);
-        return false;
+        if(src_input == nullptr){
+            LOG_ERROR << "CameraSrc input" << inputNames[0] << " haven't init";
+            return false;
+        }
+        auto src = std::dynamic_pointer_cast<ios::CameraSrcIn>(src_input);
+        auto img = std::make_shared<base::Image8U>(src->cv_img_);
+        auto dst = std::make_shared<ios::CameraSrcOut>(img);
+        pMap->SetIOPtr(outputNames[i], dst);
    }
-    auto src = std::dynamic_pointer_cast<ios::CameraSrcIn>(src_input);
-    auto img = std::make_shared<base::Image8U>(src->cv_img_);
-    auto dst = std::make_shared<ios::CameraSrcOut>(img);
-    pMap->SetIOPtr(outputNames[0], dst);
    return true;
 }

--- a/src/libs/trackers/lapjv.cpp
+++ b/src/libs/trackers/lapjv.cpp
@@ -321,6 +321,7 @@ int_t _ca_dense(
 /** Solve dense sparse LAP.
+ * Linear Assignment Problem (LAP) solver using the Jonker-Volgenant algorithm.
 */
 int lapjv_internal(
 	const uint_t n, cost_t *cost[],

--- a/src/libs/trackers/tracker_datatype.h
+++ b/src/libs/trackers/tracker_datatype.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-06-04 08:18:42
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-06-04 08:18:42
+ * @FilePath: /ubuntu/projects/deepinfer/src/libs/trackers/tracker_datatype.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef WAYTOUS_DEEPINFER_TRACKER_DATATYPE_H_
 #define WAYTOUS_DEEPINFER_TRACKER_DATATYPE_H_
@@ -13,6 +23,7 @@ typedef Eigen::Matrix<float, 1, 4, Eigen::RowMajor> DETECTBOX; // xyah
 typedef Eigen::Matrix<float, -1, 4, Eigen::RowMajor> DETECTBOXSS;
 //Kalmanfilter
+// state: [cx, cy, a(w/h), h, vx, vy, va, vh]
 //typedef Eigen::Matrix<float, 8, 8, Eigen::RowMajor> KAL_FILTER;
 typedef Eigen::Matrix<float, 1, 8, Eigen::RowMajor> KAL_MEAN;
 typedef Eigen::Matrix<float, 8, 8, Eigen::RowMajor> KAL_COVA;

--- a/src/models/camera_model.cpp
+++ b/src/models/camera_model.cpp
@@ -12,6 +12,7 @@ bool CameraModel::Init(std::string& configPath) {
        LOG_WARN << "Init CameraModel config_file "<< common::ConfigRoot::GetRootPath() << " " << cfgPath << " not exist.";
        return false;
    }
+    LOG_INFO << "config path: " << cfgPath;
    modelConfigNode = YAML::LoadFile(cfgPath);
    if (modelConfigNode.IsNull()) {
        LOG_WARN << "Init CameraModel, Load " << configPath << " failed! please check!";
@@ -19,6 +20,10 @@ bool CameraModel::Init(std::string& configPath) {
    }
    inputNames = modelConfigNode["inputNames"].as<std::vector<std::string>>();
    outputNames = modelConfigNode["outputNames"].as<std::vector<std::string>>();
+    LOG_INFO << "CameraModel input/output size: " << inputNames.size() << " " << outputNames.size();
+    for(auto name: outputNames) {
+        LOG_INFO << "CameraModel outputName: " << name;
+    }
    modelUnitMap = std::make_shared<interfaces::BaseIOMap>();

--- a/src/tasks/task_multi.cpp
+++ b/src/tasks/task_multi.cpp
@@ -13,6 +13,7 @@ bool TaskMulti::Init(std::string& taskConfigPath){
    };
    std::string modelName = taskNode["modelName"].as<std::string>();
    std::string modelConfigPath = taskNode["modelConfigPath"].as<std::string>();
+    LOG_INFO << "modelConfigPath: " << modelConfigPath;
    MulitTaskModel.reset(interfaces::BaseModelRegisterer::GetInstanceByName(modelName));
    if(!MulitTaskModel->Init(modelConfigPath)){
@@ -34,50 +35,61 @@ bool TaskMulti::Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseI
 void TaskMulti::Visualize(cv::Mat* image, interfaces::BaseIOPtr outs){
 }
+void TaskMulti::Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs) {
-void TaskMulti::Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs){
+}
-    auto detections = std::dynamic_pointer_cast<ios::Detection2Ds>(outputs[0])->detections;
+void TaskMulti::Visualize(std::vector<cv::Mat*> images, std::vector<interfaces::BaseIOPtr>& outputs){   
-    auto undistort_image = std::dynamic_pointer_cast<ios::CameraSrcOut>(outputs[outputs.size()-1])->img_ptr_->toCVMat();
+    std::cout << "vis 1: ...................." << std::endl; 
-    undistort_image.copyTo(*image);
+    int batch_size = images.size();
-    // image->data = undistort_image.clone().data;
+    for(int i = 0; i < batch_size; i++) {
-    for(auto& obj: detections){
+        cv::Mat* image = images[i];
-        cv::Scalar color = get_color(obj->class_label * 100 + obj->track_id);
+        auto detections = std::dynamic_pointer_cast<ios::BatchDetection2Ds>(outputs[0])->batch_detections[i]->detections;
-        cv::putText(*image, std::to_string(obj->class_label) + ":" + common::formatValue(obj->confidence, 2), 
+        auto undistort_image = std::dynamic_pointer_cast<ios::CameraSrcOut>(outputs[outputs.size()-(batch_size-i)])->img_ptr_->toCVMat();
-                    cv::Point(int(obj->x1), int(obj->y1) - 5), 
+        undistort_image.copyTo(*image);
-                    0, 0.6, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
+        // image->data = undistort_image.clone().data;
-        auto rt = cv::Rect(int(obj->x1), int(obj->y1), int(obj->x2 - obj->x1), int(obj->y2 - obj->y1));
+        std::cout << "vis 2: ...................." << std::endl; 
-        cv::rectangle(*image, rt, color, 2);
+        for(auto& obj: detections){
-        // LOG_INFO << obj->mask_ptr->width << ", " << obj->mask_ptr->height << ", " << obj->mask_ptr->rle_string;
+            cv::Scalar color = get_color(obj->class_label * 100 + obj->track_id);
-        cv::Mat instance_mask = obj->mask_ptr->decode();
+            cv::putText(*image, std::to_string(obj->class_label) + ":" + common::formatValue(obj->confidence, 2), 
-        cv::resize(instance_mask, instance_mask, rt.size());
+                        cv::Point(int(obj->x1), int(obj->y1) - 5), 
-        for (int x = rt.x; x < rt.x + rt.width; x++) {
+                        0, 0.6, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
-            for (int y = rt.y; y < rt.y + rt.height; y++) {
+            auto rt = cv::Rect(int(obj->x1), int(obj->y1), int(obj->x2 - obj->x1), int(obj->y2 - obj->y1));
-                float val = instance_mask.at<uchar>(y - rt.y, x - rt.x);
+            cv::rectangle(*image, rt, color, 2);
-                if (val <= 0.5) continue;
+            // LOG_INFO << obj->mask_ptr->width << ", " << obj->mask_ptr->height << ", " << obj->mask_ptr->rle_string;
-                // LOG_INFO<<val<<" ";
+            cv::Mat instance_mask = obj->mask_ptr->decode();
-                image->at<cv::Vec3b>(y, x)[0] = image->at<cv::Vec3b>(y, x)[0] / 2 + color[0] / 2;
+            cv::resize(instance_mask, instance_mask, rt.size());
-                image->at<cv::Vec3b>(y, x)[1] = image->at<cv::Vec3b>(y, x)[1] / 2 + color[1] / 2;
+            for (int x = rt.x; x < rt.x + rt.width; x++) {
-                image->at<cv::Vec3b>(y, x)[2] = image->at<cv::Vec3b>(y, x)[2] / 2 + color[2] / 2;
+                for (int y = rt.y; y < rt.y + rt.height; y++) {
+                    float val = instance_mask.at<uchar>(y - rt.y, x - rt.x);
+                    if (val <= 0.5) continue;
+                    // LOG_INFO<<val<<" ";
+                    image->at<cv::Vec3b>(y, x)[0] = image->at<cv::Vec3b>(y, x)[0] / 2 + color[0] / 2;
+                    image->at<cv::Vec3b>(y, x)[1] = image->at<cv::Vec3b>(y, x)[1] / 2 + color[1] / 2;
+                    image->at<cv::Vec3b>(y, x)[2] = image->at<cv::Vec3b>(y, x)[2] / 2 + color[2] / 2;
+                }
            }
        }
-    }
+        std::cout << "vis 3: ...................." << std::endl; 
-    auto semantics = std::dynamic_pointer_cast<ios::Semantics>(outputs[1])->semantic_segs;
+        auto semantics = std::dynamic_pointer_cast<ios::BatchSemantics>(outputs[1])->batch_semantic_segs[i]->semantic_segs;
-    for(auto& obj: semantics){
+        for(auto& obj: semantics){
-        cv::Scalar color = get_color(obj->class_label + 100);
+            cv::Scalar color = get_color(obj->class_label + 100);
-        // LOG_INFO << obj->mask_ptr->width << ", " << obj->mask_ptr->height << ", " << obj->mask_ptr->rle_string;
+            // LOG_INFO << obj->mask_ptr->width << ", " << obj->mask_ptr->height << ", " << obj->mask_ptr->rle_string;
-        cv::Mat semantic_mask = obj->mask_ptr->decode();
+            cv::Mat semantic_mask = obj->mask_ptr->decode();
-        cv::resize(semantic_mask, semantic_mask, image->size());
+            cv::resize(semantic_mask, semantic_mask, image->size());
-        for (int x = 0; x < image->cols; x++) {
+            for (int x = 0; x < image->cols; x++) {
-            for (int y = 0; y < image->rows; y++) {
+                for (int y = 0; y < image->rows; y++) {
-                float val = semantic_mask.at<uchar>(y, x);
+                    float val = semantic_mask.at<uchar>(y, x);
-                if (val <= 0.5) continue;
+                    if (val <= 0.5) continue;
-                image->at<cv::Vec3b>(y, x)[0] = image->at<cv::Vec3b>(y, x)[0] / 2 + color[0] / 2;
+                    image->at<cv::Vec3b>(y, x)[0] = image->at<cv::Vec3b>(y, x)[0] / 2 + color[0] / 2;
-                image->at<cv::Vec3b>(y, x)[1] = image->at<cv::Vec3b>(y, x)[1] / 2 + color[1] / 2;
+                    image->at<cv::Vec3b>(y, x)[1] = image->at<cv::Vec3b>(y, x)[1] / 2 + color[1] / 2;
-                image->at<cv::Vec3b>(y, x)[2] = image->at<cv::Vec3b>(y, x)[2] / 2 + color[2] / 2;
+                    image->at<cv::Vec3b>(y, x)[2] = image->at<cv::Vec3b>(y, x)[2] / 2 + color[2] / 2;
+                }
            }
        }
    }
 }

--- a/src/tasks/task_multi.h
+++ b/src/tasks/task_multi.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 10:17:55
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-06 02:37:51
+ * @FilePath: /ubuntu/projects/deepinfer/src/tasks/task_multi.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef WAYTOUS_DEEPINFER_TASK_MULTI_H_
 #define WAYTOUS_DEEPINFER_TASK_MULTI_H_
@@ -20,7 +30,8 @@ public:
    bool Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseIOPtr>& outputs) override;
    void Visualize(cv::Mat* image, interfaces::BaseIOPtr outs) override;
-    void Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs);
+    void Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs) override;
+    void Visualize(std::vector<cv::Mat*> image, std::vector<interfaces::BaseIOPtr>& outputs) override;
    std::string Name() override;
 public:

--- a/test/multi_res.jpg
+++ b/test/multi_res.jpg
--- a/test/multi_res_.jpg
+++ b/test/multi_res_.jpg