add batch inference

f3b17ea9 · yangxue · c7156e91 · f3b17ea9 · f3b17ea9 · f3b17ea9
Commit f3b17ea9 authored Sep 06, 2025 by yangxue
25 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.5)
 project(WaytousDeepInfer)
@@ -39,6 +36,8 @@ link_directories(/home/ubuntu/projects/infer/cuda/TensorRT-8.2.3.0/lib)
 # yaml
 find_package(yaml-cpp REQUIRED)
+include_directories(/usr/local/include/yaml-cpp)
+link_directories(/usr/local/lib)
 # glog gflags
 # sudo apt-get install libgoogle-glog*

--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 * @Author: yangxue && xue.yang@waytous.com
 * @Date: 2025-04-07 09:39:32
 * @LastEditors: yangxue xue.yang@waytous.com
- * @LastEditTime: 2025-04-07 09:57:50
+ * @LastEditTime: 2025-09-05 11:05:30
 * @FilePath: /ubuntu/projects/deepinfer/README.md
 * @Description: 
 * 
@@ -30,7 +30,6 @@ include_directories(/usr/include/glog)
 ```
 `mkdir build && cd build`
 `cmake ..` or `cmake -DCUDA_nppicom_LIBRARY=/usr/local/cuda/lib64/libnppim.so ..`

--- a/configs/tasks/multi/multi_task.yaml
+++ b/configs/tasks/multi/multi_task.yaml
-configRootPath: /home/wangxin/projects/waytous/DeepInfer
+configRootPath: /home/wangxin/projects/deepinfer
 device: 0
 modelName: CameraModel
 modelConfigPath: configs/tasks/multi/multi_yolov5.yaml

--- a/configs/tasks/multi/multi_yolov5.yaml
+++ b/configs/tasks/multi/multi_yolov5.yaml
 name: CameraModel
-inputNames: [cvImage]
+inputNames: [cvImage,cvImage2]
-outputNames: [out_instances, out_semantics, out_depths, undistortVisImage]
+outputNames: [out_instances, out_semantics, out_depths, undistortVisImage,undistortVisImage2]
 units:
    -
        name: CameraSrc
-        inputNames: [cvImage]
+        inputNames: [cvImage, cvImage2]
-        outputNames: [uint8Image]
+        outputNames: [uint8Image, uint8Image2]
    -
        name: Undistort
        inputNames: [uint8Image]
@@ -16,12 +16,19 @@ units:
        imageHeight: 540
        IntrinsicPath: configs/tasks/multi/rgb_intrinsic.yaml
    -
+        name: Undistort
+        inputNames: [uint8Image2]
+        outputNames: [undistortVisImage2]
+        imageWidth: 960
+        imageHeight: 540
+        IntrinsicPath: configs/tasks/multi/rgb_intrinsic.yaml
+    -
        name: ResizeNorm
-        inputNames: [undistortVisImage]
+        inputNames: [undistortVisImage, undistortVisImage2]
        outputNames: [resizeNormImages]
        inputMean: [0, 0, 0]
        inputStd: [1, 1, 1]
-        inferBatchSize: 1
+        inferBatchSize: 2
        inputWidth: 960 # 640
        inputHeight: 544 # 640
        fixAspectRatio: false
@@ -33,33 +40,34 @@ units:
        runMode: 1 # 0-fp32 1-fp16 2-int8 (int8 not supported)
        # weightsPath: "configs/tasks/multi/isp_612m_best.onnx"
        # engineFile: "configs/tasks/multi/isp_612m_best_fp16.engine"
-        weightsPath: "configs/tasks/multi/isp_0718m_best.onnx"
+        weightsPath: "configs/tasks/multi/ips_2504m_best.onnx" # projects/deepinfer/configs/tasks/multi/ips_2504m_best.onnx
-        engineFile: "configs/tasks/multi/isp_0718m_best_fp16.engine"
+        engineFile: "configs/tasks/multi/ips_2504m_best_fp16.engine"
        # calibImgPathFile: "configs/tasks/multi/isp_calib_imgs.txt"
        # calibTableCache: "configs/tasks/multi/isp_calib_table.cache"
-        inferDynamic: false
+        inferDynamic: true
-        inferBatchSize: 1
+        inferBatchSize: 2
        inputWidth: 960 # 640
        inputHeight: 544 # 640
-        maxBatchSize: 1 # used when build engine
+        maxBatchSize: 4 # used when build engine
    -
        name: MultiPostProcess
        inputNames: [detections, seg_protos, depths, semantics, undistortVisImage]
        outputNames: [out_instances, out_semantics, out_depths]
+        inferBatchSize: 2
        inputWidth: 960 # 640
        inputHeight: 544 # 640
        fixAspectRatio: false
        # instance-seg
        nmsThreshold: 0.45
-        scoreThreshold: 0.2 # used when inference, can be modified
+        scoreThreshold: 0.1 # used when inference, can be modified
        truncatedThreshold: 0.05
        maxOutputNum: 1000
-        rawDetectionLength: 32130 #25200
+        rawDetectionLength: 10710 # 32130 #25200
        keepTopK: 100
        segProtoDim: 32
        instanceDownScale: 4
-        instanceClassNumber: 9
+        instanceClassNumber: 3
-        instanceClassNames: ["pedestrian", "two_wheel", "car", "truck", "construction_machine", "fence", "stone", "dust", "cone"]
+        instanceClassNames: ["pedestrian", "vehicle", "stone"]
        # semantic-seg
        semanticDownScale: 4
        semanticClassNumber: 2

--- a/include/base/blob.h
+++ b/include/base/blob.h
@@ -216,3 +216,37 @@ using BlobConstPtr = std::shared_ptr<const Blob<Dtype>>;
 #endif
+/*
+Blob 类是一个模板类，用于管理多维数据（如神经网络中层的特征图或参数），其主要功能和设计点如下：
+- 数据存储
+使用模板参数 Dtype 以支持不同类型的数据。
+内部通过智能指针管理两个 SyncedMemory 对象，一个用于存储实际数据（data_），另一个保存数据形状（shape_data_），
+以便在 CPU 和 GPU 内存之间进行同步。
+- 形状管理和重塑（Reshape）
+提供 Reshape() 方法，通过输入一个 int 类型的 vector 来设置数据的维度，同时计算数据总数 count_。
+检查输入形状不超过最大轴数限制（kMaxBlobAxes），并动态分配足够内存存储数据。
+提供 ReshapeLike() 方法，使得当前 Blob 可复制另一个 Blob 的形状。
+- 数据访问接口
+提供 const 和 mutable 版本的 cpu_data() 与 gpu_data() 方法，允许对数据进行只读或读写访问，并自动根据数据状态同步内存。
+同时支持设置外部数据指针（set_cpu_data / set_gpu_data），确保数据在 CPU 和 GPU 端保持一致。
+Legacy 接口支持
+为向后兼容，提供了 num()、channels()、height()、width() 这些 legacy 访问器，这些方法通过对形状的特殊处理模拟低维数据
+（通常为 4D）的访问。
+- 索引与偏移计算
+提供 offset() 函数，用于计算多维数据在一维数组中的具体偏移，也支持通过 vector 提供多个索引实现多维访问。
+数据共享
+通过 ShareData() 方法，可以在不同 Blob 之间共享数据，即共享同一个 SyncedMemory 的实例，防止数据复制时的冗余开销。
+总体来说，Blob 类将复杂的 CPU/GPU 内存管理、数据同步以及多维张量操作封装起来，使得上层算法可以直接关注数据维度和计算，
+而不用过多关心底层的内存细节。
+*/
\ No newline at end of file
--- a/include/base/syncmem.h
+++ b/include/base/syncmem.h
@@ -32,6 +32,28 @@ inline void PerceptionFreeHost(void* ptr, bool use_cuda) {
 /**
 * @brief Manages memory allocation and synchronization between the host (CPU)
 *        and device (GPU).
+  SyncedMemory 类主要用于管理主机（CPU）和设备（GPU）之间的内存分配和数据同步，具体说明如下：
+    内存分配机制
+    SyncedMemory 内部维护两个指针：一个指向 CPU 内存（cpu_ptr_），另一个指向 GPU 内存（gpu_ptr_）。在 CPU 内存分配时，
+    会根据是否使用 CUDA 分配使用相应的函数（malloc 或 cudaMallocHost）。
+    数据状态管理
+    类中定义了一个枚举类型 SyncedHead，用来记录当前有效数据所在位置（例如 UNINITIALIZED、HEAD_AT_CPU、HEAD_AT_GPU、SYNCED）。
+    这样可以确保当修改某一端的数据时，能够明确数据是最新的，从而在需要时进行同步。
+    数据访问接口
+    提供了 cpu_data()、gpu_data()、mutable_cpu_data()、mutable_gpu_data() 以及 set_cpu_data()、set_gpu_data() 等接口，
+    方便用户以只读或可修改的方式访问数据。当数据状态不一致时，内部会自动调用 to_cpu() 或 to_gpu() 方法将数据同步到对应端。
+    异步数据传输
+    async_gpu_push() 方法允许利用 CUDA 流（cudaStream_t）实现异步数据传输，提升数据同步的效率。
+    设备检查
+    内部通过 check_device() 方法检查当前所用的 GPU 设备是否和数据所在的设备一致，避免跨设备数据混乱的问题。
+    总体来说，SyncedMemory 封装了跨 CPU 与 GPU 内存管理和数据同步的逻辑，使得开发人员能够更简单地在混合计算场景中处理数据，
+    而无需手动编写繁琐的内存分配和同步代码。
 */
 class SyncedMemory {
 public:

--- a/include/common/register.h
+++ b/include/common/register.h
@@ -15,6 +15,21 @@ namespace common {
 // from apollo register
 // idea from boost any but make it more simple and don't use type_info.
+/*
+Any 类实现了一个简化版的类型擦除机制，其主要作用包括：
+存储任意类型的数据
+Any 内部持有一个指向 PlaceHolder 的指针，通过模板内部类 Holder 来存储任意类型的对象。这允许在运行时动态保存不同类型的数据，而不需要明确指定类型。
+复制和克隆机制
+Holder 类中实现了 Clone 方法，使得 Any 对象支持深拷贝，即在拷贝构造时可以生成一个独立的副本，确保存储的数据被正确复制。
+类型转换接口
+Any 提供了 AnyCast 模板方法，用于将存储的数据转换回原本的类型。调用者需保证转换类型与实际存储类型相符，否则会返回空指针。
+这种设计类似于 Boost.Any，通过类型擦除来实现对不同类型对象的统一管理，在工厂模式等场景中十分有用。
+*/
 class Any {
 public:
    Any() : content_(NULL) {}
@@ -77,6 +92,22 @@ bool GetRegisteredClasses(
 }  // namespace deepinfer
 }  // namespace waytous
+/*
+这两个宏的作用在于实现自动注册和工厂实例化机制，具体说明如下：
+- **DEEPINFER_REGISTER_REGISTERER(base_class)**  
+  - 定义了一个注册器类，用于提供对某个基类下所有注册的派生类的管理接口。
+  - 提供通过名称获取单例、获取所有实例、验证注册名称等静态方法，便于后续动态创建对象。
+  - 它依赖全局工厂映射（GlobalFactoryMap）来查找注册的对象工厂，从而调用 NewInstance() 创建对象。
+- **DEEPINFER_REGISTER_CLASS(clazz, name)**  
+  - 定义了一个局部（匿名命名空间内）的对象工厂类，该工厂类继承自 ObjectFactory，并实现了 NewInstance() 方法，即实例化具体的派生类。
+  - 利用构造函数属性（__attribute__((constructor)))，在程序加载时自动执行注册，将具体类的工厂注册到全局工厂映射中，关联键为给定的类名和名称。
+简单来说，前者（REGISTER_REGISTERER）为某个基类提供了对象获取和管理接口，而后者（REGISTER_CLASS）则用于将某个具体的类注册到全局工厂中，从而支持基于名称的动态实例化。
+*/
 #define DEEPINFER_REGISTER_REGISTERER(base_class)                    \
 class base_class##Registerer {                                        \
    typedef ::waytous::deepinfer::common::Any Any;                   \
@@ -146,3 +177,4 @@ __attribute__((constructor)) void RegisterFactory##name() {                     
 #endif
--- a/include/interfaces/base_task.h
+++ b/include/interfaces/base_task.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 10:57:32
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-06 02:39:29
+ * @FilePath: /ubuntu/projects/deepinfer/include/interfaces/base_task.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef WAYTOUS_DEEPINFER_TASK_H_
 #define WAYTOUS_DEEPINFER_TASK_H_
@@ -26,6 +36,7 @@ public:
    virtual bool Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseIOPtr>& outputs) = 0;
    virtual void Visualize(cv::Mat* image, interfaces::BaseIOPtr outs) = 0;
    virtual void Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs){};
+    virtual void Visualize(std::vector<cv::Mat*> image, std::vector<interfaces::BaseIOPtr>& outputs) {};
    virtual cv::Scalar get_color(int idx);

--- a/include/libs/ios/depth.h
+++ b/include/libs/ios/depth.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 09:14:47
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-05 09:14:48
+ * @FilePath: /ubuntu/projects/deepinfer/include/libs/ios/depth.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef DEEPINFER_DEPTH_H_
 #define DEEPINFER_DEPTH_H_
@@ -17,13 +27,16 @@ namespace ios {
 class Depth: public interfaces::BaseIO{
 public:
    cv::Mat depth;
 };
 using DepthPtr = std::shared_ptr<Depth>;
+class BatchDepth: public interfaces::BaseIO{
+public:
+    std::vector<DepthPtr> batch_depths;
+};
+using BatchDepthPtr = std::shared_ptr<BatchDepth>;
 }  // namespace ios
 }  // namespace deepinfer
 }  // namespace waytous

--- a/include/libs/ios/detection.h
+++ b/include/libs/ios/detection.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 09:13:12
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-05 09:18:00
+ * @FilePath: /ubuntu/projects/deepinfer/include/libs/ios/detection.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef DEEPINFER_DETECTION_H_
 #define DEEPINFER_DETECTION_H_
@@ -58,6 +68,13 @@ public:
 };
 using Detection2DsPtr = std::shared_ptr<Detection2Ds>;
+class BatchDetection2Ds: public interfaces::BaseIO{
+public:
+    std::vector<Detection2DsPtr> batch_detections;
+};
+using BatchDetection2DsPtr = std::shared_ptr<BatchDetection2Ds>;
 }  // namespace ios
 }  // namespace deepinfer
 }  // namespace waytous

--- a/include/libs/ios/instance_mask.h
+++ b/include/libs/ios/instance_mask.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-04 08:27:52
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-04 08:27:54
+ * @FilePath: /ubuntu/projects/deepinfer/include/libs/ios/instance_mask.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef DEEPINFER_INSTANCE_MASK_H_
 #define DEEPINFER_INSTANCE_MASK_H_
@@ -38,7 +48,7 @@ public:
 public:
    int width;
    int height;
-    std::string rle_string; // coco encode mask string
+    std::string rle_string; // coco encode mask string (run-length encoding)
 };
 using InstanceMaskPtr = std::shared_ptr<InstanceMask>;

--- a/include/libs/ios/semantic.h
+++ b/include/libs/ios/semantic.h
@@ -32,6 +32,12 @@ public:
 };
 using SemanticsPtr = std::shared_ptr<Semantics>;
+class BatchSemantics: public interfaces::BaseIO{
+public:
+    std::vector<SemanticsPtr> batch_semantic_segs;
+};
+using BatchSemanticsPtr = std::shared_ptr<BatchSemantics>;
 }  // namespace ios
 }  // namespace deepinfer

--- a/main.cpp
+++ b/main.cpp
@@ -36,7 +36,7 @@ int main(int argc, char** argv){
    std::string configPath = argv[3];
    std::string srcPath = argv[4];
    std::string savePath = argv[5];
-    int infer_count = 100;
+    int infer_count = 1;
    if(argc >= 7){
        infer_count = std::stoi(argv[6]);
    }
@@ -72,13 +72,27 @@ int main(int argc, char** argv){
        std::cout << "avg infer time: " <<
            std::chrono::duration_cast<std::chrono::microseconds>(e2 - e1).count() / 1000. / infer_count << " ms" << std::endl;
        if(inputs.size() != outputs.size()){
-            cv::Mat vis = images[0];
+            // cv::Mat vis = images[0];    
-            if(taskName == "TaskMulti"){
+            std::cout << "vis 0: ...................." << taskName << std::endl;        
-                t->Visualize(&vis, outputs);
+            int vis_num = inputs.size();
+            std::vector<cv::Mat*> vis;          
+            for(int i = 0; i < vis_num; i++) {                
+                vis.push_back(inputs[i]);
+            }            
+            if(taskName == "TaskMulti"){ // only implement TaskMulti !!!
+                std::cout << "vis taskmulti: ...................." << std::endl;     
+                t->Visualize(vis, outputs);
            }else{
-                t->Visualize(&vis, outputs[0]);
+                // t->Visualize(&vis, outputs[0]);
            }  
-            cv::imwrite(savePaths[0], vis);
+            // write image
+            for(int i = 0; i < vis_num; i++) {
+                std::cout << "write image: " << savePaths[i] << std::endl;
+                cv::imwrite(savePaths[i], *vis[i]);
+            }
        }else{
            for(int i=0; i<inputs.size(); i++){
                cv::Mat vis = images[i];
@@ -130,6 +144,9 @@ int main(int argc, char** argv){
 ./main TaskMulti image ../configs/tasks/multi/multi_task.yaml ../test/multi_test.jpg ../test/multi_res.jpg
+# 测试多图
+./main TaskMulti image ../configs/tasks/multi/multi_task.yaml ../test/multi_test.jpg,../test/multi_test.jpg ../test/multi_res1.jpg,../test/multi_res2.jpg
 */

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,7 +2,6 @@
 project(DeepInfer)
 include_directories(.)
 # include_directories(src/base)

--- a/src/libs/inferences/tensorrt/trt_infer.cpp
+++ b/src/libs/inferences/tensorrt/trt_infer.cpp
@@ -40,6 +40,7 @@ bool TRTInference::Init(YAML::Node& configNode, interfaces::BaseIOMapPtr pmap){
    engineFile = configNode["engineFile"].as<std::string>();
    inferDynamic = configNode["inferDynamic"].as<bool>();
    engineFile = common::GetAbsolutePath(common::ConfigRoot::GetRootPath(), engineFile);
+    LOG_INFO << "engine file: " << engineFile; 
    if(!waytous::deepinfer::common::PathExists(engineFile)){
        LOG_INFO << "Tensorrt engine haven't been built, built from saved weights.";
        BuildEngine(configNode);
@@ -69,7 +70,18 @@ bool TRTInference::Init(YAML::Node& configNode, interfaces::BaseIOMapPtr pmap){
    mContext = mEngine->createExecutionContext();
    assert(mContext != nullptr);
-    mContext->setProfiler(&mProfiler);
+    // mContext->setProfiler(&mProfiler); 
+    // use default profiler    
+    // set input dynamic shape for context
+    // https://docs.nvidia.com/deeplearning/tensorrt-rtx/latest/inference-library/work-with-dynamic-shapes.html#overview
+    if(inferDynamic){
+        nvinfer1::Dims inputDims = mEngine->getBindingDimensions(0);
+        inputDims.d[0] = inferBatchSize;
+        mContext->setBindingDimensions(0, inputDims);
+        LOG_INFO << "set dynamic batch size: " << inferBatchSize;
+    }
    int nbBindings = mEngine->getNbBindings();
    if(nbBindings != inputNames.size() + outputNames.size()){
@@ -102,9 +114,12 @@ bool TRTInference::Init(YAML::Node& configNode, interfaces::BaseIOMapPtr pmap){
        std::vector<int> shape;
        shape.push_back(inferBatchSize);
-        for(int dindex = 0; dindex < dims.nbDims; dindex++){
+        for(int dindex = 1; dindex < dims.nbDims; dindex++){
            shape.push_back(dims.d[dindex]);
        };
+        for(auto dim: shape) {
+            LOG_INFO <<name << " dim: " <<  dim;
+        }
        auto blob = std::make_shared<base::Blob<float>>(shape);
        blobs_.insert(std::make_pair(name, blob));
        auto output = std::make_shared<ios::NormalIO>(blob);
@@ -203,6 +218,13 @@ bool TRTInference::BuildEngine(YAML::Node& configNode){
        // */
        // return false;
    }
+    // 设置输入images的动态形状
+    nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile();
+	profile->setDimensions("images", nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1,3, inputHeight, inputWidth));
+    profile->setDimensions("images", nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4(2,3, inputHeight, inputWidth));
+    profile->setDimensions("images", nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4(4,3, inputHeight, inputWidth));	//这里的尺寸更具你自己的输入修改（最大尺寸）
+    builderConfig->addOptimizationProfile(profile);	//添加进 IBuilderConfig
    LOG_INFO << "Begin building engine..." ;
    #ifdef TRT_8
@@ -248,6 +270,10 @@ std::shared_ptr<base::Blob<float>> TRTInference::get_blob(const std::string &nam
    return iter->second;
 }
+/*
+输入一个ios::NormalIO (base::Blob)， 表示[b,c,h,w]的图像 -> 
+输出多个ios::NormalIO (base::Blob)，表示detections, seg_protos, depths, semantics
+*/
 bool TRTInference::Exec(){
    CUDA_CHECK(cudaStreamSynchronize(mCudaStream));
@@ -256,6 +282,10 @@ bool TRTInference::Exec(){
        if (blob != nullptr) {
            blob->gpu_data();
        }
+        LOG_INFO << "TRTInference inputName: " << name;
+        for(int i =0; i < blob->num_axes(); i++) {
+            LOG_INFO << "TRTInference input shape [" << i << "] : " << blob->shape(i); 
+        }
    }
    // If `out_blob->mutable_cpu_data()` is invoked outside,
    // HEAD will be set to CPU, and `out_blob->mutable_gpu_data()`
@@ -269,11 +299,14 @@ bool TRTInference::Exec(){
            blob->gpu_data();
        }
    }
    if(inferDynamic){
+        //mContext->enqueueV2(&mBindings[0], mCudaStream, nullptr);
        mContext->enqueue(inferBatchSize, &mBindings[0], mCudaStream, nullptr);
    }else{
        mContext->executeV2(&mBindings[0]);
    }
    CUDA_CHECK(cudaStreamSynchronize(mCudaStream));
    for (auto name : outputNames) {

--- a/src/libs/postprocessors/multi_post.cpp
+++ b/src/libs/postprocessors/multi_post.cpp
@@ -33,6 +33,7 @@ bool MultiPostProcess::Init(YAML::Node& node, interfaces::BaseIOMapPtr pmap) {
    depthDownScale = node["depthDownScale"].as<int>();
    depthDistanceScale = node["depthDistanceScale"].as<int>();
+    inferBatchSize = node["inferBatchSize"].as<int>();
    output_length_ptr.reset(new base::Blob<int>({inferBatchSize, 1}));
    output_length_ptr->cpu_data();
@@ -41,6 +42,10 @@ bool MultiPostProcess::Init(YAML::Node& node, interfaces::BaseIOMapPtr pmap) {
    return true;
 };
+/*
+输入多个ios::NormalIO (base::Blob), 分别表示detections, seg_protos, depths, semantics, undistortVisImage
+输出 ios::Detection2Ds, ios::Semantics, ios::Depth
+*/
 bool MultiPostProcess::Exec() {
@@ -70,20 +75,51 @@ bool MultiPostProcess::Exec() {
        return false;
    }
+    // rawDetections: [1,1, rawDetectionLength, detectionStep]
+    for(int i = 0; i < rawDetections->data_->num_axes(); i++) {
+        LOG_INFO << "rawDetections shape: "<<  i << ":  "<< rawDetections->data_->shape(i);
+    }   
+    // segProtos: [1,1,SegProtoDim, 136,240], 1/4 downscaled of input image
+    for(int i = 0; i < segProtos->data_->num_axes(); i++) {
+        LOG_INFO << "segProtos shape: "<<  i << ":  "<< segProtos->data_->shape(i);
+    }   
+    // rawSemantics: [1,1, semanticClassNumber, 136, 240]
+    for(int i = 0; i < rawSemantics->data_->num_axes(); i++) {
+        LOG_INFO << "rawSemantics shape: "<<  i << ":  "<< rawSemantics->data_->shape(i);
+    }
+    // rawDepths: [1,1,1,136,240]
+    for(int i = 0; i < rawDepths->data_->num_axes(); i++) {
+        LOG_INFO << "rawDepths shape: "<<  i << ":  "<< rawDepths->data_->shape(i);
+    }   
+    // check rawDetections validity with two same samples
+    int rawDetectionArea = rawDetectionLength * detectionStep;
+    int equal_count = 0;
+    for(int i = 0; i < rawDetectionArea; i++) {
+        equal_count += rawDetections->data_->cpu_data()[i] == rawDetections->data_->cpu_data()[i + rawDetectionArea]? 1 : 0;
+    }    
+    LOG_WARN << "RawDetections equal count: " << equal_count << " / " << rawDetectionArea;
+    // iterate over batch
+    auto detection_ptr_array = std::make_shared<std::vector<ios::Detection2DsPtr>>();
+    auto semantic_ptr_array = std::make_shared<std::vector<ios::SemanticsPtr>>();
+    auto depth_ptr_array = std::make_shared<std::vector<ios::DepthPtr>>();
-    // filter detections 25200 x (5+9+32) -> 1000 x (5+9+32)
+    for(int bi = 0; bi < inferBatchSize; bi++) {           
+        // filter detections 25200 x (5+class_num+32) -> 1000 x (5+class_num+32)
        // reset output_length=0, otherwise, it will increase after every inference.
-    output_length_ptr->mutable_cpu_data()[0] = 0;
+        output_length_ptr->mutable_cpu_data()[bi] = 0;
        multitask_instance_filter(
-        rawDetections->data_->gpu_data(), rawDetectionLength,
+            rawDetections->data_->gpu_data() + bi * rawDetectionLength * detectionStep, 
-        bboxes_ptr->mutable_gpu_data(),
+            rawDetectionLength,
-        output_length_ptr->mutable_gpu_data(),
+            bboxes_ptr->mutable_gpu_data() + bi * maxOutputNum * detectionStep,
+            output_length_ptr->mutable_gpu_data() + bi,
            scoreThreshold, detectionStep, maxOutputNum
        );
-    auto outputLength = output_length_ptr->cpu_data();
+        auto outputLength = output_length_ptr->cpu_data() + bi;
-    auto outputBoxes = bboxes_ptr->mutable_cpu_data();
+        auto outputBoxes = bboxes_ptr->mutable_cpu_data() + bi * maxOutputNum * detectionStep;  
-    auto proto = segProtos->data_->mutable_cpu_data();
+        LOG_INFO << "Num of boxes after filter (outputLength[0]): " << outputLength[0];  
+        LOG_INFO << "Elem of boxes after filter (outputBoxes[0]): " << outputBoxes[0];  
        // Detection
        float img_width = float(inputImage->img_ptr_->cols());
@@ -138,6 +174,8 @@ bool MultiPostProcess::Exec() {
        // Instance Mask
        int mask_width = inputWidth / instanceDownScale;
        int mask_height = inputHeight / instanceDownScale;
+        auto proto = segProtos->data_->mutable_cpu_data() + bi * mask_width * mask_height * segProtoDim;
        for(auto det: dets->detections){
            int x1 = round(det->x1 * scalex / instanceDownScale); // scale to output mask level.
            int x2 = round(det->x2 * scalex / instanceDownScale);
@@ -168,18 +206,22 @@ bool MultiPostProcess::Exec() {
            );
            // LOG_INFO << x1 << " " << x2 << " " << y1 << " " << y2 <<", " << det->mask_ptr->rle_string;
        }
-    pMap->SetIOPtr(outputNames[0], dets);
+        // pMap->SetIOPtr(outputNames[0], dets);
+        detection_ptr_array->emplace_back(dets);
        // Semantic Mask
        auto semantics = std::make_shared<ios::Semantics>();
-    auto rawSemanticSegs = rawSemantics->data_->mutable_cpu_data();
+        int semanticHeight = inputHeight / semanticDownScale;
+        int semanticWidth = inputWidth / semanticDownScale;
+        auto rawSemanticSegs = rawSemantics->data_->mutable_cpu_data() + bi * semanticClassNumber * semanticHeight * semanticWidth;
        for(int is=0; is < semanticClassNumber; is++){
            auto seg = std::make_shared<ios::SemanticSeg>();
            seg->class_label = is;
            seg->class_name = semanticClassNames[is];
-        cv::Mat mask_mat = cv::Mat::zeros(inputHeight / semanticDownScale, inputWidth / semanticDownScale, CV_32FC1);
+            cv::Mat mask_mat = cv::Mat::zeros(semanticHeight, semanticWidth, CV_32FC1);
-        for (int x = 0; x < inputWidth / semanticDownScale; x++) {
+            for (int x = 0; x < semanticWidth; x++) {
-            for (int y = 0; y < inputHeight / semanticDownScale; y++) {
+                for (int y = 0; y < semanticHeight; y++) {
                    float e = rawSemanticSegs[is * mask_mat.cols * mask_mat.rows + y * mask_mat.cols + x];
                    // e = 1.0f / (1.0f + expf(-e));
                    mask_mat.at<float>(y, x) = e;
@@ -200,14 +242,19 @@ bool MultiPostProcess::Exec() {
            // LOG_INFO << seg->mask_ptr->rle_string;
            semantics->semantic_segs.push_back(seg);
        }
-    pMap->SetIOPtr(outputNames[1], semantics);
+        // pMap->SetIOPtr(outputNames[1], semantics);
+        semantic_ptr_array->emplace_back(semantics);
        // Depth
        auto depth = std::make_shared<ios::Depth>();
-    auto rawDepth = rawDepths->data_->mutable_cpu_data();
+        int depthHeight = inputHeight / depthDownScale;
-    cv::Mat mask_mat = cv::Mat::zeros(inputHeight / depthDownScale, inputWidth / depthDownScale, CV_32FC1);
+        int depthWidth = inputWidth / depthDownScale;
-    for (int x = 0; x < inputWidth / depthDownScale; x++) {
-        for (int y = 0; y < inputHeight / depthDownScale; y++) {
+        cv::Mat mask_mat = cv::Mat::zeros(depthHeight, depthWidth, CV_32FC1);
+        auto rawDepth = rawDepths->data_->mutable_cpu_data() + bi * depthHeight * depthWidth;
+        for (int x = 0; x < depthWidth; x++) {
+            for (int y = 0; y < depthHeight; y++) {
                float e = rawDepth[y * mask_mat.cols + x];
                // e = 1.0f / (1.0f + expf(-e));
                mask_mat.at<float>(y, x) = e * depthDistanceScale;
@@ -223,7 +270,19 @@ bool MultiPostProcess::Exec() {
        //     cv::resize(mask_mat, mask_res, cv::Size(img_width, img_height));
        // }
        depth->depth = mask_mat;
-    pMap->SetIOPtr(outputNames[2], depth);
+        // pMap->SetIOPtr(outputNames[2], depth);
+        depth_ptr_array->emplace_back(depth);
+    }
+    auto batch_detection_ptr = std::make_shared<ios::BatchDetection2Ds>();
+    batch_detection_ptr->batch_detections = *detection_ptr_array;
+    auto batch_semantic_ptr =  std::make_shared<ios::BatchSemantics>();
+    batch_semantic_ptr->batch_semantic_segs =  *semantic_ptr_array; 
+    auto batch_depth_ptr = std::make_shared<ios::BatchDepth>();
+    batch_depth_ptr->batch_depths = *depth_ptr_array;
+    pMap->SetIOPtr(outputNames[0], batch_detection_ptr);
+    pMap->SetIOPtr(outputNames[1], batch_semantic_ptr);
+    pMap->SetIOPtr(outputNames[2], batch_depth_ptr);
    return true;

--- a/src/libs/preprocessors/resize_norm.cpp
+++ b/src/libs/preprocessors/resize_norm.cpp
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 03:06:58
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-06 03:24:47
+ * @FilePath: /ubuntu/projects/deepinfer/src/libs/preprocessors/resize_norm.cpp
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #include "libs/preprocessors/resize_norm.h"
@@ -17,6 +27,7 @@ bool ResizeNorm::Init(YAML::Node& node, interfaces::BaseIOMapPtr pmap){
    inferBatchSize = node["inferBatchSize"].as<int>();
    fixAspectRatio = node["fixAspectRatio"].as<bool>();
+    LOG_INFO << "inputNames size / inferBatchSize: " << inputNames.size() << " "  << inferBatchSize;
    if(inputNames.size() != inferBatchSize){
        LOG_ERROR << "Resize norm got wrong inputs number: " << inputNames.size() << " with infer batchsize: " << inferBatchSize;
        return false;
@@ -37,7 +48,9 @@ bool ResizeNorm::Init(YAML::Node& node, interfaces::BaseIOMapPtr pmap){
    return true;
 };
+/*
+多个ios::CameraSrcOut (base::Image8U) -> 一个 ios::NormalIO (base::Blob)
+*/
 bool ResizeNorm::Exec(){
    for(int b=0; b < inputNames.size(); b++){
        auto inputName = inputNames[b];
@@ -59,6 +72,14 @@ bool ResizeNorm::Exec(){
            useBGR, fixAspectRatio
        );
    }
+    // LOG_INFO << "resize norm done.";
+    // // debug: download to cpu and save image
+    // for(int i = 0; i < inferBatchSize; i++) {
+    //     cv::Mat mat(inputHeight, inputWidth, CV_32FC3);
+    //     cudaMemcpy(mat.data, dst->cpu_data() + (i * 3 * inputHeight * inputWidth), sizeof(float) * 3 * inputHeight * inputWidth, cudaMemcpyDeviceToHost);
+    //     mat.convertTo(mat, CV_8UC3, 255.0);        
+    //     cv::imwrite("resize_norm_" + std::to_string(i) + ".jpg", mat);
+    // }
    return true;
 };

--- a/src/libs/sources/camera_src.cpp
+++ b/src/libs/sources/camera_src.cpp
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 08:58:55
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-05 08:58:55
+ * @FilePath: /ubuntu/projects/deepinfer/src/libs/sources/camera_src.cpp
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #include "libs/sources/camera_src.h"
@@ -8,7 +18,9 @@ namespace sources {
 bool CameraSrc::Exec(){
-    auto src_input = pMap->GetIOPtr(inputNames[0]);
+    // 将ios::CameraSrcIn(cv::Mat) 转换为 ios::CameraSrcOut (base::Image8U)
+    for(int i = 0; i < inputNames.size(); i++) {
+        auto src_input = pMap->GetIOPtr(inputNames[i]);
        if(src_input == nullptr){
            LOG_ERROR << "CameraSrc input" << inputNames[0] << " haven't init";
            return false;
@@ -16,7 +28,9 @@ bool CameraSrc::Exec(){
        auto src = std::dynamic_pointer_cast<ios::CameraSrcIn>(src_input);
        auto img = std::make_shared<base::Image8U>(src->cv_img_);
        auto dst = std::make_shared<ios::CameraSrcOut>(img);
-    pMap->SetIOPtr(outputNames[0], dst);
+        pMap->SetIOPtr(outputNames[i], dst);
+    }
    return true;
 }

--- a/src/libs/trackers/lapjv.cpp
+++ b/src/libs/trackers/lapjv.cpp
@@ -321,6 +321,7 @@ int_t _ca_dense(
 /** Solve dense sparse LAP.
+ * Linear Assignment Problem (LAP) solver using the Jonker-Volgenant algorithm.
 */
 int lapjv_internal(
 	const uint_t n, cost_t *cost[],

--- a/src/libs/trackers/tracker_datatype.h
+++ b/src/libs/trackers/tracker_datatype.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-06-04 08:18:42
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-06-04 08:18:42
+ * @FilePath: /ubuntu/projects/deepinfer/src/libs/trackers/tracker_datatype.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef WAYTOUS_DEEPINFER_TRACKER_DATATYPE_H_
 #define WAYTOUS_DEEPINFER_TRACKER_DATATYPE_H_
@@ -13,6 +23,7 @@ typedef Eigen::Matrix<float, 1, 4, Eigen::RowMajor> DETECTBOX; // xyah
 typedef Eigen::Matrix<float, -1, 4, Eigen::RowMajor> DETECTBOXSS;
 //Kalmanfilter
+// state: [cx, cy, a(w/h), h, vx, vy, va, vh]
 //typedef Eigen::Matrix<float, 8, 8, Eigen::RowMajor> KAL_FILTER;
 typedef Eigen::Matrix<float, 1, 8, Eigen::RowMajor> KAL_MEAN;
 typedef Eigen::Matrix<float, 8, 8, Eigen::RowMajor> KAL_COVA;

--- a/src/models/camera_model.cpp
+++ b/src/models/camera_model.cpp
@@ -12,6 +12,7 @@ bool CameraModel::Init(std::string& configPath) {
        LOG_WARN << "Init CameraModel config_file "<< common::ConfigRoot::GetRootPath() << " " << cfgPath << " not exist.";
        return false;
    }
+    LOG_INFO << "config path: " << cfgPath;
    modelConfigNode = YAML::LoadFile(cfgPath);
    if (modelConfigNode.IsNull()) {
        LOG_WARN << "Init CameraModel, Load " << configPath << " failed! please check!";
@@ -19,6 +20,10 @@ bool CameraModel::Init(std::string& configPath) {
    }
    inputNames = modelConfigNode["inputNames"].as<std::vector<std::string>>();
    outputNames = modelConfigNode["outputNames"].as<std::vector<std::string>>();
+    LOG_INFO << "CameraModel input/output size: " << inputNames.size() << " " << outputNames.size();
+    for(auto name: outputNames) {
+        LOG_INFO << "CameraModel outputName: " << name;
+    }
    modelUnitMap = std::make_shared<interfaces::BaseIOMap>();

--- a/src/tasks/task_multi.cpp
+++ b/src/tasks/task_multi.cpp
@@ -13,6 +13,7 @@ bool TaskMulti::Init(std::string& taskConfigPath){
    };
    std::string modelName = taskNode["modelName"].as<std::string>();
    std::string modelConfigPath = taskNode["modelConfigPath"].as<std::string>();
+    LOG_INFO << "modelConfigPath: " << modelConfigPath;
    MulitTaskModel.reset(interfaces::BaseModelRegisterer::GetInstanceByName(modelName));
    if(!MulitTaskModel->Init(modelConfigPath)){
@@ -34,12 +35,19 @@ bool TaskMulti::Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseI
 void TaskMulti::Visualize(cv::Mat* image, interfaces::BaseIOPtr outs){
 }
+void TaskMulti::Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs) {
-void TaskMulti::Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs){
+}
-    auto detections = std::dynamic_pointer_cast<ios::Detection2Ds>(outputs[0])->detections;
+void TaskMulti::Visualize(std::vector<cv::Mat*> images, std::vector<interfaces::BaseIOPtr>& outputs){   
-    auto undistort_image = std::dynamic_pointer_cast<ios::CameraSrcOut>(outputs[outputs.size()-1])->img_ptr_->toCVMat();
+    std::cout << "vis 1: ...................." << std::endl; 
+    int batch_size = images.size();
+    for(int i = 0; i < batch_size; i++) {
+        cv::Mat* image = images[i];
+        auto detections = std::dynamic_pointer_cast<ios::BatchDetection2Ds>(outputs[0])->batch_detections[i]->detections;
+        auto undistort_image = std::dynamic_pointer_cast<ios::CameraSrcOut>(outputs[outputs.size()-(batch_size-i)])->img_ptr_->toCVMat();
        undistort_image.copyTo(*image);
        // image->data = undistort_image.clone().data;
+        std::cout << "vis 2: ...................." << std::endl; 
        for(auto& obj: detections){
            cv::Scalar color = get_color(obj->class_label * 100 + obj->track_id);
            cv::putText(*image, std::to_string(obj->class_label) + ":" + common::formatValue(obj->confidence, 2), 
@@ -61,8 +69,9 @@ void TaskMulti::Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& ou
                }
            }
        }
+        std::cout << "vis 3: ...................." << std::endl; 
-    auto semantics = std::dynamic_pointer_cast<ios::Semantics>(outputs[1])->semantic_segs;
+        auto semantics = std::dynamic_pointer_cast<ios::BatchSemantics>(outputs[1])->batch_semantic_segs[i]->semantic_segs;
        for(auto& obj: semantics){
            cv::Scalar color = get_color(obj->class_label + 100);
            // LOG_INFO << obj->mask_ptr->width << ", " << obj->mask_ptr->height << ", " << obj->mask_ptr->rle_string;
@@ -78,6 +87,9 @@ void TaskMulti::Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& ou
                }
            }
        }
+    }
 }

--- a/src/tasks/task_multi.h
+++ b/src/tasks/task_multi.h
+/*
+ * @Author: yangxue && xue.yang@waytous.com
+ * @Date: 2025-09-05 10:17:55
+ * @LastEditors: yangxue xue.yang@waytous.com
+ * @LastEditTime: 2025-09-06 02:37:51
+ * @FilePath: /ubuntu/projects/deepinfer/src/tasks/task_multi.h
+ * @Description: 
+ * 
+ * Copyright (c) 2025 by ${git_name_email}, All Rights Reserved. 
+ */
 #ifndef WAYTOUS_DEEPINFER_TASK_MULTI_H_
 #define WAYTOUS_DEEPINFER_TASK_MULTI_H_
@@ -20,7 +30,8 @@ public:
    bool Exec(std::vector<cv::Mat*> inputs, std::vector<interfaces::BaseIOPtr>& outputs) override;
    void Visualize(cv::Mat* image, interfaces::BaseIOPtr outs) override;
-    void Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs);
+    void Visualize(cv::Mat* image, std::vector<interfaces::BaseIOPtr>& outputs) override;
+    void Visualize(std::vector<cv::Mat*> image, std::vector<interfaces::BaseIOPtr>& outputs) override;
    std::string Name() override;
 public:

--- a/test/multi_res.jpg
+++ b/test/multi_res.jpg
--- a/test/multi_res_.jpg
+++ b/test/multi_res_.jpg