

#include "libs/inferences/tensorrt/trt_yolov5_layer.h"


namespace nvinfer1
{

    YoloV5LayerPlugin::YoloV5LayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, 
        int numAnchorsPerLevel, const std::vector<waytous::deepinfer::inference::Yolo::YoloKernel> &vYoloKernel)
    {
        mClassCount = classCount;
        mYoloV5NetWidth = netWidth;
        mYoloV5NetHeight = netHeight;
        mMaxOutObject = maxOut;
        mNumAnchorsPerLevel = numAnchorsPerLevel;
        mYoloKernel = vYoloKernel;
        mKernelCount = vYoloKernel.size();

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        // size_t AnchorLen = sizeof(float)* Yolo::maxNumAnchorsPerLevel * 2; // just get valid anchors to gpu
        size_t AnchorLen = sizeof(float)* mNumAnchorsPerLevel * 2;
        for (int ii = 0; ii < mKernelCount; ii++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
            // CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors.data(), AnchorLen, cudaMemcpyHostToDevice));
        }
    }
    
    YoloV5LayerPlugin::~YoloV5LayerPlugin()
    {
        for (int ii = 0; ii < mKernelCount; ii++)
        {
            CUDA_CHECK(cudaFree(mAnchor[ii]));
        }
        CUDA_CHECK(cudaFreeHost(mAnchor));
    }

    // create the plugin at runtime from a byte stream
    YoloV5LayerPlugin::YoloV5LayerPlugin(const void* data, size_t length)
    {
        using namespace waytous::deepinfer::inference::Tn;
        const char *d = reinterpret_cast<const char *>(data), *a = d;
        read(d, mClassCount);
        read(d, mThreadCount);
        read(d, mKernelCount);
        read(d, mYoloV5NetWidth);
        read(d, mYoloV5NetHeight);
        read(d, mMaxOutObject);
        read(d, mNumAnchorsPerLevel);
        mYoloKernel.resize(mKernelCount);
        auto kernelSize = mKernelCount * sizeof(waytous::deepinfer::inference::Yolo::YoloKernel);
        memcpy(mYoloKernel.data(), d, kernelSize);
        d += kernelSize;

        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
        // size_t AnchorLen = sizeof(float)* Yolo::maxNumAnchorsPerLevel * 2; // default max anchor length
        size_t AnchorLen = sizeof(float)* mNumAnchorsPerLevel * 2;
        for (int ii = 0; ii < mKernelCount; ii++)
        {
            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
            const auto& yolo = mYoloKernel[ii];
            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
        }
        assert(d == a + length);
    }

    void YoloV5LayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT
    {
        using namespace waytous::deepinfer::inference::Tn;
        char* d = static_cast<char*>(buffer), *a = d;
        write(d, mClassCount);
        write(d, mThreadCount);
        write(d, mKernelCount);
        write(d, mYoloV5NetWidth);
        write(d, mYoloV5NetHeight);
        write(d, mMaxOutObject);
        write(d, mNumAnchorsPerLevel);
        auto kernelSize = mKernelCount * sizeof(waytous::deepinfer::inference::Yolo::YoloKernel);
        memcpy(d, mYoloKernel.data(), kernelSize);
        d += kernelSize;

        assert(d == a + getSerializationSize());
    }

    size_t YoloV5LayerPlugin::getSerializationSize() const TRT_NOEXCEPT
    {
        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + \
        sizeof(waytous::deepinfer::inference::Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight) + sizeof(mMaxOutObject) + sizeof(mNumAnchorsPerLevel);
    }

    int YoloV5LayerPlugin::initialize() TRT_NOEXCEPT
    {
        return 0;
    }

    Dims YoloV5LayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT
    {   
        assert(index < 2);
        //output the result to channel
        if (index == 0)
        {
            return Dims3(mMaxOutObject, 1, 4);
        }
        return DimsHW(mMaxOutObject, mClassCount);
    }

    // Set plugin namespace
    void YoloV5LayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* YoloV5LayerPlugin::getPluginNamespace() const TRT_NOEXCEPT
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType YoloV5LayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool YoloV5LayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool YoloV5LayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT
    {
        return false;
    }

    void YoloV5LayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void YoloV5LayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
    {
    }

    // Detach the plugin object from its execution context.
    void YoloV5LayerPlugin::detachFromContext() TRT_NOEXCEPT {}

    const char* YoloV5LayerPlugin::getPluginType() const TRT_NOEXCEPT
    {
        return "YoloV5Layer_TRT";
    }

    const char* YoloV5LayerPlugin::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    void YoloV5LayerPlugin::destroy() TRT_NOEXCEPT
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* YoloV5LayerPlugin::clone() const TRT_NOEXCEPT
    {
        YoloV5LayerPlugin* p = new YoloV5LayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, mNumAnchorsPerLevel, mYoloKernel);
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };

    __global__ void CalDetection(const float *input, float *bboxData, float *scoreData, int *countData, int noElements,
        const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float* anchors, int classes, int numAnchorsPerLevel)
    {

        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= noElements) return;

        int total_grid = yoloWidth * yoloHeight;
        int bnIdx = idx / total_grid;
        idx = idx - total_grid * bnIdx;
        int info_len_i = 5 + classes;                                                    // 85
        const float* curInput = input + bnIdx * (info_len_i * total_grid * numAnchorsPerLevel); // b*h*w*3*85

        for (int k = 0; k < numAnchorsPerLevel; ++k) {
            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
            if (box_prob < 0.1) continue;// ignore box prb < 0.1
            int *res_count = countData + bnIdx;
            int count = (int)atomicAdd(res_count, 1);
            if (count >= maxoutobject) return;

            float *curBbox = bboxData + bnIdx * maxoutobject * 4 + count * 4;
            float *curScore = scoreData + bnIdx * maxoutobject * classes + count * classes;

            float p = 0;
            for (int i = 5; i < info_len_i; ++i)
            {
                p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
                curScore[i - 5] = p * box_prob;
            }
            int row = idx / yoloWidth;
            int col = idx % yoloWidth;

            //Location
            // pytorch:
            //  y = x[i].sigmoid()
            //  y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
            //  y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
            //  X: (sigmoid(tx) + cx)/FeaturemapW *  netwidth
            float cx = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
            float cy = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;

            // W: (Pw * e^tw) / FeaturemapW * netwidth
            // v5: https://github.com/ultralytics/yolov5/issues/471
            float w = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
            w = w * w * anchors[2 * k];
            float h = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
            h = h * h * anchors[2 * k + 1];
            // cx,cy,w,h to x1,y1,x2,y2
            curBbox[0] = cx - 0.5 * w;
            curBbox[1] = cy - 0.5 * h;
            curBbox[2] = cx + 0.5 * w;
            curBbox[3] = cy + 0.5 * h;
            // printf("%f, %f, %f, %f, %f, %f\n", cx, cy, w, h, p, box_prob);
        }
    }

    void YoloV5LayerPlugin::forwardGpu(const float* const* inputs, void** outputs, void* workspace, cudaStream_t stream, int batchSize)
    {
        float *bboxData = (float *)outputs[0];
        float *scoreData = (float *)outputs[1];
        int *countData = (int *)workspace;


        CUDA_CHECK(cudaMemset(countData, 0, sizeof(int) * batchSize));
        CUDA_CHECK(cudaMemset(bboxData, 0, sizeof(float) * mMaxOutObject * 4 * batchSize));
        CUDA_CHECK(cudaMemset(scoreData, 0, sizeof(float) * mMaxOutObject * mClassCount * batchSize));

        int numElem = 0;
        for (unsigned int i = 0; i < mYoloKernel.size(); ++i){
            const auto& yolo = mYoloKernel[i];
            numElem = yolo.width * yolo.height * batchSize;
            if (numElem < mThreadCount) mThreadCount = numElem;
            CalDetection<<< (numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>>
                (inputs[i], bboxData, scoreData, countData, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, 
                 yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, mNumAnchorsPerLevel);
        }
    }


    int YoloV5LayerPlugin::enqueue(int batchSize, const void *const *inputs, void ** outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT
    {
        forwardGpu((const float* const*)inputs, outputs, workspace, stream, batchSize);
        return 0;
    }

    PluginFieldCollection YoloV5PluginCreator::mFC{};
    std::vector<PluginField> YoloV5PluginCreator::mPluginAttributes;

    YoloV5PluginCreator::YoloV5PluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* YoloV5PluginCreator::getPluginName() const TRT_NOEXCEPT
    {
        return "YoloV5Layer_TRT";
    }

    const char* YoloV5PluginCreator::getPluginVersion() const TRT_NOEXCEPT
    {
        return "1";
    }

    const PluginFieldCollection* YoloV5PluginCreator::getFieldNames() TRT_NOEXCEPT
    {
        return &mFC;
    }

    IPluginV2IOExt* YoloV5PluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT
    {
        assert(fc->nbFields == 2);
        assert(strcmp(fc->fields[0].name, "netinfo") == 0);
        assert(strcmp(fc->fields[1].name, "kernels") == 0);
        int *p_netinfo = (int*)(fc->fields[0].data);
        int class_count = p_netinfo[0];
        int input_w = p_netinfo[1];
        int input_h = p_netinfo[2];
        int max_output_object_count = p_netinfo[3];
        int num_anchors_per_level = p_netinfo[4];
        std::vector<waytous::deepinfer::inference::Yolo::YoloKernel> kernels(fc->fields[1].length);
        memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(waytous::deepinfer::inference::Yolo::YoloKernel));
        YoloV5LayerPlugin* obj = new YoloV5LayerPlugin(class_count, input_w, input_h, max_output_object_count, num_anchors_per_level, kernels);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* YoloV5PluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT
    {
        // This object will be deleted when the network is destroyed, which will
        // call YoloV5LayerPlugin::destroy()
        YoloV5LayerPlugin* obj = new YoloV5LayerPlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }
}









