update

b6c19984 · dengjb · b6c19984 · b6c19984 · b6c19984 · b6c19984
Commit b6c19984 authored Nov 18, 2025 by dengjb
20 changed files
--- a/projects/FastRT/fastrt/layers/CMakeLists.txt
+++ b/projects/FastRT/fastrt/layers/CMakeLists.txt
+target_sources(${PROJECT_NAME}
+PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/layers.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/poolingLayerRT.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/poolingLayerRT.cpp
+)
\ No newline at end of file
--- a/projects/FastRT/fastrt/layers/layers.cpp
+++ b/projects/FastRT/fastrt/layers/layers.cpp
+#include <limits>
+#include <vector>
+#include <iostream>
+#include "fastrt/utils.h"
+#include "fastrt/layers.h"
+
+namespace trtxapi {
+
+    IActivationLayer* addMinClamp(INetworkDefinition* network, ITensor& input, const float min) {
+        IActivationLayer* clip = network->addActivation(input, ActivationType::kCLIP);
+        TRTASSERT(clip);
+        clip->setAlpha(min);
+        clip->setBeta(std::numeric_limits<float>::max());    
+        return clip;
+    }
+
+    ITensor* addDiv255(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor* input, const std::string lname) {
+        Weights Div_225{ DataType::kFLOAT, nullptr, 3 };
+        float *wgt = reinterpret_cast<float*>(malloc(sizeof(float) * 3));
+        std::fill_n(wgt, 3, 255.0f); 
+        Div_225.values = wgt;
+        weightMap[lname + ".div"] = Div_225;
+        IConstantLayer* d = network->addConstant(Dims3{ 3, 1, 1 }, Div_225);
+        IElementWiseLayer* div255 = network->addElementWise(*input, *d->getOutput(0), ElementWiseOperation::kDIV);
+        return div255->getOutput(0);
+    }
+
+    ITensor* addMeanStd(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor* input, const std::string lname, const float* mean, const float* std, const bool div255) {
+        ITensor* tensor_holder{input};
+        if (div255) {
+            tensor_holder = addDiv255(network, weightMap, input, lname);
+        }
+        Weights Mean{ DataType::kFLOAT, nullptr, 3 };
+        Mean.values = mean;
+        IConstantLayer* m = network->addConstant(Dims3{ 3, 1, 1 }, Mean);
+        IElementWiseLayer* sub_mean = network->addElementWise(*tensor_holder, *m->getOutput(0), ElementWiseOperation::kSUB);
+        if (std != nullptr) {
+            Weights Std{ DataType::kFLOAT, nullptr, 3 };
+            Std.values = std;
+            IConstantLayer* s = network->addConstant(Dims3{ 3, 1, 1 }, Std);
+            IElementWiseLayer* std_mean = network->addElementWise(*sub_mean->getOutput(0), *s->getOutput(0), ElementWiseOperation::kDIV);
+            return std_mean->getOutput(0);
+        } else {
+            return sub_mean->getOutput(0);
+        }
+    }
+
+    IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, const std::string lname, const float eps) {
+        float *gamma = (float*)weightMap[lname + ".weight"].values;
+        float *beta = (float*)weightMap[lname + ".bias"].values;
+        float *mean = (float*)weightMap[lname + ".running_mean"].values;
+        float *var = (float*)weightMap[lname + ".running_var"].values;
+        int len = weightMap[lname + ".running_var"].count;
+
+        float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+        for (int i = 0; i < len; i++) {
+            scval[i] = gamma[i] / sqrt(var[i] + eps);
+        }
+        Weights wscale{DataType::kFLOAT, scval, len};
+
+        float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+        for (int i = 0; i < len; i++) {
+            shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
+        }
+        Weights wshift{DataType::kFLOAT, shval, len};
+
+        float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+        for (int i = 0; i < len; i++) {
+            pval[i] = 1.0;
+        }
+        Weights wpower{DataType::kFLOAT, pval, len};
+
+        weightMap[lname + ".scale"] = wscale;
+        weightMap[lname + ".shift"] = wshift;
+        weightMap[lname + ".power"] = wpower;
+        IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, wshift, wscale, wpower);
+        TRTASSERT(scale_1);
+        return scale_1;
+    }
+
+    IScaleLayer* addInstanceNorm2d(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, const std::string lname, const float eps) {
+        int len = weightMap[lname + ".weight"].count;
+        IReduceLayer* reduce1 = network->addReduce(input, 
+            ReduceOperation::kAVG,
+            6, 
+            true);
+        TRTASSERT(reduce1);
+
+        IElementWiseLayer* ew1 = network->addElementWise(input, 
+            *reduce1->getOutput(0),
+            ElementWiseOperation::kSUB);  
+        TRTASSERT(ew1);
+
+        const static float pval1[3]{0.0, 1.0, 2.0};   
+        Weights wshift1{DataType::kFLOAT, pval1, 1};
+        Weights wscale1{DataType::kFLOAT, pval1+1, 1};
+        Weights wpower1{DataType::kFLOAT, pval1+2, 1};
+
+        IScaleLayer* scale1 = network->addScale(
+            *ew1->getOutput(0), 
+            ScaleMode::kUNIFORM,
+            wshift1,  
+            wscale1,  
+            wpower1); 
+        TRTASSERT(scale1);
+
+        IReduceLayer* reduce2 = network->addReduce(
+            *scale1->getOutput(0), 
+            ReduceOperation::kAVG,
+            6, 
+            true);
+        TRTASSERT(reduce2);
+
+        const static float pval2[3]{eps, 1.0, 0.5}; 
+        Weights wshift2{DataType::kFLOAT, pval2, 1};
+        Weights wscale2{DataType::kFLOAT, pval2+1, 1};
+        Weights wpower2{DataType::kFLOAT, pval2+2, 1};
+        
+        IScaleLayer* scale2 = network->addScale(
+            *reduce2->getOutput(0), 
+            ScaleMode::kUNIFORM,
+            wshift2,  
+            wscale2,  
+            wpower2);
+        TRTASSERT(scale2);
+
+        IElementWiseLayer* ew2 = network->addElementWise(*ew1->getOutput(0), 
+            *scale2->getOutput(0),
+            ElementWiseOperation::kDIV); 
+        TRTASSERT(ew2);
+
+        float* pval3 = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+        std::fill_n(pval3, len, 1.0); 
+        Weights wpower3{DataType::kFLOAT, pval3, len};
+        weightMap[lname + ".power3"] = wpower3;
+
+        IScaleLayer* scale3 = network->addScale(
+            *ew2->getOutput(0), 
+            ScaleMode::kCHANNEL,
+            weightMap[lname + ".bias"], 
+            weightMap[lname + ".weight"],  
+            wpower3); 
+        TRTASSERT(scale3);
+        return scale3;
+    }
+
+    IConcatenationLayer* addIBN(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, const std::string lname) {
+        Dims spliteDims = input.getDimensions();
+        ISliceLayer *split1 = network->addSlice(input, 
+            Dims3{0, 0, 0}, 
+            Dims3{spliteDims.d[0]/2, spliteDims.d[1], spliteDims.d[2]}, 
+            Dims3{1, 1, 1});
+        TRTASSERT(split1);
+
+        ISliceLayer *split2 = network->addSlice(input, 
+            Dims3{spliteDims.d[0]/2, 0, 0}, 
+            Dims3{spliteDims.d[0]/2, spliteDims.d[1], spliteDims.d[2]}, 
+            Dims3{1, 1, 1});
+        TRTASSERT(split2);
+
+        auto in1 = addInstanceNorm2d(network, weightMap, *split1->getOutput(0), lname + "IN", 1e-5);
+        auto bn1 = addBatchNorm2d(network, weightMap, *split2->getOutput(0), lname + "BN", 1e-5);
+
+        ITensor* tensor1[] = {in1->getOutput(0), bn1->getOutput(0)};
+        auto cat1 = network->addConcatenation(tensor1, 2);
+        TRTASSERT(cat1);
+        return cat1;
+    }
+
+    IActivationLayer* basicBlock_ibn(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, const int inch, const int outch, const int stride, const std::string lname, const std::string ibn) {
+        Weights emptywts{DataType::kFLOAT, nullptr, 0};
+
+        IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + "conv1.weight"], emptywts);
+        TRTASSERT(conv1);
+        conv1->setStrideNd(DimsHW{stride, stride});
+        conv1->setPaddingNd(DimsHW{1, 1});
+
+        ILayer* bn1{conv1};
+        if (ibn == "a") {
+            bn1 = addIBN(network, weightMap, *conv1->getOutput(0), lname + "bn1.");
+        } else {
+            bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);
+        }
+        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu1);
+
+        IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
+        TRTASSERT(conv2);
+        conv2->setPaddingNd(DimsHW{1, 1});
+
+        IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);
+
+        IElementWiseLayer* ew1;
+        if (inch != outch) {
+            IConvolutionLayer* conv3 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
+            TRTASSERT(conv3);
+            conv3->setStrideNd(DimsHW{stride, stride});
+            IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5);
+            ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM);
+        } else {
+            ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM);
+        }
+        ILayer* in1{ew1};
+        if (ibn == "b") {
+            in1 = addInstanceNorm2d(network, weightMap, *ew1->getOutput(0), lname + "IN", 1e-5);
+        }
+
+        IActivationLayer* relu2 = network->addActivation(*in1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu2);
+        return relu2;
+    }
+
+    IActivationLayer* bottleneck_ibn(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, const int inch, const int outch, const int stride, const std::string lname, const std::string ibn) {
+        Weights emptywts{DataType::kFLOAT, nullptr, 0};
+        IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts);
+        TRTASSERT(conv1);
+
+        ILayer* bn1{conv1};
+        if (ibn == "a") {
+            bn1 = addIBN(network, weightMap, *conv1->getOutput(0), lname + "bn1.");
+        } else {
+            bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);
+        }
+        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu1);
+
+        IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
+        TRTASSERT(conv2);
+        conv2->setStrideNd(DimsHW{stride, stride});
+        conv2->setPaddingNd(DimsHW{1, 1});
+
+        IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);
+
+        IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu2);
+
+        IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts);
+        TRTASSERT(conv3);
+
+        IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5);
+
+        IElementWiseLayer* ew1;
+        if (stride != 1 || inch != outch * 4) {
+            IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
+            TRTASSERT(conv4);
+            conv4->setStrideNd(DimsHW{stride, stride});
+
+            IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5);
+            ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
+        } else {
+            ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM);
+        }
+
+        ILayer* in1{ew1};
+        if (ibn == "b") {
+            in1 = addInstanceNorm2d(network, weightMap, *ew1->getOutput(0), lname + "IN", 1e-5);
+        }
+        IActivationLayer* relu3 = network->addActivation(*in1->getOutput(0), ActivationType::kRELU);
+
+        TRTASSERT(relu3);
+        return relu3;
+    }
+
+    ILayer* distill_basicBlock_ibn(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, const int inch, const int outch, const int stride, const std::string lname, const std::string ibn) {
+        Weights emptywts{DataType::kFLOAT, nullptr, 0};
+
+        IActivationLayer* relu_identity = network->addActivation(input, ActivationType::kRELU);
+        TRTASSERT(relu_identity);
+
+        IConvolutionLayer* conv1 = network->addConvolutionNd(*relu_identity->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv1.weight"], emptywts);
+        TRTASSERT(conv1);
+        conv1->setStrideNd(DimsHW{stride, stride});
+        conv1->setPaddingNd(DimsHW{1, 1});
+
+        ILayer* bn1{conv1};
+        if (ibn == "a") {
+            bn1 = addIBN(network, weightMap, *conv1->getOutput(0), lname + "bn1.");
+        } else {
+            bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);
+        }
+        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu1);
+
+        IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
+        TRTASSERT(conv2);
+        conv2->setPaddingNd(DimsHW{1, 1});
+
+        IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);
+
+        IElementWiseLayer* ew1;
+        if (inch != outch) {
+            IConvolutionLayer* conv3 = network->addConvolutionNd(*relu_identity->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
+            TRTASSERT(conv3);
+            conv3->setStrideNd(DimsHW{stride, stride});
+            IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5);
+            ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM);
+        } else {
+            ew1 = network->addElementWise(*relu_identity->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM);
+        }
+        ILayer* in1{ew1};
+        if (ibn == "b") {
+            in1 = addInstanceNorm2d(network, weightMap, *ew1->getOutput(0), lname + "IN", 1e-5);
+        }
+        return in1;
+    }
+
+    ILayer* distill_bottleneck_ibn(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, const int inch, const int outch, const int stride, const std::string lname, const std::string ibn) {
+        Weights emptywts{DataType::kFLOAT, nullptr, 0};
+
+        IActivationLayer* relu_identity = network->addActivation(input, ActivationType::kRELU);
+        TRTASSERT(relu_identity);
+
+        IConvolutionLayer* conv1 = network->addConvolutionNd(*relu_identity->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts);
+        TRTASSERT(conv1);
+
+        ILayer* bn1{conv1};
+        if (ibn == "a") {
+            bn1 = addIBN(network, weightMap, *conv1->getOutput(0), lname + "bn1.");
+        } else {
+            bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5);
+        }
+        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu1);
+
+        IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts);
+        TRTASSERT(conv2);
+        conv2->setStrideNd(DimsHW{stride, stride});
+        conv2->setPaddingNd(DimsHW{1, 1});
+
+        IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5);
+
+        IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu2);
+
+        IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts);
+        TRTASSERT(conv3);
+
+        IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5);
+
+        IElementWiseLayer* ew1;
+        if (stride != 1 || inch != outch * 4) {
+            IConvolutionLayer* conv4 = network->addConvolutionNd(*relu_identity->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts);
+            TRTASSERT(conv4);
+            conv4->setStrideNd(DimsHW{stride, stride});
+
+            IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5);
+            ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
+        } else {
+            ew1 = network->addElementWise(*relu_identity->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM);
+        }
+
+        ILayer* in1{ew1};
+        if (ibn == "b") {
+            in1 = addInstanceNorm2d(network, weightMap, *ew1->getOutput(0), lname + "IN", 1e-5);
+        }
+        return in1;
+    }
+
+    IShuffleLayer* addShuffle2(INetworkDefinition* network, ITensor& input, const Dims dims, const Permutation pmt, const bool reshape_first) {
+        IShuffleLayer* shuffleLayer = network->addShuffle(input);
+        TRTASSERT(shuffleLayer);
+        if (reshape_first) {
+            shuffleLayer->setReshapeDimensions(dims);
+            shuffleLayer->setSecondTranspose(pmt); 
+        } else {
+            shuffleLayer->setFirstTranspose(pmt); 
+            shuffleLayer->setReshapeDimensions(dims);
+        }
+        return shuffleLayer;
+    }
+
+    IElementWiseLayer* Non_local(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, const std::string lname, const int reduc_ratio) {
+        int in_channel = input.getDimensions().d[0];
+        /* Hint: fast-reid use "in_channel / reduc_ratio" during Sep 10, 2020 to Dec 7, 2020 */
+        //int inter_channels = in_channel / reduc_ratio; 
+        int inter_channels = 1; 
+        std::cout << "[Non_local] inter_channels: " << inter_channels << std::endl;
+        IConvolutionLayer* g = network->addConvolutionNd(input, inter_channels, DimsHW{1, 1}, weightMap[ lname + "g.weight"],  weightMap[lname + "g.bias"]);
+        TRTASSERT(g); 
+
+        auto g_permute = addShuffle2(network, *g->getOutput(0), Dims2{g->getOutput(0)->getDimensions().d[0], -1}, Permutation{1, 0}, true);
+        IConvolutionLayer* theta = network->addConvolutionNd(input, inter_channels, DimsHW{1, 1}, weightMap[lname + "theta.weight"],  weightMap[lname + "theta.bias"]);
+        TRTASSERT(theta); 
+
+        auto theta_permute = addShuffle2(network, *theta->getOutput(0), Dims2{theta->getOutput(0)->getDimensions().d[0], -1}, Permutation{1, 0}, true);
+        IConvolutionLayer* phi = network->addConvolutionNd(input, inter_channels, DimsHW{1, 1}, weightMap[lname + "phi.weight"],  weightMap[lname + "phi.bias"]);
+        TRTASSERT(phi);  
+
+        IShuffleLayer* phi_view = network->addShuffle(*phi->getOutput(0));
+        TRTASSERT(phi_view);
+        phi_view->setReshapeDimensions(Dims2{phi->getOutput(0)->getDimensions().d[0], -1});
+
+        IMatrixMultiplyLayer *f = network->addMatrixMultiply(*theta_permute->getOutput(0), MatrixOperation::kNONE, *phi_view->getOutput(0), MatrixOperation::kNONE);
+        int N = f->getOutput(0)->getDimensions().d[f->getOutput(0)->getDimensions().nbDims-1];
+
+        float* pval =  reinterpret_cast<float*>(malloc(sizeof(float) * N * N));
+        std::fill_n(pval, N*N, N); 
+        Weights dem{DataType::kFLOAT, pval, N*N};
+        weightMap[lname + ".dem"] = dem;
+
+        auto dem_n = network->addConstant(Dims2(N, N), dem);
+        IElementWiseLayer* f_div_C = network->addElementWise(*f->getOutput(0), 
+            *dem_n->getOutput(0),
+            ElementWiseOperation::kDIV);  
+        TRTASSERT(f_div_C);
+
+        IMatrixMultiplyLayer *y = network->addMatrixMultiply(*f_div_C->getOutput(0), MatrixOperation::kNONE, *g_permute->getOutput(0), MatrixOperation::kNONE);
+        IShuffleLayer* y_permute = addShuffle2(network, *y->getOutput(0), Dims3{inter_channels, input.getDimensions().d[1], input.getDimensions().d[2]}, Permutation{1, 0}, false);
+        TRTASSERT(y_permute);
+        IConvolutionLayer* w_conv = network->addConvolutionNd(*y_permute->getOutput(0), in_channel, DimsHW{1, 1}, weightMap[lname + "W.0.weight"], weightMap[lname + "W.0.bias"]);
+        TRTASSERT(w_conv);
+        IScaleLayer* w_bn = addBatchNorm2d(network, weightMap, *w_conv->getOutput(0), lname + "W.1", 1e-5);
+        TRTASSERT(w_bn);
+
+        // z = W_y + x
+        IElementWiseLayer* z = network->addElementWise(*w_bn->getOutput(0), 
+            input,
+            ElementWiseOperation::kSUM);  
+        TRTASSERT(z);
+        return z;
+    }
+
+    IPoolingLayer* addAdaptiveAvgPool2d(INetworkDefinition* network, ITensor& input, const DimsHW output_dim) {
+        Dims input_dims = input.getDimensions();
+        TRTASSERT((input_dims.nbDims == 3));
+        // stride_dim = floor(input_dim/output_dim)
+        DimsHW stride_dims{(int)(input_dims.d[1]/output_dim.h()), 
+            (int)(input_dims.d[2]/output_dim.w())};
+        // kernel_dims = input_dim -(output_dim-1)*stride_dim
+        DimsHW kernel_dims{input_dims.d[1] - (output_dim.h()-1) * stride_dims.h(), 
+            input_dims.d[2] - (output_dim.w()-1) * stride_dims.w()};
+        IPoolingLayer* avgpool = network->addPoolingNd(input, PoolingType::kAVERAGE, kernel_dims);
+        TRTASSERT(avgpool);
+        avgpool->setStrideNd(stride_dims);
+        return avgpool;
+    }
+
+    IScaleLayer* addGeneralizedMeanPooling(INetworkDefinition* network, ITensor& input, const float norm, const DimsHW output_dim, const float eps) {
+        TRTASSERT((norm > 0.f));
+        // x = x.clamp(min=eps)
+        IActivationLayer* clamp1 = addMinClamp(network, input, eps);
+        // (x)^norm
+        const static float pval1[3]{0.0, 1.0, norm};   
+        Weights wshift1{DataType::kFLOAT, pval1, 1};
+        Weights wscale1{DataType::kFLOAT, pval1+1, 1};
+        Weights wpower1{DataType::kFLOAT, pval1+2, 1};
+
+        IScaleLayer* scale1 = network->addScale(
+            *clamp1->getOutput(0), 
+            ScaleMode::kUNIFORM,
+            wshift1,
+            wscale1,
+            wpower1);
+        TRTASSERT(scale1); 
+
+        IPoolingLayer* ada_avg_pool = addAdaptiveAvgPool2d(network, *scale1->getOutput(0));
+        TRTASSERT(ada_avg_pool);
+
+        // (ada_avg_pool)^(1/norm)
+        const static float pval2[3]{0.0, 1.0, 1.f/norm};   
+        Weights wshift2{DataType::kFLOAT, pval2, 1};
+        Weights wscale2{DataType::kFLOAT, pval2+1, 1};
+        Weights wpower2{DataType::kFLOAT, pval2+2, 1};
+
+        IScaleLayer* scale2 = network->addScale(
+            *ada_avg_pool->getOutput(0), 
+            ScaleMode::kUNIFORM,
+            wshift2,  
+            wscale2,   
+            wpower2); 
+        TRTASSERT(scale2);
+        return scale2;
+    }
+}
\ No newline at end of file
--- a/projects/FastRT/fastrt/layers/poolingLayerRT.cpp
+++ b/projects/FastRT/fastrt/layers/poolingLayerRT.cpp
+#include <iostream>
+#include "fastrt/layers.h"
+#include "poolingLayerRT.h"
+
+namespace fastrt {
+
+    ILayer* MaxPool::addPooling(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        ILayer* pooling = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{input.getDimensions().d[1], input.getDimensions().d[2]});       
+        auto p = dynamic_cast<nvinfer1::IPoolingLayer*>(pooling);
+        if(p) p->setStrideNd(DimsHW{input.getDimensions().d[1], input.getDimensions().d[2]});
+        else std::cout << "Downcasting failed." << std::endl; 
+        return pooling;
+    }
+    
+    ILayer* AvgPool::addPooling(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        ILayer* pooling = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{input.getDimensions().d[1], input.getDimensions().d[2]});
+        auto p = dynamic_cast<IPoolingLayer*>(pooling);
+        if(p) p->setStrideNd(DimsHW{input.getDimensions().d[1], input.getDimensions().d[2]});
+        else std::cout << "Downcasting failed." << std::endl; 
+        return pooling;
+    }
+
+    ILayer* GemPool::addPooling(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        return trtxapi::addGeneralizedMeanPooling(network, input); 
+    }
+
+    ILayer* GemPoolP::addPooling(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        return trtxapi::addGeneralizedMeanPooling(network, input, *(float*)weightMap["heads.pool_layer.p"].values); 
+    }    
+
+}
\ No newline at end of file
--- a/projects/FastRT/fastrt/layers/poolingLayerRT.h
+++ b/projects/FastRT/fastrt/layers/poolingLayerRT.h
+#include "NvInfer.h"
+#include "fastrt/IPoolingLayerRT.h"
+using namespace nvinfer1;
+
+namespace fastrt {
+
+    class MaxPool : public IPoolingLayerRT {
+    public:
+        MaxPool() = default;
+        ~MaxPool() = default;
+
+        ILayer* addPooling(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap,
+            ITensor& input) override;
+    };
+
+    class AvgPool : public IPoolingLayerRT {
+    public:
+        AvgPool() = default;
+        ~AvgPool() = default;
+
+        ILayer* addPooling(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap,
+            ITensor& input) override;
+    };
+
+    class GemPool : public IPoolingLayerRT {
+    public:
+        GemPool() = default;
+        ~GemPool() = default;
+
+        ILayer* addPooling(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap,
+            ITensor& input) override;
+    };
+
+    class GemPoolP : public IPoolingLayerRT {
+    public:
+        GemPoolP() = default;
+        ~GemPoolP() = default;
+
+        ILayer* addPooling(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap,
+            ITensor& input) override;
+    };
+}
\ No newline at end of file
--- a/projects/FastRT/fastrt/meta_arch/CMakeLists.txt
+++ b/projects/FastRT/fastrt/meta_arch/CMakeLists.txt
+target_sources(${PROJECT_NAME}
+PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/model.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/baseline.cpp
+)
\ No newline at end of file
--- a/projects/FastRT/fastrt/meta_arch/baseline.cpp
+++ b/projects/FastRT/fastrt/meta_arch/baseline.cpp
+#include "fastrt/layers.h"
+#include "fastrt/baseline.h"
+
+namespace fastrt {
+
+    Baseline::Baseline(const trt::ModelConfig &modelcfg, const std::string input_name, const std::string output_name) 
+        : Model(modelcfg, input_name, output_name) {}
+
+    void Baseline::preprocessing_cpu(const cv::Mat& img, float* const data, const std::size_t stride) {
+        /* Normalization & BGR->RGB */
+        for (std::size_t i = 0; i < stride; ++i) { 
+            data[i] = img.at<cv::Vec3b>(i)[2]; 
+            data[i + stride] = img.at<cv::Vec3b>(i)[1];
+            data[i + (stride<<1)] = img.at<cv::Vec3b>(i)[0];
+        }
+    }
+
+    ITensor* Baseline::preprocessing_gpu(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor* input) {
+        /* Standardization */
+        static const float mean[3] = {123.675, 116.28, 103.53};
+        static const float std[3] = {58.395, 57.120000000000005, 57.375};
+        return addMeanStd(network, weightMap, input, "", mean, std, false); // true for div 255
+    }
+
+}
\ No newline at end of file
--- a/projects/FastRT/fastrt/meta_arch/model.cpp
+++ b/projects/FastRT/fastrt/meta_arch/model.cpp
+#include "fastrt/model.h"
+#include "fastrt/calibrator.h"
+
+#ifdef BUILD_INT8
+#include "fastrt/config.h"
+#endif 
+
+namespace fastrt {
+
+    Model::Model(const trt::ModelConfig &modelcfg, const std::string input_name, const std::string output_name) {
+        
+        _engineCfg.weights_path = modelcfg.weights_path;
+        _engineCfg.max_batch_size = modelcfg.max_batch_size;
+        _engineCfg.input_h = modelcfg.input_h;
+        _engineCfg.input_w = modelcfg.input_w;
+        _engineCfg.output_size = modelcfg.output_size;
+        _engineCfg.device_id = modelcfg.device_id;
+
+        _engineCfg.input_name = input_name;
+        _engineCfg.output_name = output_name;       
+        _engineCfg.trtModelStream = nullptr;
+        _engineCfg.stream_size = 0;
+    };
+
+    bool Model::serializeEngine(const std::string engine_file, const std::initializer_list<std::unique_ptr<Module>>& modules) {
+
+        /* Create builder */  
+        auto builder = make_holder(createInferBuilder(gLogger));
+
+        /* Create model to populate the network, then set the outputs and create an engine */ 
+        auto engine = createEngine(builder.get(), modules);
+        TRTASSERT(engine.get());
+
+        /* Serialize the engine */ 
+        auto modelStream = make_holder(engine->serialize());
+        TRTASSERT(modelStream.get());
+
+        std::ofstream p(engine_file, std::ios::binary | std::ios::out);
+        if (!p) {
+            std::cerr << "could not open plan output file" << std::endl;
+            return false;
+        }
+        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
+        std::cout << "[Save serialized engine]: " << engine_file << std::endl;
+        return true;
+    }
+
+    TensorRTHolder<ICudaEngine> Model::createEngine(IBuilder* builder, const std::initializer_list<std::unique_ptr<Module>>& modules) {
+
+        auto network = make_holder(builder->createNetworkV2(0U));
+        auto config = make_holder(builder->createBuilderConfig());
+        auto data = network->addInput(_engineCfg.input_name.c_str(), _dt, Dims3{3, _engineCfg.input_h, _engineCfg.input_w});
+        TRTASSERT(data);
+
+        auto weightMap = loadWeights(_engineCfg.weights_path);
+
+        /* Preprocessing */
+        auto input = preprocessing_gpu(network.get(), weightMap, data);
+        if (!input) input = data;
+
+        /* Modeling */
+        ILayer* output{nullptr};
+        for(auto& sequential_module: modules) {
+            output = sequential_module->topology(network.get(), weightMap, *input);
+            TRTASSERT(output);
+            input = output->getOutput(0);
+        }
+
+        /* Set output */
+        output->getOutput(0)->setName(_engineCfg.output_name.c_str());
+        network->markOutput(*output->getOutput(0));
+
+        /* Build engine */ 
+        builder->setMaxBatchSize(_engineCfg.max_batch_size);
+        config->setMaxWorkspaceSize(1 << 20);
+#if defined(BUILD_FP16) && defined(BUILD_INT8)
+        std::cout << "Flag confilct! BUILD_FP16 and BUILD_INT8 can't be both True!" << std::endl;
+        return null;
+#endif 
+#if defined(BUILD_FP16)
+        std::cout << "[Build fp16]" << std::endl;
+        config->setFlag(BuilderFlag::kFP16);
+#elif defined(BUILD_INT8)
+        std::cout << "[Build int8]" << std::endl;
+        std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+        TRTASSERT(builder->platformHasFastInt8());
+        config->setFlag(BuilderFlag::kINT8);
+        Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, _engineCfg.input_w, _engineCfg.input_h, 
+            INT8_CALIBRATE_DATASET_PATH.c_str(), "int8calib.table", _engineCfg.input_name.c_str());
+        config->setInt8Calibrator(calibrator);
+#endif 
+        auto engine = make_holder(builder->buildEngineWithConfig(*network, *config));
+        std::cout << "[TRT engine build out]" << std::endl;
+
+        for (auto& mem : weightMap) {
+            free((void*) (mem.second.values));
+        }
+        return engine;
+    }
+
+    bool Model::deserializeEngine(const std::string engine_file) {
+        std::ifstream file(engine_file, std::ios::binary | std::ios::in);
+        if (file.good()) {
+            file.seekg(0, file.end);
+            _engineCfg.stream_size = file.tellg();
+            file.seekg(0, file.beg);
+            _engineCfg.trtModelStream = std::shared_ptr<char>( new char[_engineCfg.stream_size], []( char* ptr ){ delete [] ptr; } );
+            TRTASSERT(_engineCfg.trtModelStream.get());
+            file.read(_engineCfg.trtModelStream.get(), _engineCfg.stream_size);
+            file.close();
+    
+            _inferEngine = make_unique<trt::InferenceEngine>(_engineCfg);
+            return true;
+        }
+        return false;
+    }
+
+    bool Model::inference(std::vector<cv::Mat> &input) {
+        if (_inferEngine != nullptr) {
+            const std::size_t stride = _engineCfg.input_h * _engineCfg.input_w;
+            return _inferEngine.get()->doInference(input.size(), 
+                [&](float* data) {
+                    for(const auto &img : input) {
+                        preprocessing_cpu(img, data, stride);
+                        data += 3 * stride;
+                    }
+                }
+            );
+        } else {
+            return false;
+        }
+    }
+
+    float* Model::getOutput() { 
+        if(_inferEngine != nullptr) 
+            return _inferEngine.get()->getOutput(); 
+        return nullptr;
+    }
+
+    int Model::getOutputSize() { 
+        return _engineCfg.output_size; 
+    }
+
+    int Model::getDeviceID() { 
+        return _engineCfg.device_id; 
+    }
+}
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/IPoolingLayerRT.h
+++ b/projects/FastRT/include/fastrt/IPoolingLayerRT.h
+#pragma once
+
+#include <map>
+#include "struct.h"
+#include "NvInfer.h"
+using namespace nvinfer1;
+
+namespace fastrt {
+
+    class IPoolingLayerRT {
+    public:
+        IPoolingLayerRT() = default;
+        virtual ~IPoolingLayerRT() = default;
+
+        virtual ILayer* addPooling(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap, 
+            ITensor& input) = 0; 
+    };
+
+}
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/InferenceEngine.h
+++ b/projects/FastRT/include/fastrt/InferenceEngine.h
+/************************************************************************************
+ * Handle memory pre-alloc both on host(pinned memory, allow CUDA DMA) & device
+ * Author:  Darren Hsieh
+ * Date: 2020/07/07
+*************************************************************************************/
+
+#pragma once
+
+#include <thread>
+#include <chrono>
+#include <memory>
+#include <functional>
+#include <opencv2/opencv.hpp>
+
+#include "utils.h"
+#include "struct.h"
+#include "holder.h"
+#include "logging.h"
+#include "NvInfer.h"
+#include "cuda_runtime_api.h"
+static Logger gLogger;
+
+namespace trt {
+
+    class InferenceEngine {
+    public:
+        InferenceEngine(const EngineConfig &enginecfg);
+        InferenceEngine(InferenceEngine &&other) noexcept;
+        ~InferenceEngine();
+
+        InferenceEngine(const InferenceEngine &) = delete;
+        InferenceEngine& operator=(const InferenceEngine &) = delete;
+        InferenceEngine& operator=(InferenceEngine && other) = delete;
+
+        bool doInference(const int inference_batch_size, std::function<void(float*)> preprocessing);
+        float* getOutput() { return _output; }
+        std::thread::id getThreadID() { return std::this_thread::get_id(); }
+
+    private:
+        EngineConfig _engineCfg;
+        float* _input{nullptr};
+        float* _output{nullptr};
+
+        // Pointers to input and output device buffers to pass to engine.
+        // Engine requires exactly IEngine::getNbBindings() number of buffers.
+        void* _buffers[2];
+
+        // In order to bind the buffers, we need to know the names of the input and output tensors.
+        // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+        int _inputIndex;
+        int _outputIndex;
+        
+        int _inputSize;
+        int _outputSize;
+
+        static constexpr std::size_t _depth{sizeof(float)};
+        TensorRTHolder<nvinfer1::IRuntime> _runtime{nullptr};
+        TensorRTHolder<nvinfer1::ICudaEngine> _engine{nullptr};
+        TensorRTHolder<nvinfer1::IExecutionContext> _context{nullptr};
+        std::shared_ptr<cudaStream_t> _streamptr;
+    };
+}
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/baseline.h
+++ b/projects/FastRT/include/fastrt/baseline.h
+#pragma once
+
+#include "model.h"
+#include "struct.h"
+#include <memory>
+#include <opencv2/opencv.hpp>
+using namespace trtxapi;
+
+namespace fastrt {
+
+    class Baseline : public Model {
+    public:
+        Baseline(const trt::ModelConfig &modelcfg,
+            const std::string input_name = "data",
+            const std::string output_name = "reid_embd");
+        ~Baseline() = default;
+    
+    private:
+        void preprocessing_cpu(const cv::Mat& img, float* const data, const std::size_t stride);
+        ITensor* preprocessing_gpu(INetworkDefinition* network, 
+            std::map<std::string, Weights>& weightMap, 
+            ITensor* input); 
+    };
+}
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/calibrator.h
+++ b/projects/FastRT/include/fastrt/calibrator.h
+#ifndef ENTROPY_CALIBRATOR_H
+#define ENTROPY_CALIBRATOR_H
+
+#include "NvInfer.h"
+#include <string>
+#include <vector>
+
+//! \class Int8EntropyCalibrator2
+//!
+//! \brief Implements Entropy calibrator 2.
+//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
+//!
+class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
+{
+public:
+    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
+
+    virtual ~Int8EntropyCalibrator2();
+    int getBatchSize() const override;
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) override;
+    const void* readCalibrationCache(size_t& length) override;
+    void writeCalibrationCache(const void* cache, size_t length) override;
+
+private:
+    int batchsize_;
+    int input_w_;
+    int input_h_;
+    int img_idx_;
+    std::string img_dir_;
+    std::vector<std::string> img_files_;
+    size_t input_count_;
+    std::string calib_table_name_;
+    const char* input_blob_name_;
+    bool read_cache_;
+    void* device_input_;
+    std::vector<char> calib_cache_;
+};
+
+#endif // ENTROPY_CALIBRATOR_H
--- a/projects/FastRT/include/fastrt/config.h.in
+++ b/projects/FastRT/include/fastrt/config.h.in
+#pragma once
+
+#ifdef BUILD_INT8
+#include <string>
+const std::string INT8_CALIBRATE_DATASET_PATH = "@INT8_CALIBRATE_DATASET_PATH@";
+#endif
+
--- a/projects/FastRT/include/fastrt/cuda_utils.h
+++ b/projects/FastRT/include/fastrt/cuda_utils.h
+#ifndef TRTX_CUDA_UTILS_H_
+#define TRTX_CUDA_UTILS_H_
+
+#include <cuda_runtime_api.h>
+
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(callstr)\
+    {\
+        cudaError_t error_code = callstr;\
+        if (error_code != cudaSuccess) {\
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
+            assert(0);\
+        }\
+    }
+#endif  // CUDA_CHECK
+
+#endif  // TRTX_CUDA_UTILS_H_
+
--- a/projects/FastRT/include/fastrt/embedding_head.h
+++ b/projects/FastRT/include/fastrt/embedding_head.h
+#pragma once
+
+#include <map>
+#include "NvInfer.h"
+#include "fastrt/module.h"
+#include "fastrt/struct.h"
+#include "fastrt/factory.h"
+using namespace nvinfer1;
+
+namespace fastrt {
+
+    class embedding_head : public Module {
+    private:
+        FastreidConfig& _modelCfg;
+        std::unique_ptr<LayerFactory> _layerFactory;
+
+    public:
+        embedding_head(FastreidConfig& modelCfg);
+        embedding_head(FastreidConfig& modelCfg, std::unique_ptr<LayerFactory> layerFactory);
+        ~embedding_head() = default;
+
+        ILayer* topology(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap,
+            ITensor& input) override;
+    };
+
+}
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/factory.h
+++ b/projects/FastRT/include/fastrt/factory.h
+#pragma once
+
+#include "struct.h"
+#include "module.h"
+#include "IPoolingLayerRT.h"
+
+namespace fastrt {
+    
+    class ModuleFactory {
+    public:
+        ModuleFactory() = default;
+        ~ModuleFactory() = default;
+
+        std::unique_ptr<Module> createBackbone(FastreidConfig& modelCfg);
+        std::unique_ptr<Module> createHead(FastreidConfig& modelCfg);
+    };
+
+    class LayerFactory {
+    public:
+        LayerFactory() = default;
+        ~LayerFactory() = default;
+
+        std::unique_ptr<IPoolingLayerRT> createPoolingLayer(const FastreidPoolingType& pooltype);
+    };
+
+}
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/holder.h
+++ b/projects/FastRT/include/fastrt/holder.h
+#pragma once
+
+template <typename T>
+class TensorRTHolder {
+    T* holder;
+public:
+    explicit TensorRTHolder(T* holder_) : holder(holder_) {}
+    ~TensorRTHolder() {
+        if (holder)
+            holder->destroy();
+    }
+    TensorRTHolder(const TensorRTHolder&) = delete;
+    TensorRTHolder& operator=(const TensorRTHolder&) = delete;
+    TensorRTHolder(TensorRTHolder && rhs) noexcept{
+        holder = rhs.holder;
+        rhs.holder = nullptr;
+    }
+    TensorRTHolder& operator=(TensorRTHolder&& rhs) noexcept {
+        if (this == &rhs) {
+            return *this;
+        }
+        if (holder) holder->destroy();
+        holder = rhs.holder;
+        rhs.holder = nullptr;
+        return *this;
+    }
+    T* operator->() {
+        return holder;
+    }
+    T* get() { return holder; }
+    explicit operator bool() { return holder != nullptr; }
+    T& operator*() noexcept { return *holder; }
+};
+
+template <typename T>
+TensorRTHolder<T> make_holder(T* holder) {
+    return TensorRTHolder<T>(holder);
+}
+
+template <typename T>
+using TensorRTNonHolder = T*;
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/layers.h
+++ b/projects/FastRT/include/fastrt/layers.h
+#pragma once
+
+#include <map>
+#include <math.h>
+#include <assert.h>
+#include "NvInfer.h"
+#include "cuda_runtime_api.h"
+using namespace nvinfer1;
+
+namespace trtxapi {
+
+    IActivationLayer* addMinClamp(INetworkDefinition* network, 
+        ITensor& input, 
+        const float min);
+
+    ITensor* addDiv255(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor* input,
+        const std::string lname);
+        
+    ITensor* addMeanStd(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor* input, 
+        const std::string lname,
+        const float* mean, 
+        const float* std, 
+        const bool div255);
+
+    IScaleLayer* addBatchNorm2d(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor& input, 
+        const std::string lname, 
+        const float eps);
+
+    IScaleLayer* addInstanceNorm2d(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor& input, 
+        const std::string lname, 
+        const float eps);
+
+    IConcatenationLayer* addIBN(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor& input, 
+        const std::string lname);
+
+    IActivationLayer* basicBlock_ibn(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor& input, 
+        const int inch, 
+        const int outch,
+        const int stride, 
+        const std::string lname, 
+        const std::string ibn);
+
+    IActivationLayer* bottleneck_ibn(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor& input, 
+        const int inch, 
+        const int outch,
+        const int stride, 
+        const std::string lname, 
+        const std::string ibn);
+
+    ILayer* distill_basicBlock_ibn(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor& input, 
+        const int inch, 
+        const int outch,
+        const int stride, 
+        const std::string lname, 
+        const std::string ibn);
+
+    ILayer* distill_bottleneck_ibn(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor& input, 
+        const int inch, 
+        const int outch,
+        const int stride, 
+        const std::string lname, 
+        const std::string ibn);
+
+    IShuffleLayer* addShuffle2(INetworkDefinition* network, 
+        ITensor& input, 
+        const Dims dims, 
+        const Permutation pmt, 
+        const bool reshape_first);
+
+    IElementWiseLayer* Non_local(INetworkDefinition* network, 
+        std::map<std::string, Weights>& weightMap, 
+        ITensor& input, 
+        const std::string lname, 
+        const int reduc_ratio = 2);
+
+    IPoolingLayer* addAdaptiveAvgPool2d(INetworkDefinition* network, 
+        ITensor& input, 
+        const DimsHW output_dim = DimsHW{1,1});
+
+    IScaleLayer* addGeneralizedMeanPooling(INetworkDefinition* network, 
+        ITensor& input, 
+        const float norm = 3.f, 
+        const DimsHW output_dim = DimsHW{1,1}, 
+        const float eps = 1e-6);
+}
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/logging.h
+++ b/projects/FastRT/include/fastrt/logging.h
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_LOGGING_H
+#define TENSORRT_LOGGING_H
+
+#include "NvInferRuntimeCommon.h"
+#include <cassert>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+using Severity = nvinfer1::ILogger::Severity;
+
+class LogStreamConsumerBuffer : public std::stringbuf
+{
+public:
+    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mOutput(stream)
+        , mPrefix(prefix)
+        , mShouldLog(shouldLog)
+    {
+    }
+
+    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
+        : mOutput(other.mOutput)
+    {
+    }
+
+    ~LogStreamConsumerBuffer()
+    {
+        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
+        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
+        // if the pointer to the beginning is not equal to the pointer to the current position,
+        // call putOutput() to log the output to the stream
+        if (pbase() != pptr())
+        {
+            putOutput();
+        }
+    }
+
+    // synchronizes the stream buffer and returns 0 on success
+    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
+    // resetting the buffer and flushing the stream
+    virtual int sync()
+    {
+        putOutput();
+        return 0;
+    }
+
+    void putOutput()
+    {
+        if (mShouldLog)
+        {
+            // prepend timestamp
+            std::time_t timestamp = std::time(nullptr);
+            tm* tm_local = std::localtime(&timestamp);
+            std::cout << "[";
+            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
+            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
+            // std::stringbuf::str() gets the string contents of the buffer
+            // insert the buffer contents pre-appended by the appropriate prefix into the stream
+            mOutput << mPrefix << str();
+            // set the buffer to empty
+            str("");
+            // flush the stream
+            mOutput.flush();
+        }
+    }
+
+    void setShouldLog(bool shouldLog)
+    {
+        mShouldLog = shouldLog;
+    }
+
+private:
+    std::ostream& mOutput;
+    std::string mPrefix;
+    bool mShouldLog;
+};
+
+//!
+//! \class LogStreamConsumerBase
+//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
+//!
+class LogStreamConsumerBase
+{
+public:
+    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mBuffer(stream, prefix, shouldLog)
+    {
+    }
+
+protected:
+    LogStreamConsumerBuffer mBuffer;
+};
+
+//!
+//! \class LogStreamConsumer
+//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
+//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
+//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
+//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
+//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
+//!  Please do not change the order of the parent classes.
+//!
+class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
+{
+public:
+    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
+    //!  Reportable severity determines if the messages are severe enough to be logged.
+    LogStreamConsumer(Severity reportableSeverity, Severity severity)
+        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(severity <= reportableSeverity)
+        , mSeverity(severity)
+    {
+    }
+
+    LogStreamConsumer(LogStreamConsumer&& other)
+        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(other.mShouldLog)
+        , mSeverity(other.mSeverity)
+    {
+    }
+
+    void setReportableSeverity(Severity reportableSeverity)
+    {
+        mShouldLog = mSeverity <= reportableSeverity;
+        mBuffer.setShouldLog(mShouldLog);
+    }
+
+private:
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    static std::string severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    bool mShouldLog;
+    Severity mSeverity;
+};
+
+//! \class Logger
+//!
+//! \brief Class which manages logging of TensorRT tools and samples
+//!
+//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
+//! and supports logging two types of messages:
+//!
+//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
+//! - Test pass/fail messages
+//!
+//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
+//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
+//!
+//! In the future, this class could be extended to support dumping test results to a file in some standard format
+//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
+//!
+//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
+//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
+//! library and messages coming from the sample.
+//!
+//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
+//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
+//! object.
+
+class Logger : public nvinfer1::ILogger
+{
+public:
+    Logger(Severity severity = Severity::kWARNING)
+        : mReportableSeverity(severity)
+    {
+    }
+
+    //!
+    //! \enum TestResult
+    //! \brief Represents the state of a given test
+    //!
+    enum class TestResult
+    {
+        kRUNNING, //!< The test is running
+        kPASSED,  //!< The test passed
+        kFAILED,  //!< The test failed
+        kWAIVED   //!< The test was waived
+    };
+
+    //!
+    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
+    //! \return The nvinfer1::ILogger associated with this Logger
+    //!
+    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
+    //! we can eliminate the inheritance of Logger from ILogger
+    //!
+    nvinfer1::ILogger& getTRTLogger()
+    {
+        return *this;
+    }
+
+    //!
+    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
+    //!
+    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
+    //! inheritance from nvinfer1::ILogger
+    //!
+    void log(Severity severity, const char* msg) override
+    {
+        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
+    }
+
+    //!
+    //! \brief Method for controlling the verbosity of logging output
+    //!
+    //! \param severity The logger will only emit messages that have severity of this level or higher.
+    //!
+    void setReportableSeverity(Severity severity)
+    {
+        mReportableSeverity = severity;
+    }
+
+    //!
+    //! \brief Opaque handle that holds logging information for a particular test
+    //!
+    //! This object is an opaque handle to information used by the Logger to print test results.
+    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
+    //! with Logger::reportTest{Start,End}().
+    //!
+    class TestAtom
+    {
+    public:
+        TestAtom(TestAtom&&) = default;
+
+    private:
+        friend class Logger;
+
+        TestAtom(bool started, const std::string& name, const std::string& cmdline)
+            : mStarted(started)
+            , mName(name)
+            , mCmdline(cmdline)
+        {
+        }
+
+        bool mStarted;
+        std::string mName;
+        std::string mCmdline;
+    };
+
+    //!
+    //! \brief Define a test for logging
+    //!
+    //! \param[in] name The name of the test.  This should be a string starting with
+    //!                  "TensorRT" and containing dot-separated strings containing
+    //!                  the characters [A-Za-z0-9_].
+    //!                  For example, "TensorRT.sample_googlenet"
+    //! \param[in] cmdline The command line used to reproduce the test
+    //
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    //!
+    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
+    {
+        return TestAtom(false, name, cmdline);
+    }
+
+    //!
+    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
+    //!        as input
+    //!
+    //! \param[in] name The name of the test
+    //! \param[in] argc The number of command-line arguments
+    //! \param[in] argv The array of command-line arguments (given as C strings)
+    //!
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
+    {
+        auto cmdline = genCmdlineString(argc, argv);
+        return defineTest(name, cmdline);
+    }
+
+    //!
+    //! \brief Report that a test has started.
+    //!
+    //! \pre reportTestStart() has not been called yet for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has started
+    //!
+    static void reportTestStart(TestAtom& testAtom)
+    {
+        reportTestResult(testAtom, TestResult::kRUNNING);
+        assert(!testAtom.mStarted);
+        testAtom.mStarted = true;
+    }
+
+    //!
+    //! \brief Report that a test has ended.
+    //!
+    //! \pre reportTestStart() has been called for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has ended
+    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
+    //!                   TestResult::kFAILED, TestResult::kWAIVED
+    //!
+    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
+    {
+        assert(result != TestResult::kRUNNING);
+        assert(testAtom.mStarted);
+        reportTestResult(testAtom, result);
+    }
+
+    static int reportPass(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kPASSED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportFail(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kFAILED);
+        return EXIT_FAILURE;
+    }
+
+    static int reportWaive(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kWAIVED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportTest(const TestAtom& testAtom, bool pass)
+    {
+        return pass ? reportPass(testAtom) : reportFail(testAtom);
+    }
+
+    Severity getReportableSeverity() const
+    {
+        return mReportableSeverity;
+    }
+
+private:
+    //!
+    //! \brief returns an appropriate string for prefixing a log message with the given severity
+    //!
+    static const char* severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate string for prefixing a test result message with the given result
+    //!
+    static const char* testResultString(TestResult result)
+    {
+        switch (result)
+        {
+        case TestResult::kRUNNING: return "RUNNING";
+        case TestResult::kPASSED: return "PASSED";
+        case TestResult::kFAILED: return "FAILED";
+        case TestResult::kWAIVED: return "WAIVED";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
+    //!
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    //!
+    //! \brief method that implements logging test results
+    //!
+    static void reportTestResult(const TestAtom& testAtom, TestResult result)
+    {
+        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
+                                         << testAtom.mCmdline << std::endl;
+    }
+
+    //!
+    //! \brief generate a command line string from the given (argc, argv) values
+    //!
+    static std::string genCmdlineString(int argc, char const* const* argv)
+    {
+        std::stringstream ss;
+        for (int i = 0; i < argc; i++)
+        {
+            if (i > 0)
+                ss << " ";
+            ss << argv[i];
+        }
+        return ss.str();
+    }
+
+    Severity mReportableSeverity;
+};
+
+namespace
+{
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
+//!
+//! Example usage:
+//!
+//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
+//!
+//! Example usage:
+//!
+//!     LOG_INFO(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_INFO(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
+//!
+//! Example usage:
+//!
+//!     LOG_WARN(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_WARN(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
+//!
+//! Example usage:
+//!
+//!     LOG_ERROR(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_ERROR(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
+//         ("fatal" severity)
+//!
+//! Example usage:
+//!
+//!     LOG_FATAL(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_FATAL(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
+}
+
+} // anonymous namespace
+
+#endif // TENSORRT_LOGGING_H
--- a/projects/FastRT/include/fastrt/model.h
+++ b/projects/FastRT/include/fastrt/model.h
+#pragma once
+
+#include "module.h"
+#include "utils.h"
+#include "holder.h"
+#include "layers.h"
+#include "struct.h"
+#include "InferenceEngine.h"
+
+#include <memory>
+#include <vector>
+#include <opencv2/opencv.hpp>
+extern Logger gLogger;
+using namespace trt;
+using namespace trtxapi;
+
+namespace fastrt {
+
+    class Model {
+    public:
+        Model(const trt::ModelConfig &modelcfg, 
+            const std::string input_name="input", 
+            const std::string output_name="output");
+
+        virtual ~Model() = default;
+
+        /* 
+         * Serialize TRT Engine
+         * @engine_file: save serialized engine as engine_file
+         * @modules: sequential modules(variadic length). (e.g., backbone1 + backbone2 + head, backbone + head, backbone)
+         */ 
+        bool serializeEngine(const std::string engine_file, 
+            const std::initializer_list<std::unique_ptr<Module>>& modules);
+
+        bool deserializeEngine(const std::string engine_file);
+
+        /* Support batch inference */
+        bool inference(std::vector<cv::Mat> &input); 
+
+        /* 
+         * Access the memory allocated by cudaMallocHost. (It's on CPU side) 
+         * Use this after each inference.
+         */ 
+        float* getOutput(); 
+
+        /* 
+         * Output buffer size
+         */ 
+        int getOutputSize(); 
+
+        /* 
+         * Cuda device id
+         * You may need this in multi-thread/multi-engine inference
+         */ 
+        int getDeviceID(); 
+
+    private:
+        TensorRTHolder<ICudaEngine> createEngine(IBuilder* builder,
+            const std::initializer_list<std::unique_ptr<Module>>& modules);
+
+        virtual void preprocessing_cpu(const cv::Mat& img, float* const data, const std::size_t stride) = 0;
+        virtual ITensor* preprocessing_gpu(INetworkDefinition* network, 
+            std::map<std::string, Weights>& weightMap, 
+            ITensor* input) { return nullptr; };
+
+    private:
+        DataType _dt{DataType::kFLOAT};
+        trt::EngineConfig _engineCfg;
+        std::unique_ptr<trt::InferenceEngine> _inferEngine{nullptr};
+    };
+}
--- a/projects/FastRT/include/fastrt/module.h
+++ b/projects/FastRT/include/fastrt/module.h
+#pragma once
+
+#include <map>
+#include "struct.h"
+#include "NvInfer.h"
+using namespace nvinfer1;
+
+namespace fastrt {
+
+    class Module {
+    public:
+        Module() = default;
+        virtual ~Module() = default;
+
+        virtual ILayer* topology(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap, 
+            ITensor& input) = 0; 
+    };
+
+}
\ No newline at end of file