update

b6c19984 · dengjb · b6c19984 · b6c19984 · b6c19984 · b6c19984
Commit b6c19984 authored Nov 18, 2025 by dengjb
20 changed files
--- a/projects/FastFace/fastface/verification.py
+++ b/projects/FastFace/fastface/verification.py
+# encoding: utf-8
+
+"""Helper for evaluation on the Labeled Faces in the Wild dataset
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+import sklearn
+from scipy import interpolate
+from sklearn.decomposition import PCA
+from sklearn.model_selection import KFold
+
+
+def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    best_thresholds = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        # print('train_set', train_set)
+        # print('test_set', test_set)
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            # print(_embed_train.shape)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            # print(embed1.shape, embed2.shape)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        #         print('best_threshold_index', best_threshold_index, acc_train[best_threshold_index])
+        best_thresholds[fold_idx] = thresholds[best_threshold_index]
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx, threshold_idx], fprs[fold_idx, threshold_idx], _ = calculate_accuracy(threshold,
+                                                                                                 dist[test_set],
+                                                                                                 actual_issame[
+                                                                                                     test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set],
+                                                      actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy, best_thresholds
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
+    '''
+    Copy from [insightface](https://github.com/deepinsight/insightface)
+    :param thresholds:
+    :param embeddings1:
+    :param embeddings2:
+    :param actual_issame:
+    :param far_target:
+    :param nrof_folds:
+    :return:
+    '''
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy, best_thresholds = calculate_roc(thresholds, embeddings1, embeddings2,
+                                                        np.asarray(actual_issame), nrof_folds=nrof_folds, pca=pca)
+    #     thresholds = np.arange(0, 4, 0.001)
+    #     val, val_std, far = calculate_val(thresholds, embeddings1, embeddings2,
+    #                                       np.asarray(actual_issame), 1e-3, nrof_folds=nrof_folds)
+    #     return tpr, fpr, accuracy, best_thresholds, val, val_std, far
+    return tpr, fpr, accuracy, best_thresholds
--- a/projects/FastFace/train_net.py
+++ b/projects/FastFace/train_net.py
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+@author:  sherlock
+@contact: sherlockliao01@gmail.com
+"""
+
+import sys
+
+sys.path.append('.')
+
+from fastreid.config import get_cfg
+from fastreid.engine import default_argument_parser, default_setup, launch
+from fastreid.utils.checkpoint import Checkpointer
+
+from fastface import *
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    add_face_cfg(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        cfg.defrost()
+        cfg.MODEL.BACKBONE.PRETRAIN = False
+        model = FaceTrainer.build_model(cfg)
+
+        Checkpointer(model).load(cfg.MODEL.WEIGHTS)  # load trained model
+
+        res = FaceTrainer.test(cfg, model)
+        return res
+
+    trainer = FaceTrainer(cfg)
+
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
--- a/projects/FastRT/.gitignore
+++ b/projects/FastRT/.gitignore
+*.wts
+
+.vscode/
+libs/
+build/
+data/
\ No newline at end of file
--- a/projects/FastRT/CMakeLists.txt
+++ b/projects/FastRT/CMakeLists.txt
+cmake_minimum_required(VERSION 2.6)
+
+set(LIBARARY_NAME "FastRT" CACHE STRING "The Fastreid-tensorrt library name")
+
+set(LIBARARY_VERSION_MAJOR "0")
+set(LIBARARY_VERSION_MINOR "0")
+set(LIBARARY_VERSION_SINOR "5")
+set(LIBARARY_SOVERSION "0")
+set(LIBARARY_VERSION "${LIBARARY_VERSION_MAJOR}.${LIBARARY_VERSION_MINOR}.${LIBARARY_VERSION_SINOR}")
+project(${LIBARARY_NAME}${LIBARARY_VERSION})
+
+add_definitions(-std=c++11)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/")
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_C_LINK_EXECUTABLE ${CMAKE_CXX_LINK_EXECUTABLE})
+
+# option for shared or static
+set(TARGET "SHARED" CACHE STRING "SHARED or STATIC" FORCE)
+
+if("${TARGET}" STREQUAL "SHARED")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+message("Build Engine as shared library")
+else()
+  message("Build Engine as static library")
+endif()
+
+option(CUDA_USE_STATIC_CUDA_RUNTIME "Use Static CUDA"     OFF)
+option(BUILD_FASTRT_ENGINE     "Build FastRT Engine"       ON)
+option(BUILD_DEMO              "Build DEMO"                ON)
+option(BUILD_FP16              "Build Engine as FP16"     OFF)
+option(BUILD_INT8              "Build Engine as INT8"     OFF)
+option(USE_CNUMPY              "Include CNPY libs"        OFF)
+option(BUILD_PYTHON_INTERFACE  "Build Python Interface"   OFF)
+
+set(SOLUTION_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+message("CMAKE_CURRENT_SOURCE_DIR: " ${SOLUTION_DIR})
+
+if(USE_CNUMPY)
+  add_definitions(-DUSE_CNUMPY)
+endif()
+
+if(BUILD_INT8)
+  add_definitions(-DBUILD_INT8)
+  message("Build Engine as INT8")
+  set(INT8_CALIBRATE_DATASET_PATH "/data/Market-1501-v15.09.15/bounding_box_test/" CACHE STRING "Path to calibrate dataset(end with /)")
+  message("INT8_CALIBRATE_DATASET_PATH: " ${INT8_CALIBRATE_DATASET_PATH})
+  configure_file(${SOLUTION_DIR}/include/fastrt/config.h.in ${SOLUTION_DIR}/include/fastrt/config.h @ONLY)
+elseif(BUILD_FP16)
+  add_definitions(-DBUILD_FP16)
+  message("Build Engine as FP16")
+else()
+  message("Build Engine as FP32")
+endif()
+
+if(BUILD_FASTRT_ENGINE)
+  add_subdirectory(fastrt)
+  message(STATUS "BUILD_FASTREID_ENGINE: ON")
+else()
+  message(STATUS "BUILD_FASTREID_ENGINE: OFF")
+endif()
+
+if(BUILD_DEMO)
+  add_subdirectory(demo)
+  message(STATUS "BUILD_DEMO: ON")
+else()
+  message(STATUS "BUILD_DEMO: OFF")
+endif()
+
+if(BUILD_PYTHON_INTERFACE)
+  add_subdirectory(pybind_interface)
+  message(STATUS "BUILD_PYTHON_INTERFACE: ON")
+else()
+  message(STATUS "BUILD_PYTHON_INTERFACE: OFF")
+endif()
\ No newline at end of file
--- a/projects/FastRT/README.md
+++ b/projects/FastRT/README.md
+# C++ FastReID-TensorRT
+
+
+Implementation of reid model with TensorRT network definition APIs to build the whole network. 
+
+So we don't use any parsers here.
+
+### How to Run
+
+1. Generate '.wts' file from pytorch with `model_best.pth`
+
+   See [How_to_Generate.md](tools/How_to_Generate.md)
+
+2. Config your model
+   
+   See [Tensorrt Model Config](#ConfigSection)
+   
+3. (Optional) Build <a name="step3"></a>`third party` libs
+
+   See [Build third_party section](#third_party)
+   
+4. Build <a name="step4"></a>`fastrt` execute file
+   
+   ``` 
+   mkdir build
+   cd build
+   cmake -DBUILD_FASTRT_ENGINE=ON \
+         -DBUILD_DEMO=ON \
+         -DUSE_CNUMPY=ON ..
+   make
+   ```
+
+5. Run <a name="step5"></a>`fastrt`
+   
+   put `model_best.wts` into `FastRT/`
+
+   ``` 
+   ./demo/fastrt -s  // serialize model & save as 'xxx.engine' file
+   ```
+
+   ``` 
+   ./demo/fastrt -d  // deserialize 'xxx.engine' file and run inference
+   ```
+   
+6. Verify the output with pytorch
+
+7. (Optional) Once you verify the result, you can set FP16 for speed up
+   ``` 
+   mkdir build
+   cd build
+   cmake -DBUILD_FASTRT_ENGINE=ON \
+         -DBUILD_DEMO=ON \
+         -DBUILD_FP16=ON ..
+   make
+   ```
+   
+   then go to [step 5](#step5) 
+
+8. (Optional) You can use INT8 quantization for speed up
+
+   prepare CALIBRATE DATASET and set the path via cmake. (The path must end with /)
+
+   ``` 
+   mkdir build
+   cd build
+   cmake -DBUILD_FASTRT_ENGINE=ON \
+         -DBUILD_DEMO=ON \
+         -DBUILD_INT8=ON \
+         -DINT8_CALIBRATE_DATASET_PATH="/data/Market-1501-v15.09.15/bounding_box_test/" ..
+   make
+   ```
+   then go to [step 5](#step5)
+
+9. (Optional) Build tensorrt model as shared libs
+
+   ``` 
+   mkdir build
+   cd build
+   cmake -DBUILD_FASTRT_ENGINE=ON \
+         -DBUILD_DEMO=OFF \
+         -DBUILD_FP16=ON ..
+   make
+   make install
+   ```
+   You should find libs in `FastRT/libs/FastRTEngine/`
+   
+   Now build your application execute file
+   ``` 
+   cmake -DBUILD_FASTRT_ENGINE=OFF -DBUILD_DEMO=ON ..
+   make
+   ```
+
+   then go to [step 5](#step5)
+   
+10. (Optional) Build tensorrt model with python interface, then you can use FastRT model in python.
+
+    ``` 
+    mkdir build
+    cd build
+    cmake -DBUILD_FASTRT_ENGINE=ON \
+        -DBUILD_DEMO=ON \
+        -DBUILD_PYTHON_INTERFACE=ON ..
+    make
+    ```
+    
+    You should get a so file `FastRT/build/pybind_interface/ReID.cpython-37m-x86_64-linux-gnu.so`. 
+   
+    Then go to [step 5](#step5) to create engine file.
+
+    After that you can import this so file in python, and deserialize engine file to infer in python. 
+
+    You can find use example in `pybind_interface/test.py` and `pybind_interface/market_benchmark.py`.
+    
+    ``` 
+    from PATH_TO_SO_FILE import ReID
+    model = ReID(GPU_ID)
+    model.build(PATH_TO_YOUR_ENGINEFILE)
+    numpy_feature = np.array([model.infer(CV2_FRAME)])
+    ```
+    
+    * `pybind_interface/test.py` use `pybind_interface/docker/trt7cu100/Dockerfile` (without pytorch installed)
+    * `pybind_interface/market_benchmark.py` use `pybind_interface/docker/trt7cu102_torch160/Dockerfile` (with pytorch installed)
+    
+### <a name="ConfigSection"></a>`Tensorrt Model Config`
+
+Edit `FastRT/demo/inference.cpp`, according to your model config
+
+The config is related to [How_to_Generate.md](tools/How_to_Generate.md)
+
+ Ex1. `sbs_R50-ibn`
+```
+static const std::string WEIGHTS_PATH = "../sbs_R50-ibn.wts"; 
+static const std::string ENGINE_PATH = "./sbs_R50-ibn.engine";
+
+static const int MAX_BATCH_SIZE = 4;
+static const int INPUT_H = 384;
+static const int INPUT_W = 128;
+static const int OUTPUT_SIZE = 2048;
+static const int DEVICE_ID = 0;
+
+static const FastreidBackboneType BACKBONE = FastreidBackboneType::r50; 
+static const FastreidHeadType HEAD = FastreidHeadType::EmbeddingHead;
+static const FastreidPoolingType HEAD_POOLING = FastreidPoolingType::gempoolP;
+static const int LAST_STRIDE = 1;
+static const bool WITH_IBNA = true; 
+static const bool WITH_NL = true;
+static const int EMBEDDING_DIM = 0; 
+```
+
+ Ex2. `sbs_R50`
+```
+static const std::string WEIGHTS_PATH = "../sbs_R50.wts";
+static const std::string ENGINE_PATH = "./sbs_R50.engine"; 
+
+static const int MAX_BATCH_SIZE = 4;
+static const int INPUT_H = 384;
+static const int INPUT_W = 128;
+static const int OUTPUT_SIZE = 2048;
+static const int DEVICE_ID = 0;
+
+static const FastreidBackboneType BACKBONE = FastreidBackboneType::r50; 
+static const FastreidHeadType HEAD = FastreidHeadType::EmbeddingHead;
+static const FastreidPoolingType HEAD_POOLING = FastreidPoolingType::gempoolP;
+static const int LAST_STRIDE = 1;
+static const bool WITH_IBNA = false; 
+static const bool WITH_NL = true;
+static const int EMBEDDING_DIM = 0; 
+```
+
+ Ex3. `sbs_r34_distill`
+```
+static const std::string WEIGHTS_PATH = "../sbs_r34_distill.wts"; 
+static const std::string ENGINE_PATH = "./sbs_r34_distill.engine";
+
+static const int MAX_BATCH_SIZE = 4;
+static const int INPUT_H = 384;
+static const int INPUT_W = 128;
+static const int OUTPUT_SIZE = 512;
+static const int DEVICE_ID = 0;
+
+static const FastreidBackboneType BACKBONE = FastreidBackboneType::r34_distill; 
+static const FastreidHeadType HEAD = FastreidHeadType::EmbeddingHead;
+static const FastreidPoolingType HEAD_POOLING = FastreidPoolingType::gempoolP;
+static const int LAST_STRIDE = 1;
+static const bool WITH_IBNA = false; 
+static const bool WITH_NL = false;
+static const int EMBEDDING_DIM = 0; 
+```
+
+ Ex4.`kd-r34-r101_ibn`
+```
+static const std::string WEIGHTS_PATH = "../kd_r34_distill.wts"; 
+static const std::string ENGINE_PATH = "./kd_r34_distill.engine"; 
+
+static const int MAX_BATCH_SIZE = 4;
+static const int INPUT_H = 384;
+static const int INPUT_W = 128;
+static const int OUTPUT_SIZE = 512;
+static const int DEVICE_ID = 0;
+
+static const FastreidBackboneType BACKBONE = FastreidBackboneType::r34_distill; 
+static const FastreidHeadType HEAD = FastreidHeadType::EmbeddingHead;
+static const FastreidPoolingType HEAD_POOLING = FastreidPoolingType::gempoolP;
+static const int LAST_STRIDE = 1;
+static const bool WITH_IBNA = false; 
+static const bool WITH_NL = false;
+static const int EMBEDDING_DIM = 0; 
+```
+
+
+ Ex5.`kd-r18-r101_ibn`
+```
+static const std::string WEIGHTS_PATH = "../kd-r18-r101_ibn.wts"; 
+static const std::string ENGINE_PATH = "./kd_r18_distill.engine"; 
+
+static const int MAX_BATCH_SIZE = 16;
+static const int INPUT_H = 384;
+static const int INPUT_W = 128;
+static const int OUTPUT_SIZE = 512;
+static const int DEVICE_ID = 1;
+
+static const FastreidBackboneType BACKBONE = FastreidBackboneType::r18_distill; 
+static const FastreidHeadType HEAD = FastreidHeadType::EmbeddingHead;
+static const FastreidPoolingType HEAD_POOLING = FastreidPoolingType::gempoolP;
+static const int LAST_STRIDE = 1;
+static const bool WITH_IBNA = true; 
+static const bool WITH_NL = false;
+static const int EMBEDDING_DIM = 0; 
+```
+
+### Supported conversion
+
+*  Backbone: resnet50, resnet34, distill-resnet50, distill-resnet34, distill-resnet18
+*  Heads: embedding_head
+*  Plugin layers: ibn, non-local
+*  Pooling layers: maxpool, avgpool, GeneralizedMeanPooling, GeneralizedMeanPoolingP
+
+### Benchmark
+
+| Model | Engine | Batch size | Image size | Embd | Time |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| Vanilla R34 | Python/Pytorch1.6 fp32 | 1 | 256x128 | 512 | 6.49ms | 
+| Vanilla R34 | Python/Pytorch1.6 fp32 | 4 | 256x128 | 512 | 7.16ms | 
+| Vanilla R34 | C++/trt7 fp32 | 1 | 256x128 | 512 | 2.34ms | 
+| Vanilla R34 | C++/trt7 fp32 | 4 | 256x128 | 512 | 3.99ms | 
+| Vanilla R34 | C++/trt7 fp16 | 1 | 256x128 | 512 | 1.83ms | 
+| Vanilla R34 | C++/trt7 fp16 | 4 | 256x128 | 512 | 2.38ms | 
+| Distill R34 | Python/Pytorch1.6 fp32 | 1 | 256x128 | 512 | 5.68ms | 
+| Distill R34 | Python/Pytorch1.6 fp32 | 4 | 256x128 | 512 | 6.26ms | 
+| Distill R34 | C++/trt7 fp32 | 1 | 256x128 | 512 | 2.36ms | 
+| Distill R34 | C++/trt7 fp32 | 4 | 256x128 | 512 | 4.05ms | 
+| Distill R34 | C++/trt7 fp16 | 1 | 256x128 | 512 | 1.86ms | 
+| Distill R34 | C++/trt7 fp16 | 4 | 256x128 | 512 | 2.68ms | 
+| R50-NL-IBN | Python/Pytorch1.6 fp32 | 1 | 256x128 | 2048 | 14.86ms | 
+| R50-NL-IBN | Python/Pytorch1.6 fp32 | 4 | 256x128 | 2048 | 15.14ms | 
+| R50-NL-IBN | C++/trt7 fp32 | 1 | 256x128 | 2048 | 4.67ms | 
+| R50-NL-IBN | C++/trt7 fp32 | 4 | 256x128 | 2048 | 6.15ms | 
+| R50-NL-IBN | C++/trt7 fp16 | 1 | 256x128 | 2048 | 2.87ms | 
+| R50-NL-IBN | C++/trt7 fp16 | 4 | 256x128 | 2048 | 3.81ms | 
+
+* Time: preprocessing(normalization) + inference (100 times average) 
+* GPU: GTX 2080 TI
+
+### Test Environment
+
+1. fastreid v1.0.0 / 2080TI / Ubuntu18.04 / Nvidia driver 435 / cuda10.0 / cudnn7.6.5 / trt7.0.0 / nvinfer7.0.0 / opencv3.2
+
+2. fastreid v1.0.0 / 2080TI / Ubuntu18.04 / Nvidia driver 450 / cuda10.2 / cudnn7.6.5 / trt7.0.0 / nvinfer7.0.0 / opencv3.2
+
+### Installation
+
+* Set up with Docker
+
+   for cuda10.0
+
+   ```
+   cd docker/trt7cu100
+   sudo docker build -t trt7:cuda100 .
+   sudo docker run --gpus all -it --name fastrt -v /home/YOURID/workspace:/workspace -d trt7:cuda100
+   // then put the repo into `/home/YOURID/workspace/` before you getin container
+   ```
+
+   for cuda10.2
+
+   ```
+   cd docker/trt7cu102
+   sudo docker build -t trt7:cuda102 .
+   sudo docker run --gpus all -it --name fastrt -v /home/YOURID/workspace:/workspace -d trt7:cuda102 
+   // then put the repo into `/home/YOURID/workspace/` before you getin container
+   ```
+
+* [Installation reference](https://github.com/wang-xinyu/tensorrtx/blob/master/tutorials/install.md)
+
+### Build <a name="third_party"></a> third party
+
+* for read/write numpy
+
+   ```
+   cd third_party/cnpy
+   cmake -DCMAKE_INSTALL_PREFIX=../../libs/cnpy -DENABLE_STATIC=OFF . && make -j4 && make install
+   ```
\ No newline at end of file
--- a/projects/FastRT/demo/CMakeLists.txt
+++ b/projects/FastRT/demo/CMakeLists.txt
+SET(APP_PROJECT_NAME fastrt)
+
+find_package(CUDA REQUIRED)
+# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
+# cuda
+include_directories(/usr/local/cuda/include)
+link_directories(/usr/local/cuda/lib64)
+# tensorrt
+include_directories(/usr/include/x86_64-linux-gnu/)
+link_directories(/usr/lib/x86_64-linux-gnu/)
+
+include_directories(${SOLUTION_DIR}/include)
+add_executable(${APP_PROJECT_NAME} inference.cpp)
+
+# numpy
+if(USE_CNUMPY)
+  include_directories(${SOLUTION_DIR}/libs/cnpy/include)
+  SET(CNPY_LIB ${SOLUTION_DIR}/libs/cnpy/lib/libcnpy.so)
+else()
+  SET(CNPY_LIB)
+endif()
+
+# OpenCV
+find_package(OpenCV)
+target_include_directories(${APP_PROJECT_NAME}
+PUBLIC
+  ${OpenCV_INCLUDE_DIRS}
+)
+target_link_libraries(${APP_PROJECT_NAME}
+PUBLIC
+  ${OpenCV_LIBS}
+)
+
+if(BUILD_FASTRT_ENGINE AND BUILD_DEMO)
+  SET(FASTRTENGINE_LIB FastRTEngine)
+else()
+  SET(FASTRTENGINE_LIB ${SOLUTION_DIR}/libs/FastRTEngine/libFastRTEngine.so)
+endif()
+
+target_link_libraries(${APP_PROJECT_NAME} 
+PRIVATE
+  ${FASTRTENGINE_LIB}
+  nvinfer
+  ${CNPY_LIB}
+)
\ No newline at end of file
--- a/projects/FastRT/demo/inference.cpp
+++ b/projects/FastRT/demo/inference.cpp
+#include <iostream>
+#include <opencv2/opencv.hpp>
+
+#include "fastrt/utils.h"
+#include "fastrt/baseline.h"
+#include "fastrt/factory.h"
+using namespace fastrt;
+using namespace nvinfer1;
+
+#ifdef USE_CNUMPY
+#include "cnpy.h"
+#endif
+
+/* Ex1. sbs_R50-ibn */
+static const std::string WEIGHTS_PATH = "../sbs_R50-ibn.wts"; 
+static const std::string ENGINE_PATH = "./sbs_R50-ibn.engine";
+
+static const int MAX_BATCH_SIZE = 4;
+static const int INPUT_H = 384;
+static const int INPUT_W = 128;
+static const int OUTPUT_SIZE = 2048;
+static const int DEVICE_ID = 0;
+
+static const FastreidBackboneType BACKBONE = FastreidBackboneType::r50; 
+static const FastreidHeadType HEAD = FastreidHeadType::EmbeddingHead;
+static const FastreidPoolingType HEAD_POOLING = FastreidPoolingType::gempoolP;
+static const int LAST_STRIDE = 1;
+static const bool WITH_IBNA = true; 
+static const bool WITH_NL = true;
+static const int EMBEDDING_DIM = 0; 
+
+
+int main(int argc, char** argv) {
+
+    trt::ModelConfig modelCfg { 
+        WEIGHTS_PATH,
+        MAX_BATCH_SIZE,
+        INPUT_H,
+        INPUT_W,
+        OUTPUT_SIZE,
+        DEVICE_ID};
+
+    FastreidConfig reidCfg { 
+        BACKBONE,
+        HEAD,
+        HEAD_POOLING,
+        LAST_STRIDE,
+        WITH_IBNA,
+        WITH_NL,
+        EMBEDDING_DIM};
+
+    std::cout << "[ModelConfig]: \n" << modelCfg
+        << "\n[FastreidConfig]: \n" << reidCfg << std::endl;
+
+    Baseline baseline{modelCfg}; 
+
+    if (argc == 2 && std::string(argv[1]) == "-s") {
+        ModuleFactory moduleFactory;
+        std::cout << "[Serializling Engine]" << std::endl;
+        if (!baseline.serializeEngine(ENGINE_PATH, 
+            {std::move(moduleFactory.createBackbone(reidCfg)), 
+                std::move(moduleFactory.createHead(reidCfg))})) {
+            std::cout << "SerializeEngine Failed." << std::endl;
+            return -1;
+        }   
+        return 0;
+    } else if (argc == 2 && std::string(argv[1]) == "-d") {
+        std::cout << "[Deserializling Engine]" << std::endl;
+        if(!baseline.deserializeEngine(ENGINE_PATH)) {
+            std::cout << "DeserializeEngine Failed." << std::endl;
+            return -1;
+        }
+
+/* comment out(//#define VERIFY) for real images usage */
+#define VERIFY
+
+#ifdef VERIFY   
+        /* support batch input data */
+        std::vector<cv::Mat> input;
+
+        input.emplace_back(cv::Mat(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(255,255,255))); // batch size = 1
+        //input.emplace_back(cv::Mat(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(255,255,255))); // batch size = 2, ...
+
+        /* run inference */
+        TimePoint start_infer, end_infer;
+        int LOOP_TIMES = 100;
+        start_infer = Time::now();
+        for (int times = 0; times < LOOP_TIMES; ++times) {
+            if(!baseline.inference(input)) {
+                std::cout << "Inference Failed." << std::endl;
+                return -1;
+            }
+        }
+        end_infer = Time::now();
+
+        /* get output from cudaMallocHost */
+        float* feat_embedding = baseline.getOutput();
+
+#ifdef USE_CNUMPY
+        /* save as numpy. shape = (OUTPUT_SIZE,) */
+        cnpy::npy_save("./feat_embedding.npy", feat_embedding, {OUTPUT_SIZE}, "w");
+#endif
+
+        /* print output */
+        TRTASSERT(feat_embedding);
+        for (size_t img_idx = 0; img_idx < input.size(); ++img_idx) {
+            for (int dim = 0; dim < baseline.getOutputSize(); ++dim) {
+                std::cout<< feat_embedding[img_idx+dim] << " ";
+                if ((dim+1) % 10 == 0) {
+                    std::cout << std::endl;
+                }
+            }
+        }
+        std::cout << std::endl;
+        
+        /* Not including image resizing */
+        std::cout << "[Preprocessing+Inference]: " << 
+            std::chrono::duration_cast<std::chrono::milliseconds>(end_infer - start_infer).count()/static_cast<float>(LOOP_TIMES) << "ms" << std::endl;         
+#else      
+        /* get jpg filenames */
+        auto filenames = io::fileGlob("../data/*.jpg"); 
+        std::cout << "#filenames: " << filenames.size() << std::endl;
+        std::vector<cv::Mat> input;
+        for (size_t batch_start = 0; batch_start < filenames.size(); batch_start+=modelCfg.max_batch_size) {
+            input.clear();
+            /* collect batch */
+            for (int img_idx = 0; img_idx < modelCfg.max_batch_size; ++img_idx) {
+                if ( (batch_start + img_idx) >= filenames.size() ) continue; 
+                std::cout << "Image: " << filenames[batch_start + img_idx] << std::endl;
+                cv::Mat resizeImg(modelCfg.input_h, modelCfg.input_w, CV_8UC3);
+                cv::resize(cv::imread(filenames[batch_start + img_idx]), resizeImg, resizeImg.size(), 0, 0, cv::INTER_CUBIC); /* cv::INTER_LINEAR */
+                cv::imwrite("./file_idx[" + std::to_string(batch_start + img_idx) + "].jpg", resizeImg); /* Visualize resize image */
+                input.emplace_back(resizeImg);
+            }
+            if(!baseline.inference(input)) {
+                std::cout << "Inference Failed." << std::endl;
+                return -1;
+            }
+        }
+#endif
+        return 0;
+    } else {
+        std::cerr << "arguments not right!" << std::endl;
+        std::cerr << "./demo/fastrt -s  // serialize model to .engine file" << std::endl;
+        std::cerr << "./demo/fastrt -d  // deserialize .engine file and run inference" << std::endl;
+        return -1;
+    }
+}
--- a/projects/FastRT/docker/trt7cu100/Dockerfile
+++ b/projects/FastRT/docker/trt7cu100/Dockerfile
+# cuda10.0
+FROM fineyu/tensorrt7:0.0.1
+
+RUN add-apt-repository -y ppa:timsc/opencv-3.4 && \
+    apt-get update && \
+    apt-get install -y cmake \
+    libopencv-dev \
+    libopencv-dnn-dev \
+    libopencv-shape3.4-dbg && \
+    apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
--- a/projects/FastRT/docker/trt7cu102/Dockerfile
+++ b/projects/FastRT/docker/trt7cu102/Dockerfile
+# cuda10.2
+FROM nvcr.io/nvidia/tensorrt:20.03-py3
+
+RUN apt-get update && apt-get dist-upgrade -y && \
+    apt-get install -y \
+    software-properties-common \
+    build-essential \
+    cmake \
+    git \
+    libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev \
+    python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev \ 	
+    libdc1394-22-dev libgl1-mesa-glx && \
+    apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+    
+RUN mkdir opencv34 && cd opencv34 && \
+    git clone -b 3.4 https://github.com/opencv/opencv && \
+    git clone -b 3.4 https://github.com/opencv/opencv_contrib && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_INSTALL_PREFIX=/usr/local/opencv \
+    -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DBUILD_opencv_xfeatures2d=OFF \
+    -DOPENCV_EXTRA_MODULES_PATH=../opencv_contrib/modules ../opencv && \
+    make -j12 && \
+    make install && \
+    ldconfig && \
+    cd ../.. \
+    && rm -rf opencv34
--- a/projects/FastRT/fastrt/CMakeLists.txt
+++ b/projects/FastRT/fastrt/CMakeLists.txt
+project(FastRTEngine)
+
+file(GLOB_RECURSE COMMON_SRC_FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/common/utils.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/common/calibrator.cpp
+)
+
+find_package(CUDA REQUIRED)
+# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
+# cuda
+include_directories(/usr/local/cuda/include)
+link_directories(/usr/local/cuda/lib64)
+# tensorrt
+include_directories(/usr/include/x86_64-linux-gnu/)
+link_directories(/usr/lib/x86_64-linux-gnu/)
+
+# build engine as library
+add_library(${PROJECT_NAME} ${TARGET} ${COMMON_SRC_FILES})
+
+target_include_directories(${PROJECT_NAME}
+PUBLIC
+  ../include
+)
+
+find_package(OpenCV)
+target_include_directories(${PROJECT_NAME}
+PUBLIC
+  ${OpenCV_INCLUDE_DIRS}
+)
+
+target_link_libraries(${PROJECT_NAME} 
+  nvinfer
+  cudart
+  ${OpenCV_LIBS}
+)
+
+SET_TARGET_PROPERTIES(${PROJECT_NAME} 
+PROPERTIES
+  SOVERSION ${LIBARARY_SOVERSION}
+  VERSION ${LIBARARY_VERSION}
+)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+
+install(TARGETS ${PROJECT_NAME}
+  LIBRARY DESTINATION ${SOLUTION_DIR}/libs/${PROJECT_NAME})
+
+add_subdirectory(layers)
+add_subdirectory(engine)
+add_subdirectory(heads)
+add_subdirectory(backbones)
+add_subdirectory(meta_arch)
+add_subdirectory(factory)
\ No newline at end of file
--- a/projects/FastRT/fastrt/backbones/CMakeLists.txt
+++ b/projects/FastRT/fastrt/backbones/CMakeLists.txt
+target_sources(${PROJECT_NAME}
+PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/sbs_resnet.cpp
+)
\ No newline at end of file
--- a/projects/FastRT/fastrt/backbones/sbs_resnet.cpp
+++ b/projects/FastRT/fastrt/backbones/sbs_resnet.cpp
+#include <vector>
+#include <iostream>
+#include "fastrt/utils.h"
+#include "fastrt/layers.h"
+#include "fastrt/sbs_resnet.h"
+using namespace trtxapi;
+
+namespace fastrt {
+    ILayer* backbone_sbsR18_distill::topology(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        std::string ibn{""};
+        if(_modelCfg.with_ibna) {
+            ibn = "a";
+        }
+        std::map<std::string, std::vector<std::string>> ibn_layers{ 
+            {"a", {"a","a","a","a","a","a","",""}},
+            {"b", {"","","b","","","","b","","","","","","","","","",}},
+            {"", {16,""}}};
+
+        Weights emptywts{DataType::kFLOAT, nullptr, 0};
+        IConvolutionLayer* conv1 = network->addConvolutionNd(input, 64, DimsHW{7, 7}, weightMap["backbone.conv1.weight"], emptywts);
+        TRTASSERT(conv1);
+        conv1->setStrideNd(DimsHW{2, 2});
+        conv1->setPaddingNd(DimsHW{3, 3});
+
+        IScaleLayer* bn1{nullptr};
+        if (ibn == "b") {
+            bn1 = addInstanceNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        } else {
+            bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        }
+        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu1);
+
+        // pytorch: nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
+        IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
+        TRTASSERT(pool1);
+        pool1->setStrideNd(DimsHW{2, 2});
+        pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
+
+        ILayer* x = distill_basicBlock_ibn(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "backbone.layer1.0.", ibn_layers[ibn][0]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 64, 64, 1, "backbone.layer1.1.", ibn_layers[ibn][1]);
+
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 64, 128, 2, "backbone.layer2.0.", ibn_layers[ibn][2]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 128, 1, "backbone.layer2.1.", ibn_layers[ibn][3]);
+
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 256, 2, "backbone.layer3.0.", ibn_layers[ibn][4]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.1.", ibn_layers[ibn][5]);
+       
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 512, _modelCfg.last_stride, "backbone.layer4.0.", ibn_layers[ibn][6]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 512, 512, 1, "backbone.layer4.1.", ibn_layers[ibn][7]);
+
+        IActivationLayer* relu2 = network->addActivation(*x->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu2);
+        return relu2;
+    }
+
+    ILayer* backbone_sbsR34_distill::topology(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        std::string ibn{""};
+        if(_modelCfg.with_ibna) {
+            ibn = "a";
+        }
+        std::map<std::string, std::vector<std::string>> ibn_layers{ 
+            {"a", {"a","a","a","a","a","a","a","a","a","a","a","a","a","","",""}},
+            {"b", {"","","b","","","","b","","","","","","","","","",}},
+            {"", {16,""}}};
+
+        Weights emptywts{DataType::kFLOAT, nullptr, 0};
+        IConvolutionLayer* conv1 = network->addConvolutionNd(input, 64, DimsHW{7, 7}, weightMap["backbone.conv1.weight"], emptywts);
+        TRTASSERT(conv1);
+        conv1->setStrideNd(DimsHW{2, 2});
+        conv1->setPaddingNd(DimsHW{3, 3});
+
+        IScaleLayer* bn1{nullptr};
+        if (ibn == "b") {
+            bn1 = addInstanceNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        } else {
+            bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        }
+        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu1);
+
+        // pytorch: nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
+        IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
+        TRTASSERT(pool1);
+        pool1->setStrideNd(DimsHW{2, 2});
+        pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
+
+        ILayer* x = distill_basicBlock_ibn(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "backbone.layer1.0.", ibn_layers[ibn][0]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 64, 64, 1, "backbone.layer1.1.", ibn_layers[ibn][1]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 64, 64, 1, "backbone.layer1.2.", ibn_layers[ibn][2]);
+
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 64, 128, 2, "backbone.layer2.0.", ibn_layers[ibn][3]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 128, 1, "backbone.layer2.1.", ibn_layers[ibn][4]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 128, 1, "backbone.layer2.2.", ibn_layers[ibn][5]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 128, 1, "backbone.layer2.3.", ibn_layers[ibn][6]);
+
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 256, 2, "backbone.layer3.0.", ibn_layers[ibn][7]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.1.", ibn_layers[ibn][8]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.2.", ibn_layers[ibn][9]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.3.", ibn_layers[ibn][10]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.4.", ibn_layers[ibn][11]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.5.", ibn_layers[ibn][12]);
+       
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 512, _modelCfg.last_stride, "backbone.layer4.0.", ibn_layers[ibn][13]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 512, 512, 1, "backbone.layer4.1.", ibn_layers[ibn][14]);
+        x = distill_basicBlock_ibn(network, weightMap, *x->getOutput(0), 512, 512, 1, "backbone.layer4.2.", ibn_layers[ibn][15]);
+
+        IActivationLayer* relu2 = network->addActivation(*x->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu2);
+        return relu2;
+    }
+
+    ILayer* backbone_sbsR50_distill::topology(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        std::string ibn{""};
+        if(_modelCfg.with_ibna) {
+            ibn = "a";
+        }
+        std::map<std::string, std::vector<std::string>> ibn_layers{ 
+            {"a", {"a","a","a","a","a","a","a","a","a","a","a","a","a","","",""}},
+            {"b", {"","","b","","","","b","","","","","","","","","",}},
+            {"", {16,""}}};
+
+        Weights emptywts{DataType::kFLOAT, nullptr, 0};
+        IConvolutionLayer* conv1 = network->addConvolutionNd(input, 64, DimsHW{7, 7}, weightMap["backbone.conv1.weight"], emptywts);
+        TRTASSERT(conv1);
+        conv1->setStrideNd(DimsHW{2, 2});
+        conv1->setPaddingNd(DimsHW{3, 3});
+
+        IScaleLayer* bn1{nullptr};
+        if (ibn == "b") {
+            bn1 = addInstanceNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        } else {
+            bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        }
+        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu1);
+
+        // pytorch: nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
+        IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
+        TRTASSERT(pool1);
+        pool1->setStrideNd(DimsHW{2, 2});
+        pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
+
+        ILayer* x = distill_bottleneck_ibn(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "backbone.layer1.0.", ibn_layers[ibn][0]);
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 64, 1, "backbone.layer1.1.", ibn_layers[ibn][1]);
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 64, 1, "backbone.layer1.2.", ibn_layers[ibn][2]);
+
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 128, 2, "backbone.layer2.0.", ibn_layers[ibn][3]);
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "backbone.layer2.1.", ibn_layers[ibn][4]);
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "backbone.layer2.2.", ibn_layers[ibn][5]);
+        ILayer* _layer{x};
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_2.0.");
+        }
+        x = distill_bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 512, 128, 1, "backbone.layer2.3.", ibn_layers[ibn][6]);
+        _layer = x;
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_2.1.");
+        }
+
+        x = distill_bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 512, 256, 2, "backbone.layer3.0.", ibn_layers[ibn][7]);
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "backbone.layer3.1.", ibn_layers[ibn][8]);
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "backbone.layer3.2.", ibn_layers[ibn][9]);
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "backbone.layer3.3.", ibn_layers[ibn][10]);
+        _layer = x;
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_3.0.");
+        } 
+        x = distill_bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 1024, 256, 1, "backbone.layer3.4.", ibn_layers[ibn][11]);
+        _layer = x;
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_3.1.");
+        }
+        x = distill_bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 1024, 256, 1, "backbone.layer3.5.", ibn_layers[ibn][12]);
+        _layer = x;
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_3.2.");
+        }
+
+        x = distill_bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 1024, 512, _modelCfg.last_stride, "backbone.layer4.0.", ibn_layers[ibn][13]); 
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 2048, 512, 1, "backbone.layer4.1.", ibn_layers[ibn][14]);
+        x = distill_bottleneck_ibn(network, weightMap, *x->getOutput(0), 2048, 512, 1, "backbone.layer4.2.", ibn_layers[ibn][15]);
+        
+        IActivationLayer* relu2 = network->addActivation(*x->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu2);  
+        return relu2;
+    }
+
+    ILayer* backbone_sbsR34::topology(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        std::string ibn{""};
+        if(_modelCfg.with_ibna) {
+            ibn = "a";
+        }
+        std::map<std::string, std::vector<std::string>> ibn_layers{ 
+            {"a", {"a","a","a","a","a","a","a","a","a","a","a","a","a","","",""}},  /* resnet34-ibna */
+            {"b", {"","","b","","","","b","","","","","","","","","",}}, /* resnet34-ibnb */
+            {"", {16,""}}}; /* vanilla resnet34 */
+
+        Weights emptywts{DataType::kFLOAT, nullptr, 0};
+        IConvolutionLayer* conv1 = network->addConvolutionNd(input, 64, DimsHW{7, 7}, weightMap["backbone.conv1.weight"], emptywts);
+        TRTASSERT(conv1);
+        conv1->setStrideNd(DimsHW{2, 2});
+        conv1->setPaddingNd(DimsHW{3, 3});
+
+        IScaleLayer* bn1{nullptr};
+        if (ibn == "b") {
+            bn1 = addInstanceNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        } else {
+            bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        }
+        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu1);
+
+        // pytorch: nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
+        IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
+        TRTASSERT(pool1);
+        pool1->setStrideNd(DimsHW{2, 2});
+        pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
+
+        IActivationLayer* x = basicBlock_ibn(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "backbone.layer1.0.", ibn_layers[ibn][0]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 64, 64, 1, "backbone.layer1.1.", ibn_layers[ibn][1]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 64, 64, 1, "backbone.layer1.2.", ibn_layers[ibn][2]);
+
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 64, 128, 2, "backbone.layer2.0.", ibn_layers[ibn][3]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 128, 1, "backbone.layer2.1.", ibn_layers[ibn][4]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 128, 1, "backbone.layer2.2.", ibn_layers[ibn][5]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 128, 1, "backbone.layer2.3.", ibn_layers[ibn][6]);
+
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 128, 256, 2, "backbone.layer3.0.", ibn_layers[ibn][7]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.1.", ibn_layers[ibn][8]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.2.", ibn_layers[ibn][9]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.3.", ibn_layers[ibn][10]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.4.", ibn_layers[ibn][11]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 256, 1, "backbone.layer3.5.", ibn_layers[ibn][12]);
+       
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 256, 512, _modelCfg.last_stride, "backbone.layer4.0.", ibn_layers[ibn][13]); 
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 512, 512, 1, "backbone.layer4.1.", ibn_layers[ibn][14]);
+        x = basicBlock_ibn(network, weightMap, *x->getOutput(0), 512, 512, 1, "backbone.layer4.2.", ibn_layers[ibn][15]);
+        return x;
+    }
+
+    ILayer* backbone_sbsR50::topology(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        /*
+         * Reference: https://github.com/JDAI-CV/fast-reid/blob/master/fastreid/modeling/backbones/resnet.py
+         * NL layers follow by: nl_layers_per_stage = {'50x': [0, 2, 3, 0],}[depth]
+         * for nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True) => pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
+         * for nn.MaxPool2d(kernel_size=3, stride=2, padding=1) replace with => pool1->setPaddingNd(DimsHW{1, 1});
+         */
+        std::string ibn{""};
+        if(_modelCfg.with_ibna) {
+            ibn = "a";
+        }
+        std::map<std::string, std::vector<std::string>> ibn_layers{ 
+            {"a", {"a","a","a","a","a","a","a","a","a","a","a","a","a","","",""}}, /* resnet50-ibna */
+            {"b", {"","","b","","","","b","","","","","","","","","",}}, /* resnet50-ibnb(not used in fastreid) */ 
+            {"", {16,""}}}; /* vanilla resnet50 */
+
+        Weights emptywts{DataType::kFLOAT, nullptr, 0};
+        IConvolutionLayer* conv1 = network->addConvolutionNd(input, 64, DimsHW{7, 7}, weightMap["backbone.conv1.weight"], emptywts);
+        TRTASSERT(conv1);
+        conv1->setStrideNd(DimsHW{2, 2});
+        conv1->setPaddingNd(DimsHW{3, 3});
+
+        IScaleLayer* bn1{nullptr};
+        if (ibn == "b") {
+            bn1 = addInstanceNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        } else {
+            bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5);
+        }
+        IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
+        TRTASSERT(relu1);
+
+        IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
+        TRTASSERT(pool1);
+        pool1->setStrideNd(DimsHW{2, 2});
+        pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
+
+        IActivationLayer* x = bottleneck_ibn(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "backbone.layer1.0.", ibn_layers[ibn][0]);
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 64, 1, "backbone.layer1.1.", ibn_layers[ibn][1]);
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 64, 1, "backbone.layer1.2.", ibn_layers[ibn][2]);
+
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 128, 2, "backbone.layer2.0.", ibn_layers[ibn][3]);
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "backbone.layer2.1.", ibn_layers[ibn][4]);
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "backbone.layer2.2.", ibn_layers[ibn][5]);
+        ILayer* _layer{x};
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_2.0.");
+        }
+        x = bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 512, 128, 1, "backbone.layer2.3.", ibn_layers[ibn][6]);
+        _layer = x;
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_2.1.");
+        }
+
+        x = bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 512, 256, 2, "backbone.layer3.0.", ibn_layers[ibn][7]);
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "backbone.layer3.1.", ibn_layers[ibn][8]);
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "backbone.layer3.2.", ibn_layers[ibn][9]);
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "backbone.layer3.3.", ibn_layers[ibn][10]);
+        _layer = x;
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_3.0.");
+        } 
+        x = bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 1024, 256, 1, "backbone.layer3.4.", ibn_layers[ibn][11]);
+        _layer = x;
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_3.1.");
+        }
+        x = bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 1024, 256, 1, "backbone.layer3.5.", ibn_layers[ibn][12]);
+        _layer = x;
+        if(_modelCfg.with_nl) {
+            _layer = Non_local(network, weightMap, *x->getOutput(0), "backbone.NL_3.2.");
+        }
+
+        x = bottleneck_ibn(network, weightMap, *_layer->getOutput(0), 1024, 512, _modelCfg.last_stride, "backbone.layer4.0.", ibn_layers[ibn][13]); 
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 2048, 512, 1, "backbone.layer4.1.", ibn_layers[ibn][14]);
+        x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 2048, 512, 1, "backbone.layer4.2.", ibn_layers[ibn][15]);
+        return x;
+    }
+
+}
\ No newline at end of file
--- a/projects/FastRT/fastrt/common/calibrator.cpp
+++ b/projects/FastRT/fastrt/common/calibrator.cpp
+#include <iostream>
+#include <iterator>
+#include <fstream>
+#include <opencv2/opencv.hpp>
+#include <opencv2/dnn/dnn.hpp>
+#include "fastrt/calibrator.h"
+#include "fastrt/cuda_utils.h"
+#include "fastrt/utils.h"
+
+Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
+    : batchsize_(batchsize)
+    , input_w_(input_w)
+    , input_h_(input_h)
+    , img_idx_(0)
+    , img_dir_(img_dir)
+    , calib_table_name_(calib_table_name)
+    , input_blob_name_(input_blob_name)
+    , read_cache_(read_cache)
+{
+    input_count_ = 3 * input_w * input_h * batchsize;
+    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
+    read_files_in_dir(img_dir, img_files_);
+}
+
+Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
+{
+    CUDA_CHECK(cudaFree(device_input_));
+}
+
+int Int8EntropyCalibrator2::getBatchSize() const
+{
+    return batchsize_;
+}
+
+bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings)
+{
+    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
+        return false;
+    }
+
+    std::vector<cv::Mat> input_imgs_;
+    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
+        std::cout << img_dir_ + img_files_[i] << "  " << i << std::endl;
+        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
+        if (temp.empty()){
+            std::cerr << "Fatal error: image cannot open!" << std::endl;
+            return false;
+        }
+        input_imgs_.push_back(temp);
+    }
+    img_idx_ += batchsize_;
+    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);
+
+    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
+    assert(!strcmp(names[0], input_blob_name_));
+    bindings[0] = device_input_;
+    return true;
+}
+
+const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length)
+{
+    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
+    calib_cache_.clear();
+    std::ifstream input(calib_table_name_, std::ios::binary);
+    input >> std::noskipws;
+    if (read_cache_ && input.good())
+    {
+        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
+    }
+    length = calib_cache_.size();
+    return length ? calib_cache_.data() : nullptr;
+}
+
+void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length)
+{
+    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
+    std::ofstream output(calib_table_name_, std::ios::binary);
+    output.write(reinterpret_cast<const char*>(cache), length);
+}
+
--- a/projects/FastRT/fastrt/common/utils.cpp
+++ b/projects/FastRT/fastrt/common/utils.cpp
+#include <glob.h>
+#include <vector>
+#include "fastrt/utils.h"
+
+namespace io {
+
+    std::vector<std::string> fileGlob(const std::string& pattern){
+        glob_t glob_result;
+        glob(pattern.c_str(), GLOB_TILDE, NULL, &glob_result);
+        std::vector<std::string> files;
+        for (size_t i = 0;i < glob_result.gl_pathc; ++i){
+            files.push_back(std::string(glob_result.gl_pathv[i]));
+        }
+        globfree(&glob_result);
+        return files;
+    }
+
+}
+
+namespace trt {
+
+    std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
+        std::cout << "[Loading weights]: " << file << std::endl;
+        std::map<std::string, nvinfer1::Weights> weightMap;
+
+        // Open weights file
+        std::ifstream input(file);
+        if(!input.is_open()) throw std::runtime_error("Unable to load weight file.");
+        
+        // Read number of weight blobs
+        int32_t count;
+        input >> count;
+        if(count <= 0) throw std::runtime_error("Invalid weight map file.");
+        
+        while (count--) {
+            nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
+            uint32_t size;
+
+            // Read name and type of blob
+            std::string name;
+            input >> name >> std::dec >> size;
+            wt.type = nvinfer1::DataType::kFLOAT;
+
+            // Load blob
+            uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
+            for (uint32_t x = 0, y = size; x < y; ++x) {
+                input >> std::hex >> val[x];
+            }
+            wt.values = val;
+            wt.count = size;
+            weightMap[name] = wt;
+        }
+        return weightMap;
+    }
+
+    std::ostream& operator<<(std::ostream& os, const ModelConfig& modelCfg) {
+        os << "\tweights_path: "    << modelCfg.weights_path      << "\n\t"
+            << "max_batch_size: "   << modelCfg.max_batch_size    << "\n\t"
+            << "input_h: "          << modelCfg.input_h           << "\n\t"
+            << "input_w: "          << modelCfg.input_w           << "\n\t"
+            << "output_size: "      << modelCfg.output_size       << "\n\t"
+            << "device_id: "        << modelCfg.device_id         << "\n";
+        return os;   
+    }
+    
+}
+
+namespace fastrt {
+
+    const std::string BackboneTypetoString(FastreidBackboneType value) {
+    #define X(a, b) b,
+        static std::vector<std::string> table{ FASTBACKBONE_TABLE };
+    #undef X
+        return table[value];
+    }
+
+    const std::string HeadTypetoString(FastreidHeadType value) {
+    #define X(a, b) b,
+        static std::vector<std::string> table{ FASTHEAD_TABLE };
+    #undef X
+        return table[value];
+    }
+
+    const std::string PoolingTypetoString(FastreidPoolingType value) {
+    #define X(a, b) b,
+        static std::vector<std::string> table{ FASTPOOLING_TABLE };
+    #undef X
+        return table[value];
+    }
+
+    std::ostream& operator<<(std::ostream& os, const FastreidConfig& fastreidCfg) {
+        os << "\tbackbone: "            << BackboneTypetoString(fastreidCfg.backbone) << "\n\t"
+            << "head: "                 << HeadTypetoString(fastreidCfg.head)         << "\n\t"
+            << "pooling: "              << PoolingTypetoString(fastreidCfg.pooling)   << "\n\t"
+            << "last_stride: "          << fastreidCfg.last_stride                    << "\n\t"
+            << "with_ibna: "            << fastreidCfg.with_ibna                      << "\n\t"
+            << "with_nl: "              << fastreidCfg.with_nl                        << "\n\t"
+            << "embedding_dim: "        << fastreidCfg.embedding_dim                  << "\n";
+        return os;   
+    } 
+
+}
\ No newline at end of file
--- a/projects/FastRT/fastrt/engine/CMakeLists.txt
+++ b/projects/FastRT/fastrt/engine/CMakeLists.txt
+target_sources(${PROJECT_NAME}
+PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/InferenceEngine.cpp
+)
\ No newline at end of file
--- a/projects/FastRT/fastrt/engine/InferenceEngine.cpp
+++ b/projects/FastRT/fastrt/engine/InferenceEngine.cpp
+#include "fastrt/utils.h"
+#include "fastrt/InferenceEngine.h"
+
+namespace trt {
+
+   InferenceEngine::InferenceEngine(const EngineConfig &enginecfg): _engineCfg(enginecfg) { 
+        TRTASSERT((_engineCfg.max_batch_size > 0));
+        CHECK(cudaSetDevice(_engineCfg.device_id));
+
+        _runtime = make_holder(nvinfer1::createInferRuntime(gLogger));
+        TRTASSERT(_runtime.get());
+        _engine = make_holder(_runtime->deserializeCudaEngine(_engineCfg.trtModelStream.get(), _engineCfg.stream_size)); 
+        TRTASSERT(_engine.get());
+        _context = make_holder(_engine->createExecutionContext());
+        TRTASSERT(_context.get());
+
+        _inputSize = _engineCfg.max_batch_size * 3 * _engineCfg.input_h * _engineCfg.input_w * _depth;
+        _outputSize = _engineCfg.max_batch_size * _engineCfg.output_size * _depth; 
+
+        CHECK(cudaMallocHost((void**)&_input, _inputSize));
+        CHECK(cudaMallocHost((void**)&_output, _outputSize));
+
+        _streamptr = std::shared_ptr<cudaStream_t>( new cudaStream_t, 
+            [](cudaStream_t* ptr){ 
+                cudaStreamDestroy(*ptr);
+                if(ptr != nullptr){ 
+                    delete ptr;
+                } 
+            });
+
+        CHECK(cudaStreamCreate(&*_streamptr.get()));
+
+        // Pointers to input and output device buffers to pass to engine.
+        // Engine requires exactly IEngine::getNbBindings() number of buffers.
+        TRTASSERT((_engine->getNbBindings() == 2));
+
+        // In order to bind the buffers, we need to know the names of the input and output tensors.
+        // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+        _inputIndex = _engine->getBindingIndex(_engineCfg.input_name.c_str());
+        _outputIndex = _engine->getBindingIndex(_engineCfg.output_name.c_str());
+        
+        // Create GPU buffers on device
+        CHECK(cudaMalloc(&_buffers[_inputIndex], _inputSize));
+        CHECK(cudaMalloc(&_buffers[_outputIndex], _outputSize));
+
+        _inputSize /= _engineCfg.max_batch_size;
+        _outputSize /= _engineCfg.max_batch_size; 
+    }
+
+    bool InferenceEngine::doInference(const int inference_batch_size, std::function<void(float*)> preprocessing) {
+        TRTASSERT(( inference_batch_size <= _engineCfg.max_batch_size && inference_batch_size > 0));
+        preprocessing(_input);
+        CHECK(cudaSetDevice(_engineCfg.device_id));
+        CHECK(cudaMemcpyAsync(_buffers[_inputIndex], _input, inference_batch_size * _inputSize, cudaMemcpyHostToDevice, *_streamptr));
+        auto status = _context->enqueue(inference_batch_size, _buffers, *_streamptr, nullptr);
+        CHECK(cudaMemcpyAsync(_output, _buffers[_outputIndex], inference_batch_size * _outputSize, cudaMemcpyDeviceToHost, *_streamptr));
+        CHECK(cudaStreamSynchronize(*_streamptr));
+        return status;
+    }
+
+    InferenceEngine::InferenceEngine(InferenceEngine &&other) noexcept: 
+        _engineCfg(other._engineCfg)
+        , _input(other._input)
+        , _output(other._output)
+        , _inputIndex(other._inputIndex) 
+        , _outputIndex(other._outputIndex)
+        , _inputSize(other._inputSize) 
+        , _outputSize(other._outputSize)
+        , _runtime(std::move(other._runtime))
+        , _engine(std::move(other._engine))
+        , _context(std::move(other._context))
+        , _streamptr(other._streamptr) { 
+
+        _buffers[0] = other._buffers[0];
+        _buffers[1] = other._buffers[1];
+        other._streamptr.reset();
+        other._input = nullptr;
+        other._output = nullptr;
+        other._buffers[0] = nullptr; 
+        other._buffers[1] = nullptr; 
+    } 
+
+    InferenceEngine::~InferenceEngine() {  
+        CHECK(cudaFreeHost(_input));
+        CHECK(cudaFreeHost(_output));
+        CHECK(cudaFree(_buffers[_inputIndex]));
+        CHECK(cudaFree(_buffers[_outputIndex]));
+    }
+}
\ No newline at end of file
--- a/projects/FastRT/fastrt/factory/CMakeLists.txt
+++ b/projects/FastRT/fastrt/factory/CMakeLists.txt
+target_sources(${PROJECT_NAME}
+PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/factory.cpp
+  ${CMAKE_SOURCE_DIR}/fastrt/layers/poolingLayerRT.h
+)
\ No newline at end of file
--- a/projects/FastRT/fastrt/factory/factory.cpp
+++ b/projects/FastRT/fastrt/factory/factory.cpp
+#include <iostream>
+#include "fastrt/utils.h"
+#include "fastrt/sbs_resnet.h"
+#include "fastrt/factory.h"
+#include "fastrt/embedding_head.h"
+#include "../layers/poolingLayerRT.h"
+
+namespace fastrt {
+
+    std::unique_ptr<Module> ModuleFactory::createBackbone(FastreidConfig& modelCfg) {
+        switch(modelCfg.backbone) {
+            case FastreidBackboneType::r50:   
+                /* cfg.MODEL.META_ARCHITECTURE: Baseline */  
+                /* cfg.MODEL.BACKBONE.DEPTH: 50x */ 
+                std::cout << "[createBackboneModule]: backbone_sbsR50" << std::endl;
+                return make_unique<backbone_sbsR50>(modelCfg);
+            case FastreidBackboneType::r50_distill: 
+                /* cfg.MODEL.META_ARCHITECTURE: Distiller */ 
+                /* cfg.MODEL.BACKBONE.DEPTH: 50x */   
+                std::cout << "[createBackboneModule]: backbone_sbsR50_distill" << std::endl;
+                return make_unique<backbone_sbsR50_distill>(modelCfg);
+            case FastreidBackboneType::r34: 
+                /* cfg.MODEL.META_ARCHITECTURE: Baseline */  
+                /* cfg.MODEL.BACKBONE.DEPTH: 34x */  
+                std::cout << "[createBackboneModule]: backbone_sbsR34" << std::endl;
+                return make_unique<backbone_sbsR34>(modelCfg);
+            case FastreidBackboneType::r34_distill: 
+                /* cfg.MODEL.META_ARCHITECTURE: Distiller */ 
+                /* cfg.MODEL.BACKBONE.DEPTH: 34x */  
+                std::cout << "[createBackboneModule]: backbone_sbsR34_distill" << std::endl;
+                return make_unique<backbone_sbsR34_distill>(modelCfg);
+            case FastreidBackboneType::r18_distill: 
+                /* cfg.MODEL.META_ARCHITECTURE: Distiller */ 
+                /* cfg.MODEL.BACKBONE.DEPTH: 18x */  
+                std::cout << "[createBackboneModule]: backbone_sbsR18_distill" << std::endl;
+                return make_unique<backbone_sbsR18_distill>(modelCfg);
+            default:
+                std::cerr << "[Backbone is not supported.]" << std::endl;
+                return nullptr;
+        }
+    }
+
+    std::unique_ptr<Module> ModuleFactory::createHead(FastreidConfig& modelCfg) {
+        switch(modelCfg.head) {
+            case FastreidHeadType::EmbeddingHead:   
+                /* cfg.MODEL.HEADS.NAME: EmbeddingHead */ 
+                std::cout << "[createHeadModule]: EmbeddingHead" << std::endl;
+                return make_unique<embedding_head>(modelCfg);
+            default:
+                std::cerr << "[Head is not supported.]" << std::endl;
+                return nullptr;
+        }
+    }
+
+    std::unique_ptr<IPoolingLayerRT> LayerFactory::createPoolingLayer(const FastreidPoolingType& pooltype) {
+        switch(pooltype) {
+            case FastreidPoolingType::maxpool:
+                std::cout << "[createPoolingLayer]: maxpool" << std::endl;
+                return make_unique<MaxPool>();
+            case FastreidPoolingType::avgpool:
+                std::cout << "[createPoolingLayer]: avgpool" << std::endl;
+                return make_unique<AvgPool>();
+            case FastreidPoolingType::gempool:
+                std::cout << "[createPoolingLayer]: gempool" << std::endl;
+                return make_unique<GemPool>();
+            case FastreidPoolingType::gempoolP:
+                std::cout << "[createPoolingLayer]: gempoolP" << std::endl;
+                return make_unique<GemPoolP>();
+            default:
+                std::cerr << "[Pooling layer is not supported.]" << std::endl; 
+                return nullptr;
+        }  
+    }
+
+}
\ No newline at end of file
--- a/projects/FastRT/fastrt/heads/CMakeLists.txt
+++ b/projects/FastRT/fastrt/heads/CMakeLists.txt
+target_sources(${PROJECT_NAME}
+PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/embedding_head.cpp
+)
\ No newline at end of file
--- a/projects/FastRT/fastrt/heads/embedding_head.cpp
+++ b/projects/FastRT/fastrt/heads/embedding_head.cpp
+#include <iostream>
+#include "fastrt/utils.h"
+#include "fastrt/layers.h"
+#include "fastrt/embedding_head.h"
+
+namespace fastrt {
+
+    embedding_head::embedding_head(FastreidConfig& modelCfg) : 
+        _modelCfg(modelCfg), _layerFactory(make_unique<LayerFactory>()) {}
+    
+    embedding_head::embedding_head(FastreidConfig& modelCfg, 
+        std::unique_ptr<LayerFactory> layerFactory) : _modelCfg(modelCfg), _layerFactory(std::move(layerFactory)) {}
+
+    ILayer* embedding_head::topology(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input) {
+        /*
+         * Reference: https://github.com/JDAI-CV/fast-reid/blob/master/fastreid/modeling/heads/embedding_head.py
+         */
+
+        ILayer* pooling = _layerFactory->createPoolingLayer(_modelCfg.pooling)->addPooling(network, weightMap, input);
+        TRTASSERT(pooling);
+
+        // Hint: It's used to be "heads.bnneck.0" before Sep 10, 2020. (JDAI-CV/fast-reid)
+        std::string bnneck_lname = "heads.bottleneck.0"; 
+        ILayer* reduction_neck{pooling};
+
+        if(_modelCfg.embedding_dim > 0) { 
+            Weights emptywts{DataType::kFLOAT, nullptr, 0};
+            reduction_neck = network->addConvolutionNd(*pooling->getOutput(0),
+                _modelCfg.embedding_dim, 
+                DimsHW{1, 1}, 
+                weightMap["heads.bottleneck.0.weight"],             
+                emptywts);
+            TRTASSERT(reduction_neck); 
+            bnneck_lname[bnneck_lname.size()-1] = '1';
+        }
+        
+        IScaleLayer* bottleneck = trtxapi::addBatchNorm2d(network, weightMap, *reduction_neck->getOutput(0), bnneck_lname, 1e-5);     
+        TRTASSERT(bottleneck);
+        return bottleneck;
+    }
+
+}
\ No newline at end of file