update

b6c19984 · dengjb · b6c19984 · b6c19984 · b6c19984 · b6c19984
Commit b6c19984 authored Nov 18, 2025 by dengjb
20 changed files
--- a/projects/FastRT/include/fastrt/sbs_resnet.h
+++ b/projects/FastRT/include/fastrt/sbs_resnet.h
+#pragma once
+
+#include <map>
+#include "struct.h"
+#include "module.h"
+#include "NvInfer.h"
+using namespace nvinfer1;
+
+namespace fastrt {
+    class backbone_sbsR18_distill : public Module {
+    private:
+        FastreidConfig& _modelCfg;
+    public:
+        backbone_sbsR18_distill(FastreidConfig& modelCfg) : _modelCfg(modelCfg){}
+        ~backbone_sbsR18_distill() = default;
+        ILayer* topology(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap, 
+            ITensor& input) override; 
+    };
+
+    class backbone_sbsR34_distill : public Module {
+    private:
+        FastreidConfig& _modelCfg;
+    public:
+        backbone_sbsR34_distill(FastreidConfig& modelCfg) : _modelCfg(modelCfg) {}
+        ~backbone_sbsR34_distill() = default;
+        ILayer* topology(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap, 
+            ITensor& input) override; 
+    };
+
+    class backbone_sbsR50_distill : public Module { 
+    private:
+        FastreidConfig& _modelCfg;
+    public:
+        backbone_sbsR50_distill(FastreidConfig& modelCfg) : _modelCfg(modelCfg) {}
+        ~backbone_sbsR50_distill() = default;
+        ILayer* topology(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap, 
+            ITensor& input) override;
+    };
+
+    class backbone_sbsR34 : public Module {
+    private:
+        FastreidConfig& _modelCfg;
+    public:
+        backbone_sbsR34(FastreidConfig& modelCfg) : _modelCfg(modelCfg) {}
+        ~backbone_sbsR34() = default;
+        ILayer* topology(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap, 
+            ITensor& input) override;
+    };
+
+    class backbone_sbsR50 : public Module {
+    private:
+        FastreidConfig& _modelCfg;
+    public:
+        backbone_sbsR50(FastreidConfig& modelCfg) : _modelCfg(modelCfg) {}
+        ~backbone_sbsR50() = default;
+        ILayer* topology(INetworkDefinition *network, 
+            std::map<std::string, Weights>& weightMap, 
+            ITensor& input) override;
+    };
+     
+}
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/struct.h
+++ b/projects/FastRT/include/fastrt/struct.h
+#pragma once
+
+#include <memory>
+
+namespace trt {
+
+    struct ModelConfig {
+        std::string weights_path;
+        int max_batch_size; 
+        int input_h;     /* cfg.INPUT.SIZE_TRAIN[0] */
+        int input_w;     /* cfg.INPUT.SIZE_TRAIN[1] */
+        int output_size; /* final embedding dims. Could be cfg.MODEL.BACKBONE.FEAT_DIM or cfg.MODEL.HEADS.EMBEDDING_DIM(if you modified. default=0) */
+        int device_id;   /* cuda device id(0, 1, 2, ...) */   
+    };
+
+    struct EngineConfig : ModelConfig {
+        std::string input_name;
+        std::string output_name; 
+        std::shared_ptr<char> trtModelStream;
+        int stream_size;
+    };
+
+}
+
+namespace fastrt {
+
+#define FASTBACKBONE_TABLE \
+        X(r50, "r50") \
+        X(r50_distill, "r50_distill") \
+        X(r34, "r34") \
+        X(r34_distill, "r34_distill") \
+        X(r18_distill, "r18_distill") 
+
+#define X(a, b) a,
+        enum FastreidBackboneType { FASTBACKBONE_TABLE };
+#undef X
+
+#define FASTHEAD_TABLE \
+        X(EmbeddingHead, "EmbeddingHead")
+
+#define X(a, b) a,
+    enum FastreidHeadType { FASTHEAD_TABLE };
+#undef X
+
+#define FASTPOOLING_TABLE \
+        X(maxpool, "maxpool") \
+        X(avgpool, "avgpool") \
+        X(gempool, "gempool") \
+        X(gempoolP, "gempoolP") 
+
+#define X(a, b) a,
+    enum FastreidPoolingType { FASTPOOLING_TABLE };
+#undef X
+
+    struct FastreidConfig {
+        FastreidBackboneType backbone; /* cfg.MODEL.BACKBONE.DEPTH and cfg.MODEL.META_ARCHITECTURE */
+        FastreidHeadType head;         /* cfg.MODEL.HEADS.NAME */
+        FastreidPoolingType pooling;   /* cfg.MODEL.HEADS.POOL_LAYER */
+        int last_stride;               /* cfg.MODEL.BACKBONE.LAST_STRIDE */
+        bool with_ibna;                /* cfg.MODEL.BACKBONE.WITH_IBN */
+        bool with_nl;                  /* cfg.MODEL.BACKBONE.WITH_NL */
+        int embedding_dim;             /* cfg.MODEL.HEADS.EMBEDDING_DIM (Default = 0) */ 
+    };
+
+}
\ No newline at end of file
--- a/projects/FastRT/include/fastrt/utils.h
+++ b/projects/FastRT/include/fastrt/utils.h
+#pragma once
+
+#include <map>
+#include <chrono>
+#include <memory>
+#include <vector>
+#include <fstream>
+#include <iostream>
+#include <cassert>
+#include <string.h> 
+
+#include <dirent.h>
+#include "NvInfer.h"
+#include "cuda_runtime_api.h"
+#include "fastrt/struct.h"
+
+#define CHECK(status)                             \
+    do                                            \
+    {                                             \
+        auto ret = (status);                      \
+        if (ret != 0)                             \
+        {                                         \
+            std::cout << "Cuda failure: " << ret; \
+            abort();                              \
+        }                                         \
+    } while (0)
+
+#define TRTASSERT assert
+
+using Time = std::chrono::high_resolution_clock;
+using TimePoint = std::chrono::time_point<std::chrono::high_resolution_clock>;
+
+template<typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+namespace io {
+    std::vector<std::string> fileGlob(const std::string& pattern);
+}
+
+static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
+    DIR *p_dir = opendir(p_dir_name);
+    if (p_dir == nullptr) {
+        return -1;
+    }
+
+    struct dirent* p_file = nullptr;
+    while ((p_file = readdir(p_dir)) != nullptr) {
+        if (strcmp(p_file->d_name, ".") != 0 &&
+            strcmp(p_file->d_name, "..") != 0) {
+
+            std::string cur_file_name(p_file->d_name);
+            file_names.push_back(cur_file_name);
+        }
+    }
+
+    closedir(p_dir);
+    return 0;
+}
+
+namespace trt {
+    /* 
+     * Load weights from files shared with TensorRT samples.
+     * TensorRT weight files have a simple space delimited format:
+     * [type] [size] <data x size in hex>
+     */ 
+    std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);
+
+    std::ostream& operator<<(std::ostream& os, const ModelConfig& modelCfg);
+}
+
+namespace fastrt {
+    std::ostream& operator<<(std::ostream& os, const FastreidConfig& fastreidCfg);
+}
\ No newline at end of file
--- a/projects/FastRT/pybind_interface/CMakeLists.txt
+++ b/projects/FastRT/pybind_interface/CMakeLists.txt
+SET(APP_PROJECT_NAME ReID)
+
+# pybind
+find_package(pybind11)
+
+find_package(CUDA REQUIRED)
+# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
+# cuda
+include_directories(/usr/local/cuda/include)
+link_directories(/usr/local/cuda/lib64)
+# tensorrt
+include_directories(/usr/include/x86_64-linux-gnu/)
+link_directories(/usr/lib/x86_64-linux-gnu/)
+
+include_directories(${SOLUTION_DIR}/include)
+
+pybind11_add_module(${APP_PROJECT_NAME} ${PROJECT_SOURCE_DIR}/pybind_interface/reid.cpp)
+
+# OpenCV
+find_package(OpenCV)
+target_include_directories(${APP_PROJECT_NAME}
+PUBLIC
+  ${OpenCV_INCLUDE_DIRS}
+)
+target_link_libraries(${APP_PROJECT_NAME}
+PUBLIC
+  ${OpenCV_LIBS}
+)
+
+if(BUILD_FASTRT_ENGINE AND BUILD_PYTHON_INTERFACE)
+  SET(FASTRTENGINE_LIB FastRTEngine)
+else()
+  SET(FASTRTENGINE_LIB ${SOLUTION_DIR}/libs/FastRTEngine/libFastRTEngine.so)
+endif()
+
+target_link_libraries(${APP_PROJECT_NAME} 
+PRIVATE
+  ${FASTRTENGINE_LIB}
+  nvinfer
+)
\ No newline at end of file
--- a/projects/FastRT/pybind_interface/docker/trt7cu100/Dockerfile
+++ b/projects/FastRT/pybind_interface/docker/trt7cu100/Dockerfile
+# cuda10.0
+FROM fineyu/tensorrt7:0.0.1
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    software-properties-common \
+    cmake \
+    wget \
+    python3.7-dev python3-pip 
+
+RUN add-apt-repository -y ppa:timsc/opencv-3.4 && \
+    apt-get update && \
+    apt-get install -y \
+    libopencv-dev \
+    libopencv-dnn-dev \
+    libopencv-shape3.4-dbg && \
+    apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+RUN wget https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py --force-reinstall && \
+    rm get-pip.py
+
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 && \
+    update-alternatives --set python3 /usr/bin/python3.7
+
+RUN pip install pytest opencv-python 
+
+RUN cd /usr/local/src && \
+    wget https://github.com/pybind/pybind11/archive/v2.2.3.tar.gz && \
+    tar xvf v2.2.3.tar.gz && \
+    cd pybind11-2.2.3 && \
+    mkdir build && \
+    cd build && \
+    cmake .. && \
+    make -j12 && \
+    make install && \
+    cd ../.. && \
+    rm -rf pybind11-2.2.3 && \
+    rm -rf v2.2.3.tar.gz
--- a/projects/FastRT/pybind_interface/docker/trt7cu102_torch160/Dockerfile
+++ b/projects/FastRT/pybind_interface/docker/trt7cu102_torch160/Dockerfile
+# cuda10.2
+FROM darrenhsieh1717/trt7-cu102-cv34:pybind
+
+RUN pip install torch==1.6.0 torchvision==0.7.0
+
+RUN pip install opencv-python tensorboard cython yacs termcolor scikit-learn tabulate gdown gpustat ipdb h5py fs faiss-gpu
+
+RUN git clone https://github.com/NVIDIA/apex && \
+    cd apex && \
+    python3 setup.py install
--- a/projects/FastRT/pybind_interface/market_benchmark.py
+++ b/projects/FastRT/pybind_interface/market_benchmark.py
+import random
+import numpy as np
+import cv2
+import fs
+import argparse
+import io
+import sys
+import torch
+import time
+import os
+import torchvision.transforms as T
+
+sys.path.append('../../..')
+sys.path.append('../')
+from fastreid.config import get_cfg
+from fastreid.modeling.meta_arch import build_model
+from fastreid.utils.file_io import PathManager
+from fastreid.utils.checkpoint import Checkpointer
+from fastreid.utils.logger import setup_logger
+from fastreid.data import build_reid_train_loader, build_reid_test_loader
+from fastreid.evaluation.rank import eval_market1501
+
+from build.pybind_interface.ReID import ReID
+
+
+FEATURE_DIM = 2048
+GPU_ID = 0
+
+def map(wrapper):
+	model = wrapper
+	cfg = get_cfg()
+	test_loader, num_query = build_reid_test_loader(cfg, "Market1501", T.Compose([]))
+
+	feats = []
+	pids = []
+	camids = []
+
+	for batch in test_loader:
+		for image_path in batch["img_paths"]:
+			t = torch.Tensor(np.array([model.infer(cv2.imread(image_path))]))
+			t.to(torch.device(GPU_ID))
+			feats.append(t)
+		pids.extend(batch["targets"].numpy())
+		camids.extend(batch["camids"].numpy())
+		
+	feats = torch.cat(feats, dim=0)
+	q_feat = feats[:num_query]
+	g_feat = feats[num_query:]
+	q_pids = np.asarray(pids[:num_query])
+	g_pids = np.asarray(pids[num_query:])
+	q_camids = np.asarray(camids[:num_query])
+	g_camids = np.asarray(camids[num_query:])
+
+	
+	distmat = 1 - torch.mm(q_feat, g_feat.t())
+	distmat = distmat.numpy()
+	all_cmc, all_AP, all_INP = eval_market1501(distmat, q_pids, g_pids, q_camids, g_camids, 5)
+	mAP = np.mean(all_AP)
+	print("mAP {}, rank-1 {}".format(mAP, all_cmc[0]))
+
+
+if __name__ == '__main__':
+	infer = ReID(GPU_ID)
+	infer.build("../build/sbs_R50-ibn.engine")
+	map(infer)
--- a/projects/FastRT/pybind_interface/reid.cpp
+++ b/projects/FastRT/pybind_interface/reid.cpp
+#include <iostream>
+#include <opencv2/opencv.hpp>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include "fastrt/utils.h"
+#include "fastrt/baseline.h"
+#include "fastrt/factory.h"
+using namespace fastrt;
+using namespace nvinfer1;
+
+namespace py = pybind11;
+
+
+/* Ex1. sbs_R50-ibn */
+static const std::string WEIGHTS_PATH = "../sbs_R50-ibn.wts"; 
+static const std::string ENGINE_PATH = "./sbs_R50-ibn.engine";
+
+static const int MAX_BATCH_SIZE = 4;
+static const int INPUT_H = 384;
+static const int INPUT_W = 128;
+static const int OUTPUT_SIZE = 2048;
+static const int DEVICE_ID = 0;
+
+static const FastreidBackboneType BACKBONE = FastreidBackboneType::r50; 
+static const FastreidHeadType HEAD = FastreidHeadType::EmbeddingHead;
+static const FastreidPoolingType HEAD_POOLING = FastreidPoolingType::gempoolP;
+static const int LAST_STRIDE = 1;
+static const bool WITH_IBNA = true; 
+static const bool WITH_NL = true;
+static const int EMBEDDING_DIM = 0; 
+
+FastreidConfig reidCfg { 
+        BACKBONE,
+        HEAD,
+        HEAD_POOLING,
+        LAST_STRIDE,
+        WITH_IBNA,
+        WITH_NL,
+        EMBEDDING_DIM};
+
+class ReID
+{
+
+private:
+    int device;  // GPU id
+    fastrt::Baseline baseline;
+
+public:
+    ReID(int a);
+    int build(const std::string &engine_file);
+    // std::list<float> infer_test(const std::string &image_file);
+    std::list<float> infer(py::array_t<uint8_t>&);
+    std::list<std::list<float>> batch_infer(std::list<py::array_t<uint8_t>>&);
+    ~ReID();
+};
+
+ReID::ReID(int device): baseline(trt::ModelConfig { 
+        WEIGHTS_PATH,
+        MAX_BATCH_SIZE,
+        INPUT_H,
+        INPUT_W,
+        OUTPUT_SIZE,
+        device})
+{
+    std::cout << "Init on device " << device << std::endl;
+}
+
+int ReID::build(const std::string &engine_file)
+{
+    if(!baseline.deserializeEngine(engine_file)) {
+        std::cout << "DeserializeEngine Failed." << std::endl;
+        return -1;
+    }
+    return 0;
+}
+
+ReID::~ReID()
+{
+
+    std::cout << "Destroy engine succeed" << std::endl;
+}
+
+std::list<float> ReID::infer(py::array_t<uint8_t>& img)
+{
+    auto rows = img.shape(0);
+    auto cols = img.shape(1);
+    auto type = CV_8UC3;
+
+    cv::Mat img2(rows, cols, type, (unsigned char*)img.data());
+    cv::Mat re(INPUT_H, INPUT_W, CV_8UC3);
+    // std::cout << (int)img2.data[0] << std::endl;
+    cv::resize(img2, re, re.size(), 0, 0, cv::INTER_CUBIC); /* cv::INTER_LINEAR */
+    std::vector<cv::Mat> input;
+    input.emplace_back(re);
+
+    if(!baseline.inference(input)) {
+        std::cout << "Inference Failed." << std::endl;
+    }
+    std::list<float> feature;
+
+    float* feat_embedding = baseline.getOutput();
+    TRTASSERT(feat_embedding);
+    for (int dim = 0; dim < baseline.getOutputSize(); ++dim) {
+        feature.push_back(feat_embedding[dim]);
+    }
+
+    return feature;
+}
+
+
+std::list<std::list<float>> ReID::batch_infer(std::list<py::array_t<uint8_t>>& imgs)
+{
+    // auto t1 = Time::now();
+    std::vector<cv::Mat> input;
+    int count = 0;
+    while(!imgs.empty()){
+        py::array_t<uint8_t>& img = imgs.front();
+        imgs.pop_front();
+        // parse to cvmat
+        auto rows = img.shape(0);
+        auto cols = img.shape(1);
+        auto type = CV_8UC3;
+
+        cv::Mat img2(rows, cols, type, (unsigned char*)img.data());
+        cv::Mat re(INPUT_H, INPUT_W, CV_8UC3);
+        // std::cout << (int)img2.data[0] << std::endl;
+        cv::resize(img2, re, re.size(), 0, 0, cv::INTER_CUBIC); /* cv::INTER_LINEAR */
+        input.emplace_back(re);
+
+        count += 1;
+    }
+    // auto t2 = Time::now();
+    
+    if(!baseline.inference(input)) {
+        std::cout << "Inference Failed." << std::endl;
+    }
+    std::list<std::list<float>> result;
+
+    float* feat_embedding = baseline.getOutput();
+    TRTASSERT(feat_embedding);
+
+    // auto t3 = Time::now();
+    for (int index = 0; index < count; index++)
+    {
+        std::list<float> feature;
+        for (int dim = 0; dim < baseline.getOutputSize(); ++dim) {
+            feature.push_back(feat_embedding[index * baseline.getOutputSize() + dim]);
+        }
+        result.push_back(feature);
+    }
+    // std::cout << "[Preprocessing]: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << "ms" 
+    // << "[Infer]: " << std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count() << "ms" 
+    // << "[Cast]: " << std::chrono::duration_cast<std::chrono::milliseconds>(Time::now() - t3).count() << "ms" 
+    // << std::endl; 
+    return result;
+}
+
+
+PYBIND11_MODULE(ReID, m) {
+    m.doc() = R"pbdoc(
+        Pybind11 example plugin
+    )pbdoc";
+    py::class_<ReID>(m, "ReID")
+        .def(py::init<int>())
+        .def("build", &ReID::build)
+        .def("infer", &ReID::infer, py::return_value_policy::automatic)
+        .def("batch_infer", &ReID::batch_infer, py::return_value_policy::automatic)
+        ;
+
+#ifdef VERSION_INFO
+    m.attr("__version__") = VERSION_INFO;
+#else
+    m.attr("__version__") = "dev";
+#endif
+}
--- a/projects/FastRT/pybind_interface/test.py
+++ b/projects/FastRT/pybind_interface/test.py
+import sys
+
+sys.path.append("../")
+from build.pybind_interface.ReID import ReID
+import cv2
+import time
+
+
+if __name__ == '__main__':
+    iter_ = 10
+    m = ReID(0)
+    m.build("../build/sbs_R50-ibn.engine")
+    print("build done")
+    
+    frame = cv2.imread("../data/Market-1501-v15.09.15/calib_set/-1_c1s2_009916_03.jpg")
+    m.infer(frame)
+    t0 = time.time()
+
+    for i in range(iter_):
+        m.infer(frame)
+
+    total = time.time() - t0
+    print("CPP API fps is {:.1f}, avg infer time is {:.2f}ms".format(iter_ / total, total / iter_ * 1000))
\ No newline at end of file
--- a/projects/FastRT/third_party/cnpy/CMakeLists.txt
+++ b/projects/FastRT/third_party/cnpy/CMakeLists.txt
+CMAKE_MINIMUM_REQUIRED(VERSION 3.0 FATAL_ERROR)
+if(COMMAND cmake_policy)
+	cmake_policy(SET CMP0003 NEW)
+endif(COMMAND cmake_policy)
+
+project(CNPY)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+option(ENABLE_STATIC "Build static (.a) library" ON)
+
+find_package(ZLIB REQUIRED)
+
+include_directories(${ZLIB_INCLUDE_DIRS})
+
+add_library(cnpy SHARED "cnpy.cpp")
+target_link_libraries(cnpy ${ZLIB_LIBRARIES})
+install(TARGETS "cnpy" LIBRARY DESTINATION lib PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+
+if(ENABLE_STATIC)
+    add_library(cnpy-static STATIC "cnpy.cpp")
+    set_target_properties(cnpy-static PROPERTIES OUTPUT_NAME "cnpy")
+    install(TARGETS "cnpy-static" ARCHIVE DESTINATION lib)
+endif(ENABLE_STATIC)
+
+install(FILES "cnpy.h" DESTINATION include)
+install(FILES "mat2npz" "npy2mat" "npz2mat" DESTINATION bin PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+
+add_executable(example1 example1.cpp)
+target_link_libraries(example1 cnpy)
--- a/projects/FastRT/third_party/cnpy/LICENSE
+++ b/projects/FastRT/third_party/cnpy/LICENSE
+The MIT License
+
+Copyright (c) Carl Rogers, 2011
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/projects/FastRT/third_party/cnpy/README.md
+++ b/projects/FastRT/third_party/cnpy/README.md
+# Purpose:
+
+NumPy offers the `save` method for easy saving of arrays into .npy and `savez` for zipping multiple .npy arrays together into a .npz file. 
+
+`cnpy` lets you read and write to these formats in C++. 
+
+The motivation comes from scientific programming where large amounts of data are generated in C++ and analyzed in Python.
+
+Writing to .npy has the advantage of using low-level C++ I/O (fread and fwrite) for speed and binary format for size. 
+The .npy file header takes care of specifying the size, shape, and data type of the array, so specifying the format of the data is unnecessary.
+
+Loading data written in numpy formats into C++ is equally simple, but requires you to type-cast the loaded data to the type of your choice.
+
+# Installation:
+
+Default installation directory is /usr/local. 
+To specify a different directory, add `-DCMAKE_INSTALL_PREFIX=/path/to/install/dir` to the cmake invocation in step 4.
+
+1. get [cmake](www.cmake.org)
+2. create a build directory, say $HOME/build
+3. cd $HOME/build
+4. cmake /path/to/cnpy
+5. make
+6. make install
+
+# Using:
+
+To use, `#include"cnpy.h"` in your source code. Compile the source code mycode.cpp as
+
+```bash
+g++ -o mycode mycode.cpp -L/path/to/install/dir -lcnpy -lz --std=c++11
+```
+
+# Description:
+
+There are two functions for writing data: `npy_save` and `npz_save`.
+
+There are 3 functions for reading:
+- `npy_load` will load a .npy file. 
+- `npz_load(fname)` will load a .npz and return a dictionary of NpyArray structues. 
+- `npz_load(fname,varname)` will load and return the NpyArray for data varname from the specified .npz file.
+
+The data structure for loaded data is below. 
+Data is accessed via the `data<T>()`-method, which returns a pointer of the specified type (which must match the underlying datatype of the data). 
+The array shape and word size are read from the npy header.
+
+```c++
+struct NpyArray {
+    std::vector<size_t> shape;
+    size_t word_size;
+    template<typename T> T* data();
+};
+```
+
+See [example1.cpp](example1.cpp) for examples of how to use the library. example1 will also be build during cmake installation.
--- a/projects/FastRT/third_party/cnpy/cnpy.cpp
+++ b/projects/FastRT/third_party/cnpy/cnpy.cpp
+//Copyright (C) 2011  Carl Rogers
+//Released under MIT License
+//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
+
+#include"cnpy.h"
+#include<complex>
+#include<cstdlib>
+#include<algorithm>
+#include<cstring>
+#include<iomanip>
+#include<stdint.h>
+#include<stdexcept>
+#include <regex>
+
+char cnpy::BigEndianTest() {
+    int x = 1;
+    return (((char *)&x)[0]) ? '<' : '>';
+}
+
+char cnpy::map_type(const std::type_info& t)
+{
+    if(t == typeid(float) ) return 'f';
+    if(t == typeid(double) ) return 'f';
+    if(t == typeid(long double) ) return 'f';
+
+    if(t == typeid(int) ) return 'i';
+    if(t == typeid(char) ) return 'i';
+    if(t == typeid(short) ) return 'i';
+    if(t == typeid(long) ) return 'i';
+    if(t == typeid(long long) ) return 'i';
+
+    if(t == typeid(unsigned char) ) return 'u';
+    if(t == typeid(unsigned short) ) return 'u';
+    if(t == typeid(unsigned long) ) return 'u';
+    if(t == typeid(unsigned long long) ) return 'u';
+    if(t == typeid(unsigned int) ) return 'u';
+
+    if(t == typeid(bool) ) return 'b';
+
+    if(t == typeid(std::complex<float>) ) return 'c';
+    if(t == typeid(std::complex<double>) ) return 'c';
+    if(t == typeid(std::complex<long double>) ) return 'c';
+
+    else return '?';
+}
+
+template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const std::string rhs) {
+    lhs.insert(lhs.end(),rhs.begin(),rhs.end());
+    return lhs;
+}
+
+template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const char* rhs) {
+    //write in little endian
+    size_t len = strlen(rhs);
+    lhs.reserve(len);
+    for(size_t byte = 0; byte < len; byte++) {
+        lhs.push_back(rhs[byte]);
+    }
+    return lhs;
+}
+
+void cnpy::parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector<size_t>& shape, bool& fortran_order) {
+    //std::string magic_string(buffer,6);
+    uint8_t major_version = *reinterpret_cast<uint8_t*>(buffer+6);
+    uint8_t minor_version = *reinterpret_cast<uint8_t*>(buffer+7);
+    uint16_t header_len = *reinterpret_cast<uint16_t*>(buffer+8);
+    std::string header(reinterpret_cast<char*>(buffer+9),header_len);
+
+    size_t loc1, loc2;
+
+    //fortran order
+    loc1 = header.find("fortran_order")+16;
+    fortran_order = (header.substr(loc1,4) == "True" ? true : false);
+
+    //shape
+    loc1 = header.find("(");
+    loc2 = header.find(")");
+
+    std::regex num_regex("[0-9][0-9]*");
+    std::smatch sm;
+    shape.clear();
+
+    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
+    while(std::regex_search(str_shape, sm, num_regex)) {
+        shape.push_back(std::stoi(sm[0].str()));
+        str_shape = sm.suffix().str();
+    }
+
+    //endian, word size, data type
+    //byte order code | stands for not applicable. 
+    //not sure when this applies except for byte array
+    loc1 = header.find("descr")+9;
+    bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
+    assert(littleEndian);
+
+    //char type = header[loc1+1];
+    //assert(type == map_type(T));
+
+    std::string str_ws = header.substr(loc1+2);
+    loc2 = str_ws.find("'");
+    word_size = atoi(str_ws.substr(0,loc2).c_str());
+}
+
+void cnpy::parse_npy_header(FILE* fp, size_t& word_size, std::vector<size_t>& shape, bool& fortran_order) {  
+    char buffer[256];
+    size_t res = fread(buffer,sizeof(char),11,fp);       
+    if(res != 11)
+        throw std::runtime_error("parse_npy_header: failed fread");
+    std::string header = fgets(buffer,256,fp);
+    assert(header[header.size()-1] == '\n');
+
+    size_t loc1, loc2;
+
+    //fortran order
+    loc1 = header.find("fortran_order");
+    if (loc1 == std::string::npos)
+        throw std::runtime_error("parse_npy_header: failed to find header keyword: 'fortran_order'");
+    loc1 += 16;
+    fortran_order = (header.substr(loc1,4) == "True" ? true : false);
+
+    //shape
+    loc1 = header.find("(");
+    loc2 = header.find(")");
+    if (loc1 == std::string::npos || loc2 == std::string::npos)
+        throw std::runtime_error("parse_npy_header: failed to find header keyword: '(' or ')'");
+
+    std::regex num_regex("[0-9][0-9]*");
+    std::smatch sm;
+    shape.clear();
+
+    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
+    while(std::regex_search(str_shape, sm, num_regex)) {
+        shape.push_back(std::stoi(sm[0].str()));
+        str_shape = sm.suffix().str();
+    }
+
+    //endian, word size, data type
+    //byte order code | stands for not applicable. 
+    //not sure when this applies except for byte array
+    loc1 = header.find("descr");
+    if (loc1 == std::string::npos)
+        throw std::runtime_error("parse_npy_header: failed to find header keyword: 'descr'");
+    loc1 += 9;
+    bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
+    assert(littleEndian);
+
+    //char type = header[loc1+1];
+    //assert(type == map_type(T));
+
+    std::string str_ws = header.substr(loc1+2);
+    loc2 = str_ws.find("'");
+    word_size = atoi(str_ws.substr(0,loc2).c_str());
+}
+
+void cnpy::parse_zip_footer(FILE* fp, uint16_t& nrecs, size_t& global_header_size, size_t& global_header_offset)
+{
+    std::vector<char> footer(22);
+    fseek(fp,-22,SEEK_END);
+    size_t res = fread(&footer[0],sizeof(char),22,fp);
+    if(res != 22)
+        throw std::runtime_error("parse_zip_footer: failed fread");
+
+    uint16_t disk_no, disk_start, nrecs_on_disk, comment_len;
+    disk_no = *(uint16_t*) &footer[4];
+    disk_start = *(uint16_t*) &footer[6];
+    nrecs_on_disk = *(uint16_t*) &footer[8];
+    nrecs = *(uint16_t*) &footer[10];
+    global_header_size = *(uint32_t*) &footer[12];
+    global_header_offset = *(uint32_t*) &footer[16];
+    comment_len = *(uint16_t*) &footer[20];
+
+    assert(disk_no == 0);
+    assert(disk_start == 0);
+    assert(nrecs_on_disk == nrecs);
+    assert(comment_len == 0);
+}
+
+cnpy::NpyArray load_the_npy_file(FILE* fp) {
+    std::vector<size_t> shape;
+    size_t word_size;
+    bool fortran_order;
+    cnpy::parse_npy_header(fp,word_size,shape,fortran_order);
+
+    cnpy::NpyArray arr(shape, word_size, fortran_order);
+    size_t nread = fread(arr.data<char>(),1,arr.num_bytes(),fp);
+    if(nread != arr.num_bytes())
+        throw std::runtime_error("load_the_npy_file: failed fread");
+    return arr;
+}
+
+cnpy::NpyArray load_the_npz_array(FILE* fp, uint32_t compr_bytes, uint32_t uncompr_bytes) {
+
+    std::vector<unsigned char> buffer_compr(compr_bytes);
+    std::vector<unsigned char> buffer_uncompr(uncompr_bytes);
+    size_t nread = fread(&buffer_compr[0],1,compr_bytes,fp);
+    if(nread != compr_bytes)
+        throw std::runtime_error("load_the_npy_file: failed fread");
+
+    int err;
+    z_stream d_stream;
+
+    d_stream.zalloc = Z_NULL;
+    d_stream.zfree = Z_NULL;
+    d_stream.opaque = Z_NULL;
+    d_stream.avail_in = 0;
+    d_stream.next_in = Z_NULL;
+    err = inflateInit2(&d_stream, -MAX_WBITS);
+
+    d_stream.avail_in = compr_bytes;
+    d_stream.next_in = &buffer_compr[0];
+    d_stream.avail_out = uncompr_bytes;
+    d_stream.next_out = &buffer_uncompr[0];
+
+    err = inflate(&d_stream, Z_FINISH);
+    err = inflateEnd(&d_stream);
+
+    std::vector<size_t> shape;
+    size_t word_size;
+    bool fortran_order;
+    cnpy::parse_npy_header(&buffer_uncompr[0],word_size,shape,fortran_order);
+
+    cnpy::NpyArray array(shape, word_size, fortran_order);
+
+    size_t offset = uncompr_bytes - array.num_bytes();
+    memcpy(array.data<unsigned char>(),&buffer_uncompr[0]+offset,array.num_bytes());
+
+    return array;
+}
+
+cnpy::npz_t cnpy::npz_load(std::string fname) {
+    FILE* fp = fopen(fname.c_str(),"rb");
+
+    if(!fp) {
+        throw std::runtime_error("npz_load: Error! Unable to open file "+fname+"!");
+    }
+
+    cnpy::npz_t arrays;  
+
+    while(1) {
+        std::vector<char> local_header(30);
+        size_t headerres = fread(&local_header[0],sizeof(char),30,fp);
+        if(headerres != 30)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //if we've reached the global header, stop reading
+        if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
+
+        //read in the variable name
+        uint16_t name_len = *(uint16_t*) &local_header[26];
+        std::string varname(name_len,' ');
+        size_t vname_res = fread(&varname[0],sizeof(char),name_len,fp);
+        if(vname_res != name_len)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //erase the lagging .npy        
+        varname.erase(varname.end()-4,varname.end());
+
+        //read in the extra field
+        uint16_t extra_field_len = *(uint16_t*) &local_header[28];
+        if(extra_field_len > 0) {
+            std::vector<char> buff(extra_field_len);
+            size_t efield_res = fread(&buff[0],sizeof(char),extra_field_len,fp);
+            if(efield_res != extra_field_len)
+                throw std::runtime_error("npz_load: failed fread");
+        }
+
+        uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0]+8);
+        uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
+        uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);
+
+        if(compr_method == 0) {arrays[varname] = load_the_npy_file(fp);}
+        else {arrays[varname] = load_the_npz_array(fp,compr_bytes,uncompr_bytes);}
+    }
+
+    fclose(fp);
+    return arrays;  
+}
+
+cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {
+    FILE* fp = fopen(fname.c_str(),"rb");
+
+    if(!fp) throw std::runtime_error("npz_load: Unable to open file "+fname);
+
+    while(1) {
+        std::vector<char> local_header(30);
+        size_t header_res = fread(&local_header[0],sizeof(char),30,fp);
+        if(header_res != 30)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //if we've reached the global header, stop reading
+        if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
+
+        //read in the variable name
+        uint16_t name_len = *(uint16_t*) &local_header[26];
+        std::string vname(name_len,' ');
+        size_t vname_res = fread(&vname[0],sizeof(char),name_len,fp);      
+        if(vname_res != name_len)
+            throw std::runtime_error("npz_load: failed fread");
+        vname.erase(vname.end()-4,vname.end()); //erase the lagging .npy
+
+        //read in the extra field
+        uint16_t extra_field_len = *(uint16_t*) &local_header[28];
+        fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field
+        
+        uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0]+8);
+        uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
+        uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);
+
+        if(vname == varname) {
+            NpyArray array  = (compr_method == 0) ? load_the_npy_file(fp) : load_the_npz_array(fp,compr_bytes,uncompr_bytes);
+            fclose(fp);
+            return array;
+        }
+        else {
+            //skip past the data
+            uint32_t size = *(uint32_t*) &local_header[22];
+            fseek(fp,size,SEEK_CUR);
+        }
+    }
+
+    fclose(fp);
+
+    //if we get here, we haven't found the variable in the file
+    throw std::runtime_error("npz_load: Variable name "+varname+" not found in "+fname);
+}
+
+cnpy::NpyArray cnpy::npy_load(std::string fname) {
+
+    FILE* fp = fopen(fname.c_str(), "rb");
+
+    if(!fp) throw std::runtime_error("npy_load: Unable to open file "+fname);
+
+    NpyArray arr = load_the_npy_file(fp);
+
+    fclose(fp);
+    return arr;
+}
+
+
+
--- a/projects/FastRT/third_party/cnpy/cnpy.h
+++ b/projects/FastRT/third_party/cnpy/cnpy.h
+//Copyright (C) 2011  Carl Rogers
+//Released under MIT License
+//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
+
+#ifndef LIBCNPY_H_
+#define LIBCNPY_H_
+
+#include<string>
+#include<stdexcept>
+#include<sstream>
+#include<vector>
+#include<cstdio>
+#include<typeinfo>
+#include<iostream>
+#include<cassert>
+#include<zlib.h>
+#include<map>
+#include<memory>
+#include<stdint.h>
+#include<numeric>
+
+namespace cnpy {
+
+    struct NpyArray {
+        NpyArray(const std::vector<size_t>& _shape, size_t _word_size, bool _fortran_order) :
+            shape(_shape), word_size(_word_size), fortran_order(_fortran_order)
+        {
+            num_vals = 1;
+            for(size_t i = 0;i < shape.size();i++) num_vals *= shape[i];
+            data_holder = std::shared_ptr<std::vector<char>>(
+                new std::vector<char>(num_vals * word_size));
+        }
+
+        NpyArray() : shape(0), word_size(0), fortran_order(0), num_vals(0) { }
+
+        template<typename T>
+        T* data() {
+            return reinterpret_cast<T*>(&(*data_holder)[0]);
+        }
+
+        template<typename T>
+        const T* data() const {
+            return reinterpret_cast<T*>(&(*data_holder)[0]);
+        }
+
+        template<typename T>
+        std::vector<T> as_vec() const {
+            const T* p = data<T>();
+            return std::vector<T>(p, p+num_vals);
+        }
+
+        size_t num_bytes() const {
+            return data_holder->size();
+        }
+
+        std::shared_ptr<std::vector<char>> data_holder;
+        std::vector<size_t> shape;
+        size_t word_size;
+        bool fortran_order;
+        size_t num_vals;
+    };
+   
+    using npz_t = std::map<std::string, NpyArray>; 
+
+    char BigEndianTest();
+    char map_type(const std::type_info& t);
+    template<typename T> std::vector<char> create_npy_header(const std::vector<size_t>& shape);
+    void parse_npy_header(FILE* fp,size_t& word_size, std::vector<size_t>& shape, bool& fortran_order);
+    void parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector<size_t>& shape, bool& fortran_order);
+    void parse_zip_footer(FILE* fp, uint16_t& nrecs, size_t& global_header_size, size_t& global_header_offset);
+    npz_t npz_load(std::string fname);
+    NpyArray npz_load(std::string fname, std::string varname);
+    NpyArray npy_load(std::string fname);
+
+    template<typename T> std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs) {
+        //write in little endian
+        for(size_t byte = 0; byte < sizeof(T); byte++) {
+            char val = *((char*)&rhs+byte); 
+            lhs.push_back(val);
+        }
+        return lhs;
+    }
+
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const std::string rhs);
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const char* rhs);
+
+
+    template<typename T> void npy_save(std::string fname, const T* data, const std::vector<size_t> shape, std::string mode = "w") {
+        FILE* fp = NULL;
+        std::vector<size_t> true_data_shape; //if appending, the shape of existing + new data
+
+        if(mode == "a") fp = fopen(fname.c_str(),"r+b");
+
+        if(fp) {
+            //file exists. we need to append to it. read the header, modify the array size
+            size_t word_size;
+            bool fortran_order;
+            parse_npy_header(fp,word_size,true_data_shape,fortran_order);
+            assert(!fortran_order);
+
+            if(word_size != sizeof(T)) {
+                std::cout<<"libnpy error: "<<fname<<" has word size "<<word_size<<" but npy_save appending data sized "<<sizeof(T)<<"\n";
+                assert( word_size == sizeof(T) );
+            }
+            if(true_data_shape.size() != shape.size()) {
+                std::cout<<"libnpy error: npy_save attempting to append misdimensioned data to "<<fname<<"\n";
+                assert(true_data_shape.size() != shape.size());
+            }
+
+            for(size_t i = 1; i < shape.size(); i++) {
+                if(shape[i] != true_data_shape[i]) {
+                    std::cout<<"libnpy error: npy_save attempting to append misshaped data to "<<fname<<"\n";
+                    assert(shape[i] == true_data_shape[i]);
+                }
+            }
+            true_data_shape[0] += shape[0];
+        }
+        else {
+            fp = fopen(fname.c_str(),"wb");
+            true_data_shape = shape;
+        }
+
+        std::vector<char> header = create_npy_header<T>(true_data_shape);
+        size_t nels = std::accumulate(shape.begin(),shape.end(),1,std::multiplies<size_t>());
+
+        fseek(fp,0,SEEK_SET);
+        fwrite(&header[0],sizeof(char),header.size(),fp);
+        fseek(fp,0,SEEK_END);
+        fwrite(data,sizeof(T),nels,fp);
+        fclose(fp);
+    }
+
+    template<typename T> void npz_save(std::string zipname, std::string fname, const T* data, const std::vector<size_t>& shape, std::string mode = "w")
+    {
+        //first, append a .npy to the fname
+        fname += ".npy";
+
+        //now, on with the show
+        FILE* fp = NULL;
+        uint16_t nrecs = 0;
+        size_t global_header_offset = 0;
+        std::vector<char> global_header;
+
+        if(mode == "a") fp = fopen(zipname.c_str(),"r+b");
+
+        if(fp) {
+            //zip file exists. we need to add a new npy file to it.
+            //first read the footer. this gives us the offset and size of the global header
+            //then read and store the global header.
+            //below, we will write the the new data at the start of the global header then append the global header and footer below it
+            size_t global_header_size;
+            parse_zip_footer(fp,nrecs,global_header_size,global_header_offset);
+            fseek(fp,global_header_offset,SEEK_SET);
+            global_header.resize(global_header_size);
+            size_t res = fread(&global_header[0],sizeof(char),global_header_size,fp);
+            if(res != global_header_size){
+                throw std::runtime_error("npz_save: header read error while adding to existing zip");
+            }
+            fseek(fp,global_header_offset,SEEK_SET);
+        }
+        else {
+            fp = fopen(zipname.c_str(),"wb");
+        }
+
+        std::vector<char> npy_header = create_npy_header<T>(shape);
+
+        size_t nels = std::accumulate(shape.begin(),shape.end(),1,std::multiplies<size_t>());
+        size_t nbytes = nels*sizeof(T) + npy_header.size();
+
+        //get the CRC of the data to be added
+        uint32_t crc = crc32(0L,(uint8_t*)&npy_header[0],npy_header.size());
+        crc = crc32(crc,(uint8_t*)data,nels*sizeof(T));
+
+        //build the local header
+        std::vector<char> local_header;
+        local_header += "PK"; //first part of sig
+        local_header += (uint16_t) 0x0403; //second part of sig
+        local_header += (uint16_t) 20; //min version to extract
+        local_header += (uint16_t) 0; //general purpose bit flag
+        local_header += (uint16_t) 0; //compression method
+        local_header += (uint16_t) 0; //file last mod time
+        local_header += (uint16_t) 0;     //file last mod date
+        local_header += (uint32_t) crc; //crc
+        local_header += (uint32_t) nbytes; //compressed size
+        local_header += (uint32_t) nbytes; //uncompressed size
+        local_header += (uint16_t) fname.size(); //fname length
+        local_header += (uint16_t) 0; //extra field length
+        local_header += fname;
+
+        //build global header
+        global_header += "PK"; //first part of sig
+        global_header += (uint16_t) 0x0201; //second part of sig
+        global_header += (uint16_t) 20; //version made by
+        global_header.insert(global_header.end(),local_header.begin()+4,local_header.begin()+30);
+        global_header += (uint16_t) 0; //file comment length
+        global_header += (uint16_t) 0; //disk number where file starts
+        global_header += (uint16_t) 0; //internal file attributes
+        global_header += (uint32_t) 0; //external file attributes
+        global_header += (uint32_t) global_header_offset; //relative offset of local file header, since it begins where the global header used to begin
+        global_header += fname;
+
+        //build footer
+        std::vector<char> footer;
+        footer += "PK"; //first part of sig
+        footer += (uint16_t) 0x0605; //second part of sig
+        footer += (uint16_t) 0; //number of this disk
+        footer += (uint16_t) 0; //disk where footer starts
+        footer += (uint16_t) (nrecs+1); //number of records on this disk
+        footer += (uint16_t) (nrecs+1); //total number of records
+        footer += (uint32_t) global_header.size(); //nbytes of global headers
+        footer += (uint32_t) (global_header_offset + nbytes + local_header.size()); //offset of start of global headers, since global header now starts after newly written array
+        footer += (uint16_t) 0; //zip file comment length
+
+        //write everything
+        fwrite(&local_header[0],sizeof(char),local_header.size(),fp);
+        fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp);
+        fwrite(data,sizeof(T),nels,fp);
+        fwrite(&global_header[0],sizeof(char),global_header.size(),fp);
+        fwrite(&footer[0],sizeof(char),footer.size(),fp);
+        fclose(fp);
+    }
+
+    template<typename T> void npy_save(std::string fname, const std::vector<T> data, std::string mode = "w") {
+        std::vector<size_t> shape;
+        shape.push_back(data.size());
+        npy_save(fname, &data[0], shape, mode);
+    }
+
+    template<typename T> void npz_save(std::string zipname, std::string fname, const std::vector<T> data, std::string mode = "w") {
+        std::vector<size_t> shape;
+        shape.push_back(data.size());
+        npz_save(zipname, fname, &data[0], shape, mode);
+    }
+
+    template<typename T> std::vector<char> create_npy_header(const std::vector<size_t>& shape) {  
+
+        std::vector<char> dict;
+        dict += "{'descr': '";
+        dict += BigEndianTest();
+        dict += map_type(typeid(T));
+        dict += std::to_string(sizeof(T));
+        dict += "', 'fortran_order': False, 'shape': (";
+        dict += std::to_string(shape[0]);
+        for(size_t i = 1;i < shape.size();i++) {
+            dict += ", ";
+            dict += std::to_string(shape[i]);
+        }
+        if(shape.size() == 1) dict += ",";
+        dict += "), }";
+        //pad with spaces so that preamble+dict is modulo 16 bytes. preamble is 10 bytes. dict needs to end with \n
+        int remainder = 16 - (10 + dict.size()) % 16;
+        dict.insert(dict.end(),remainder,' ');
+        dict.back() = '\n';
+
+        std::vector<char> header;
+        header += (char) 0x93;
+        header += "NUMPY";
+        header += (char) 0x01; //major version of numpy format
+        header += (char) 0x00; //minor version of numpy format
+        header += (uint16_t) dict.size();
+        header.insert(header.end(),dict.begin(),dict.end());
+
+        return header;
+    }
+
+
+}
+
+#endif
--- a/projects/FastRT/third_party/cnpy/example1.cpp
+++ b/projects/FastRT/third_party/cnpy/example1.cpp
+#include"cnpy.h"
+#include<complex>
+#include<cstdlib>
+#include<iostream>
+#include<map>
+#include<string>
+
+const int Nx = 128;
+const int Ny = 64;
+const int Nz = 32;
+
+int main()
+{
+    //set random seed so that result is reproducible (for testing)
+    srand(0);
+    //create random data
+    std::vector<std::complex<double>> data(Nx*Ny*Nz);
+    for(int i = 0;i < Nx*Ny*Nz;i++) data[i] = std::complex<double>(rand(),rand());
+
+    //save it to file
+    cnpy::npy_save("arr1.npy",&data[0],{Nz,Ny,Nx},"w");
+
+    //load it into a new array
+    cnpy::NpyArray arr = cnpy::npy_load("arr1.npy");
+    std::complex<double>* loaded_data = arr.data<std::complex<double>>();
+    
+    //make sure the loaded data matches the saved data
+    assert(arr.word_size == sizeof(std::complex<double>));
+    assert(arr.shape.size() == 3 && arr.shape[0] == Nz && arr.shape[1] == Ny && arr.shape[2] == Nx);
+    for(int i = 0; i < Nx*Ny*Nz;i++) assert(data[i] == loaded_data[i]);
+
+    //append the same data to file
+    //npy array on file now has shape (Nz+Nz,Ny,Nx)
+    cnpy::npy_save("arr1.npy",&data[0],{Nz,Ny,Nx},"a");
+
+    //now write to an npz file
+    //non-array variables are treated as 1D arrays with 1 element
+    double myVar1 = 1.2;
+    char myVar2 = 'a';
+    cnpy::npz_save("out.npz","myVar1",&myVar1,{1},"w"); //"w" overwrites any existing file
+    cnpy::npz_save("out.npz","myVar2",&myVar2,{1},"a"); //"a" appends to the file we created above
+    cnpy::npz_save("out.npz","arr1",&data[0],{Nz,Ny,Nx},"a"); //"a" appends to the file we created above
+
+    //load a single var from the npz file
+    cnpy::NpyArray arr2 = cnpy::npz_load("out.npz","arr1");
+
+    //load the entire npz file
+    cnpy::npz_t my_npz = cnpy::npz_load("out.npz");
+    
+    //check that the loaded myVar1 matches myVar1
+    cnpy::NpyArray arr_mv1 = my_npz["myVar1"];
+    double* mv1 = arr_mv1.data<double>();
+    assert(arr_mv1.shape.size() == 1 && arr_mv1.shape[0] == 1);
+    assert(mv1[0] == myVar1);
+}
--- a/projects/FastRT/third_party/cnpy/mat2npz
+++ b/projects/FastRT/third_party/cnpy/mat2npz
+#!/usr/bin/env python
+
+import sys
+from numpy import savez
+from scipy.io import loadmat
+
+assert len(sys.argv) > 1
+
+files = sys.argv[1:]
+
+for f in files:
+    mat_vars = loadmat(f)
+    mat_vars.pop('__version__')
+    mat_vars.pop('__header__')
+    mat_vars.pop('__globals__')
+
+    fn = f.replace('.mat','.npz')
+    savez(fn,**mat_vars)
--- a/projects/FastRT/third_party/cnpy/npy2mat
+++ b/projects/FastRT/third_party/cnpy/npy2mat
+#!/usr/bin/env python
+
+import sys
+from numpy import load
+from scipy.io import savemat
+
+assert len(sys.argv) > 1
+
+files = sys.argv[1:]
+
+for f in files:
+   data = load(f)
+   fn = f.replace('.npy','')
+   fn = fn.replace('.','_')
+   savemat(fn,{fn : data})
--- a/projects/FastRT/third_party/cnpy/npz2mat
+++ b/projects/FastRT/third_party/cnpy/npz2mat
+#!/usr/bin/env python
+
+import sys
+from numpy import load
+from scipy.io import savemat
+
+assert len(sys.argv) > 1
+
+files = sys.argv[1:]
+
+for f in files:
+   data = load(f)
+   fn = f.replace('.npz','')
+   fn = fn.replace('.','_') #matlab cant handle dots
+   savemat(fn,data)
--- a/projects/FastRT/tools/How_to_Generate.md
+++ b/projects/FastRT/tools/How_to_Generate.md
+# Fastreid Model Deployment
+
+The `gen_wts.py` script convert a fastreid model to [.wts format](https://github.com/wang-xinyu/tensorrtx/blob/master/tutorials/getting_started.md#the-wts-content-format) file, then it will be used in [FastRT](https://github.com/JDAI-CV/fast-reid/blob/master/projects/FastRT) directly. 
+
+### Convert Environment
+
+* Same as fastreid.
+    
+### How to Generate
+
+This is a general example for converting fastreid to TensorRT model. We use `FastRT` to build the model with nvidia TensorRT APIs.
+
+In this part you need to convert the pytorch model to '.wts' file using `gen_wts.py` follow instructions below.
+
+1. Run command line below to generate the '.wts' file from pytorch model
+   
+   It's similar to how you use fastreid.
+    ```bash
+    python projects/FastRT/tools/gen_wts.py --config-file='config/you/use/in/fastreid/xxx.yml' \
+    --verify --show_model --wts_path='outputs/trt_model_file/xxx.wts' \
+    MODEL.WEIGHTS '/path/to/checkpoint_file/model_best.pth' MODEL.DEVICE "cuda:0"
+    ```
+
+    then you can check the TensorRT model weights `outputs/trt_model_file/xxx.wts`.
+
+3. Copy the `outputs/trt_model_file/xxx.wts` to [FastRT](https://github.com/JDAI-CV/fast-reid/blob/master/projects/FastRT)
+
+
+### More convert examples
+
+ Ex1. `sbs_R50-ibn`
+    - [x] resnet50, ibn, non-local, gempoolp
+    ```bash
+    python projects/FastRT/tools/gen_wts.py --config-file='configs/DukeMTMC/sbs_R50-ibn.yml' \
+    --verify --show_model --wts_path='outputs/trt_model_file/sbs_R50-ibn.wts' \
+    MODEL.WEIGHTS '/path/to/checkpoint_file/model_best.pth' MODEL.DEVICE "cuda:0"
+    ```
+    
+ Ex2. `sbs_R50`
+    - [x] resnet50, gempoolp   
+    ```bash
+    python projects/FastRT/tools/gen_wts.py --config-file='configs/DukeMTMC/sbs_R50.yml' \
+    --verify --show_model --wts_path='outputs/trt_model_file/sbs_R50.wts' \
+    MODEL.WEIGHTS '/path/to/checkpoint_file/model_best.pth' MODEL.DEVICE "cuda:0"
+    ``` 
+    
+* Ex3. `sbs_r34_distill`
+    - [x] train-alone distill-r34 (hint: distill-resnet is slightly different from resnet34), gempoolp
+    ```bash
+    python projects/FastRT/tools/gen_wts.py --config-file='projects/FastDistill/configs/sbs_r34.yml' \
+    --verify --show_model --wts_path='outputs/to/trt_model_file/sbs_r34_distill.wts' \
+    MODEL.WEIGHTS '/path/to/checkpoint_file/model_best.pth' MODEL.DEVICE "cuda:0"
+    ```
+
+* Ex4.`kd-r34-r101_ibn`
+    - [x] teacher model(r101_ibn), student model(distill-r34). the one for deploying is student model, gempoolp
+    ```bash
+    python projects/FastRT/tools/gen_wts.py --config-file='projects/FastDistill/configs/kd-sbs_r101ibn-sbs_r34.yml' \
+    --verify --show_model --wts_path='outputs/to/trt_model_file/kd_r34_distill.wts' \
+    MODEL.WEIGHTS '/path/to/checkpoint_file/model_best.pth' MODEL.DEVICE "cuda:0"
+    ```
+
+## Acknowledgements
+
+Thanks to [tensorrtx](https://github.com/wang-xinyu/tensorrtx) for demonstrating the usage of trt network definition APIs.
+
--- a/projects/FastRT/tools/gen_wts.py
+++ b/projects/FastRT/tools/gen_wts.py
+# encoding: utf-8
+
+import sys
+import time
+import struct
+import argparse
+sys.path.append('.')
+
+import torch
+import torchvision
+#from torchsummary import summary
+
+from fastreid.config import get_cfg
+from fastreid.modeling.meta_arch import build_model
+from fastreid.utils.checkpoint import Checkpointer
+
+sys.path.append('./projects/FastDistill')
+from fastdistill import *
+
+def setup_cfg(args):
+    # load confiimport argparseg from file and command-line arguments
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    return cfg
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Encode pytorch weights for tensorrt.")
+    parser.add_argument(
+        "--config-file",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument(
+        "--wts_path",
+        default='./trt_demo',
+        help='path to save tensorrt weights file(.wts)'
+    )
+    parser.add_argument(
+        "--show_model",
+        action='store_true',
+        help='print model architecture'
+    )
+    parser.add_argument(
+        "--verify",
+        action='store_true',
+        help='print model output for verify'
+    )
+    parser.add_argument(
+        "--benchmark",
+        action='store_true',
+        help='preprocessing + inference time'
+    )
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+def gen_wts(args):
+    """
+        Thanks to https://github.com/wang-xinyu/tensorrtx
+    """
+    print("Wait for it: {} ...".format(args.wts_path))
+    f = open(args.wts_path, 'w')
+    f.write("{}\n".format(len(model.state_dict().keys())))
+    for k,v in model.state_dict().items():
+        #print('key: ', k)
+        #print('value: ', v.shape)     
+        vr = v.reshape(-1).cpu().numpy()
+        f.write("{} {}".format(k, len(vr)))
+        for vv in vr:
+            f.write(" ")
+            f.write(struct.pack(">f", float(vv)).hex())
+        f.write("\n")
+        
+if __name__ == '__main__':
+    args = get_parser().parse_args()
+    cfg = setup_cfg(args)
+    cfg.MODEL.BACKBONE.PRETRAIN = False
+    print("[Config]: \n", cfg)
+    
+    model = build_model(cfg)
+    
+    if args.show_model:
+        print('[Model]: \n', model)
+        #summary(model, (3, cfg.INPUT.SIZE_TEST[0], cfg.INPUT.SIZE_TEST[1]))
+    
+    print("Load model from: ", cfg.MODEL.WEIGHTS)
+    Checkpointer(model).load(cfg.MODEL.WEIGHTS)
+    
+    model = model.to(cfg.MODEL.DEVICE)
+    model.eval()
+    
+    if args.verify:
+        input = torch.ones(1, 3, cfg.INPUT.SIZE_TEST[0], cfg.INPUT.SIZE_TEST[1]).to(cfg.MODEL.DEVICE) * 255.
+        out = model(input).view(-1).cpu().detach().numpy()
+        print('[Model output]: \n', out) 
+        
+    if args.benchmark:
+        start_time = time.time()
+        input = torch.ones(1, 3, cfg.INPUT.SIZE_TEST[0], cfg.INPUT.SIZE_TEST[1]).to(cfg.MODEL.DEVICE) * 255.
+        for i in range(100):
+            out = model(input).view(-1).cpu().detach()
+        print("--- %s seconds ---" % ((time.time() - start_time)/100.) )
+    
+    gen_wts(args)
+    
\ No newline at end of file