提交RetinaFace C++示例

4d3d722b · Your Name · 18da8bf0 · 4d3d722b · 4d3d722b · 4d3d722b
Commit 4d3d722b authored May 29, 2023 by Your Name
20 changed files
--- a/weights/mobilenet0.25_Final.pth
+++ b/weights/mobilenet0.25_Final.pth
--- a/widerface_evaluate/README.md
+++ b/widerface_evaluate/README.md
--- a/widerface_evaluate/box_overlaps.pyx
+++ b/widerface_evaluate/box_overlaps.pyx
--- a/widerface_evaluate/evaluation.py
+++ b/widerface_evaluate/evaluation.py
--- a/widerface_evaluate/ground_truth/wider_easy_val.mat
+++ b/widerface_evaluate/ground_truth/wider_easy_val.mat
--- a/widerface_evaluate/ground_truth/wider_face_val.mat
+++ b/widerface_evaluate/ground_truth/wider_face_val.mat
--- a/widerface_evaluate/ground_truth/wider_hard_val.mat
+++ b/widerface_evaluate/ground_truth/wider_hard_val.mat
--- a/widerface_evaluate/ground_truth/wider_medium_val.mat
+++ b/widerface_evaluate/ground_truth/wider_medium_val.mat
--- a/widerface_evaluate/setup.py
+++ b/widerface_evaluate/setup.py
--- a/README.md
+++ b/README.md
@@ -12,35 +12,89 @@ RetinaFace模型 有几个主要特点：
 - 引入 SSH 算法的 Context Modeling；
 - 多任务训练，提供额外的监督信息。

-## 推理
+## 构建安装
+
+在光源可拉取推理的docker镜像，RetinaFace工程推荐的镜像如下：
+
+```python
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:ort1.14.0_migraphx3.0.0-dtk22.10.1
+```
+
+### 安装Opencv依赖
+
+```python
+cd <path_to_migraphx_samples>
+sh ./3rdParty/InstallOpenCVDependences.sh
+```
+
+### 修改CMakeLists.txt

-### 环境配置
+- 如果使用ubuntu系统，需要修改CMakeLists.txt中依赖库路径：
+  将"${CMAKE_CURRENT_SOURCE_DIR}/depend/lib64/"修改为"${CMAKE_CURRENT_SOURCE_DIR}/depend/lib/"

-在[光源](https://www.sourcefind.cn/#/image/dcu/custom)可拉取用于推理的docker镜像，RetinaFace 模型推理推荐的镜像如下：
+- **MIGraphX2.3.0及以上版本需要c++17**
+
+
+### 安装OpenCV并构建工程

 ```
-docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:ort_dcu_1.14.0_migraphx2.5.2_dtk22.10.1
+rbuild build -d depend
 ```

-在[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)可下载MIGraphX安装包，python依赖安装：
+### 设置环境变量
+
+将依赖库依赖加入环境变量LD_LIBRARY_PATH，在~/.bashrc中添加如下语句：
+
+**Centos**:

 ```
-pip install -r requirements.txt
+export LD_LIBRARY_PATH=<path_to_migraphx_samples>/depend/lib64/:$LD_LIBRARY_PATH
+```
+
+**Ubuntu**:
+
 ```
+export LD_LIBRARY_PATH=<path_to_migraphx_samples>/depend/lib/:$LD_LIBRARY_PATH
+```
+
+然后执行:
+
+```
+source ~/.bashrc
+```
+
+## 推理
+
+### C++版本推理

-安装DTK版的Pytorch和torchvision，下载地址：https://cancon.hpccube.com:65024/4/main/pytorch，https://cancon.hpccube.com:65024/4/main/vision
+成功编译RetinaFace工程后，在build目录下输入如下命令运行该示例：

-### 运行示例
+```
+./MIGraphX_Samples 0
+```
+
+程序运行结束会在build目录生成RetinaFace人脸检测结果图像。
+
+<img src="./Resource/Images/Result_1.jpg" alt="Result" style="zoom:67%;" />

-RetinaFace模型的推理示例程序是RetinaFace_infer_migraphx.py，使用如下命令运行该推理示例：
+### python版本推理
+
+RetinaFace模型的推理示例程序是RetinaFace_infer_migraphx.py，进入python文件夹使用如下命令运行该推理示例：

 ```
+# 进入python示例目录
+cd ./Python
+
+# 安装依赖
+pip install -r requirements.txt
+
+# 运行程序
 python RetinaFace_infer_migraphx.py 
 ```

 程序运行结束会在当前目录生成RetinaFace检测结果图像。

-<img src="./curve/Result.jpg" alt="Result" style="zoom: 50%;" />
+<img src="./Resource/Images/Result_2.jpg" alt="Result_2" style="zoom:67%;" />

 ## 历史版本


--- a/Resource/Configuration.xml
+++ b/Resource/Configuration.xml
+<?xml version="1.0" encoding="GB2312"?>
+<opencv_storage>
+
+	<!--RetinaFace检测器-->
+	<DetectorRetinaFace>
+		<ModelPath>"../Resource/Models/Detector/RetinaFace/mobilenet0.25_Final.onnx"</ModelPath>
+        <Scale>1.0</Scale><!--缩放尺度-->
+		<MeanValue1>104</MeanValue1><!--均值，顺序为bgr-->
+		<MeanValue2>117</MeanValue2>
+		<MeanValue3>123</MeanValue3>
+		<SwapRB>0</SwapRB>
+		<Crop>0</Crop>
+		<UseInt8>0</UseInt8><!--是否使用int8,不支持-->
+		<UseFP16>0</UseFP16><!--是否使用FP16-->
+
+		<!--////////////////// RetinaFace检测器参数 ////////////////// -->
+		<!--priorbox层的个数-->
+		<PriorBoxLayerNumber>3</PriorBoxLayerNumber>
+
+		<!--每个priorbox层的minisize和maxSize(需要与输出检测层顺序保持一致，下面涉及每个priorbox层参数的都需要保持顺序一致)-->
+		<MinSize11>16</MinSize11>
+		<MinSize12>32</MinSize12>
+		<MinSize21>64</MinSize21>
+		<MinSize22>128</MinSize22>
+		<MinSize31>256</MinSize31>
+		<MinSize32>512</MinSize32>
+
+		<!--每个priorbox层的Flip和Clip(使用0,1表示)-->
+		<Flip1>0</Flip1>
+		<Flip2>0</Flip2>
+		<Flip3>0</Flip3>
+
+		<Clip1>0</Clip1>
+		<Clip2>0</Clip2>
+		<Clip3>0</Clip3>
+
+		<!--每个priorbox层的宽高比(由于RetinaFace只包含宽高比为1的anchor，所以这里不需要设置宽高比)-->
+		<!-- <AspectRatio11>0.3333</AspectRatio11>
+		<AspectRatio12>0.25</AspectRatio12>
+		<AspectRatio21>0.3333</AspectRatio21>
+		<AspectRatio22>0.25</AspectRatio22>
+		<AspectRatio31>0.3333</AspectRatio31>
+		<AspectRatio32>0.25</AspectRatio32>
+		<AspectRatio41>0.3333</AspectRatio41>
+		<AspectRatio42>0.25</AspectRatio42> -->
+
+		<!--每个priorbox层的step-->
+		<PriorBoxStepWidth1>8</PriorBoxStepWidth1><!--第一个priorbox层的step的width-->
+		<PriorBoxStepWidth2>16</PriorBoxStepWidth2>
+		<PriorBoxStepWidth3>32</PriorBoxStepWidth3>
+
+		<PriorBoxStepHeight1>8</PriorBoxStepHeight1><!--第一个priorbox层的step的height-->
+		<PriorBoxStepHeight2>16</PriorBoxStepHeight2>
+		<PriorBoxStepHeight3>32</PriorBoxStepHeight3>
+
+		<!--priorbox层中的offset-->
+		<Offset>0.5</Offset>
+
+		<!--DetectionOutput参数-->
+		<ClassNumber>2</ClassNumber>
+		<TopK>400</TopK>
+		<KeepTopK>200</KeepTopK>
+		<NMSThreshold>0.3</NMSThreshold>
+		<ConfidenceThreshold>0.9</ConfidenceThreshold>
+	</DetectorRetinaFace>
+</opencv_storage>
--- a/Resource/Images/FaceDetect.jpg
+++ b/Resource/Images/FaceDetect.jpg
--- a/Resource/Images/Result_1.jpg
+++ b/Resource/Images/Result_1.jpg
--- a/Resource/Images/Result_2.jpg
+++ b/Resource/Images/Result_2.jpg
--- a/Resource/Models/Detector/RetinaFace/mobilenet0.25_Final.onnx
+++ b/Resource/Models/Detector/RetinaFace/mobilenet0.25_Final.onnx
--- a/Src/RetinaFace/DetectorRetinaFace.cpp
+++ b/Src/RetinaFace/DetectorRetinaFace.cpp
+#include <DetectorRetinaFace.h>
+#include <migraphx/onnx.hpp>
+#include <migraphx/gpu/target.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/quantization.hpp>
+#include <opencv2/dnn.hpp>
+#include <CommonUtility.h>
+#include <Filesystem.h>
+#include <SimpleLog.h>
+
+using namespace cv::dnn;
+
+namespace migraphxSamples
+{
+
+#define SSD_QUANT_BASE     4096 // 基数
+#define SSD_COORDI_NUM     4  // 坐标个数(x1,y1,x2,y2)
+#define SSD_PROPOSAL_WIDTH 6
+#define SSD_HALF           0.5
+#define SSD_ASPECT_RATIO_NUM 6 // 默认最大的宽高比个数
+
+#define SSD_MAX(a,b)    (((a) > (b)) ? (a) : (b))
+#define SSD_MIN(a,b)    (((a) < (b)) ? (a) : (b))
+
+// 16字节对齐
+#define SSD_ALIGN_16 16
+#define SSD_ALIGN16(number) ((number + SSD_ALIGN_16-1) / SSD_ALIGN_16*SSD_ALIGN_16)
+
+DetectorRetinaFace::DetectorRetinaFace():logFile(NULL)
+{
+}
+
+DetectorRetinaFace::~DetectorRetinaFace()
+{
+    configurationFile.release();
+    
+    // 释放SSD参数的内存空间
+    delete[] ssdParameter.buffer;
+}
+
+ErrorCode DetectorRetinaFace::Initialize(InitializationParameterOfDetector initializationParameterOfDetector)
+{
+    // 初始化(获取日志文件,加载配置文件等)
+    ErrorCode errorCode=DoCommonInitialization(initializationParameterOfDetector);
+    if(errorCode!=SUCCESS)
+    {
+        LOG_ERROR(logFile,"fail to DoCommonInitialization\n");
+        return errorCode;
+    }
+    LOG_INFO(logFile,"succeed to DoCommonInitialization\n");
+    
+    // 获取配置文件参数
+    FileNode netNode = configurationFile["DetectorRetinaFace"];
+    string modelPath=initializationParameter.parentPath+(string)netNode["ModelPath"];
+    scale=(float)netNode["Scale"];
+    meanValue.val[0]=(float)netNode["MeanValue1"];
+    meanValue.val[1]=(float)netNode["MeanValue2"];
+    meanValue.val[2]=(float)netNode["MeanValue3"];
+    swapRB=(bool)(int)netNode["SwapRB"];
+    crop=(bool)(int)netNode["Crop"];
+    useInt8=(bool)(int)netNode["UseInt8"];
+    useFP16=(bool)(int)netNode["UseFP16"];
+
+    // 加载模型
+    if(Exists(modelPath)==false)
+    {
+        LOG_ERROR(logFile,"%s not exist!\n",modelPath.c_str());
+        return MODEL_NOT_EXIST;
+    }
+    net = migraphx::parse_onnx(modelPath);
+    LOG_INFO(logFile,"succeed to load model: %s\n",GetFileName(modelPath).c_str());
+
+    // 获取模型输入属性
+    std::pair<std::string, shape> inputAttribute=*(net.get_parameter_shapes().begin());
+    inputName=inputAttribute.first;
+    inputShape=inputAttribute.second;
+    inputSize=Size(inputShape.lens()[3],inputShape.lens()[2]);
+    
+    // 设置模型为GPU模式
+    migraphx::target gpuTarget = migraphx::gpu::target{};
+
+    // 量化
+    if(useInt8)
+    {
+        // 创建量化校准数据,建议使用测试集中的多张典型图像
+        cv::Mat srcImage=imread("../Resource/Images/FaceDetect_2.jpg",1);
+        std::vector<cv::Mat> srcImages;
+        for(int i=0;i<inputShape.lens()[0];++i)
+        {
+            srcImages.push_back(srcImage);
+        }
+        cv::Mat inputBlob;
+        blobFromImages(srcImages,
+                        inputBlob,
+                        scale,
+                        inputSize,
+                        meanValue,
+                        swapRB,
+                        false);
+        migraphx::parameter_map inputData;
+        inputData[inputName]= migraphx::argument{inputShape, (float*)inputBlob.data};
+        std::vector<migraphx::parameter_map> calibrationData = {inputData};
+
+        // INT8量化
+        migraphx::quantize_int8(net, gpuTarget, calibrationData);
+    }
+    if(useFP16)
+    {
+        migraphx::quantize_fp16(net);
+    }
+
+    // 编译模型
+    migraphx::compile_options options;
+    options.device_id=0; // 设置GPU设备，默认为0号设备
+    options.offload_copy=true; // 设置offload_copy
+    net.compile(gpuTarget,options);
+    LOG_INFO(logFile,"succeed to compile model: %s\n",GetFileName(modelPath).c_str());
+
+    // Run once by itself
+    migraphx::parameter_map inputData;
+    inputData[inputName]=migraphx::generate_argument(inputShape);
+    net.eval(inputData);
+
+    // log
+    LOG_INFO(logFile,"InputSize:%dx%d\n",inputSize.width,inputSize.height);
+    LOG_INFO(logFile,"InputName:%s\n",inputName.c_str());
+    LOG_INFO(logFile,"Scale:%.6f\n",scale);
+    LOG_INFO(logFile,"Mean:%.2f,%.2f,%.2f\n",meanValue.val[0],meanValue.val[1],meanValue.val[2]);
+    LOG_INFO(logFile,"SwapRB:%d\n",(int)swapRB);
+    LOG_INFO(logFile,"Crop:%d\n",(int)crop);
+    LOG_INFO(logFile,"UseInt8:%d\n",(int)useInt8);
+    LOG_INFO(logFile,"UseFP16:%d\n",(int)useFP16);
+
+    // 读取SSD 参数
+    GetSSDParameter();
+
+    return SUCCESS;
+
+}
+ErrorCode DetectorRetinaFace::DoCommonInitialization(InitializationParameterOfDetector initializationParameterOfDetector)
+{
+    initializationParameter=initializationParameterOfDetector;
+
+    // 获取日志文件
+    logFile=LogManager::GetInstance()->GetLogFile(initializationParameter.logName);
+
+    // 加载配置文件
+    std::string configFilePath=initializationParameter.configFilePath;
+    if(!Exists(configFilePath))
+    {
+        LOG_ERROR(logFile, "no configuration file!\n");
+        return CONFIG_FILE_NOT_EXIST;
+    }
+    if(!configurationFile.open(configFilePath, FileStorage::READ))
+    {
+       LOG_ERROR(logFile, "fail to open configuration file\n");
+       return FAIL_TO_OPEN_CONFIG_FILE;
+    }
+    LOG_INFO(logFile, "succeed to open configuration file\n");
+
+    // 修改父路径
+    std::string &parentPath = initializationParameter.parentPath;
+    if (!parentPath.empty())
+    {
+        if(!IsPathSeparator(parentPath[parentPath.size() - 1]))
+        {
+           parentPath+=PATH_SEPARATOR;
+        }
+    }
+
+    return SUCCESS;
+
+}
+
+ErrorCode DetectorRetinaFace::Detect(const cv::Mat &srcImage,std::vector<ResultOfDetection> &resultsOfDetection)
+{
+
+    if(srcImage.empty()||srcImage.type()!=CV_8UC3)
+    {
+        LOG_ERROR(logFile, "image error!\n");
+        return IMAGE_ERROR;
+    }
+
+    // 预处理并转换为NCHW
+    cv::Mat inputBlob;
+    blobFromImage(srcImage,
+                    inputBlob,
+                    scale,
+                    inputSize,
+                    meanValue,
+                    swapRB,
+                    false);
+ 
+    // 输入数据
+    migraphx::parameter_map inputData;
+    inputData[inputName]= migraphx::argument{inputShape, (float*)inputBlob.data};
+
+    // 推理
+    std::vector<migraphx::argument> inferenceResults=net.eval(inputData);
+    vector<vector<float>> regressions;
+    vector<vector<float>> classifications;
+    for(int i=0;i<ssdParameter.numberOfPriorBoxLayer;++i) // 执行Permute操作
+    {
+        int numberOfPriorBox=ssdParameter.detectInputChn[i]/(4*(ssdParameter.priorBoxHeight[i] * ssdParameter.priorBoxWidth[i]));
+
+        // BboxHead
+        std::vector<float> regression;
+        migraphx::argument result0  = inferenceResults[2*i]; 
+        result0.visit([&](auto output) { regression.assign(output.begin(), output.end()); });
+        regression=PermuteLayer(regression,ssdParameter.priorBoxWidth[i],ssdParameter.priorBoxHeight[i],numberOfPriorBox*4);
+        regressions.push_back(regression);
+        
+        // ClassHead
+        std::vector<float> classification;
+        migraphx::argument result1  = inferenceResults[2*i+1]; 
+        result1.visit([&](auto output) { classification.assign(output.begin(), output.end()); });
+        classification=PermuteLayer(classification,ssdParameter.priorBoxWidth[i],ssdParameter.priorBoxHeight[i],numberOfPriorBox*ssdParameter.classNum);
+        classifications.push_back(classification);
+    }
+
+    // 对推理结果进行处理，得到最后SSD检测的结果
+    GetResult(classifications,regressions,resultsOfDetection);
+
+    // 转换到原图坐标
+    for(int i=0;i<resultsOfDetection.size();++i)
+    {
+        float ratioOfWidth=(1.0*srcImage.cols)/inputSize.width;
+        float ratioOfHeight=(1.0*srcImage.rows)/inputSize.height;
+
+        resultsOfDetection[i].boundingBox.x*=ratioOfWidth;
+        resultsOfDetection[i].boundingBox.width*=ratioOfWidth;
+        resultsOfDetection[i].boundingBox.y*=ratioOfHeight;
+        resultsOfDetection[i].boundingBox.height*=ratioOfHeight;
+    }
+
+    // 按照置信度排序
+    sort(resultsOfDetection.begin(), resultsOfDetection.end(),CompareConfidence);
+
+    return SUCCESS;
+
+}
+
+void DetectorRetinaFace::GetSSDParameter()
+{
+    FileNode rootNode = configurationFile["DetectorRetinaFace"];
+    ssdParameter.numberOfPriorBoxLayer=(int)rootNode["PriorBoxLayerNumber"];
+
+    ssdParameter.srcImageHeight = inputSize.height;
+    ssdParameter.srcImageWidth = inputSize.width;
+
+    // MinSize,MaxSize
+    ssdParameter.priorBoxMinSize.resize(ssdParameter.numberOfPriorBoxLayer);
+    ssdParameter.priorBoxMaxSize.resize(ssdParameter.numberOfPriorBoxLayer);
+    for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer; ++i)
+	{
+		char nodeName[256] = { 0 };
+
+		// miniSize
+        {
+            int j=0;
+            while(true)
+            {
+               sprintf(nodeName, "MinSize%d%d", (i + 1),++j);
+               FileNode miniSizeNode = rootNode[nodeName];
+               if(miniSizeNode.empty())
+               {
+                   break;
+               }
+               else
+               {
+                 ssdParameter.priorBoxMinSize[i].push_back((float)rootNode[nodeName]);
+               }
+            }
+        }
+
+		// maxSize
+        {
+            int j=0;
+            while(true)
+            {
+               sprintf(nodeName, "MaxSize%d%d", (i + 1),++j);
+               FileNode maxSizeNode = rootNode[nodeName];
+               if(maxSizeNode.empty())
+               {
+                   break;
+               }
+               else
+               {
+                 ssdParameter.priorBoxMaxSize[i].push_back((float)rootNode[nodeName]);
+               }
+            }
+        }
+	}
+
+    // MinSizeNumber,MaxSizeNumber
+    for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer; ++i)
+    {
+        ssdParameter.minSizeNum[i] = ssdParameter.priorBoxMinSize[i].size();
+        ssdParameter.maxSizeNum[i] = ssdParameter.priorBoxMaxSize[i].size();;
+    }
+
+    // Flip,Clip
+    for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer; ++i)
+	{
+		char nodeName[256] = { 0 };
+
+		// Flip
+		sprintf(nodeName, "Flip%d", i + 1);
+		int flip = (int)rootNode[nodeName];
+		ssdParameter.flip[i] = flip;
+
+		// Clip
+		sprintf(nodeName, "Clip%d", i + 1);
+		int clip = (int)rootNode[nodeName];
+		ssdParameter.clip[i] = clip;
+	}
+
+    // AspectRatio
+    ssdParameter.priorBoxAspectRatio.resize(ssdParameter.numberOfPriorBoxLayer);
+    for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer; ++i)
+	{
+        char nodeName[256] = { 0 };
+        int j=0;
+        while(true)
+        {
+           sprintf(nodeName, "AspectRatio%d%d", (i + 1),++j);
+           FileNode aspectRatioNode = rootNode[nodeName];
+           if(aspectRatioNode.empty())
+           {
+               break;
+           }
+           else
+           {
+             ssdParameter.priorBoxAspectRatio[i].push_back((float)rootNode[nodeName]);
+           }
+        }
+    }
+
+    // aspect ratio number
+    for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer; ++i)
+    {
+        ssdParameter.inputAspectRatioNum[i] = ssdParameter.priorBoxAspectRatio[i].size();
+    }
+
+    // PriorBoxStep
+    for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer; ++i)
+	{
+		char nodeName[256] = { 0 };
+
+		// width
+		sprintf(nodeName, "PriorBoxStepWidth%d", i + 1);
+		int width = (int)rootNode[nodeName];
+		ssdParameter.priorBoxStepWidth[i] = width;
+
+		// height
+		sprintf(nodeName, "PriorBoxStepHeight%d", i + 1);
+		int height = (int)rootNode[nodeName];
+		ssdParameter.priorBoxStepHeight[i] = height;
+
+	}
+
+    // PriorBoxWidth,PriorBoxHeight
+    for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer; ++i)
+    {
+        ssdParameter.priorBoxWidth[i] = ssdParameter.srcImageWidth/ssdParameter.priorBoxStepWidth[i];
+        ssdParameter.priorBoxHeight[i] = ssdParameter.srcImageHeight/ssdParameter.priorBoxStepHeight[i];
+    }
+
+    ssdParameter.offset = (float)rootNode["Offset"];
+
+    ssdParameter.priorBoxVar[0] = (int)(0.1f*SSD_QUANT_BASE);
+    ssdParameter.priorBoxVar[1] = (int)(0.1f*SSD_QUANT_BASE);
+    ssdParameter.priorBoxVar[2] = (int)(0.2f*SSD_QUANT_BASE);
+    ssdParameter.priorBoxVar[3] = (int)(0.2f*SSD_QUANT_BASE);
+
+    int classNumber = (int)rootNode["ClassNumber"];
+    ssdParameter.softMaxInHeight = classNumber;
+
+    ssdParameter.concatNum = ssdParameter.numberOfPriorBoxLayer;
+    ssdParameter.softMaxOutWidth = 1;
+    ssdParameter.softMaxOutHeight = classNumber;
+
+    int totalSizeOfClasReg=0;// 分类和回归一共需要的内存空间大小
+	for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer; ++i)
+	{
+        int priorBoxNumber=0;
+        priorBoxNumber+=1;// aspect ratio=1
+        for (int j = 0; j < ssdParameter.inputAspectRatioNum[i]; j++)
+        {
+            ++priorBoxNumber;
+            if (ssdParameter.flip[j]==1)
+            {
+                ++priorBoxNumber;
+            }
+        }
+        priorBoxNumber = ssdParameter.minSizeNum[i] * priorBoxNumber + ssdParameter.maxSizeNum[i];
+
+		int totalPriorBoxNumber = priorBoxNumber*ssdParameter.priorBoxHeight[i] * ssdParameter.priorBoxWidth[i];
+		ssdParameter.softMaxInChn[i] = totalPriorBoxNumber * classNumber;
+		ssdParameter.softMaxOutChn += totalPriorBoxNumber;
+		ssdParameter.detectInputChn[i] = totalPriorBoxNumber * 4;
+
+        totalSizeOfClasReg+=(ssdParameter.softMaxInChn[i]+ssdParameter.detectInputChn[i]);
+
+	}
+
+    // DetectionOut
+    ssdParameter.classNum = classNumber;
+    ssdParameter.topK = (int)rootNode["TopK"];;
+    ssdParameter.keepTopK = (int)rootNode["KeepTopK"];
+    ssdParameter.NMSThresh = (int)((float)rootNode["NMSThreshold"]* SSD_QUANT_BASE);
+    ssdParameter.confThresh=(int)((float)rootNode["ConfidenceThreshold"]*SSD_QUANT_BASE);
+    
+    for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer ; i++)
+    {
+        int numberOfPriorBox=ssdParameter.detectInputChn[i]/(4*(ssdParameter.priorBoxHeight[i] * ssdParameter.priorBoxWidth[i]));
+
+        ssdParameter.convHeight[2*i]=ssdParameter.priorBoxHeight[i];
+        ssdParameter.convWidth[2*i]=ssdParameter.priorBoxWidth[i];
+        ssdParameter.convChannel[2*i]=numberOfPriorBox*4;
+
+        ssdParameter.convHeight[2*i+1]=ssdParameter.priorBoxHeight[i];
+        ssdParameter.convWidth[2*i+1]=ssdParameter.priorBoxWidth[i];
+        ssdParameter.convChannel[2*i+1]=numberOfPriorBox*ssdParameter.classNum;
+
+        ssdParameter.convStride[i] = SSD_ALIGN16(ssdParameter.convChannel[2*i+1] * sizeof(int)) / sizeof(int);
+    }
+
+    // 计算softMaxOutputData内存空间大小
+    int softMaxSize=0;
+    for(int i = 0; i < ssdParameter.concatNum; i++)
+    {
+        softMaxSize += ssdParameter.softMaxInChn[i];
+    }
+
+    // 计算getResultBuffer内存空间大小
+    int priorNum = 0;
+    int detectionSize = 0;
+    for(int i = 0; i < ssdParameter.concatNum; i++)
+    {
+        priorNum+=ssdParameter.detectInputChn[i]/SSD_COORDI_NUM;
+    }
+    detectionSize+=priorNum*SSD_COORDI_NUM;
+    detectionSize+=priorNum*SSD_PROPOSAL_WIDTH*2;
+    detectionSize+=priorNum*2;
+
+    // 计算dstRoi,classRoiNum,dstScore内存空间大小
+    int dstRoiSize = 0;
+    int dstScoreSize = 0;
+    int classRoiNumSize = 0;
+    dstRoiSize = SSD_ALIGN16(ssdParameter.classNum*ssdParameter.topK*SSD_COORDI_NUM);
+    dstScoreSize = SSD_ALIGN16(ssdParameter.classNum*ssdParameter.topK);
+    classRoiNumSize = SSD_ALIGN16(ssdParameter.classNum);
+
+    // 申请内存，并分配
+    int totalSize=totalSizeOfClasReg+SSD_COORDI_NUM*2*ssdParameter.softMaxOutChn+softMaxSize+detectionSize+dstRoiSize+classRoiNumSize+dstScoreSize;
+    ssdParameter.buffer=new int[totalSize];
+    int *data=ssdParameter.buffer;
+    memset(data,0,totalSize*sizeof(int));// 初始化0
+    int offset=0;
+    for (int i = 0; i < ssdParameter.numberOfPriorBoxLayer; ++i)
+	{
+        
+        int *dataOfClasReg=data+offset;
+        ssdParameter.classification[i]=dataOfClasReg;
+        ssdParameter.regression[i]=dataOfClasReg+ssdParameter.softMaxInChn[i];
+
+        offset+=(ssdParameter.softMaxInChn[i]+ssdParameter.detectInputChn[i]);
+
+    }
+    ssdParameter.priorboxOutputData=data+totalSizeOfClasReg;
+    ssdParameter.softMaxOutputData=ssdParameter.priorboxOutputData+SSD_COORDI_NUM*2*ssdParameter.softMaxOutChn;
+    ssdParameter.getResultBuffer=ssdParameter.softMaxOutputData+softMaxSize;
+    ssdParameter.dstRoi=ssdParameter.getResultBuffer+detectionSize;
+    ssdParameter.classRoiNum=ssdParameter.dstRoi+dstRoiSize;
+    ssdParameter.dstScore=ssdParameter.classRoiNum+classRoiNumSize;
+
+}
+
+void DetectorRetinaFace::GetResult(const vector<vector<float>> &classifications,const vector<vector<float>> &regressions,vector<ResultOfDetection> &resultsOfDetection)
+{
+    int numberOfPriorBoxLayer=ssdParameter.numberOfPriorBoxLayer;
+
+    // 类型转换
+    for(int i = 0; i < numberOfPriorBoxLayer; i++)
+    {
+        // 分类
+        vector<float> classificationOfEachLayer=classifications[i];
+        for(int j=0;j<classificationOfEachLayer.size();++j)
+        {
+            (ssdParameter.classification[i])[j]=classificationOfEachLayer[j]*SSD_QUANT_BASE;
+        }
+
+        // 回归
+        vector<float> regressionOfEachLayer=regressions[i];
+        for(int j=0;j<regressionOfEachLayer.size();++j)
+        {
+            (ssdParameter.regression[i])[j]=regressionOfEachLayer[j]*SSD_QUANT_BASE;
+        }
+
+    }
+
+    int* priorboxOutputData[SSD_MAX_PRIORBOX_LAYER_NUM];
+    int* softMaxInputData[SSD_MAX_PRIORBOX_LAYER_NUM];
+    int* detectionLocData[SSD_MAX_PRIORBOX_LAYER_NUM];
+    int* softMaxOutputData = NULL;
+    int* detectionOutTmpBuf = NULL;
+    int  softMaxWidth[SSD_MAX_PRIORBOX_LAYER_NUM];
+    int size = 0;
+    int i = 0;
+
+    /////////////////////////////////// PriorBoxLayer：生成所有priorbox ///////////////////////////////////
+    // 分配priorboxOutputData内存空间
+    priorboxOutputData[0] = ssdParameter.priorboxOutputData;
+    for (i = 1; i < numberOfPriorBoxLayer; i++)
+    {
+        size=ssdParameter.softMaxInChn[i-1]/ssdParameter.classNum*SSD_COORDI_NUM*2;
+        priorboxOutputData[i] = priorboxOutputData[i - 1] + size;
+    }
+    for (i = 0; i < numberOfPriorBoxLayer; i++)
+    {
+        PriorBoxLayer(i,priorboxOutputData[i]);
+    }
+
+    /////////////////////////////////// SoftmaxLayer：计算所有priorbox的置信度 ///////////////////////////////////
+    // 分配softMaxOutputData内存空间
+    softMaxOutputData =ssdParameter.softMaxOutputData;
+    for(i = 0; i < numberOfPriorBoxLayer; i++)
+    {
+        softMaxInputData[i] = ssdParameter.classification[i];
+        softMaxWidth[i] = ssdParameter.convChannel[i*2+1];
+    }
+    SoftmaxLayer(softMaxWidth,softMaxInputData, softMaxOutputData);
+
+    /////////////////////////////////// DetectionOutputLayer：对网络输出值解码并经过NMS得到最后的检测结果 ///////////////////////////////////
+    // 分配DetectionOut内存空间
+    detectionOutTmpBuf = ssdParameter.getResultBuffer;
+    for(i = 0; i < numberOfPriorBoxLayer; i++)
+    {
+        detectionLocData[i] = ssdParameter.regression[i];
+    }
+    DetectionOutputLayer(detectionLocData, priorboxOutputData, softMaxOutputData,detectionOutTmpBuf);
+
+    // 获取最后的检测结果
+    CreateDetectionResults(resultsOfDetection);
+}
+
+
+void DetectorRetinaFace::PriorBoxLayer(int indexOfLayer,int* priorboxOutputData)
+{
+    // 参数赋值
+    int priorBoxWidth=ssdParameter.priorBoxWidth[indexOfLayer];
+    int priorBoxHeight=ssdParameter.priorBoxHeight[indexOfLayer];
+    int srcImageWidth=ssdParameter.srcImageWidth;
+    int srcImageHeight=ssdParameter.srcImageHeight;
+    vector<float> priorBoxMinSize=ssdParameter.priorBoxMinSize[indexOfLayer];
+    int minSizeNum=ssdParameter.minSizeNum[indexOfLayer];
+    vector<float> priorBoxMaxSize=ssdParameter.priorBoxMaxSize[indexOfLayer];
+    int maxSizeNum=ssdParameter.maxSizeNum[indexOfLayer];
+    int flip=ssdParameter.flip[indexOfLayer];
+    int clip=ssdParameter.clip[indexOfLayer];
+    int inputAspectRatioNum=ssdParameter.inputAspectRatioNum[indexOfLayer];
+    vector<float> priorBoxAspectRatio=ssdParameter.priorBoxAspectRatio[indexOfLayer];
+    float priorBoxStepWidth=ssdParameter.priorBoxStepWidth[indexOfLayer];
+    float priorBoxStepHeight= ssdParameter.priorBoxStepHeight[indexOfLayer];
+    float offset=ssdParameter.offset;
+    int *priorBoxVar=ssdParameter.priorBoxVar;
+
+    int aspectRatioNum = 0;
+    int index = 0;
+    float aspectRatio[SSD_ASPECT_RATIO_NUM] = { 0 };
+    int numPrior = 0;
+    float centerX = 0;
+    float centerY = 0;
+    float boxHeight = 0;
+    float boxWidth = 0;
+    float maxBoxWidth = 0;
+    int i = 0;
+    int j = 0;
+    int n = 0;
+    int h = 0;
+    int w = 0;
+
+    aspectRatioNum = 0;
+    aspectRatio[0] = 1;
+    aspectRatioNum++;
+    for (i = 0; i < inputAspectRatioNum; i++)
+    {
+        aspectRatio[aspectRatioNum++] = priorBoxAspectRatio[i];
+        if (flip)
+        {
+            aspectRatio[aspectRatioNum++] = 1.0f / priorBoxAspectRatio[i];
+        }
+    }
+    numPrior = minSizeNum * aspectRatioNum + maxSizeNum;
+
+    index = 0;
+    for (h = 0; h < priorBoxHeight; h++)
+    {
+        for (w = 0; w < priorBoxWidth; w++)
+        {
+            centerX = (w + offset) * priorBoxStepWidth;
+            centerY = (h + offset) * priorBoxStepHeight;
+            for (n = 0; n < minSizeNum; n++)
+            {
+                // 首先产生宽高比为1的priorbox
+                boxHeight = priorBoxMinSize[n];
+                boxWidth = priorBoxMinSize[n];
+                priorboxOutputData[index++] = (int)(centerX - boxWidth * SSD_HALF);
+                priorboxOutputData[index++] = (int)(centerY - boxHeight * SSD_HALF);
+                priorboxOutputData[index++] = (int)(centerX + boxWidth * SSD_HALF);
+                priorboxOutputData[index++] = (int)(centerY + boxHeight * SSD_HALF);
+
+                // 对于max_size,生成宽高比为1的priorbox,宽高为sqrt(min_size * max_size)
+                if(maxSizeNum>0)
+                {
+                    maxBoxWidth = sqrt(priorBoxMinSize[n] * priorBoxMaxSize[n]);
+                    boxHeight = maxBoxWidth;
+                    boxWidth = maxBoxWidth;
+                    priorboxOutputData[index++] = (int)(centerX - boxWidth * SSD_HALF);
+                    priorboxOutputData[index++] = (int)(centerY - boxHeight * SSD_HALF);
+                    priorboxOutputData[index++] = (int)(centerX + boxWidth * SSD_HALF);
+                    priorboxOutputData[index++] = (int)(centerY + boxHeight * SSD_HALF);
+                }
+
+                // 剩下的priorbox
+                for (i = 1; i < aspectRatioNum; i++)
+                {
+                    boxWidth = (float)(priorBoxMinSize[n] * sqrt( aspectRatio[i] ));
+                    boxHeight = (float)(priorBoxMinSize[n]/sqrt( aspectRatio[i] ));
+
+                    priorboxOutputData[index++] = (int)(centerX - boxWidth * SSD_HALF);
+                    priorboxOutputData[index++] = (int)(centerY - boxHeight * SSD_HALF);
+                    priorboxOutputData[index++] = (int)(centerX + boxWidth * SSD_HALF);
+                    priorboxOutputData[index++] = (int)(centerY + boxHeight * SSD_HALF);
+                }
+            }
+        }
+    }
+
+    // 越界处理 [0, srcImageWidth] & [0, srcImageHeight] 
+    if (clip)
+    {
+        for (i = 0; i < (int)(priorBoxWidth * priorBoxHeight * SSD_COORDI_NUM*numPrior / 2); i++)
+        {
+            priorboxOutputData[2 * i] = SSD_MIN((int)SSD_MAX(priorboxOutputData[2 * i], 0), srcImageWidth);
+            priorboxOutputData[2 * i + 1] = SSD_MIN((int)SSD_MAX(priorboxOutputData[2 * i + 1], 0), srcImageHeight);
+        }
+    }
+    // var
+    for (h = 0; h < priorBoxHeight; h++)
+    {
+        for (w = 0; w < priorBoxWidth; w++)
+        {
+            for (i = 0; i < numPrior; i++)
+            {
+                for (j = 0; j < SSD_COORDI_NUM; j++)
+                {
+                    priorboxOutputData[index++] = (int)priorBoxVar[j];
+                }
+            }
+        }
+    }
+}
+
+void DetectorRetinaFace::SoftmaxLayer(int softMaxWidth[],int* softMaxInputData[], int* softMaxOutputData)
+{
+
+    // 参数赋值
+    int softMaxInHeight=ssdParameter.softMaxInHeight;
+    int *softMaxInChn=ssdParameter.softMaxInChn;
+    int concatNum=ssdParameter.concatNum;
+    int *convStride=ssdParameter.convStride;
+
+    int* inputData = NULL;
+    int* outputTmp = NULL;
+    int outerNum = 0;
+    int innerNum = 0;
+    int inputChannel = 0;
+    int i = 0;
+    int concatCnt = 0;
+    int stride = 0;
+    int skip = 0;
+    int left = 0;
+    outputTmp = softMaxOutputData;
+    for (concatCnt = 0; concatCnt < concatNum; concatCnt++)
+    {
+        inputData = softMaxInputData[concatCnt];
+        stride = convStride[concatCnt];
+        inputChannel = softMaxInChn[concatCnt];
+        outerNum = inputChannel / softMaxInHeight;
+        innerNum = softMaxInHeight;
+        skip = softMaxWidth[concatCnt] / innerNum;
+        left = stride - softMaxWidth[concatCnt];
+        for (i = 0; i < outerNum; i++)
+        {
+            ComputeSoftMax(inputData, (int)innerNum,outputTmp);
+            inputData += innerNum;
+            outputTmp += innerNum;
+        }
+    }
+}
+
+void DetectorRetinaFace::ComputeSoftMax(int* src, int size, int* dst)
+{
+    int max = 0;
+    int sum = 0;
+    int i = 0;
+    for (i = 0; i < size; ++i)
+    {
+        if (max < src[i])
+        {
+            max = src[i];
+        }
+    }
+    for (i = 0; i < size; ++i)
+    {
+        dst[i] = (int)(SSD_QUANT_BASE* exp((float)(src[i] - max) / SSD_QUANT_BASE));
+        sum += dst[i];
+    }
+    for (i = 0; i < size; ++i)
+    {
+        dst[i] = (int)(((float)dst[i] / (float)sum) * SSD_QUANT_BASE);
+    }
+
+}
+
+void DetectorRetinaFace::DetectionOutputLayer(int* allLocPreds[], int* allPriorBoxes[],int* confScores, int* assistMemPool)
+{
+
+    // 参数赋值
+    int concatNum=ssdParameter.concatNum;
+    int confThresh=ssdParameter.confThresh;
+    int classNum=ssdParameter.classNum;
+    int topK=ssdParameter.topK;
+    int keepTopK=ssdParameter.keepTopK;
+    int NMSThresh=ssdParameter.NMSThresh;
+    int *detectInputChn=ssdParameter.detectInputChn;
+    int* dstScoreSrc=ssdParameter.dstScore;
+    int* dstBboxSrc=ssdParameter.dstRoi;
+    int* roiOutCntSrc=ssdParameter.classRoiNum;
+
+    int* locPreds = NULL;
+    int* priorBoxes = NULL;
+    int* priorVar = NULL;
+    int* allDecodeBoxes = NULL;
+    int* dstScore = NULL;
+    int* dstBbox = NULL;
+    int* classRoiNum = NULL;
+    int roiOutCnt = 0;
+    int* singleProposal = NULL;
+    int* afterTopK = NULL;
+    QuickSortStack* stack = NULL;
+    int priorNum = 0;
+    int numPredsPerClass = 0;
+    float priorWidth = 0;
+    float priorHeight = 0;
+    float priorCenterX = 0;
+    float priorCenterY = 0;
+    float decodeBoxCenterX = 0;
+    float decodeBoxCenterY = 0;
+    float decodeBoxWidth = 0;
+    float decodeBoxHeight = 0;
+    int srcIdx = 0;
+    int afterFilter = 0;
+    int afterTopK2 = 0;
+    int keepCnt = 0;
+    int i = 0;
+    int j = 0;
+    int offset = 0;
+    priorNum = 0;
+    for (i = 0; i < concatNum; i++)
+    {
+        priorNum += detectInputChn[i] / SSD_COORDI_NUM;
+    }
+
+    // 缓存
+    allDecodeBoxes = assistMemPool;
+    singleProposal = allDecodeBoxes + priorNum * SSD_COORDI_NUM;
+    afterTopK = singleProposal + SSD_PROPOSAL_WIDTH * priorNum;
+    stack = (QuickSortStack*)(afterTopK + priorNum * SSD_PROPOSAL_WIDTH);
+    srcIdx = 0;
+    for (i = 0; i < concatNum; i++)
+    {
+        // 回归预测值
+        locPreds = allLocPreds[i];
+        numPredsPerClass = detectInputChn[i] / SSD_COORDI_NUM;
+
+        // 获取priorbox
+        priorBoxes = allPriorBoxes[i];
+        priorVar = priorBoxes + numPredsPerClass*SSD_COORDI_NUM;
+        for (j = 0; j < numPredsPerClass; j++)
+        {
+            priorWidth = (float)(priorBoxes[j*SSD_COORDI_NUM+2] - priorBoxes[j*SSD_COORDI_NUM]);
+            priorHeight = (float)(priorBoxes[j*SSD_COORDI_NUM+3] - priorBoxes[j*SSD_COORDI_NUM + 1]);
+            priorCenterX = (priorBoxes[j*SSD_COORDI_NUM+2] + priorBoxes[j*SSD_COORDI_NUM])*SSD_HALF;
+            priorCenterY = (priorBoxes[j*SSD_COORDI_NUM+3] + priorBoxes[j*SSD_COORDI_NUM+1])*SSD_HALF;
+
+            decodeBoxCenterX = ((float)priorVar[j*SSD_COORDI_NUM]/SSD_QUANT_BASE)*
+                ((float)locPreds[j*SSD_COORDI_NUM]/SSD_QUANT_BASE)*priorWidth+priorCenterX;
+
+            decodeBoxCenterY = ((float)priorVar[j*SSD_COORDI_NUM+1]/SSD_QUANT_BASE)*
+                ((float)locPreds[j*SSD_COORDI_NUM+1]/SSD_QUANT_BASE)*priorHeight+priorCenterY;
+
+            decodeBoxWidth = exp(((float)priorVar[j*SSD_COORDI_NUM+2]/SSD_QUANT_BASE)*
+                ((float)locPreds[j*SSD_COORDI_NUM+2]/SSD_QUANT_BASE))*priorWidth;
+
+            decodeBoxHeight = exp(((float)priorVar[j*SSD_COORDI_NUM+3]/SSD_QUANT_BASE)*
+                ((float)locPreds[j*SSD_COORDI_NUM+3]/SSD_QUANT_BASE))*priorHeight;
+
+            allDecodeBoxes[srcIdx++] = (int)(decodeBoxCenterX - decodeBoxWidth * SSD_HALF);
+            allDecodeBoxes[srcIdx++] = (int)(decodeBoxCenterY - decodeBoxHeight * SSD_HALF);
+            allDecodeBoxes[srcIdx++] = (int)(decodeBoxCenterX + decodeBoxWidth * SSD_HALF);
+            allDecodeBoxes[srcIdx++] = (int)(decodeBoxCenterY + decodeBoxHeight * SSD_HALF);
+        }
+    }
+
+    // 对每一类做NMS
+    afterTopK2 = 0;
+    for (i = 0; i < classNum; i++)
+    {
+        if(i==0)
+            continue;
+
+        for (j = 0; j < priorNum; j++)
+        {
+            singleProposal[j * SSD_PROPOSAL_WIDTH] = allDecodeBoxes[j * SSD_COORDI_NUM];
+            singleProposal[j * SSD_PROPOSAL_WIDTH + 1] = allDecodeBoxes[j * SSD_COORDI_NUM + 1];
+            singleProposal[j * SSD_PROPOSAL_WIDTH + 2] = allDecodeBoxes[j * SSD_COORDI_NUM + 2];
+            singleProposal[j * SSD_PROPOSAL_WIDTH + 3] = allDecodeBoxes[j * SSD_COORDI_NUM + 3];
+            singleProposal[j * SSD_PROPOSAL_WIDTH + 4] = confScores[j*classNum + i];
+            singleProposal[j * SSD_PROPOSAL_WIDTH + 5] = 0;
+        }
+        QuickSort(singleProposal, 0, priorNum - 1, stack,topK);
+        afterFilter = (priorNum < topK) ? priorNum : topK;
+        NonMaxSuppression(singleProposal, afterFilter, NMSThresh, afterFilter);
+        roiOutCnt = 0;
+        dstScore = (int*)dstScoreSrc;
+        dstBbox = (int*)dstBboxSrc;
+        classRoiNum = (int*)roiOutCntSrc;
+        dstScore += (int)afterTopK2;
+        dstBbox += (int)(afterTopK2 * SSD_COORDI_NUM);
+        for (j = 0; j < topK; j++)
+        {
+            if (singleProposal[j * SSD_PROPOSAL_WIDTH + 5] == 0 &&
+                singleProposal[j * SSD_PROPOSAL_WIDTH + 4] > (int)confThresh)
+            {
+                dstScore[roiOutCnt] = singleProposal[j * 6 + 4];
+                dstBbox[roiOutCnt * SSD_COORDI_NUM] = singleProposal[j * SSD_PROPOSAL_WIDTH];
+                dstBbox[roiOutCnt * SSD_COORDI_NUM + 1] = singleProposal[j * SSD_PROPOSAL_WIDTH + 1];
+                dstBbox[roiOutCnt * SSD_COORDI_NUM + 2] = singleProposal[j * SSD_PROPOSAL_WIDTH + 2];
+                dstBbox[roiOutCnt * SSD_COORDI_NUM + 3] = singleProposal[j * SSD_PROPOSAL_WIDTH + 3];
+                roiOutCnt++;
+            }
+        }
+        classRoiNum[i] = (int)roiOutCnt;
+        afterTopK2 += roiOutCnt;
+    }
+
+    keepCnt = 0;
+    offset = 0;
+    if (afterTopK2 > keepTopK)
+    {
+        offset = classRoiNum[0];
+        for (i = 1; i < classNum; i++)
+        {
+            dstScore = (int*)dstScoreSrc;
+            dstBbox = (int*)dstBboxSrc;
+            classRoiNum = (int*)roiOutCntSrc;
+            dstScore += (int)(offset);
+            dstBbox += (int)(offset * SSD_COORDI_NUM);
+            for (j = 0; j < (int)classRoiNum[i]; j++)
+            {
+                afterTopK[keepCnt * SSD_PROPOSAL_WIDTH] = dstBbox[j * SSD_COORDI_NUM];
+                afterTopK[keepCnt * SSD_PROPOSAL_WIDTH + 1] = dstBbox[j * SSD_COORDI_NUM + 1];
+                afterTopK[keepCnt * SSD_PROPOSAL_WIDTH + 2] = dstBbox[j * SSD_COORDI_NUM + 2];
+                afterTopK[keepCnt * SSD_PROPOSAL_WIDTH + 3] = dstBbox[j * SSD_COORDI_NUM + 3];
+                afterTopK[keepCnt * SSD_PROPOSAL_WIDTH + 4] = dstScore[j];
+                afterTopK[keepCnt * SSD_PROPOSAL_WIDTH + 5] = i;
+                keepCnt++;
+            }
+            offset = offset + classRoiNum[i];
+        }
+        QuickSort(afterTopK, 0, keepCnt - 1, stack,keepCnt);
+
+        offset = 0;
+        offset = classRoiNum[0];
+        for (i = 1; i < classNum; i++)
+        {
+            roiOutCnt = 0;
+            dstScore = (int*)dstScoreSrc;
+            dstBbox = (int*)dstBboxSrc;
+            classRoiNum = (int*)roiOutCntSrc;
+            dstScore += (int)(offset);
+            dstBbox += (int)(offset * SSD_COORDI_NUM);
+            for (j = 0; j < keepTopK; j++)
+            {
+                if (afterTopK[j * SSD_PROPOSAL_WIDTH + 5] == i)
+                {
+                    dstScore[roiOutCnt] = afterTopK[j * SSD_PROPOSAL_WIDTH + 4];
+                    dstBbox[roiOutCnt * SSD_COORDI_NUM] = afterTopK[j * SSD_PROPOSAL_WIDTH];
+                    dstBbox[roiOutCnt * SSD_COORDI_NUM + 1] = afterTopK[j * SSD_PROPOSAL_WIDTH + 1];
+                    dstBbox[roiOutCnt * SSD_COORDI_NUM + 2] = afterTopK[j * SSD_PROPOSAL_WIDTH + 2];
+                    dstBbox[roiOutCnt * SSD_COORDI_NUM + 3] = afterTopK[j * SSD_PROPOSAL_WIDTH + 3];
+                    roiOutCnt++;
+                }
+            }
+            classRoiNum[i] = (int)roiOutCnt;
+            offset += roiOutCnt;
+        }
+    }
+}
+
+vector<float> DetectorRetinaFace::PermuteLayer(const vector<float> &data,int width,int height,int channels)
+{
+    vector<float> result(data.size());
+    int index=0;
+    int channelStep=width*height;
+    for(int h=0; h<height;h++)
+    {
+        for(int w=0;w<width;w++)
+        {
+            for(int c = 0;c < channels;c++)
+            {
+                result[index++] = data[c*channelStep + h*width + w];
+            }
+        }
+    }
+    return result;
+
+}
+
+void DetectorRetinaFace::QuickSort(int* src,int low, int high, QuickSortStack *stack,int maxNum)
+{
+    int i = low;
+    int j = high;
+    int top = 0;
+    int keyConfidence = src[SSD_PROPOSAL_WIDTH * low + 4];
+    stack[top].min = low;
+    stack[top].max = high;
+
+    while(top > -1)
+    {
+        low = stack[top].min;
+        high = stack[top].max;
+        i = low;
+        j = high;
+        top--;
+
+        keyConfidence = src[SSD_PROPOSAL_WIDTH * low + 4];
+
+        while(i < j)
+        {
+            while((i < j) && (keyConfidence > src[j * SSD_PROPOSAL_WIDTH + 4]))
+            {
+                j--;
+            }
+            if(i < j)
+            {
+                Swap(&src[i*SSD_PROPOSAL_WIDTH], &src[j*SSD_PROPOSAL_WIDTH]);
+                i++;
+            }
+
+            while((i < j) && (keyConfidence < src[i*SSD_PROPOSAL_WIDTH + 4]))
+            {
+                i++;
+            }
+            if(i < j)
+            {
+                Swap(&src[i*SSD_PROPOSAL_WIDTH], &src[j*SSD_PROPOSAL_WIDTH]);
+                j--;
+            }
+        }
+
+        if(low <= maxNum)
+        {
+                if(low < i-1)
+                {
+                    top++;
+                    stack[top].min = low;
+                    stack[top].max = i-1;
+                }
+
+                if(high > i+1)
+                {
+                    top++;
+                    stack[top].min = i+1;
+                    stack[top].max = high;
+                }
+        }
+    }
+}
+
+void DetectorRetinaFace::NonMaxSuppression( int* proposals, int anchorsNum,int NMSThresh,int maxRoiNum)
+{
+    int xMin1 = 0;
+    int yMin1 = 0;
+    int xMax1 = 0;
+    int yMax1 = 0;
+    int xMin2 = 0;
+    int yMin2 = 0;
+    int xMax2 = 0;
+    int yMax2 = 0;
+    int areaTotal = 0;
+    int areaInter = 0;
+    int i = 0;
+    int j = 0;
+    int num = 0;
+    int NoOverlap  = 1;
+    for (i = 0; i < anchorsNum && num < maxRoiNum; i++)
+    {
+        if( proposals[SSD_PROPOSAL_WIDTH*i+5] == 0 )
+        {
+            num++;
+            xMin1 =  proposals[SSD_PROPOSAL_WIDTH*i];
+            yMin1 =  proposals[SSD_PROPOSAL_WIDTH*i+1];
+            xMax1 =  proposals[SSD_PROPOSAL_WIDTH*i+2];
+            yMax1 =  proposals[SSD_PROPOSAL_WIDTH*i+3];
+            for(j= i+1;j< anchorsNum; j++)
+            {
+                if( proposals[SSD_PROPOSAL_WIDTH*j+5] == 0 )
+                {
+                    xMin2 = proposals[SSD_PROPOSAL_WIDTH*j];
+                    yMin2 = proposals[SSD_PROPOSAL_WIDTH*j+1];
+                    xMax2 = proposals[SSD_PROPOSAL_WIDTH*j+2];
+                    yMax2 = proposals[SSD_PROPOSAL_WIDTH*j+3];
+                    NoOverlap = (xMin2>xMax1)||(xMax2<xMin1)||(yMin2>yMax1)||(yMax2<yMin1);
+                    if(NoOverlap)
+                    {
+                        continue;
+                    }
+                    ComputeOverlap(xMin1, yMin1, xMax1, yMax1, xMin2, yMin2, xMax2, yMax2, &areaTotal, &areaInter);
+                    if(areaInter*SSD_QUANT_BASE > ((int)NMSThresh*areaTotal))
+                    {
+                        if( proposals[SSD_PROPOSAL_WIDTH*i+4] >= proposals[SSD_PROPOSAL_WIDTH*j+4] )
+                        {
+                            proposals[SSD_PROPOSAL_WIDTH*j+5] = 1;
+                        }
+                        else
+                        {
+                            proposals[SSD_PROPOSAL_WIDTH*i+5] = 1;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void DetectorRetinaFace::ComputeOverlap(int xMin1, int yMin1, int xMax1, int yMax1, int xMin2,
+    int yMin2, int xMax2, int yMax2,  int* areaSum, int* areaInter)
+{
+    
+    int inter = 0;
+    int s32Total = 0;
+    int xMin = 0;
+    int yMin = 0;
+    int xMax = 0;
+    int yMax = 0;
+    int area1 = 0;
+    int area2 = 0;
+    int interWidth = 0;
+    int interHeight = 0;
+
+    xMin = SSD_MAX(xMin1, xMin2);
+    yMin = SSD_MAX(yMin1, yMin2);
+    xMax = SSD_MIN(xMax1, xMax2);
+    yMax = SSD_MIN(yMax1, yMax2);
+
+    interWidth = xMax - xMin + 1;
+    interHeight = yMax - yMin + 1;
+
+    interWidth = ( interWidth >= 0 ) ? interWidth : 0;
+    interHeight = ( interHeight >= 0 ) ? interHeight : 0;
+
+    inter = interWidth * interHeight;
+    area1 = (xMax1 - xMin1 + 1) * (yMax1 - yMin1 + 1);
+    area2 = (xMax2 - xMin2 + 1) * (yMax2 - yMin2 + 1);
+
+    s32Total = area1 + area2 - inter;
+
+    *areaSum = s32Total;
+    *areaInter = inter;
+}
+
+void DetectorRetinaFace::Swap(int* src1, int* src2)
+{
+    int i = 0;
+    int temp = 0;
+    for( i = 0; i < SSD_PROPOSAL_WIDTH; i++ )
+    {
+        temp = src1[i];
+        src1[i] = src2[i];
+        src2[i] = temp;
+    }
+}
+
+void DetectorRetinaFace::CreateDetectionResults(std::vector<ResultOfDetection> &resultsOfDetection)
+{
+    // 参数赋值
+    int* score=ssdParameter.dstScore;
+    int* roi=ssdParameter.dstRoi;
+    int* classRoiNum=ssdParameter.classRoiNum;
+    float printResultThresh=((float)ssdParameter.confThresh)/SSD_QUANT_BASE;
+    int classNum=ssdParameter.classNum;
+
+    int i = 0, j = 0;
+    int roiNumBias = 0;
+    int scoreBias = 0;
+    int bboxBias = 0;
+    float score2 = 0.0f;
+    int xMin = 0,yMin= 0,xMax = 0,yMax = 0;
+
+    roiNumBias += classRoiNum[0];
+    for (i = 1; i < classNum; i++)
+    {
+        scoreBias = roiNumBias;
+        bboxBias = roiNumBias * SSD_COORDI_NUM;
+
+        if((float)score[scoreBias] / SSD_QUANT_BASE >=
+            printResultThresh && classRoiNum[i]!=0)
+        {
+            //printf("==== The %d th class box info====\n", i);
+        }
+        for (j = 0; j < (int)classRoiNum[i]; j++)
+        {
+            score2 = (float)score[scoreBias + j] / SSD_QUANT_BASE;
+            if (score2 < printResultThresh)
+            {
+                break;
+            }
+            xMin = roi[bboxBias + j*SSD_COORDI_NUM];
+            yMin = roi[bboxBias + j*SSD_COORDI_NUM + 1];
+            xMax = roi[bboxBias + j*SSD_COORDI_NUM + 2];
+            yMax = roi[bboxBias + j*SSD_COORDI_NUM + 3];
+
+            ResultOfDetection result;
+            result.boundingBox.x=xMin;
+            result.boundingBox.y=yMin;
+            result.boundingBox.width=xMax-xMin+1;
+            result.boundingBox.height=yMax-yMin+1;
+            result.classID=i;
+            result.confidence=score2;
+            resultsOfDetection.push_back(result);
+        }
+        roiNumBias += classRoiNum[i];
+    }
+}
+
+
+
+}
--- a/Src/RetinaFace/DetectorRetinaFace.h
+++ b/Src/RetinaFace/DetectorRetinaFace.h
+#ifndef __DETECTOR_RETINAFACE_H__
+#define __DETECTOR_RETINAFACE_H__
+
+#include <string>
+#include <migraphx/program.hpp>
+#include <opencv2/opencv.hpp>
+#include <CommonDefinition.h>
+#include <SSDDefinition.h>
+
+using namespace std;
+using namespace cv;
+using namespace migraphx;
+
+namespace migraphxSamples
+{
+
+class DetectorRetinaFace
+{
+public:
+    DetectorRetinaFace();
+    
+    ~DetectorRetinaFace();
+
+    ErrorCode Initialize(InitializationParameterOfDetector initializationParameterOfDetector);
+
+    ErrorCode Detect(const cv::Mat &srcImage,std::vector<ResultOfDetection> &resultsOfDetection);
+
+private:
+    ErrorCode DoCommonInitialization(InitializationParameterOfDetector initializationParameterOfDetector);
+    void GetSSDParameter();
+    void GetResult(const std::vector<std::vector<float>> &classification,const std::vector<std::vector<float>> &regression,std::vector<ResultOfDetection> &resultsOfDetection);
+
+    std::vector<float> PermuteLayer(const std::vector<float> &data,int width,int height,int channels);
+    void PriorBoxLayer(int indexOfLayer,int* priorboxOutputData);
+    void SoftmaxLayer(int softMaxWidth[],int* softMaxInputData[], int* softMaxOutputData);
+    void DetectionOutputLayer(int* allLocPreds[], int* allPriorBoxes[],int* confScores, int* assistMemPool);
+
+    void ComputeSoftMax(int* src, int size, int* dst);
+    void QuickSort(int* src,int low, int high, QuickSortStack *stack,int maxNum);
+    void NonMaxSuppression( int* proposals, int anchorsNum,int NMSThresh,int maxRoiNum);
+    void Swap(int* src1, int* src2);
+    void ComputeOverlap(int xMin1, int yMin1, int xMax1, int yMax1, int xMin2,int yMin2, int xMax2, int yMax2,  int* areaSum, int* areaInter);
+    void CreateDetectionResults(std::vector<ResultOfDetection> &resultsOfDetection);
+
+private:
+    cv::FileStorage configurationFile;
+    InitializationParameterOfDetector initializationParameter;
+    FILE *logFile;
+    
+    migraphx::program net;
+    cv::Size inputSize;
+    string inputName;
+    migraphx::shape inputShape;
+
+    float scale;
+    cv::Scalar meanValue;
+    bool swapRB;
+    bool crop;
+    bool useInt8;
+    bool useFP16;
+
+    SSDParameter ssdParameter;
+
+};
+
+}
+
+#endif
+
--- a/Src/RetinaFace/SSDDefinition.h
+++ b/Src/RetinaFace/SSDDefinition.h
+// SSD定义
+
+#ifndef __SSD_DEFINITION_H__
+#define __SSD_DEFINITION_H__
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+namespace migraphxSamples
+{
+
+#define SSD_MAX_PRIORBOX_LAYER_NUM      10 // 能够支持的最大检测层数量
+
+// SSD参数
+typedef struct _SSDParameter
+{
+    int numberOfPriorBoxLayer; // 检测层数量
+
+    // Model Parameters
+    int convHeight[SSD_MAX_PRIORBOX_LAYER_NUM*2];
+    int convWidth[SSD_MAX_PRIORBOX_LAYER_NUM*2];
+    int convChannel[SSD_MAX_PRIORBOX_LAYER_NUM*2];
+
+    // PriorBoxLayer Parameters
+    int priorBoxWidth[SSD_MAX_PRIORBOX_LAYER_NUM]; // 每个检测层priorbox的宽
+    int priorBoxHeight[SSD_MAX_PRIORBOX_LAYER_NUM];// 每个检测层priorbox的高
+    std::vector<std::vector<float>> priorBoxMinSize; // 每个检测层priorbox的minsize
+    std::vector<std::vector<float>> priorBoxMaxSize; // 每个检测层priorbox的maxsize
+    int minSizeNum[SSD_MAX_PRIORBOX_LAYER_NUM]; // 每个检测层priorbox的minsize数量
+    int maxSizeNum[SSD_MAX_PRIORBOX_LAYER_NUM]; // 每个检测层priorbox的maxsize数量
+    int srcImageHeight;// 原图大小
+    int srcImageWidth;
+    int inputAspectRatioNum[SSD_MAX_PRIORBOX_LAYER_NUM];// 每个检测层宽高比的数量 
+    std::vector<std::vector<float>> priorBoxAspectRatio;// 每个检测层的宽高比
+    float priorBoxStepWidth[SSD_MAX_PRIORBOX_LAYER_NUM];// 每个检测层步长的宽
+    float priorBoxStepHeight[SSD_MAX_PRIORBOX_LAYER_NUM];// 每个检测层步长的高
+    float offset;
+    int flip[SSD_MAX_PRIORBOX_LAYER_NUM];
+    int clip[SSD_MAX_PRIORBOX_LAYER_NUM];
+    int priorBoxVar[4];
+
+    // SoftmaxLayer Parameters
+    int softMaxInChn[SSD_MAX_PRIORBOX_LAYER_NUM];
+    int softMaxInHeight;
+    int concatNum;
+    int softMaxOutWidth;
+    int softMaxOutHeight;
+    int softMaxOutChn;
+
+    // DetectionOutLayer Parameters
+    int classNum;// 类别数(包含背景类)
+    int topK;
+    int keepTopK;
+    int NMSThresh;
+    int confThresh;
+    int detectInputChn[SSD_MAX_PRIORBOX_LAYER_NUM];
+    int convStride[SSD_MAX_PRIORBOX_LAYER_NUM];
+
+    // buffer
+    int *buffer;
+    int *classification[SSD_MAX_PRIORBOX_LAYER_NUM];// 分类数据
+    int *regression[SSD_MAX_PRIORBOX_LAYER_NUM];// 回归
+    int *priorboxOutputData;
+    int *softMaxOutputData;
+    int *getResultBuffer;
+    int *dstScore;
+    int *dstRoi;
+    int *classRoiNum;
+    _SSDParameter():srcImageHeight(0),
+                    srcImageWidth(0),
+                    offset(0.0),
+                    softMaxInHeight(0),
+                    concatNum(0),
+                    softMaxOutWidth(0),
+                    softMaxOutHeight(0),
+                    softMaxOutChn(0),
+                    buffer(NULL),
+                    priorboxOutputData(NULL),
+                    softMaxOutputData(NULL),
+                    getResultBuffer(NULL),
+                    dstScore(NULL),
+                    dstRoi(NULL),
+                    classRoiNum(NULL){}
+}SSDParameter;
+
+typedef struct _QuickSortStack
+{
+    int min;
+    int max;
+}QuickSortStack;
+
+}
+
+#endif
+
--- a/Src/Sample.cpp
+++ b/Src/Sample.cpp
+#include <Sample.h>
+#include <opencv2/dnn.hpp>
+#include <SimpleLog.h>
+#include <Filesystem.h>
+#include <DetectorRetinaFace.h>
+#include <fstream>
+
+
+using namespace std;
+using namespace cv;
+using namespace cv::dnn;
+using namespace migraphx;
+using namespace migraphxSamples;
+
+
+void Sample_DetectorRetinaFace()
+{
+    // 创建RetinaFace检测器
+    DetectorRetinaFace detector;
+    InitializationParameterOfDetector initParamOfDetectorRetinaFace;
+    initParamOfDetectorRetinaFace.parentPath = "";
+    initParamOfDetectorRetinaFace.configFilePath = CONFIG_FILE;
+    initParamOfDetectorRetinaFace.logName = "";
+    ErrorCode errorCode=detector.Initialize(initParamOfDetectorRetinaFace);
+    if(errorCode!=SUCCESS)
+    {
+        LOG_ERROR(stdout, "fail to initialize detector!\n");
+        exit(-1);
+    }
+    LOG_INFO(stdout, "succeed to initialize detector\n");
+
+    // 读取测试图片
+    Mat srcImage=imread("../Resource/Images/FaceDetect.jpg",1);
+
+    // 推理
+    std::vector<ResultOfDetection> predictions;
+    double time1 = getTickCount();
+    detector.Detect(srcImage,predictions);
+    double time2 = getTickCount();
+    double elapsedTime = (time2 - time1)*1000 / getTickFrequency();
+    LOG_INFO(stdout, "inference time:%f ms\n", elapsedTime);
+
+    // 获取推理结果
+    LOG_INFO(stdout,"========== Detection Results ==========\n");
+    for(int i=0;i<predictions.size();++i)
+    {
+        ResultOfDetection result=predictions[i];
+        cv::rectangle(srcImage,result.boundingBox,Scalar(0,255,255),2);
+        
+        LOG_INFO(stdout,"box:%d %d %d %d,label:%d,confidence:%f\n",predictions[i].boundingBox.x,
+        predictions[i].boundingBox.y,predictions[i].boundingBox.width,predictions[i].boundingBox.height,predictions[i].classID,predictions[i].confidence);
+    }
+    imwrite("Result.jpg",srcImage);
+    LOG_INFO(stdout,"Detection results have been saved to ./Result.jpg\n");
+}
\ No newline at end of file
--- a/Src/Sample.h
+++ b/Src/Sample.h
+// 示例程序
+
+#ifndef __SAMPLE_H__
+#define __SAMPLE_H__
+
+// RetinaFace sample
+void Sample_DetectorRetinaFace();
+
+#endif
\ No newline at end of file