支持GPU(cuda)

c08438d8 · benjaminwan · 1587a9f5 · c08438d8 · c08438d8 · c08438d8
Commit c08438d8 authored Oct 19, 2022 by benjaminwan
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -136,7 +136,11 @@ cmake-build-debug/
 build/
 build-lib/
 /models/*.onnx
-onnxruntime-shared/
+onnxruntime-gpu/*
+!onnxruntime-gpu/README.md
+!onnxruntime-gpu/OnnxRuntimeWrapper.cmake
+!onnxruntime-gpu/windows-x64/OnnxRuntimeConfig.cmake
+!onnxruntime-gpu/linux/OnnxRuntimeConfig.cmake
 onnxruntime-static/*
 !/onnxruntime-static/OnnxRuntimeWrapper.cmake
 opencv-static/*

--- a/BUILD.md
+++ b/BUILD.md
@@ -8,7 +8,7 @@
 * 把压缩包解压到项目根目录，windows平台需要注意目录层次，解压后目录结构如下
 * windows平台分为mt和md版，mt代表静态链接CRT，md代表动态链接CRT
 ```
-OcrLiteNcnn/opencv-static
+opencv-static
 ├── OpenCVWrapperConfig.cmake
 ├── linux
 ├── macos
@@ -24,7 +24,7 @@ OcrLiteNcnn/opencv-static
 * 把压缩包解压到项目根目录，windows平台需要注意目录层次，解压后目录结构如下
 * windows平台分为mt和md版，mt代表静态链接CRT，md代表动态链接CRT
 ```
-OcrLiteNcnn/onnxruntime-static
+onnxruntime-static
 ├── OnnxRuntimeWrapper.cmake
 ├── linux
 ├── macos

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,12 @@ option(OCR_BENCHMARK "build benchmark" ON)
 set(OCR_BENCHMARK ON)
 #set(OCR_OUTPUT "BIN")

+if (NOT DEFINED OCR_ONNX)
+    set(OCR_ONNX "CPU")
+    message(STATUS "No OCR_ONNX, defaulting to CPU")
+endif ()
+#set(OCR_OUTPUT "GPU")
+
 set(CMAKE_CXX_STANDARD 11)
 add_definitions(-DUNICODE -D_UNICODE)
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -25,7 +31,11 @@ else ()
 endif ()

 # OnnxRuntime
-include(${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-static/OnnxRuntimeWrapper.cmake)
+if (OCR_ONNX STREQUAL "CPU")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-static/OnnxRuntimeWrapper.cmake)
+elseif (OCR_ONNX STREQUAL "CUDA") # CUDA
+    include(${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-gpu/OnnxRuntimeWrapper.cmake)
+endif ()
 find_package(OnnxRuntime REQUIRED)
 if (OnnxRuntime_FOUND)
    message(STATUS "OnnxRuntime_LIBS: ${OnnxRuntime_LIBS}")
@@ -85,6 +95,10 @@ if (OCR_OUTPUT STREQUAL "CLIB") # CLIB
    install(FILES ${OCR_INCLUDE} DESTINATION include)
 endif ()

+if (OCR_ONNX STREQUAL "CUDA")
+    target_compile_definitions(RapidOcrOnnx PRIVATE __CUDA__)
+endif ()
+
 # benchmark
 if (OCR_BENCHMARK AND (OCR_OUTPUT STREQUAL "BIN"))
    add_executable(benchmark benchmark/benchmark.cpp
@@ -98,6 +112,9 @@ if (OCR_BENCHMARK AND (OCR_OUTPUT STREQUAL "BIN"))
    target_link_libraries(benchmark ${OnnxRuntime_LIBS} ${OpenCV_LIBS})
    target_compile_definitions(benchmark PRIVATE __EXEC__)

+    if (OCR_ONNX STREQUAL "CUDA")
+        target_compile_definitions(benchmark PRIVATE __CUDA__)
+    endif ()
    install(TARGETS benchmark EXPORT benchmark
            ARCHIVE DESTINATION staticlib
            LIBRARY DESTINATION sharedlib

--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -43,10 +43,11 @@ int main(int argc, char **argv) {
    int flagDoAngle = 1;
    bool mostAngle = true;
    int flagMostAngle = 1;
+    int flagGpu = -1;

    int opt;
    int optionIndex = 0;
-    while ((opt = getopt_long(argc, argv, "d:1:2:3:4:i:t:p:s:b:o:u:a:A:v:h:l", long_options, &optionIndex)) != -1) {
+    while ((opt = getopt_long(argc, argv, "d:1:2:3:4:i:t:p:s:b:o:u:a:A:G:v:h:l", long_options, &optionIndex)) != -1) {
        //printf("option(-%c)=%s\n", opt, optarg);
        switch (opt) {
            case 'd':
@@ -123,6 +124,9 @@ int main(int argc, char **argv) {
            case 'h':
                printHelp(stdout, argv[0]);
                return 0;
+            case 'G':
+                flagGpu = (int) strtol(optarg, NULL, 10);
+                break;
            case 'l':
                loopCount = (int) strtol(optarg, NULL, 10);
                //printf("loopCount=%d\n", loopCount);
@@ -164,9 +168,11 @@ int main(int argc, char **argv) {
            false);//isOutputResultImg

    //ocrLite.enableResultTxt(imgDir.c_str(), imgName.c_str());
+    ocrLite.setGpuIndex(flagGpu);
    printf("=====Input Params=====\n");
-    printf("numThread(%d),padding(%d),maxSideLen(%d),boxScoreThresh(%f),boxThresh(%f),unClipRatio(%f),doAngle(%d),mostAngle(%d)\n",
-            numThread, padding, maxSideLen, boxScoreThresh, boxThresh, unClipRatio, doAngle, mostAngle);
+    printf("numThread(%d),padding(%d),maxSideLen(%d),boxScoreThresh(%f),boxThresh(%f),unClipRatio(%f),doAngle(%d),mostAngle(%d),GPU(%d)\n",
+            numThread, padding, maxSideLen, boxScoreThresh, boxThresh, unClipRatio, doAngle, mostAngle,
+            flagGpu);
    bool initModelsRet = ocrLite.initModels(modelDetPath, modelClsPath, modelRecPath, keysPath);
    if (!initModelsRet) return -1;
    printf("=====Warmup 2 cycles=====\n");

--- a/build-default.bat
+++ b/build-default.bat
@@ -3,56 +3,62 @@ chcp 65001
 cls
 @SETLOCAL

-mkdir win-BIN-x64
-pushd win-BIN-x64
+mkdir win-BIN-CPU-x64
+pushd win-BIN-CPU-x64
 cmake -T "v142,host=x64" -A "x64" ^
  -DCMAKE_INSTALL_PREFIX=install ^
-  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="BIN" -DOCR_BUILD_CRT="True" ..
+  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="BIN" ^
+  -DOCR_BUILD_CRT="True" -DOCR_ONNX="CPU" ..
 cmake --build . --config Release -j %NUMBER_OF_PROCESSORS%
 cmake --build . --config Release --target install
 popd

-mkdir win-BIN-Win32
-pushd win-BIN-Win32
+mkdir win-BIN-CPU-Win32
+pushd win-BIN-CPU-Win32
 cmake -T "v142,host=x64" -A "Win32" ^
  -DCMAKE_INSTALL_PREFIX=install ^
-  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="BIN" -DOCR_BUILD_CRT="True" ..
+  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="BIN" ^
+  -DOCR_BUILD_CRT="True" -DOCR_ONNX="CPU" ..
 cmake --build . --config Release -j %NUMBER_OF_PROCESSORS%
 cmake --build . --config Release --target install
 popd

-mkdir win-JNI-x64
-pushd win-JNI-x64
+mkdir win-JNI-CPU-x64
+pushd win-JNI-CPU-x64
 cmake -T "v142,host=x64" -A "x64" ^
  -DCMAKE_INSTALL_PREFIX=install ^
-  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="JNI" -DOCR_BUILD_CRT="True" ..
+  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="JNI" ^
+  -DOCR_BUILD_CRT="True" -DOCR_ONNX="CPU" ..
 cmake --build . --config Release -j %NUMBER_OF_PROCESSORS%
 cmake --build . --config Release --target install
 popd

-mkdir win-JNI-Win32
-pushd win-JNI-Win32
+mkdir win-JNI-CPU-Win32
+pushd win-JNI-CPU-Win32
 cmake -T "v142,host=x64" -A "Win32" ^
  -DCMAKE_INSTALL_PREFIX=install ^
-  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="JNI" -DOCR_BUILD_CRT="True" ..
+  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="JNI" ^
+  -DOCR_BUILD_CRT="True" -DOCR_ONNX="CPU" ..
 cmake --build . --config Release -j %NUMBER_OF_PROCESSORS%
 cmake --build . --config Release --target install
 popd

-mkdir win-CLIB-x64
-pushd win-CLIB-x64
+mkdir win-CLIB-CPU-x64
+pushd win-CLIB-CPU-x64
 cmake -T "v142,host=x64" -A "x64" ^
  -DCMAKE_INSTALL_PREFIX=install ^
-  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="CLIB" -DOCR_BUILD_CRT="True" ..
+  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="CLIB" ^
+  -DOCR_BUILD_CRT="True" -DOCR_ONNX="CPU" ..
 cmake --build . --config Release -j %NUMBER_OF_PROCESSORS%
 cmake --build . --config Release --target install
 popd

-mkdir win-CLIB-Win32
-pushd win-CLIB-Win32
+mkdir win-CLIB-CPU-Win32
+pushd win-CLIB-CPU-Win32
 cmake -T "v142,host=x64" -A "Win32" ^
  -DCMAKE_INSTALL_PREFIX=install ^
-  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="CLIB" -DOCR_BUILD_CRT="True" ..
+  -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="CLIB" ^
+  -DOCR_BUILD_CRT="True" -DOCR_ONNX="CPU" ..
 cmake --build . --config Release -j %NUMBER_OF_PROCESSORS%
 cmake --build . --config Release --target install
 popd

--- a/build-default.sh
+++ b/build-default.sh
@@ -12,26 +12,23 @@ else
  echo "Other OS: $sysOS"
 fi

-mkdir -p ${sysOS}-BIN
-pushd ${sysOS}-BIN
-cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="BIN" ..
+mkdir -p ${sysOS}-BIN-CPU
+pushd ${sysOS}-BIN-CPU
+cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="BIN" -DOCR_ONNX="CPU" ..
 cmake --build . --config Release -j $NUM_THREADS
 cmake --build . --config Release --target install
 popd

-
-mkdir -p ${sysOS}-JNI
-pushd ${sysOS}-JNI
-cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="JNI" ..
+mkdir -p ${sysOS}-JNI-CPU
+pushd ${sysOS}-JNI-CPU
+cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="JNI" -DOCR_ONNX="CPU" ..
 cmake --build . --config Release -j $NUM_THREADS
 cmake --build . --config Release --target install
 popd

-
-mkdir -p ${sysOS}-CLIB
-pushd ${sysOS}-CLIB
-cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="CLIB" ..
+mkdir -p ${sysOS}-CLIB-CPU
+pushd ${sysOS}-CLIB-CPU
+cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DOCR_OUTPUT="CLIB" -DOCR_ONNX="CPU" ..
 cmake --build . --config Release -j $NUM_THREADS
 cmake --build . --config Release --target install
-popd
-
+popd
\ No newline at end of file
--- a/build.bat
+++ b/build.bat
@@ -33,6 +33,14 @@ if %flag% == 1 (
 else (set MT_ENABLED="False")
 echo.

+echo "onnxruntime: 1)CPU(默认), 2)GPU(cuda)"
+echo "注意：范例工程默认集成CPU版，CUDA版仅支持x64且需下载"
+set /p flag=
+if %flag% == 1 (set ONNX_TYPE="CPU")^
+else if %flag% == 2 (set ONNX_TYPE="CUDA")^
+else (echo 输入错误！Input Error!)
+echo.
+
 echo "VS版本: 1)vs2019-x64, 2)vs2019-x86"
 set BUILD_CMAKE_T="v142"
 set BUILD_CMAKE_A="x64"
@@ -48,13 +56,13 @@ else if %flag% == 2 (
 else (echo 输入错误！Input Error!)
 echo.

-mkdir win-%BUILD_OUTPUT%-%BUILD_CMAKE_A%
-pushd win-%BUILD_OUTPUT%-%BUILD_CMAKE_A%
+mkdir win-%BUILD_OUTPUT%-%ONNX_TYPE%-%BUILD_CMAKE_A%
+pushd win-%BUILD_OUTPUT%-%ONNX_TYPE%-%BUILD_CMAKE_A%

 cmake -T "%BUILD_CMAKE_T%,host=x64" -A %BUILD_CMAKE_A% ^
  -DCMAKE_INSTALL_PREFIX=install ^
  -DCMAKE_BUILD_TYPE=%BUILD_TYPE% -DOCR_OUTPUT=%BUILD_OUTPUT% ^
-  -DOCR_BUILD_CRT=%MT_ENABLED% ..
+  -DOCR_BUILD_CRT=%MT_ENABLED% -DOCR_ONNX=%ONNX_TYPE% ..
 cmake --build . --config %BUILD_TYPE% -j %NUMBER_OF_PROCESSORS%
 cmake --build . --config %BUILD_TYPE% --target install


--- a/build.sh
+++ b/build.sh
@@ -14,8 +14,8 @@ else
  echo -e "输入错误！Input Error!"
 fi

-echo "请注意：如果选择2)JNI动态库时，必须安装配置Oracle JDK"
 echo "请选择编译输出类型并回车: 1)BIN可执行文件，2)JNI动态库，3)C动态库"
+echo "请注意：如果选择2)JNI动态库时，必须安装配置Oracle JDK"
 read -p "" BUILD_OUTPUT
 if [ $BUILD_OUTPUT == 1 ]; then
  BUILD_OUTPUT="BIN"
@@ -27,6 +27,17 @@ else
  echo -e "输入错误！Input Error!"
 fi

+echo "onnxruntime: 1)CPU(默认), 2)GPU(cuda)"
+echo "注意：范例工程默认集成CPU版，CUDA版仅支持Linux64且需下载"
+read -p "" ONNX_TYPE
+if [ $ONNX_TYPE == 1 ]; then
+  ONNX_TYPE="CPU"
+elif [ $ONNX_TYPE == 2 ]; then
+  ONNX_TYPE="CUDA"
+else
+  echo -e "输入错误！Input Error!"
+fi
+
 sysOS=$(uname -s)
 NUM_THREADS=1
 if [ $sysOS == "Darwin" ]; then
@@ -39,11 +50,11 @@ else
  echo "Other OS: $sysOS"
 fi

-mkdir -p $sysOS-$BUILD_OUTPUT
-pushd $sysOS-$BUILD_OUTPUT
+mkdir -p $sysOS-$ONNX_TYPE-$BUILD_OUTPUT
+pushd $sysOS-$ONNX_TYPE-$BUILD_OUTPUT

-echo "cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DOCR_OUTPUT=$BUILD_OUTPUT .."
-cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DOCR_OUTPUT=$BUILD_OUTPUT ..
+echo "cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DOCR_OUTPUT=$BUILD_OUTPUT -DOCR_ONNX=$ONNX_TYPE .."
+cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DOCR_OUTPUT=$BUILD_OUTPUT -DOCR_ONNX=$ONNX_TYPE ..
 cmake --build . --config $BUILD_TYPE -j $NUM_THREADS
 cmake --build . --config $BUILD_TYPE --target install
 popd
--- a/generate-vs-project.bat
+++ b/generate-vs-project.bat
@@ -33,6 +33,14 @@ if %flag% == 1 (
 else (set MT_ENABLED="False")
 echo.

+echo "onnxruntime: 1)CPU(默认), 2)GPU(cuda)"
+echo "注意：范例工程默认集成CPU版，CUDA版仅支持x64且需下载"
+set /p flag=
+if %flag% == 1 (set ONNX_TYPE="CPU")^
+else if %flag% == 2 (set ONNX_TYPE="CUDA")^
+else (echo 输入错误！Input Error!)
+echo.
+
 echo "请输入选项并回车: 0)ALL, 1)vs2019-x86, 2)vs2019-x64:"
 set /p flag=
 if %flag% == 0 (call :buildALL)^
@@ -61,8 +69,8 @@ popd
 GOTO:EOF

 :cmakeParams
-echo cmake -G "%~1" -A "%~2" -DOCR_OUTPUT=%BUILD_OUTPUT% -DOCR_BUILD_CRT=%MT_ENABLED% ..
-cmake -G "%~1" -A "%~2" -DOCR_OUTPUT=%BUILD_OUTPUT% -DOCR_BUILD_CRT=%MT_ENABLED% ..
+echo cmake -G "%~1" -A "%~2" -DOCR_OUTPUT=%BUILD_OUTPUT% -DOCR_BUILD_CRT=%MT_ENABLED% -DOCR_ONNX=%ONNX_TYPE% ..
+cmake -G "%~1" -A "%~2" -DOCR_OUTPUT=%BUILD_OUTPUT% -DOCR_BUILD_CRT=%MT_ENABLED% -DOCR_ONNX=%ONNX_TYPE% ..
 GOTO:EOF

 @ENDLOCAL
--- a/include/AngleNet.h
+++ b/include/AngleNet.h
@@ -7,12 +7,13 @@

 class AngleNet {
 public:
-    AngleNet();

    ~AngleNet();

    void setNumThread(int numOfThread);

+    void setGpuIndex(int gpuIndex);
+
    void initModel(const std::string &pathStr);

    std::vector<Angle> getAngles(std::vector<cv::Mat> &partImgs, const char *path,

--- a/include/CrnnNet.h
+++ b/include/CrnnNet.h
@@ -8,12 +8,12 @@
 class CrnnNet {
 public:

-    CrnnNet();
-
    ~CrnnNet();

    void setNumThread(int numOfThread);

+    void setGpuIndex(int gpuIndex);
+
    void initModel(const std::string &pathStr, const std::string &keysPath);

    std::vector<TextLine> getTextLines(std::vector<cv::Mat> &partImg, const char *path, const char *imgName);

--- a/include/DbNet.h
+++ b/include/DbNet.h
@@ -7,12 +7,12 @@

 class DbNet {
 public:
-    DbNet();
-
    ~DbNet();

    void setNumThread(int numOfThread);

+    void setGpuIndex(int gpuIndex);
+
    void initModel(const std::string &pathStr);

    std::vector<TextBox> getTextBoxes(cv::Mat &src, ScaleParam &s, float boxScoreThresh,

--- a/include/OcrLite.h
+++ b/include/OcrLite.h
@@ -20,6 +20,8 @@ public:

    void enableResultTxt(const char *path, const char *imgName);

+    void setGpuIndex(int gpuIndex);
+
    bool initModels(const std::string &detPath, const std::string &clsPath,
                    const std::string &recPath, const std::string &keysPath);


--- a/include/main.h
+++ b/include/main.h
@@ -21,13 +21,14 @@ static const struct option long_options[] = {
        {"version",        no_argument,       NULL, 'v'},
        {"help",           no_argument,       NULL, 'h'},
        {"loopCount",      required_argument, NULL, 'l'},
+        {"GPU",            required_argument, NULL, 'G'},
        {NULL,             no_argument,       NULL, 0}
 };

 const char *usageMsg = "(-d --models) (-1 --det) (-2 --cls) (-3 --rec) (-4 --keys) (-i --image)\n"\
                       "[-t --numThread] [-p --padding] [-s --maxSideLen]\n" \
                       "[-b --boxScoreThresh] [-o --boxThresh] [-u --unClipRatio]\n" \
-                       "[-a --noAngle] [-A --mostAngle]\n\n";
+                       "[-a --noAngle] [-A --mostAngle] [-G --GPU]\n\n";

 const char *requiredMsg = "-d --models: models directory.\n" \
                          "-1 --det: model file name of det.\n" \
@@ -43,12 +44,13 @@ const char *optionalMsg = "-t --numThread: value of numThread(int), default: 4\n
                          "-o --boxThresh: value of boxThresh(float), default: 0.3\n" \
                          "-u --unClipRatio: value of unClipRatio(float), default: 1.6\n" \
                          "-a --doAngle: Enable(1)/Disable(0) Angle Net, default: Enable\n" \
-                          "-A --mostAngle: Enable(1)/Disable(0) Most Possible AngleIndex, default: Enable\n\n";
+                          "-A --mostAngle: Enable(1)/Disable(0) Most Possible AngleIndex, default: Enable\n\n" \
+                          "-G --GPU: Disable(-1)/GPU0(0)/GPU1(1)/... Use Vulkan GPU accelerate, default: Disable(-1)\n\n";

 const char *otherMsg = "-v --version: show version\n" \
                       "-h --help: print this help\n\n";

-const char *example1Msg = "Example1: %s --models models --det det.onnx --cls cls.onnx --rec rec.onnx --keys keys.txt --image 1.jpg\n";
-const char *example2Msg = "Example2: %s -d models -1 det.onnx -2 cls.onnx -3 rec.onnx -4 keys.txt -i 1.jpg -t 4 -p 50 -s 0 -b 0.6 -o 0.3 -u 2.0 -a 1 -A 1\n";
+const char *example1Msg = "Example1: %s --models models --det det.onnx --cls cls.onnx --rec rec.onnx --keys keys.txt --image 1.jpg  --GPU 0\n";
+const char *example2Msg = "Example2: %s -d models -1 det.onnx -2 cls.onnx -3 rec.onnx -4 keys.txt -i 1.jpg -t 4 -p 50 -s 0 -b 0.6 -o 0.3 -u 2.0 -a 1 -A 1 -G 0\n";

 #endif //__MAIN_H__
--- a/onnxruntime-gpu/OnnxRuntimeWrapper.cmake
+++ b/onnxruntime-gpu/OnnxRuntimeWrapper.cmake
+if (APPLE)
+    message("配置macOS OnnxRuntime 路径: ${CMAKE_CURRENT_LIST_DIR}/macos")
+    set(OnnxRuntime_DIR "${CMAKE_CURRENT_LIST_DIR}/macos")
+elseif (WIN32)
+    if (CMAKE_CL_64)
+        message("配置WINDOWS OnnxRuntime x64 路径: ${CMAKE_CURRENT_LIST_DIR}/windows-x64")
+        set(OnnxRuntime_DIR "${CMAKE_CURRENT_LIST_DIR}/windows-x64")
+    else ()
+        message("配置WINDOWS OnnxRuntime x86 路径: ${CMAKE_CURRENT_LIST_DIR}/windows-x86")
+        set(OnnxRuntime_DIR "${CMAKE_CURRENT_LIST_DIR}/windows-x86")
+    endif ()
+elseif (UNIX)
+    message("配置Linux OnnxRuntime 路径: ${CMAKE_CURRENT_LIST_DIR}/linux")
+    set(OnnxRuntime_DIR "${CMAKE_CURRENT_LIST_DIR}/linux")
+endif ()
+
--- a/onnxruntime-gpu/README.md
+++ b/onnxruntime-gpu/README.md
+# 编译说明
+
+### onnxruntime gpu(cuda下载)
+
+1. [下载地址](https://github.com/microsoft/onnxruntime/releases)
+
+* 仅支持Linux和Windows，仅支持x64
+* Windows包名: onnxruntime-win-x64-gpu-版本号.zip
+* Linux包名: onnxruntime-linux-x64-gpu-版本号.tgz
+* Windows平台：把压缩包内的lib文件夹解压到windows-x64文件夹里
+* Linux平台：把压缩包内的lib文件夹解压到linux文件夹里
+* 创建include/onnxruntime/core/session，把压缩包内的所有.h文件解压到session文件夹里
+* 目录结构如下
+
+```
+onnxruntime-gpu
+├── linux
+│   ├── include
+│   │   └── onnxruntime
+│   │       └── core
+│   │           └── session
+│   │               ├── cpu_provider_factory.h
+│   │               ├── onnxruntime_c_api.h
+│   │               ├── onnxruntime_cxx_api.h
+│   │               ├── onnxruntime_cxx_inline.h
+│   │               ├── onnxruntime_run_options_config_keys.h
+│   │               ├── onnxruntime_session_options_config_keys.h
+│   │               ├── provider_options.h
+│   │               └── tensorrt_provider_factory.h
+│   ├── lib
+│   │   ├── libonnxruntime_providers_cuda.so
+│   │   ├── libonnxruntime_providers_shared.so
+│   │   ├── libonnxruntime_providers_tensorrt.so
+│   │   ├── libonnxruntime.so -> libonnxruntime.so.1.12.1
+│   │   └── libonnxruntime.so.1.12.1
+│   └── OnnxRuntimeConfig.cmake
+└── windows-x64
+    ├── include
+    │   └── onnxruntime
+    │       └── core
+    │           └── session
+    │               ├── cpu_provider_factory.h
+    │               ├── onnxruntime_c_api.h
+    │               ├── onnxruntime_cxx_api.h
+    │               ├── onnxruntime_cxx_inline.h
+    │               ├── onnxruntime_run_options_config_keys.h
+    │               ├── onnxruntime_session_options_config_keys.h
+    │               ├── provider_options.h
+    │               └── tensorrt_provider_factory.h
+    ├── lib
+    │   ├── onnxruntime.dll
+    │   ├── onnxruntime.lib
+    │   ├── onnxruntime.pdb
+    │   ├── onnxruntime_providers_cuda.dll
+    │   ├── onnxruntime_providers_cuda.lib
+    │   ├── onnxruntime_providers_cuda.pdb
+    │   ├── onnxruntime_providers_shared.dll
+    │   ├── onnxruntime_providers_shared.lib
+    │   ├── onnxruntime_providers_shared.pdb
+    │   ├── onnxruntime_providers_tensorrt.dll
+    │   ├── onnxruntime_providers_tensorrt.lib
+    │   └── onnxruntime_providers_tensorrt.pdb
+    └── OnnxRuntimeConfig.cmake
+
+```
+
+2. CUDA和cuDNN
+
+* [CUDA下载地址](https://developer.nvidia.com/downloads)
+* 捷径 https://developer.download.nvidia.com/compute/cuda/11.4.0/local_installers/cuda_11.4.0_470.42.01_linux.run
+* [cuDNN下载地址](https://developer.nvidia.com/rdp/cudnn-archive)
+* 根据onnxruntime官方文档https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html
+* onnxruntime v1.12，需要CUDA 11.4和cuDNN 8.2.4 (Linux) 8.2.2.26 (Windows)
\ No newline at end of file
--- a/onnxruntime-gpu/linux/OnnxRuntimeConfig.cmake
+++ b/onnxruntime-gpu/linux/OnnxRuntimeConfig.cmake
+set(OnnxRuntime_INCLUDE_DIRS "${CMAKE_CURRENT_LIST_DIR}/include") 
+include_directories(${OnnxRuntime_INCLUDE_DIRS}) 
+link_directories(${CMAKE_CURRENT_LIST_DIR}/lib) 
+set(OnnxRuntime_LIBS onnxruntime onnxruntime_providers_cuda onnxruntime_providers_shared onnxruntime_providers_tensorrt) 
--- a/onnxruntime-gpu/windows-x64/OnnxRuntimeConfig.cmake
+++ b/onnxruntime-gpu/windows-x64/OnnxRuntimeConfig.cmake
+set(OnnxRuntime_INCLUDE_DIRS "${CMAKE_CURRENT_LIST_DIR}/include") 
+include_directories(${OnnxRuntime_INCLUDE_DIRS}) 
+link_directories(${CMAKE_CURRENT_LIST_DIR}/lib) 
+set(OnnxRuntime_LIBS onnxruntime onnxruntime_providers_cuda onnxruntime_providers_shared onnxruntime_providers_tensorrt) 
--- a/run-benchmark.bat
+++ b/run-benchmark.bat
@@ -6,10 +6,11 @@ echo "Setting the Number of Threads=%NUMBER_OF_PROCESSORS% Using an OpenMP Envir
 set OMP_NUM_THREADS=%NUMBER_OF_PROCESSORS%

 :MainExec
-echo "请输入测试选项并回车: 1)CPU-x64, 2)CPU-x86"
+echo "请输入测试选项并回车: 1)CPU-x64, 2)CPU-x86, 3)CUDA-x64"
 set /p flag=
 if %flag% == 1 (call :PrepareCpuX64)^
 else if %flag% == 2 (call :PrepareCpuX86)^
+else if %flag% == 3 (call :PrepareCudaX64)^
 else (echo 输入错误！Input Error!)

 echo "请输入循环次数:"
@@ -41,6 +42,7 @@ SET EXE_PATH=%EXE_PATH%\install\bin
 --unClipRatio 1.6 ^
 --doAngle 1 ^
 --mostAngle 1 ^
+--GPU %GPU_INDEX% ^
 --loopCount %LOOP_COUNT%

 popd
@@ -48,13 +50,18 @@ echo.
 GOTO:MainExec

 :PrepareCpuX64
-set EXE_PATH=win-BIN-x64
+set EXE_PATH=win-BIN-CPU-x64
 set GPU_INDEX=-1
 GOTO:EOF

 :PrepareCpuX86
-set EXE_PATH=win-BIN-Win32
+set EXE_PATH=win-BIN-CPU-Win32
 set GPU_INDEX=-1
 GOTO:EOF

+:PrepareCudaX64
+set EXE_PATH=win-BIN-CUDA-x64
+set GPU_INDEX=0
+GOTO:EOF
+
 @ENDLOCAL
--- a/run-test.bat
+++ b/run-test.bat
@@ -6,10 +6,11 @@ echo "Setting the Number of Threads=%NUMBER_OF_PROCESSORS% Using an OpenMP Envir
 set OMP_NUM_THREADS=%NUMBER_OF_PROCESSORS%

 :MainExec
-echo "请输入测试选项并回车: 1)CPU-x64, 2)CPU-x86"
+echo "请输入测试选项并回车: 1)CPU-x64, 2)CPU-x86, 3)CUDA-x64"
 set /p flag=
 if %flag% == 1 (call :PrepareCpuX64)^
 else if %flag% == 2 (call :PrepareCpuX86)^
+else if %flag% == 3 (call :PrepareCudaX64)^
 else (echo 输入错误！Input Error!)

 SET TARGET_IMG=images/1.jpg
@@ -37,17 +38,25 @@ SET EXE_PATH=%EXE_PATH%\install\bin
 --boxThresh 0.3 ^
 --unClipRatio 1.6 ^
 --doAngle 1 ^
--mostAngle 1
+--mostAngle 1 ^
+--GPU %GPU_INDEX%

 echo.
 GOTO:MainExec

 :PrepareCpuX64
-set EXE_PATH=win-BIN-x64
+set EXE_PATH=win-BIN-CPU-x64
+set GPU_INDEX=-1
 GOTO:EOF

 :PrepareCpuX86
-set EXE_PATH=win-BIN-Win32
+set EXE_PATH=win-BIN-CPU-Win32
+set GPU_INDEX=-1
+GOTO:EOF
+
+:PrepareCudaX64
+set EXE_PATH=win-BIN-CUDA-x64
+set GPU_INDEX=0
 GOTO:EOF

 @ENDLOCAL