init

c0705977 · wangkaixiong · d3982d85 · c0705977 · c0705977 · c0705977
Commit c0705977 authored Apr 08, 2026 by wangkaixiong 🚴🏼
20 changed files
--- a/docs/miner-auto-convert.py
+++ b/docs/miner-auto-convert.py
+import requests
+import os
+import base64
+import json
+
+# 1. 准备请求
+url = "http://127.0.0.1:8000/parse/pdf" # 你的API地址
+files = {'file': open('test.pdf', 'rb')}
+data = {'output_format': 'json'} # 请求JSON格式以便获取图片详情
+
+# 2. 发送请求
+response = requests.post(url, files=files, data=data)
+result = response.json()
+
+# 3. 处理结果 (根据实际API返回结构调整)
+# 假设返回结构中包含 markdown 内容和图片列表
+md_content = result.get('markdown', '')
+images = result.get('images', []) # 假设这里有图片信息
+
+# 4. 保存图片
+os.makedirs("output_images", exist_ok=True)
+for img in images:
+    img_name = img.get('name')
+    img_data = img.get('data') # 可能是 base64 或 url
+    
+    # 如果是 base64
+    if img_data.startswith('data:image'):
+        img_bytes = base64.b64decode(img_data.split(',')[1])
+        with open(f"output_images/{img_name}", 'wb') as f:
+            f.write(img_bytes)
+    
+    # 如果是 URL (需要二次下载)
+    elif img_data.startswith('http'):
+        img_resp = requests.get(img_data)
+        with open(f"output_images/{img_name}", 'wb') as f:
+            f.write(img_resp.content)
+
+# 5. 保存 Markdown
+with open("output.md", "w", encoding="utf-8") as f:
+    f.write(md_content)
+
+print("处理完成，图片和Markdown已保存。")
\ No newline at end of file
--- a/env_tools/README.md
+++ b/env_tools/README.md
+# Easy_Tools
+
+各种环境问题的解决方案。
\ No newline at end of file
--- a/env_tools/centos_lmdeploy.sh
+++ b/env_tools/centos_lmdeploy.sh
+#!/bin/bash
+
+# 2
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH
\ No newline at end of file
--- a/env_tools/close_centos_update.sh
+++ b/env_tools/close_centos_update.sh
+#!/bin/bash
+
+# 1
+# 关闭 centos 自动更新
+# 修改yum配置文件/etc/yum.conf
+# 在[main]字段中添加exclude=kernel*、exclude=centos-release*
\ No newline at end of file
--- a/env_tools/cmake_install.sh
+++ b/env_tools/cmake_install.sh
+#!/bin/bash
+
+
+# prepare
+# CentOS 或 RHEL 系统
+# sudo yum install -y gcc gcc-c++ make openssl-devel 
+# Ubuntu 或 Debian 系统
+sudo apt-get update && sudo apt-get install -y build-essential libssl-dev
+
+# 替换为实际下载链接
+wget https://cmake.org/files/v3.24/cmake-3.24.2.tar.gz 
+
+# 替换为实际解压命令，根据你下载的版本号调整
+tar -zxvf cmake-3.24.2.tar.gz 
+
+# 进入解压后的目录
+cd cmake-3.24.2 
+
+# ./bootstrap --prefix=/usr/local
+./bootstrap --prefix=/usr/local --parallel=32
+
+make -j${nproc}
+sudo make install
--- a/env_tools/docker_install.sh
+++ b/env_tools/docker_install.sh
+#!/bin/bash
+
+
+sudo apt-get remove docker docker-engine docker-ce docker.io
+
+curl -fsSL http://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add -
+
+sudo add-apt-repository "deb [arch=amd64] http://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable"
+sudo apt-get update
+sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
+
+docker version
\ No newline at end of file
--- a/env_tools/install_git_lfs.sh
+++ b/env_tools/install_git_lfs.sh
+#!/bin/bash
+
+
+
+# 3 安装 git lfs
+# centos
+# curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | sudo bash
+# sudo yum install git-lfs
+# git lfs install 
+
+# ubuntu
+# curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
+# sudo apt-get install git-lfs
+# git lfs install 
+
+# 查看 git lfs 拉取的大文件进度:
+# cd want_repo
+# watch -n 1 du --max-depth=1
--- a/env_tools/install_mpi.sh
+++ b/env_tools/install_mpi.sh
+#!/bin/bash
+
+workspace=`pwd`
+
+tar xvf ${workspace}/../tools/mpi3.1.tar.gz -C /opt
+
+echo 'export LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH' >> ~/.bashrc
+echo 'export PATH=/opt/mpi/bin:$PATH' >> ~/.bashrc
+
+# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/mpi/lib
+# export PATH=$PATH:/opt/mpi/bin
--- a/env_tools/miniconda_install.sh
+++ b/env_tools/miniconda_install.sh
+#!/bin/bash
+
+
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+
+chmod 777 Miniconda3-latest-Linux-x86_64.sh
--- a/env_tools/modelScope_install.sh
+++ b/env_tools/modelScope_install.sh
+pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple
--- a/env_tools/solve_urllib3_openssl_api_question.sh
+++ b/env_tools/solve_urllib3_openssl_api_question.sh
+#!/bin/bash
+
+
+# 安装后通过网络下载模型可能会遇到问题如下：
+# ImportError: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' mo
+
+# 拉取 1.1.1+ 的 openssl 的代码
+wget https://www.openssl.org/source/openssl-1.1.1.tar.gz
+
+# 安装配置到指定路径
+./config shared --prefix=/usr/local/ssl --openssldir=/usr/local/ssl
+make -j${nproc} && make install
+echo "/usr/local/ssl/lib" >>/etc/ld.so.conf
+export LD_LIBRARY_PATH=/usr/local/ssl/lib:$LD_LIBRARY_PATH
+
+# 重新安装 urllib3
+pip3 uninstall urllib3
+# 根据配置安装符合版本的 urllib3
+pip3 install urllib3==1.26.0 -i https://mirrors.aliyun.com/pypi/simple
\ No newline at end of file
--- a/env_tools/sys_verify.sh
+++ b/env_tools/sys_verify.sh
+#!/bin/bash
+
+get_os_info() {
+    SYS_NAME=""
+    SYS_VERSION=""
+
+    if [[ -f /etc/os-release ]]; then
+        source /etc/os-release
+        
+        case "$ID" in
+            ubuntu|ubuntukylin)
+                SYS_NAME="$NAME"
+                SYS_VERSION="$VERSION_ID"
+                ;;
+            kylin)
+                SYS_NAME="$PRETTY_NAME"
+                SYS_VERSION="$VERSION_ID"
+                ;;
+            anolis) # 龙蜥操作系统（Anolis OS）
+                SYS_NAME="$PRETTY_NAME"
+                SYS_VERSION="$VERSION_ID"
+                ;;
+            uos) # 统信UOS
+                SYS_NAME="$PRETTY_NAME"
+                SYS_VERSION="$VERSION"
+                ;;
+            centos)
+                # 对于CentOS，尝试从/etc/redhat-release获取详细版本
+                if [[ -f /etc/redhat-release ]]; then
+                    local centos_release=$(</etc/redhat-release)
+                    SYS_NAME="CentOS"
+                    # 使用正则表达式提取版本号，例如"CentOS Linux release 7.6.1810 (Core)"提取为7.6.1810
+                    if [[ $centos_release =~ release\ ([0-9]+\.[0-9]+(\.[0-9]+)?)\ .* ]]; then
+                        SYS_VERSION="${BASH_REMATCH[1]}"
+                    else
+                        SYS_VERSION="未知"
+                    fi
+                fi
+                ;;
+            *)
+                # 其他未知系统
+                ;;
+        esac
+    else
+        echo "无法找到 /etc/os-release 文件"
+    fi
+}
+
+# 调用函数获取信息
+get_os_info
+
+# 输出结果
+echo "系统名称: ${SYS_NAME:-未知}"
+echo "版本号: ${SYS_VERSION:-未知}"
--- a/env_tools/tools/cmake-3.23.2.tar.gz
+++ b/env_tools/tools/cmake-3.23.2.tar.gz
--- a/env_tools/tools/mpi3.1.tar.gz
+++ b/env_tools/tools/mpi3.1.tar.gz
--- a/hip_tools_and_kernel/Makefile
+++ b/hip_tools_and_kernel/Makefile
+
+cc           := hipcc
+name         := pro
+workdir      := workspace
+srcdir       := src
+# export CUDA_VISIBLE_DEVICES=0
+objdir       := objs
+stdcpp       := c++17
+dst_so_path  := libs/alg/libEXEC.so
+project_name := EXEC
+demodir		 := demo
+defined      := 
+pwd          := $(abspath .)
+# 静态推理:0  动态推理:1
+run_args     := 0
+nvcc         := 
+cuda_arch    := 
+
+
+export pwd workdir srcdir objdir demodir
+
+demo_srcs   := $(shell find $(demodir) -name "*.cpp")
+demo_objs   := $(demo_srcs:.cpp=.o)
+demo_objs   := $(demo_objs:$(demodir)/%=$(objdir)/%)
+
+cpp_srcs := $(shell find $(srcdir) -name "*.cpp")
+cpp_objs := $(cpp_srcs:.cpp=.o)
+cpp_objs := $(cpp_objs:$(srcdir)/%=$(objdir)/%)
+
+cc_srcs := $(shell find $(srcdir) -name "*.cc")
+cc_objs := $(cc_srcs:.cc=.o)
+cc_objs := $(cc_objs:$(srcdir)/%=$(objdir)/%)
+
+cu_srcs  := $(shell find $(srcdir) -name "*.cu")
+cu_objs  := $(cu_srcs:.cu=.cu.o)
+cu_objs  := $(cu_objs:$(srcdir)/%=objs/%)
+
+
+dtk_path  := $(shell find /opt -name "dtk-*")
+
+include_paths := $(dtk_path)/cuda/include \
+			$(pwd)/3rdparty/opencv_4.5_lean/include/opencv4   \
+			$(srcdir)	\
+			. \
+			$(pwd)/3rdparty/ppl/include \
+			$(dtk_path)/include \
+			$(srcdir)/Utility
+
+
+library_paths :=  \
+			$(pwd)/3rdparty/opencv_4.5_lean/lib  \
+			$(pwd)/3rdparty/opencv_4.5_lean/lib64  \
+			$(pwd)/3rdparty/opencv_4.5_lean/lib/opencv4/3rdparty \
+			$(pwd)/3rdparty/opencv_4.5_lean/lib64/opencv4/3rdparty \
+			$(pwd)/3rdparty/ppl/lib \
+			$(dtk_path)/cuda/lib64 \
+			$(dtk_path)/hip/lib \
+			$(dtk_path)/lib
+
+
+empty := 
+library_path_export := $(subst $(empty) $(empty),:,$(library_paths))
+
+link_opencv := opencv_core opencv_imgproc opencv_imgcodecs opencv_dnn  
+# link_3rd    := libprotobuf libtiff libjpeg-turbo IlmImf libpng libwebp quirc zlib ade ittnotify libopenjp2      
+link_cuda   := cudart cuda cublas
+link_trt    := migraphx migraphx_gpu migraphx_onnx
+link_sys    := m stdc++ dl rt
+link_cudnn  := 
+link_librarys := $(link_opencv) $(link_trt) $(link_cuda) $(link_sys) 
+
+run_paths     := $(foreach item,$(library_paths),-Wl,-rpath=$(item))
+include_paths := $(foreach item,$(include_paths),-I$(item))
+library_paths := $(foreach item,$(library_paths),-L$(item))
+link_librarys := $(foreach item,$(link_librarys),-l$(item))
+defined       := $(foreach item,$(defined),-D$(item))
+
+link_librarys     += -pthread -fopenmp
+
+# cpp_compile_flags := -std=$(stdcpp) -fPIC -m64 -O3 -w  $(defined) -fuse-ld=gold -fuse-ld=gold --offload-arch=gfx926 --offload-arch=gfx906
+# cpp_compile_flags += $(include_paths)
+# cu_compile_flags  := -std=c++17 -w -O1 -Xcompiler "$(cpp_compile_flags)" $(cuda_arch) $(support_define) --gpu-max-threads-per-block=1024 --offload-arch=gfx926 --offload-arch=gfx906 
+# cu_compile_flags  += $(include_paths)
+
+cpp_compile_flags := -std=$(stdcpp) -fPIC -m64 -O3 -w  $(defined) --gpu-max-threads-per-block=1024
+cpp_compile_flags += $(include_paths)
+cu_compile_flags  := -std=c++17 -w -O3 -Xcompiler "$(cpp_compile_flags)" $(cuda_arch) $(support_define) --gpu-max-threads-per-block=1024
+cu_compile_flags  += $(include_paths)
+
+link_flags 		  += $(library_paths) $(link_librarys) $(run_paths)
+
+#pro         workspace/pro
+$(name)   : $(workdir)/$(name)
+all       : $(name)
+
+run       : $(name)
+	@echo Run_start
+	@cd $(workdir) && ./$(name) $(run_args)
+
+build_so : $(dst_so_path)
+$(dst_so_path) : $(cpp_objs) $(cc_objs) $(cu_objs) 
+	@echo Link $@
+	@mkdir -p $(dir $@)
+	@$(cc) $^ -shared -o $@ $(link_flags)
+
+run_by_so : $(dst_so_path)
+	@echo Compile demo.cpp
+	@$(cc) -c $(demo_srcs)  -o objs/$(project_name).o $(cpp_compile_flags)
+	@echo Link dst_so
+	@$(cc) objs/$(project_name).o -o $(workdir)/$(name) -Llibs/alg -l$(project_name) $(link_flags)
+	@./$(workdir)/$(name) $(run_args)
+
+$(workdir)/$(name) : $(cpp_objs) $(cc_objs) $(cu_objs)
+	@echo Link $@
+	@mkdir -p $(dir $@)
+	@$(cc) $^ -o $@ $(link_flags)
+
+$(objdir)/%.o : $(srcdir)/%.cpp
+	@echo Compile CXX $<
+	@mkdir -p $(dir $@)
+	@$(cc) -c $< -o $@ $(cpp_compile_flags)
+
+$(objdir)/%.o : $(srcdir)/%.cc
+	@echo Compile CUDA $<
+	@mkdir -p $(dir $@)
+	@$(cc) -c $< -o $@ $(cpp_compile_flags)
+
+$(objdir)/%.cu.o : $(srcdir)/%.cu
+	@echo Compile CUDA $<
+	@mkdir -p $(dir $@)
+	@$(nvcc) -c $< -o $@ $(cu_compile_flags)
+
+$(objdir)/%.o : $(demodir)/%.cpp
+	@echo Compile DEMO $<
+	@mkdir -p $(dir $@)
+	@$(cc) -c $< -o $@ $(cpp_compile_flags)
+
+# 定义清理指令
+clean :
+	@rm -rf $(objdir) $(workdir)/$(name)
+	@rm -f libs/alg/* ./*.o pro src/*.o demo/*.o workspace/core.* core.*
+
+debug :
+	@echo $(LD_LIBRARY_PATH):$(library_path_export)
+
+.PHONY : clean run $(name) all build_so run_by_so
--- a/hip_tools_and_kernel/README.md
+++ b/hip_tools_and_kernel/README.md
+# hip_tools_and_kernel
+
+
+hipcc 编译遇到 hip_runtime、 hipMalloc、 hipMemcpy 之类的，会自动添加-I\-L\-l
+
+
+
+
+
+
+## Getting started
+
+To make it easy for you to get started with GitLab, here's a list of recommended next steps.
+
+Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!
+
+## Add your files
+
+- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files
+- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command:
+
+```
+cd existing_repo
+git remote add origin http://10.0.15.207/wangkaixiong/hip_tools_and_kernel.git
+git branch -M main
+git push -uf origin main
+```
+
+## Integrate with your tools
+
+- [ ] [Set up project integrations](http://10.0.15.207/wangkaixiong/hip_tools_and_kernel/-/settings/integrations)
+
+## Collaborate with your team
+
+- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/)
+- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html)
+- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically)
+- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/)
+- [ ] [Automatically merge when pipeline succeeds](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html)
+
+## Test and Deploy
+
+Use the built-in continuous integration in GitLab.
+
+- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html)
+- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing(SAST)](https://docs.gitlab.com/ee/user/application_security/sast/)
+- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html)
+- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/)
+- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html)
+
+***
+
+# Editing this README
+
+When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thank you to [makeareadme.com](https://www.makeareadme.com/) for this template.
+
+## Suggestions for a good README
+Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information.
+
+## Name
+Choose a self-explaining name for your project.
+
+## Description
+Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors.
+
+## Badges
+On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge.
+
+## Visuals
+Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method.
+
+## Installation
+Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
+
+## Usage
+Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
+
+## Support
+Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
+
+## Roadmap
+If you have ideas for releases in the future, it is a good idea to list them in the README.
+
+## Contributing
+State if you are open to contributions and what your requirements are for accepting them.
+
+For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self.
+
+You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser.
+
+## Authors and acknowledgment
+Show your appreciation to those who have contributed to the project.
+
+## License
+For open source projects, say how it is licensed.
+
+## Project status
+If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
--- a/hip_tools_and_kernel/src/preprocess_kernel/hip_tools.cpp
+++ b/hip_tools_and_kernel/src/preprocess_kernel/hip_tools.cpp
+/*
+ *  系统关于HIP的功能函数
+ */
+#include "hip_tools.hpp"
+
+namespace sg{
+
+    // bool check_driver(CUresult e, const char* call, int line, const char *file) {
+    //     if (e != HIP_SUCCESS) {
+
+    //         const char* message = nullptr;
+    //         const char* name = nullptr;
+    //         cuGetErrorString(e, &message);
+    //         cuGetErrorName(e, &name);
+    //         printf("CUDA Driver error %s # %s, code = %s [ %d ] in file %s:%d\n", call, message, name, e, file, line);
+    //         return false;
+    //     }
+    //     return true;
+    // }
+
+    bool check_runtime(hipError_t e, const char* call, int line, const char *file){
+        if (e != HIP_SUCCESS) {
+            printf("CUDA Runtime error %s # %s, code = %s [ %d ] in file %s:%d\n", call, hipGetErrorString(e), hipGetErrorName(e), e, file, line);
+            return false;
+        }
+        return true;
+    }
+
+    bool check_device_id(int device_id){
+        int device_count = -1;
+        checkHipRuntime(hipGetDeviceCount(&device_count));
+        if(device_id < 0 || device_id >= device_count){
+            printf("Invalid device id: %d, count = %d\n", device_id, device_count);
+            return false;
+        }
+        return true;
+    }
+
+    int current_device_id(){
+        int device_id = 0;
+        checkHipRuntime(hipGetDevice(&device_id));
+        return device_id;
+    }
+
+    dim3 grid_dims(int numJobs) {
+        int numBlockThreads = numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
+        return dim3(((numJobs + numBlockThreads - 1) / (float)numBlockThreads));
+    }
+
+    dim3 block_dims(int numJobs) {
+        return numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
+    }
+
+    // std::string device_capability(int device_id){
+    //     hipDeviceProp_t prop;
+    //     checkHipRuntime(hipGetDeviceProperties(&prop, device_id));
+    //     return iLogger::format("%d.%d", prop.major, prop.minor);
+    // }
+
+    std::string device_name(int device_id){
+        hipDeviceProp_t prop;
+        checkHipRuntime(hipGetDeviceProperties(&prop, device_id));
+        return prop.name;
+    }
+
+    std::string device_description(){
+
+        hipDeviceProp_t prop;
+        size_t free_mem, total_mem;
+        int device_id = 0;
+
+        checkHipRuntime(hipGetDevice(&device_id));
+        checkHipRuntime(hipGetDeviceProperties(&prop, device_id));
+        checkHipRuntime(hipMemGetInfo(&free_mem, &total_mem));
+
+        return iLogger::format(
+            "[ID %d]<%s>[arch %d.%d][GMEM %.2f GB/%.2f GB]",
+            device_id, prop.name, prop.major, prop.minor, 
+            free_mem / 1024.0f / 1024.0f / 1024.0f,
+            total_mem / 1024.0f / 1024.0f / 1024.0f
+        );
+    }
+
+    AutoDevice::AutoDevice(int device_id){
+
+        hipGetDevice(&old_);
+        checkHipRuntime(hipSetDevice(device_id));
+    }
+
+    AutoDevice::~AutoDevice(){
+        checkHipRuntime(hipSetDevice(old_));
+    }
+};  // namespace sg
\ No newline at end of file
--- a/hip_tools_and_kernel/src/preprocess_kernel/hip_tools.hpp
+++ b/hip_tools_and_kernel/src/preprocess_kernel/hip_tools.hpp
+#ifndef HIP_TOOLS_HPP
+#define HIP_TOOLS_HPP
+
+/*
+ *  系统关于HIP的功能函数
+ */
+ 
+#include <hip/hip_runtime_api.h>
+#include <string>
+#include "ilogger.hpp"
+
+#define GPU_BLOCK_THREADS  512
+
+
+#define KernelPositionBlock											\
+	int position = (blockDim.x * blockIdx.x + threadIdx.x);		    \
+    if (position >= (edge)) return;
+
+
+// #define checkHipDriver(call)  sg::check_driver(call, #call, __LINE__, __FILE__)
+#define checkHipRuntime(call) sg::check_runtime(call, #call, __LINE__, __FILE__)
+#define CHECK(call)                                                                                 \
+{                                                                                                   \    
+    const hipError_t error = call;                                                                 \    
+    if (error != hipSuccess)                                                                       \        
+    {                                                                                               \
+        printf("Error: %s:%d, ", __FILE__, __LINE__);                                               \    
+        printf("code: %d, reason: %s\n", error, hipGetErrorString(error));                         \  
+    }                                                                                               \    
+}
+
+#define checkHipKernel(...)                                                                         \
+    __VA_ARGS__;                                                                                     \
+    do{hipError_t cudaStatus = hipPeekAtLastError();                                               \
+    if (cudaStatus != hipSuccess){                                                                  \
+        printf("launch failed: %s\n", hipGetErrorString(cudaStatus));                                  \
+    }} while(0);
+
+
+#define Assert(op)					 \
+	do{                              \
+		bool cond = !(!(op));        \
+		if(!cond){                   \
+			printf("Assert failed, \n" #op);  \
+		}                                  \
+	}while(false)
+
+
+struct CUctx_st;
+struct CUstream_st;
+
+typedef CUstream_st* ICUStream;
+typedef CUctx_st* ICUContext;
+typedef void* ICUDeviceptr;
+typedef int DeviceID;
+
+namespace sg{
+    // bool check_driver(CUresult e, const char* call, int iLine, const char *szFile);
+    bool check_runtime(hipError_t e, const char* call, int iLine, const char *szFile);
+    bool check_device_id(int device_id);
+    int current_device_id();
+
+    dim3 grid_dims(int numJobs);
+    dim3 block_dims(int numJobs);
+
+    // return 8.6  etc.
+    std::string device_capability(int device_id);
+    std::string device_name(int device_id);
+    std::string device_description();
+
+    class AutoDevice{
+    public:
+        AutoDevice(int device_id = 0);
+        virtual ~AutoDevice();
+    
+    private:
+        int old_ = -1;
+    };
+}
+
+
+#endif // HIP_TOOLS_HPP
\ No newline at end of file
--- a/hip_tools_and_kernel/src/preprocess_kernel/preprocess_kernel.cpp
+++ b/hip_tools_and_kernel/src/preprocess_kernel/preprocess_kernel.cpp
+
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+#include <hipcub/block/block_radix_sort.hpp>
+#include <hipcub/warp/warp_reduce.hpp>
+#include <hipcub/block/block_load.hpp>
+#include <hipcub/block/block_discontinuity.hpp>
+#include <hipcub/block/block_store.hpp>
+#include <hipcub/block/block_reduce.hpp>
+#include <hip/hip_math_constants.h>
+#include "preprocess_kernel.cuh"
+
+namespace AIKernel{
+
+	Norm Norm::mean_std(const float mean[3], const float std[3], float alpha, ChannelType channel_type){
+
+		Norm out;
+		out.type  = NormType::MeanStd;
+		out.alpha = alpha;
+		out.channel_type = channel_type;
+		memcpy(out.mean, mean, sizeof(out.mean));
+		memcpy(out.std,  std,  sizeof(out.std));
+		return out;
+	}
+
+	Norm Norm::alpha_beta(float alpha, float beta, ChannelType channel_type){
+
+		Norm out;
+		out.type = NormType::AlphaBeta;
+		out.alpha = alpha;
+		out.beta = beta;
+		out.channel_type = channel_type;
+		return out;
+	}
+
+	Norm Norm::None(){
+		return Norm();
+	}	
+
+	#define INTER_RESIZE_COEF_BITS 11
+	#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
+	#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
+	template<typename _T>
+	static __inline__ __device__ _T limit(_T value, _T low, _T high){
+		return value < low ? low : (value > high ? high : value);
+	}
+
+	static __inline__ __device__ int resize_cast(int value){
+		return (value + (1 << (CAST_BITS - 1))) >> CAST_BITS;
+	}
+
+
+    __global__ void computeAvgBright(
+        const uint8_t* src, int* sum, int edge)
+    {   
+        int position = blockDim.x * blockIdx.x + threadIdx.x;
+        if (position >= edge) return;
+
+        // puDst = src + 当前图像的位置
+        const uint8_t* puDst = src + position * 3;
+        
+        // BGR 转灰度  + 0.5 是为了四舍五入
+        // Y = 0.299 R + 0.587 G + 0.114B
+        // uint8_t temp = uint8_t(puDst[0]*0.114f + puDst[1]*0.587f +  puDst[2]*0.299f + 0.5f);
+        atomicAdd(sum, int(puDst[0]*0.114f + puDst[1]*0.587f +  puDst[2]*0.299f + 0.5f));
+    }
+
+
+    __global__ void perspective_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, 
+		uint8_t const_value_st, float* warp_affine_matrix_3_3, Norm norm, int edge){
+
+		int position = blockDim.x * blockIdx.x + threadIdx.x;
+		if (position >= edge) return;
+
+		float m_x1 = warp_affine_matrix_3_3[0];
+		float m_y1 = warp_affine_matrix_3_3[1];
+		float m_z1 = warp_affine_matrix_3_3[2];
+
+		float m_x2 = warp_affine_matrix_3_3[3];
+		float m_y2 = warp_affine_matrix_3_3[4];
+		float m_z2 = warp_affine_matrix_3_3[5];
+
+        float m_x3 = warp_affine_matrix_3_3[6];
+		float m_y3 = warp_affine_matrix_3_3[7];
+		float m_z3 = warp_affine_matrix_3_3[8];
+
+		int dx      = position % dst_width;
+		int dy      = position / dst_width;
+
+        // 原图位置
+		float src_x = (m_x1 * dx + m_y1 * dy + m_z1)/(m_x3 * dx + m_y3 * dy + m_z3);
+		float src_y = (m_x2 * dx + m_y2 * dy + m_z2)/(m_x3 * dx + m_y3 * dy + m_z3);
+		float c0, c1, c2;
+
+		if(src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height){
+			// out of range
+			c0 = const_value_st;
+			c1 = const_value_st;
+			c2 = const_value_st;
+		}else{
+			int y_low = floorf(src_y);
+			int x_low = floorf(src_x);
+			int y_high = y_low + 1;
+			int x_high = x_low + 1;
+
+			uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+			float ly    = src_y - y_low;
+			float lx    = src_x - x_low;
+			float hy    = 1 - ly;
+			float hx    = 1 - lx;
+			float w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+			uint8_t* v1 = const_value;
+			uint8_t* v2 = const_value;
+			uint8_t* v3 = const_value;
+			uint8_t* v4 = const_value;
+			if(y_low >= 0){
+				if (x_low >= 0)
+					v1 = src + y_low * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v2 = src + y_low * src_line_size + x_high * 3;
+			}
+			
+			if(y_high < src_height){
+				if (x_low >= 0)
+					v3 = src + y_high * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v4 = src + y_high * src_line_size + x_high * 3;
+			}
+			
+			// same to opencv
+			c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
+			c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
+			c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
+		}
+
+		if(norm.channel_type == ChannelType::Invert){
+			float t = c2;
+			c2 = c0;  c0 = t;
+		}
+
+		if(norm.type == NormType::MeanStd){
+			c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
+			c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
+			c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
+		}else if(norm.type == NormType::AlphaBeta){
+			c0 = c0 * norm.alpha + norm.beta;
+			c1 = c1 * norm.alpha + norm.beta;
+			c2 = c2 * norm.alpha + norm.beta;
+		}
+
+		int area = dst_width * dst_height;
+		float* pdst_c0 = dst + dy * dst_width + dx;
+		float* pdst_c1 = pdst_c0 + area;
+		float* pdst_c2 = pdst_c1 + area;
+		*pdst_c0 = c0;
+		*pdst_c1 = c1;
+		*pdst_c2 = c2;
+
+        // 验证效果时, 注释掉上面的模型预处理的代码(倒数第二个if到这一行的所有代码); 开启下面的代码以保存图像透视变换之后的数据, 在进行下一次验证
+		// float* buf_pdst_c0 = dst + position * 3;
+		// float* buf_pdst_c1 = buf_pdst_c0 + 1;
+		// float* buf_pdst_c2 = buf_pdst_c1 + 1;
+		// *buf_pdst_c0 = c0;
+		// *buf_pdst_c1 = c1;
+		// *buf_pdst_c2 = c2;
+	}
+
+	// same to opencv
+	// reference: https://github.com/opencv/opencv/blob/24fcb7f8131f707717a9f1871b17d95e7cf519ee/modules/imgproc/src/resize.cpp
+	// reference: https://github.com/openppl-public/ppl.cv/blob/04ef4ca48262601b99f1bb918dcd005311f331da/src/ppl/cv/cuda/resize.cu
+	/*
+	  可以考虑用同样实现的resize函数进行训练，python代码在：tools/test_resize.py
+	*/
+	__global__ void resize_bilinear_and_normalize_kernel(
+		uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, 
+		float sx, float sy, Norm norm, int edge
+	){
+		int position = blockDim.x * blockIdx.x + threadIdx.x;
+		if (position >= edge) return;
+
+        // 得到 每一个 目标像素的 坐标，仅仅是坐标
+		int dx      = position % dst_width;
+		int dy      = position / dst_width;
+        // 得到 对应的 原图像的 坐标
+		float src_x = (dx + 0.5f) * sx - 0.5f;
+		float src_y = (dy + 0.5f) * sy - 0.5f;
+		float c0, c1, c2;
+
+		int y_low = floorf(src_y);
+		int x_low = floorf(src_x);
+		int y_high = limit(y_low + 1, 0, src_height - 1);
+		int x_high = limit(x_low + 1, 0, src_width - 1);
+		y_low = limit(y_low, 0, src_height - 1);
+		x_low = limit(x_low, 0, src_width - 1);
+
+		int ly    = rint((src_y - y_low) * INTER_RESIZE_COEF_SCALE);
+		int lx    = rint((src_x - x_low) * INTER_RESIZE_COEF_SCALE);
+		int hy    = INTER_RESIZE_COEF_SCALE - ly;
+		int hx    = INTER_RESIZE_COEF_SCALE - lx;
+
+		int w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+		// float* pdst = dst + dy * dst_width + dx * 3;
+		uint8_t* v1 = src + y_low * src_line_size + x_low * 3;
+		uint8_t* v2 = src + y_low * src_line_size + x_high * 3;
+		uint8_t* v3 = src + y_high * src_line_size + x_low * 3;
+		uint8_t* v4 = src + y_high * src_line_size + x_high * 3;
+
+		c0 = resize_cast(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]);
+		c1 = resize_cast(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]);
+		c2 = resize_cast(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]);
+
+		if(norm.channel_type == ChannelType::Invert){
+			float t = c2;
+			c2 = c0;  c0 = t;
+		}
+
+		if(norm.type == NormType::MeanStd){
+			c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
+			c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
+			c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
+		}else if(norm.type == NormType::AlphaBeta){
+			c0 = c0 * norm.alpha + norm.beta;
+			c1 = c1 * norm.alpha + norm.beta;
+			c2 = c2 * norm.alpha + norm.beta;
+		}
+
+		int area = dst_width * dst_height;
+		float* pdst_c0 = dst + dy * dst_width + dx;
+		float* pdst_c1 = pdst_c0 + area;
+		float* pdst_c2 = pdst_c1 + area;
+		*pdst_c0 = c0;
+		*pdst_c1 = c1;
+		*pdst_c2 = c2;
+
+        // float* buf_pdst_c0 = dst + position * 3;
+		// float* buf_pdst_c1 = buf_pdst_c0 + 1;
+		// float* buf_pdst_c2 = buf_pdst_c1 + 1;
+		// *buf_pdst_c0 = c0;
+		// *buf_pdst_c1 = c1;
+		// *buf_pdst_c2 = c2;
+	}
+
+    __global__ void resize_bilinear_and_normalize_kernel_two_steps(
+		uint8_t* src, float* resize_buffer, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, 
+		float sx, float sy, Norm norm, int edge
+	){
+		int position = blockDim.x * blockIdx.x + threadIdx.x;
+		if (position >= edge) return;
+
+        // 当前是第几行，第几列数据
+		int dx      = position % dst_width;
+		int dy      = position / dst_width;
+
+		float src_x = (dx + 0.5f) * sx - 0.5f;
+		float src_y = (dy + 0.5f) * sy - 0.5f;
+		float c0, c1, c2;
+
+		int y_low = floorf(src_y);
+		int x_low = floorf(src_x);
+		int y_high = limit(y_low + 1, 0, src_height - 1);
+		int x_high = limit(x_low + 1, 0, src_width - 1);
+		y_low = limit(y_low, 0, src_height - 1);
+		x_low = limit(x_low, 0, src_width - 1);
+
+		int ly    = rint((src_y - y_low) * INTER_RESIZE_COEF_SCALE);
+		int lx    = rint((src_x - x_low) * INTER_RESIZE_COEF_SCALE);
+		int hy    = INTER_RESIZE_COEF_SCALE - ly;
+		int hx    = INTER_RESIZE_COEF_SCALE - lx;
+        // w 为 插值权重
+		int w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+        // float* temp_pdst = resize_buffer + dy * dst_width + dx * 3;
+		// float* pdst = dst + dy * dst_width + dx * 3;
+
+		uint8_t* v1 = src + y_low * src_line_size + x_low * 3;
+		uint8_t* v2 = src + y_low * src_line_size + x_high * 3;
+		uint8_t* v3 = src + y_high * src_line_size + x_low * 3;
+		uint8_t* v4 = src + y_high * src_line_size + x_high * 3;
+
+		c0 = resize_cast(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]);
+		c1 = resize_cast(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]);
+		c2 = resize_cast(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]);
+        
+
+        int area = dst_width * dst_height;
+		// float* buf_pdst_c0 = resize_buffer + dy * dst_width + dx;
+		float* buf_pdst_c0 = resize_buffer + position * 3;
+		float* buf_pdst_c1 = buf_pdst_c0 + 1;
+		float* buf_pdst_c2 = buf_pdst_c1 + 1;
+		*buf_pdst_c0 = c0;
+		*buf_pdst_c1 = c1;
+		*buf_pdst_c2 = c2;
+
+		if(norm.channel_type == ChannelType::Invert){
+			float t = c2;
+			c2 = c0;  c0 = t;
+		}
+
+        
+
+		if(norm.type == NormType::MeanStd){
+			c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
+			c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
+			c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
+		}else if(norm.type == NormType::AlphaBeta){
+			c0 = c0 * norm.alpha + norm.beta;
+			c1 = c1 * norm.alpha + norm.beta;
+			c2 = c2 * norm.alpha + norm.beta;
+		}
+
+		float* pdst_c0 = dst + dy * dst_width + dx;
+		float* pdst_c1 = pdst_c0 + area;
+		float* pdst_c2 = pdst_c1 + area;
+		*pdst_c0 = c0;
+		*pdst_c1 = c1;
+		*pdst_c2 = c2;
+	}
+
+
+	__global__ void warp_affine_bilinear_and_normalize_plane_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, 
+		uint8_t const_value_st, float* warp_affine_matrix_2_3, Norm norm, int edge){
+
+		int position = blockDim.x * blockIdx.x + threadIdx.x;
+		if (position >= edge) return;
+
+		float m_x1 = warp_affine_matrix_2_3[0];
+		float m_y1 = warp_affine_matrix_2_3[1];
+		float m_z1 = warp_affine_matrix_2_3[2];
+		float m_x2 = warp_affine_matrix_2_3[3];
+		float m_y2 = warp_affine_matrix_2_3[4];
+		float m_z2 = warp_affine_matrix_2_3[5];
+
+		int dx      = position % dst_width;
+		int dy      = position / dst_width;
+		float src_x = m_x1 * dx + m_y1 * dy + m_z1;
+		float src_y = m_x2 * dx + m_y2 * dy + m_z2;
+		float c0, c1, c2;
+
+		if(src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height){
+			// out of range
+			c0 = const_value_st;
+			c1 = const_value_st;
+			c2 = const_value_st;
+		}else{
+			int y_low = floorf(src_y);
+			int x_low = floorf(src_x);
+			int y_high = y_low + 1;
+			int x_high = x_low + 1;
+
+			uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+			float ly    = src_y - y_low;
+			float lx    = src_x - x_low;
+			float hy    = 1 - ly;
+			float hx    = 1 - lx;
+			float w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+			uint8_t* v1 = const_value;
+			uint8_t* v2 = const_value;
+			uint8_t* v3 = const_value;
+			uint8_t* v4 = const_value;
+			if(y_low >= 0){
+				if (x_low >= 0)
+					v1 = src + y_low * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v2 = src + y_low * src_line_size + x_high * 3;
+			}
+			
+			if(y_high < src_height){
+				if (x_low >= 0)
+					v3 = src + y_high * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v4 = src + y_high * src_line_size + x_high * 3;
+			}
+			
+			// same to opencv
+			c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
+			c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
+			c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
+		}
+
+		if(norm.channel_type == ChannelType::Invert){
+			float t = c2;
+			c2 = c0;  c0 = t;
+		}
+
+		if(norm.type == NormType::MeanStd){
+			c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
+			c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
+			c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
+		}else if(norm.type == NormType::AlphaBeta){
+			c0 = c0 * norm.alpha + norm.beta;
+			c1 = c1 * norm.alpha + norm.beta;
+			c2 = c2 * norm.alpha + norm.beta;
+		}
+
+		int area = dst_width * dst_height;
+		float* pdst_c0 = dst + dy * dst_width + dx;
+		float* pdst_c1 = pdst_c0 + area;
+		float* pdst_c2 = pdst_c1 + area;
+		*pdst_c0 = c0;
+		*pdst_c1 = c1;
+		*pdst_c2 = c2;
+
+        // float* buf_pdst_c0 = dst + position * 3;
+		// float* buf_pdst_c1 = buf_pdst_c0 + 1;
+		// float* buf_pdst_c2 = buf_pdst_c1 + 1;
+		// *buf_pdst_c0 = c0;
+		// *buf_pdst_c1 = c1;
+		// *buf_pdst_c2 = c2;
+	}
+
+    __global__ void warp_affine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, uint8_t* dst, int dst_width, int dst_height, 
+		uint8_t const_value_st, float* warp_affine_matrix_2_3, int edge){
+
+		int position = blockDim.x * blockIdx.x + threadIdx.x;
+		if (position >= edge) return;
+
+		float m_x1 = warp_affine_matrix_2_3[0];
+		float m_y1 = warp_affine_matrix_2_3[1];
+		float m_z1 = warp_affine_matrix_2_3[2];
+		float m_x2 = warp_affine_matrix_2_3[3];
+		float m_y2 = warp_affine_matrix_2_3[4];
+		float m_z2 = warp_affine_matrix_2_3[5];
+
+		int dx      = position % dst_width;
+		int dy      = position / dst_width;
+		float src_x = m_x1 * dx + m_y1 * dy + m_z1;
+		float src_y = m_x2 * dx + m_y2 * dy + m_z2;
+		float c0, c1, c2;
+
+		if(src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height){
+			// out of range
+			c0 = const_value_st;
+			c1 = const_value_st;
+			c2 = const_value_st;
+		}else{
+			int y_low = floorf(src_y);
+			int x_low = floorf(src_x);
+			int y_high = y_low + 1;
+			int x_high = x_low + 1;
+
+			uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+			float ly    = src_y - y_low;
+			float lx    = src_x - x_low;
+			float hy    = 1 - ly;
+			float hx    = 1 - lx;
+			float w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+			uint8_t* v1 = const_value;
+			uint8_t* v2 = const_value;
+			uint8_t* v3 = const_value;
+			uint8_t* v4 = const_value;
+			if(y_low >= 0){
+				if (x_low >= 0)
+					v1 = src + y_low * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v2 = src + y_low * src_line_size + x_high * 3;
+			}
+			
+			if(y_high < src_height){
+				if (x_low >= 0)
+					v3 = src + y_high * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v4 = src + y_high * src_line_size + x_high * 3;
+			}
+			
+			// same to opencv
+			c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
+			c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
+			c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
+		}
+
+
+        uint8_t* buf_pdst_c0 = dst + position * 3;
+		uint8_t* buf_pdst_c1 = buf_pdst_c0 + 1;
+		uint8_t* buf_pdst_c2 = buf_pdst_c1 + 1;
+		*buf_pdst_c0 = (uint8_t)c0;
+		*buf_pdst_c1 = (uint8_t)c1;
+		*buf_pdst_c2 = (uint8_t)c2;
+	}
+
+	__global__ void warp_affine_bilinear_and_normalize_focus_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, 
+		uint8_t const_value_st, float* warp_affine_matrix_1_3, Norm norm, int edge){
+
+		int position = blockDim.x * blockIdx.x + threadIdx.x;
+		if (position >= edge) return;
+
+		float m_k   = *warp_affine_matrix_1_3++;
+		float m_b0  = *warp_affine_matrix_1_3++;
+		float m_b1  = *warp_affine_matrix_1_3++;
+
+		int dx      = position % dst_width;
+		int dy      = position / dst_width;
+		float src_x = m_k * dx + m_b0;
+		float src_y = m_k * dy + m_b1;
+		float c0, c1, c2;
+
+		if(src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height){
+			// out of range
+			c0 = const_value_st;
+			c1 = const_value_st;
+			c2 = const_value_st;
+		}else{
+			int y_low = floorf(src_y);
+			int x_low = floorf(src_x);
+			int y_high = y_low + 1;
+			int x_high = x_low + 1;
+
+			uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+			float ly    = src_y - y_low;
+			float lx    = src_x - x_low;
+			float hy    = 1 - ly;
+			float hx    = 1 - lx;
+			float w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+			uint8_t* v1 = const_value;
+			uint8_t* v2 = const_value;
+			uint8_t* v3 = const_value;
+			uint8_t* v4 = const_value;
+			if(y_low >= 0){
+				if (x_low >= 0)
+					v1 = src + y_low * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v2 = src + y_low * src_line_size + x_high * 3;
+			}
+			
+			if(y_high < src_height){
+				if (x_low >= 0)
+					v3 = src + y_high * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v4 = src + y_high * src_line_size + x_high * 3;
+			}
+
+			// same to opencv
+			c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
+			c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
+			c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
+		}
+
+		if(norm.channel_type == ChannelType::Invert){
+			float t = c2;
+			c2 = c0;  c0 = t;
+		}
+
+		if(norm.type == NormType::MeanStd){
+			c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
+			c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
+			c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
+		}else if(norm.type == NormType::AlphaBeta){
+			c0 = c0 * norm.alpha + norm.beta;
+			c1 = c1 * norm.alpha + norm.beta;
+			c2 = c2 * norm.alpha + norm.beta;
+		}
+
+		int after_focus_width  = dst_width / 2;
+		int after_focus_height = dst_height / 2;
+		int fdx = dx / 2;
+		int fdy = dy / 2;
+		int fc  = ((dx % 2) << 1) | (dy % 2);
+
+		/**
+		 *   x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]
+		 *    4                     fc
+		 *    3                     [0, 1, 2]
+		 *    after_focus_height    fdy
+		 *    after_focus_width     fdx
+		 *    左乘右加
+		 **/
+
+		float* pdst_c0 = dst + ((fc * 3 + 0) * after_focus_height + fdy) * after_focus_width + fdx;
+		float* pdst_c1 = dst + ((fc * 3 + 1) * after_focus_height + fdy) * after_focus_width + fdx;
+		float* pdst_c2 = dst + ((fc * 3 + 2) * after_focus_height + fdy) * after_focus_width + fdx;
+
+		*pdst_c0 = c0;
+		*pdst_c1 = c1;
+		*pdst_c2 = c2;
+	}
+
+	__global__ void normalize_feature_kernel(float* feature_array, int num_feature, int feature_length, int edge){
+
+		/*
+		&   1 gz         bi.z   0
+		*   1 gy         bi.y   0
+        *   N NF         bi.x   ~
+		*   1 1          ti.z   0
+		*   F FL / 32    ti.y   ~
+		*   Q 32         ti.x   ~
+		*/
+
+		int position = (blockIdx.x * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+        if (position >= edge) return;
+
+		extern __shared__ float l2_norm[];
+
+		int irow    = position / feature_length;
+		int icol    = position % feature_length;
+
+		if(icol == 0)
+			l2_norm[irow] = 0;
+		
+		__syncthreads();
+
+		float value = feature_array[position];
+		atomicAdd(l2_norm + irow, value * value);
+
+		__syncthreads();
+		if(icol == 0)
+			l2_norm[irow] = sqrt(l2_norm[irow]);
+
+		__syncthreads();
+		feature_array[position] = value / l2_norm[irow];
+	}
+
+    static __device__ uint8_t cast(float value){
+        return value < 0 ? 0 : (value > 255 ? 255 : value);
+    }
+
+    static __global__ void convert_nv12_to_bgr_kernel(const uint8_t* y, const uint8_t* uv, int width, int height, int linesize, uint8_t* dst_bgr, int edge){
+
+        int position = blockDim.x * blockIdx.x + threadIdx.x;
+        if (position >= edge) return;
+
+        int ox = position % width;
+        int oy = position / width;
+        const uint8_t& yvalue = y[oy * linesize + ox];
+        int offset_uv = (oy >> 1) * linesize + (ox & 0xFFFFFFFE);
+        const uint8_t& u = uv[offset_uv + 0];
+        const uint8_t& v = uv[offset_uv + 1];
+		dst_bgr[position * 3 + 0] = 1.164f * (yvalue - 16.0f) + 2.018f * (u - 128.0f);
+		dst_bgr[position * 3 + 1] = 1.164f * (yvalue - 16.0f) - 0.813f * (v - 128.0f) - 0.391f * (u - 128.0f);
+		dst_bgr[position * 3 + 2] = 1.164f * (yvalue - 16.0f) + 1.596f * (v - 128.0f);
+    }
+
+    __global__ void roi_kernel(uint8_t* src, int src_line_step, int src_width, 
+                int src_height, uint8_t* dst, int x_start, int y_start, int roi_width, int roi_height, int edge)
+    {   
+        // blockDim.x = 512
+        int position = blockDim.x * blockIdx.x + threadIdx.x;
+        if (position >= edge) return;
+        // printf(" position: %d \n", position);
+        // 单纯计算像素位置, 不包含通道 OK
+        int start_position = src_width * y_start + x_start;
+        int end_position = src_width * (y_start + roi_height) + x_start + roi_width;
+        // 调试, gan掉不好看的边界
+        if (position >= start_position - 1 && position <= end_position)
+        {   
+            int sx = position % src_width;
+            int sy = position / src_width;
+            // 调试, gan掉不好看的边界
+            if (sx >= x_start && sx <= x_start + roi_width)
+            {   
+                int dx = sx - x_start;
+                int dy = sy - y_start;
+                int dst_position = (dy * roi_width + dx ) * 3;
+
+                uint8_t* psrc_c0 = src + position * 3;
+                uint8_t* psrc_c1 = psrc_c0 + 1;
+                uint8_t* psrc_c2 = psrc_c0 + 2;
+
+                uint8_t* pdst_c0 = dst + dst_position;
+                uint8_t* pdst_c1 = pdst_c0 + 1;
+                uint8_t* pdst_c2 = pdst_c0 + 2;
+
+                *pdst_c0     = *psrc_c0;
+                *pdst_c1     = *psrc_c1;
+                *pdst_c2     = *psrc_c2;
+            }
+        }
+        else
+        {
+            return;
+        }
+    }
+
+
+	/////////////////////////////////////////////////////////////////////////
+	void convert_nv12_to_bgr_invoke(
+		const uint8_t* y, const uint8_t* uv, int width, int height, int linesize, uint8_t* dst, hipStream_t stream){
+			
+		int total = width * height;
+		dim3 grid = sg::grid_dims(total);
+		dim3 block = sg::block_dims(total);
+
+		checkHipKernel(convert_nv12_to_bgr_kernel<<<grid, block, 0, stream>>>(
+			y, uv, width, height, linesize,
+			dst, total
+		));
+	}
+
+	void warp_affine_bilinear_and_normalize_plane(
+		uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+		float* matrix_2_3, uint8_t const_value, const Norm& norm,
+		hipStream_t stream) {
+		
+		int jobs   = dst_width * dst_height;
+		auto grid  = sg::grid_dims(jobs);
+		auto block = sg::block_dims(jobs);
+		
+		checkHipKernel(warp_affine_bilinear_and_normalize_plane_kernel <<<grid, block, 0, stream >>> (
+			src, src_line_size,
+			src_width, src_height, dst,
+			dst_width, dst_height, const_value, matrix_2_3, norm, jobs
+		));
+	}
+
+    void warp_affine(
+		uint8_t* src, int src_line_size, int src_width, int src_height, uint8_t* dst, int dst_width, int dst_height,
+		float* matrix_2_3, uint8_t const_value, hipStream_t& stream) {
+		
+		int jobs   = dst_width * dst_height;
+		auto grid  = sg::grid_dims(jobs);
+		auto block = sg::block_dims(jobs);
+		
+		checkHipKernel(warp_affine_kernel <<<grid, block, 0, stream >>> (
+			src, src_line_size,
+			src_width, src_height, dst,
+			dst_width, dst_height, const_value, matrix_2_3, jobs
+		));
+	}
+
+	
+	void warp_affine_bilinear_and_normalize_focus(
+        uint8_t* src, int src_line_size, int src_width, int src_height, 
+        float* dst  , int dst_width, int dst_height,
+        float* matrix_1_3, uint8_t const_value, const Norm& norm,
+        hipStream_t stream){
+
+		int jobs   = dst_width * dst_height;
+		auto grid  = sg::grid_dims(jobs);
+		auto block = sg::block_dims(jobs);
+		
+		checkHipKernel(warp_affine_bilinear_and_normalize_focus_kernel <<<grid, block, 0, stream >>> (
+			src, src_line_size,
+			src_width, src_height, dst,
+			dst_width, dst_height, const_value, matrix_1_3, norm, jobs
+		));
+	}
+
+	void resize_bilinear_and_normalize(
+		uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+		const Norm& norm) {
+		
+		int jobs   = dst_width * dst_height;
+		auto grid  = sg::grid_dims(jobs);
+		auto block = sg::block_dims(jobs);
+		
+		resize_bilinear_and_normalize_kernel <<<grid, block, 0, nullptr >>> (
+			src, src_line_size,
+			src_width, src_height, dst,
+			dst_width, dst_height, src_width/(float)dst_width, src_height/(float)dst_height, norm, jobs
+		);
+
+    
+
+	}
+
+    void resize_bilinear_and_normalize_two_steps(
+		uint8_t* src, float* resize_buffer, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+		const Norm& norm,
+		hipStream_t stream)
+    {   
+        int jobs   = dst_width * dst_height;
+		auto grid  = sg::grid_dims(jobs);
+		auto block = sg::block_dims(jobs);
+        checkHipKernel(resize_bilinear_and_normalize_kernel_two_steps <<<grid, block, 0, stream >>> (
+			src, resize_buffer, src_line_size,
+			src_width, src_height, dst,
+			dst_width, dst_height, src_width/(float)dst_width, src_height/(float)dst_height, norm, jobs
+		));
+
+    }
+	
+	void norm_feature(
+        float* feature_array, int num_feature, int feature_length,
+        hipStream_t stream
+    ){
+		Assert(feature_length % 32 == 0);
+
+		int jobs   = num_feature * feature_length;
+		auto grid  = dim3(num_feature);
+		auto block = dim3(feature_length / 32, 32);
+		checkHipKernel(normalize_feature_kernel <<<grid, block, num_feature * sizeof(float), stream >>> (
+			feature_array, num_feature, feature_length, jobs
+		));	
+	}
+
+    /************************************************************************
+    函　数　名：GetImgAvgBright
+    功　　　能：进行GPU上的图像亮度计算
+    参　　　数：src         [IN] 图像地址(暂时为cpu地址); 
+               width:      [IN] 图像宽度; 
+               height:     [IN] 图像高度;
+               linesize:   [IN] 图像一行的字节数; 
+               &avgBright:  [OUT] 平均亮度 ; 
+    返　回  值：无
+    ************************************************************************/
+    void GetImgAvgBright(
+        const uint8_t* src, int width, int height, int linesize, float& avgBright
+    )
+    {   
+        int total = width * height;
+
+        dim3 grid = sg::grid_dims(total);
+        dim3 block = sg::block_dims(total);
+
+        int *sum_dev=nullptr, *sum_host=nullptr;
+
+        // gpu, cpu 空间以及数据准备
+        size_t space_size = sizeof(int); 
+
+        checkHipRuntime(hipMalloc(&sum_dev, space_size));
+        checkHipRuntime(hipMemset(sum_dev, 0, space_size));
+
+        sum_host = (int*)malloc(space_size);
+        memset(sum_host, 0, space_size);
+
+        // 核函数执行
+        computeAvgBright<<<grid, block, 0, nullptr>>> (
+            src, sum_dev, total
+        );
+
+        hipMemcpy(sum_host, sum_dev, sizeof(int), hipMemcpyDeviceToHost);
+
+        // 计算亮度
+        avgBright = float(*sum_host) / float(total * 255);
+
+        // 销毁显存 和 内存
+        checkHipRuntime(hipFree(sum_dev));
+        free(sum_host);
+        sum_host = nullptr;
+        sum_dev = nullptr;
+    }
+
+
+    void Perspective_gpu(
+        uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+		float* matrix_3_3, uint8_t const_value, const Norm& norm
+        )
+    {   
+        int jobs   = dst_width * dst_height;
+		auto grid  = sg::grid_dims(jobs);
+		auto block = sg::block_dims(jobs);
+
+		
+		checkHipKernel(perspective_kernel <<<grid, block, 0, nullptr >>> (
+			src, src_line_size,
+			src_width, src_height, dst,
+			dst_width, dst_height, const_value, matrix_3_3, norm, jobs
+		));
+
+    }
+
+    void obtain_roi(
+    uint8_t* src, int src_line_step, int src_width, int src_height, uint8_t* dst, int x, int y, int dst_width, int dst_height,
+    hipStream_t stream)
+    {
+        int jobs   = src_width * src_height;
+        auto grid  = sg::grid_dims(jobs);
+        auto block = sg::block_dims(jobs);
+        
+        checkHipKernel(roi_kernel <<<grid, block, 0, stream >>> (
+            src, src_line_step,
+            src_width, src_height, dst,
+            x, y, dst_width, 
+            dst_height, jobs
+        ));
+    }
+};
--- a/hip_tools_and_kernel/src/preprocess_kernel/preprocess_kernel.cuh
+++ b/hip_tools_and_kernel/src/preprocess_kernel/preprocess_kernel.cuh
+#ifndef PREPROCESS_KERNEL_CUH
+#define PREPROCESS_KERNEL_CUH
+
+#include "preprocess_kernel/hip_tools.hpp"
+#include <hip/hip_runtime_api.h>
+
+namespace AIKernel{
+
+    enum class NormType : int{
+        None      = 0,
+        MeanStd   = 1,
+        AlphaBeta = 2
+    };
+
+    enum class ChannelType : int{
+        None          = 0,
+        Invert        = 1
+    };
+
+    struct Norm{
+        float mean[3];
+        float std[3];
+        float alpha, beta;
+        NormType type = NormType::None;
+        ChannelType channel_type = ChannelType::None;
+
+        // out = (x * alpha - mean) / std
+        static Norm mean_std(const float mean[3], const float std[3], float alpha = 1/255.0f, ChannelType channel_type=ChannelType::None);
+
+        // out = x * alpha + beta
+        static Norm alpha_beta(float alpha, float beta = 0, ChannelType channel_type=ChannelType::None);
+
+        // None
+        static Norm None();
+    };
+
+    /************************************************************************
+    函　数　名：GetImgAvgBright
+    功　　　能：进行GPU上的图像亮度计算
+    参　　　数：src         [IN] 图像地址; 
+               width:      [IN] 图像宽度; 
+               height:     [IN] 图像高度;
+               linesize:   [IN] 图像一行的字节数; 
+               &avgBright:  [OUT] 平均亮度 ; 
+    返　回  值：无
+    ************************************************************************/
+    void GetImgAvgBright(
+        const uint8_t* src, int width, int height, int linesize, float& avgBright
+    );
+
+    //
+    // 使用示范:
+    // 
+    // float* pfMatrix_dev = nullptr;
+    // size_t matrix_bytes = 3 * 3 * sizeof(f32);
+    // checkCudaRuntime(cudaMalloc(&pfMatrix_dev, matrix_bytes));
+    // checkCudaRuntime(cudaMemset(pfMatrix_dev, 0,  matrix_bytes));
+    //
+    // #左上、右上、右下、左下 原图像四个点的坐标
+    //    cv::Point2f src_points[] = { 
+    //    vctvctPoints[nImageIdx][0],
+    //    vctvctPoints[nImageIdx][1],
+    //    vctvctPoints[nImageIdx][2],
+    //    vctvctPoints[nImageIdx][3]};
+    // 
+    // #左上、右上、左下、右下（Z 字形排列） 目标图像四个点的坐标
+    //    cv::Point2f dst_points[] = {
+    //        cv::Point2f(0, 0),
+    //        cv::Point2f(nw, 0),
+    //        cv::Point2f(0, nh),
+    //        cv::Point2f(nw, nh) };
+    // 利用opencv 得到变换矩阵
+    //    cv::Mat tfMatrix_org = cv::getPerspectiveTransform(src_points, dst_points);
+    //    cv::Mat tfMatrix_I;
+    // 得到变换矩阵的逆矩阵
+    //    cv::invert(tfMatrix_org, tfMatrix_I);
+    //    tfMatrix_I.convertTo(tfMatrix_I,  CV_32FC1);
+    // 拷贝到 gpu 
+    //    checkCudaRuntime(cudaMemcpy(pfMatrix_dev, tfMatrix_I.data, matrix_bytes, cudaMemcpyHostToDevice));
+
+    void Perspective_gpu(
+        uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+		float* matrix_3_3, uint8_t const_value, const Norm& norm
+        );
+
+    void obtain_roi(
+    uint8_t* src, int src_line_step, int src_width, int src_height, uint8_t* dst, int x, int y, int dst_width, int dst_height,
+    hipStream_t stream);
+    
+    void resize_bilinear_and_normalize(
+		uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+		const Norm& norm);
+
+    void resize_bilinear_and_normalize_two_steps(
+		uint8_t* src, float* resize_buffer, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+		const Norm& norm,
+		hipStream_t stream);
+
+    void warp_affine_bilinear_and_normalize_plane(
+        uint8_t* src, int src_line_size, int src_width, int src_height, 
+        float* dst  , int dst_width, int dst_height,
+        float* matrix_2_3, uint8_t const_value, const Norm& norm,
+        hipStream_t stream);
+
+    void warp_affine(
+        uint8_t* src, int src_line_size, int src_width, int src_height, 
+        uint8_t* dst  , int dst_width, int dst_height,
+        float* matrix_2_3, uint8_t const_value, hipStream_t& stream);
+    
+    void warp_affine_bilinear_and_normalize_focus(
+        uint8_t* src, int src_line_size, int src_width, int src_height, 
+        float* dst  , int dst_width, int dst_height,
+        float* matrix_2_3, uint8_t const_value, const Norm& norm,
+        hipStream_t stream);
+
+    void norm_feature(
+        float* feature_array, int num_feature, int feature_length,
+        hipStream_t stream
+    );
+
+    void convert_nv12_to_bgr_invoke(
+        const uint8_t* y, const uint8_t* uv, int width, int height, 
+        int linesize, uint8_t* dst, 
+        hipStream_t stream);
+};
+
+#endif // PREPROCESS_KERNEL_CUH
\ No newline at end of file