Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o,...

Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o, libcustom_op_library.so, rocm_ops.cc.o, compile_rdc.sh, custom_op_library.h, benchmark.py, compile.sh, custom_op_library.cc, cuda_utils.py, fp16.py, rocm_ops.h, node_utils.py, rocm_ops.cc, readme.md, rocm_ops.hip.cpp files

Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o,...
Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o, libcustom_op_library.so, rocm_ops.cc.o, compile_rdc.sh, custom_op_library.h, benchmark.py, compile.sh, custom_op_library.cc, cuda_utils.py, fp16.py, rocm_ops.h, node_utils.py, rocm_ops.cc, readme.md, rocm_ops.hip.cpp files
b3e3ac37 · ghfund4_b52 · b3e3ac37 · b3e3ac37 · b3e3ac37 · b3e3ac37
Commit b3e3ac37 authored May 13, 2025 by ghfund4_b52
17 changed files
--- a/benchmark.py
+++ b/benchmark.py
+import onnxruntime as ort
+import numpy as np
+import os
+import sys
+import onnx
+from onnx import numpy_helper
+from node_utils import INPUT_TYPE_TO_NP_TYPE_MAP
+from cuda_utils import set_batchsize, model_setbs
+from scipy import spatial
+import argparse
+import time
+
+def save_input_output_data(save_path, data_dict, isInput=True):
+    if not os.path.isdir(save_path):
+        os.makedirs(save_path)
+
+    keys = list(data_dict.keys())
+    data_prefix = 'input'
+    if not isInput:
+        data_prefix = 'output'
+
+    for j in range(len(data_dict)):
+        with open(os.path.join(save_path, '{}_{}.pb'.format(data_prefix, j)), 'wb') as f:
+            f.write(numpy_helper.from_array(
+                data_dict[keys[j]], keys[j]).SerializeToString())
+
+def load_pb_data(pb_path):
+    with open(pb_path, 'rb') as f:
+        input_content = f.read()
+        tensor = onnx.TensorProto()
+        tensor.ParseFromString(input_content)
+        f.close()
+        return numpy_helper.to_array(tensor)
+
+def get_cosine(gpu_array, cpu_array):
+    gpu_array = gpu_array.astype(np.float64)
+    cpu_array = cpu_array.astype(np.float64)
+    gpu_array = gpu_array.reshape([-1])
+    cpu_array = cpu_array.reshape([-1])
+    cosine = spatial.distance.cosine(cpu_array, gpu_array)
+    return cosine
+
+def get_snr(gpu_array, cpu_array):
+    cpu_array = cpu_array.astype(np.float64)
+    gpu_array = gpu_array.astype(np.float64)
+    diff_array = cpu_array - gpu_array
+    x = diff_array * diff_array
+    x = np.sum(x)
+    y = cpu_array * cpu_array
+    y = np.sum(y)
+    snr = (x) / (y + 1e-7)
+    snr = np.mean(snr)
+    return snr
+
+def accuracy_check_run(args):
+    EP_List = ['ROCMExecutionProvider']
+    model = onnx.load(args.input_model)
+    model = set_batchsize(model, args.batchsize)
+    so = ort.SessionOptions()
+    so.enable_profiling = True
+
+    so.register_custom_ops_library("/public/home/kj_gauss/All_test/libcustom_op_library.so")
+    
+    so.intra_op_num_threads = 1
+    so.inter_op_num_threads = 1
+
+    cuda_session = ort.InferenceSession(
+        model.SerializeToString(), sess_options=so, providers=EP_List
+    )
+    #cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_List)
+    inputs = cuda_session.get_inputs()
+    outputs = cuda_session.get_outputs()
+
+    file_list = os.listdir(args.datapath)
+    input_list = []
+    output_list = []
+    for file in file_list:
+        if file[:5] == 'input':
+            input_list.append(file)
+        elif file[:6] == 'output':
+            output_list.append(file)
+
+    input_traits = [int(i[6:-3]) for i in input_list]
+    input_traits = sorted(input_traits)
+    input_list = [os.path.join(args.datapath, "input_{}.pb".format(i)) for i in input_traits]
+    output_traits = [int(i[7:-3]) for i in output_list]
+    output_traits = sorted(output_traits)
+    output_list = [os.path.join(args.datapath, "output_{}.pb".format(i)) for i in output_traits]
+
+    input_dict = {}
+    for input, input_file in zip(inputs, input_list):
+        input_dict[input.name] = load_pb_data(input_file)
+        if input_dict[input.name].shape[0] != args.batchsize:
+            print("Batchsize error! input data batchsize is {} but your input batchsize is {}, Please fix!".format(input_dict[input_file[0].name].shape[0], args.batchsize))
+            sys.exit()
+
+    gt_dict = {}
+    for output, gt_file in zip(outputs, output_list):
+        gt_dict[output.name] = load_pb_data(gt_file)
+
+    output_names = [x.name for x in cuda_session.get_outputs()]
+    output_data = cuda_session.run(output_names, input_dict)
+
+    for idx, output_name in enumerate(output_names):
+        print("output {}".format(output_name))
+        print("SNR IS : {}".format(get_snr(gt_dict[output_name], output_data[idx])))
+        print("COSINE IS : {}\n".format(get_cosine(gt_dict[output_name], output_data[idx])))
+
+def generate_golden_data_run(args):
+    import os
+    import time
+    import onnx
+    import onnxruntime as ort
+    import numpy as np
+    from onnx import numpy_helper
+    from node_utils import INPUT_TYPE_TO_NP_TYPE_MAP
+    
+    def save_input_output_data(save_path, data_dict, isInput=True):
+        if not os.path.isdir(save_path):
+            os.makedirs(save_path)
+        prefix = 'input' if isInput else 'output'
+        for idx, (name, data) in enumerate(data_dict.items()):
+            with open(os.path.join(save_path, f'{prefix}_{idx}.pb'), 'wb') as f:
+                f.write(numpy_helper.from_array(data, name).SerializeToString())
+
+    model = onnx.load(args.input_model)
+
+    orig_shapes = {}
+    for vi in model.graph.input:
+        dims = []
+        for d in vi.type.tensor_type.shape.dim:
+            dims.append(d.dim_value if d.dim_value > 0 else None)
+        orig_shapes[vi.name] = dims
+
+    so = ort.SessionOptions()
+    so.register_custom_ops_library("libcustom_op_library.so")
+    so.intra_op_num_threads = 1
+    so.inter_op_num_threads = 1
+    providers = ['ROCMExecutionProvider']
+
+    t0 = time.time()
+    session = ort.InferenceSession(model.SerializeToString(),
+                                   sess_options=so,
+                                   providers=providers)
+    t1 = time.time()
+    print(f"Initialize ROCM session cost {(t1-t0)*1000:.2f} ms")
+
+    input_dict = {}
+    for inp in session.get_inputs():
+        name = inp.name
+        dtype_str = inp.type
+        shape = []
+        for d in orig_shapes[name]:
+            if d is None:
+                shape.append(args.batchsize)
+            else:
+                shape.append(d)
+        print(f"[INFO] {name} <- shape {shape}, type={dtype_str}")
+
+        data = np.random.rand(*shape)
+        if 'uint8' in dtype_str:
+            data = data * 255
+        elif 'int8' in dtype_str:
+            data = data * 255 - 128
+        data = data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[dtype_str])
+        input_dict[name] = data
+
+    if args.saveIOdata == 1:
+        save_input_output_data(args.datapath, input_dict, isInput=True)
+
+    output_names = [o.name for o in session.get_outputs()]
+
+    # 6. Warm-up
+    if args.warmup > 0:
+        for _ in range(args.warmup):
+            session.run(output_names, input_dict)
+
+    t_start = time.time()
+    for _ in range(args.runnum):
+        outputs = session.run(output_names, input_dict)
+    t_end = time.time()
+
+    latency_ms = (t_end - t_start) * 1000 / (args.runnum * args.batchsize)
+    print(f"Inference cost per sample: {latency_ms:.3f} ms  |  FPS: {1000/latency_ms:.2f}")
+
+    if args.saveIOdata == 1:
+        out_dict = {n: o for n, o in zip(output_names, outputs)}
+        save_input_output_data(args.datapath, out_dict, isInput=False)
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_model",
+                        type=str,
+                        required=True,
+                        default="",
+                        help="input model file")
+    parser.add_argument("-b", "--batchsize",
+                        type=int,
+                        required=False,
+                        default=1,
+                        help="batchsize")
+    parser.add_argument("-c", "--checkresult",
+                        type=bool,
+                        required=False,
+                        default=False,
+                        help="check output accuracy")
+    parser.add_argument("-d", "--datapath",
+                        type=str,
+                        required=True,
+                        help="data path for saving golden data or checking output accuracy")
+    parser.add_argument("-w", "--warmup",
+                        type=int,
+                        required=False,
+                        default=50,
+                        help="input warm up iterations")
+    parser.add_argument("-n", "--runnum",
+                        type=int,
+                        required=False,
+                        default=100,
+                        help="input run model iterations")
+    parser.add_argument("-s", "--inputshape",
+                        type=int,
+                        required=False,
+                        default=-1,
+                        help="bert input shape")
+    parser.add_argument("-t", "--saveIOdata",
+                        type=int,
+                        required=False,
+                        default=1,
+                        help="save golden data")
+    ARGS = parser.parse_args()
+
+    if ARGS.checkresult:
+        accuracy_check_run(ARGS)
+    else:
+        generate_golden_data_run(ARGS)
\ No newline at end of file
--- a/compile.sh
+++ b/compile.sh
+/opt/dtk/hip/bin/hipcc  --offload-arch=gfx906 -I/opt/dtk-25.04/include -fPIC -x hip -o rocm_ops.hip.o -c rocm_ops.hip.cpp
+/usr/bin/c++ -DUSE_ROCM=1 -I ./include/onnxruntime/ -fPIC "-D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1" -o rocm_ops.cc.o -c rocm_ops.cc
+/usr/bin/c++ -I./include/onnxruntime/ -fPIC -o custom_op_library.cc.o -c custom_op_library.cc
+/opt/dtk/llvm/bin/clang++ -fPIC -shared -Wl,-soname,libcustom_op_library.so -o libcustom_op_library.so rocm_ops.hip.o custom_op_library.cc.o rocm_ops.cc.o -L/opt/dtk/lib -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25085.1211-205b0686 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc
+/opt/dtk/hip/bin/hipcc  --offload-arch=gfx906 -I/opt/dtk-25.04/include -fPIC -x hip -o rocm_ops.hip.o -c rocm_ops.hip.cpp
+/usr/bin/c++ -DUSE_ROCM=1 -I ./include/onnxruntime/ -fPIC "-D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1" -o rocm_ops.cc.o -c rocm_ops.cc
+/usr/bin/c++ -I./include/onnxruntime/ -fPIC -o custom_op_library.cc.o -c custom_op_library.cc
+/opt/dtk/llvm/bin/clang++ -fPIC -shared -Wl,-soname,libcustom_op_library.so -o libcustom_op_library.so rocm_ops.hip.o custom_op_library.cc.o rocm_ops.cc.o -L/opt/dtk/lib -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25085.1211-205b0686 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc
\ No newline at end of file
--- a/compile_rdc.sh
+++ b/compile_rdc.sh
+/opt/dtk/hip/bin/hipcc \
+  --offload-arch=gfx906 \
+  -shared \
+  -o libcustom_op_library.so \
+  rocm_ops.hip.cpp custom_op_library.cc.o rocm_ops.cc.o \
+  -L/opt/dtk/lib -L/opt/dtk/hip/lib \
+  -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib \
+  -lgalaxyhip \
+  /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a \
+  -lstdc++ -lm -lgcc_s -lgcc -lc
--- a/cuda_utils.py
+++ b/cuda_utils.py
+import onnxruntime as ort
+import onnx
+import numpy as np
+import time
+from node_utils import node_utils, INPUT_TYPE_TO_NP_TYPE_MAP
+import sys
+
+def set_batchsize(model, batchSize):
+    for node in model.graph.node:
+        if node.op_type in ['Reshape', 'Split', 'Transpose']:
+            return model
+
+    del model.graph.value_info[:]
+
+    for input in model.graph.input:
+        if len(input.type.tensor_type.shape.dim) > 1:
+            input.type.tensor_type.shape.dim[0].dim_value = batchSize
+
+    for output in model.graph.output:
+        if len(output.type.tensor_type.shape.dim) > 1:
+            output.type.tensor_type.shape.dim[0].dim_value = batchSize
+
+    return model
+
+def model_setbs(model, batchSize):
+    del model.graph.value_info[:]
+
+    for input in model.graph.input:
+        if len(input.type.tensor_type.shape.dim) > 1:
+            input.type.tensor_type.shape.dim[0].dim_value = batchSize
+
+    for output in model.graph.output:
+        if len(output.type.tensor_type.shape.dim) > 1:
+            output.type.tensor_type.shape.dim[0].dim_value = batchSize
+
+    return model
+
+def model_run(modelPath, batchSize=None):
+    model = onnx.load(modelPath)
+    if batchSize is not None:
+        model = set_batchsize(model, batchSize)
+    session_options = ort.SessionOptions()
+    session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+
+    EP_list = ['CUDAExecutionProvider']
+    start = time.time()
+    cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_list, sess_options=session_options)
+    end = time.time()
+    duration = (end - start) * 1000
+    print("Initialize Session cost {} ms".format(duration))
+
+    inputs = cuda_session.get_inputs()
+    outputs = cuda_session.get_outputs()
+
+    input_dict = {}
+    for input in inputs:
+        shape = [s for s in input.shape]
+        for idx in range(len(shape)):
+            if shape[idx] is None:
+                print("[ERROR] Input shape invalid,please Check")
+                return -1
+        input_data = np.random.random(shape)
+        if input.type.find('int') > 0:
+            input_data = input_data*10
+        input_data = input_data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[input.type])
+        input_dict[input.name] = ort.OrtValue.ortvalue_from_numpy(input_data, 'cuda_pinned', 0)
+
+    outputs_names = []
+    for output in outputs:
+        outputs_names.append(output.name)
+
+    io_binding = cuda_session.io_binding()
+    for key, ortValue in input_dict.items():
+        io_binding.bind_ortvalue_input(key, ortValue)
+
+    for out_name in outputs_names:
+        io_binding.bind_output(out_name, 'cuda_pinned', device_id=0)
+
+    # warm up
+    warm_up_num = 20
+    start = time.time()
+    for i in range(warm_up_num):
+        cuda_session.run_with_iobinding(io_binding)
+    end = time.time()
+    duration = (end - start) / (batchSize * warm_up_num) * 1000
+    print("Warm up cost {} ms".format(duration))
+
+    run_num = 50
+    start = time.time()
+    for i in range(run_num):
+        cuda_session.run_with_iobinding(io_binding)
+    end = time.time()
+    duration = (end - start) * 1000 / (run_num * batchSize)
+    print("Current inference cost {} ms".format(duration))
+    print("FPS is {:.2f}".format(1000/duration))
+    del cuda_session
+    return duration
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        print(len(sys.argv))
+        print("Input parameter error...")
+        print("python cudaRun.py modelPath batchSize")
+        sys.exit()
+
+    modelPath = sys.argv[1]
+    batchSize = int(sys.argv[2])
+    print(modelPath)
+    model_run(modelPath, batchSize)
--- a/custom_op_library.cc
+++ b/custom_op_library.cc
+#include "custom_op_library.h"
+
+#define ORT_API_MANUAL_INIT
+#include "onnxruntime_cxx_api.h"
+#undef ORT_API_MANUAL_INIT
+
+#include <vector>
+#include <cmath>
+#include <mutex>
+#include <system_error>
+
+#include "core/common/common.h"
+#include "core/framework/ortdevice.h"
+#include "core/framework/ortmemoryinfo.h"
+#include "rocm_ops.h"
+#include "onnxruntime_lite_custom_op.h"
+
+// static const char* c_OpDomain = "test.customop";
+static const char* c_OpDomain = "";
+
+static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain&& domain) {
+  static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
+  static std::mutex ort_custom_op_domain_mutex;
+  std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
+  ort_custom_op_domain_container.push_back(std::move(domain));
+}
+
+OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
+  Ort::Global<void>::api_ = api->GetApi(ORT_API_VERSION);
+  OrtStatus* result = nullptr;
+
+  ORT_TRY {
+    Ort::CustomOpDomain domain{c_OpDomain};
+    Rocm::RegisterOps(domain);
+
+    Ort::UnownedSessionOptions session_options(options);
+    session_options.Add(domain);
+    AddOrtCustomOpDomainToContainer(std::move(domain));
+  }
+  ORT_CATCH(const std::exception& e) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      Ort::Status status{e};
+      result = status.release();
+    });
+  }
+  return result;
+}
+
+OrtStatus* ORT_API_CALL RegisterCustomOpsAltName(OrtSessionOptions* options, const OrtApiBase* api) {
+  return RegisterCustomOps(options, api);
+}
--- a/custom_op_library.cc.o
+++ b/custom_op_library.cc.o
--- a/custom_op_library.h
+++ b/custom_op_library.h
+#pragma once
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api);
+
+// alternative name to test registration by function name
+ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOpsAltName(OrtSessionOptions* options, const OrtApiBase* api);
+
+#ifdef __cplusplus
+}
+#endif
--- a/fp16.py
+++ b/fp16.py
+import numpy as np
+import onnx
+from onnx import helper, numpy_helper
+from onnx import onnx_pb as onnx_proto
+
+def _npfloat16_to_int(np_list):
+    return [int(bin(_.view('H'))[2:].zfill(16), 2) for _ in np_list]
+
+def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4):
+    def between(a, b, c):
+        return np.logical_and(a < b, b < c)
+    np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
+    np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
+    np_array = np.where(between(max_finite_val, np_array, float('inf')), max_finite_val, np_array)
+    np_array = np.where(between(float('-inf'), np_array, -max_finite_val), -max_finite_val, np_array)
+    return np.float16(np_array)
+
+def convert_tensor_float_to_float16(tensor, min_positive_val=1e-7, max_finite_val=1e4):
+    if not isinstance(tensor, onnx_proto.TensorProto):
+        raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
+
+    if tensor.data_type == onnx_proto.TensorProto.FLOAT:
+        tensor.data_type = onnx_proto.TensorProto.FLOAT16
+        if tensor.float_data:
+            float16_data = convert_np_to_float16(np.array(tensor.float_data),
+                                                 min_positive_val, max_finite_val)
+            int_list = _npfloat16_to_int(float16_data)
+            tensor.int32_data[:] = int_list
+            tensor.float_data[:] = []
+        if tensor.raw_data:
+            float32_list = np.fromstring(tensor.raw_data, dtype='float32')
+            float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
+            tensor.raw_data = float16_list.tostring()
+    return tensor
+
+def make_value_info_from_tensor(tensor):
+    shape = numpy_helper.to_array(tensor).shape
+    return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
--- a/libcustom_op_library.so
+++ b/libcustom_op_library.so
--- a/node_utils.py
+++ b/node_utils.py
--- a/readme.md
+++ b/readme.md
+
+
+
+# Custom Op ONNXRuntime
+## Purpose
+`Adding the custom operator implementation and registering it in ONNX Runtime`
+
+## 环境配置
+### Docker（方法一）
+拉取镜像：
+
+```plaintext
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy-onnx1.19.2
+```
+
+创建并启动容器：
+
+```plaintext
+docker run --shm-size 16g --network=host --name=test --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/customop_onnxruntime:/home/customop -it <Your Image ID> /bin/bash
+```
+
+### Dockerfile（方法二）
+```
+cd ./docker
+docker build --no-cache -t customop:test .
+
+docker run --shm-size 16g --network=host --name=video_ort --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/customop_onnxruntime:/home/customop -it <Your Image ID> /bin/bash
+```
+
+## 使用
+### 编译工程
+```
+git clone http://developer.sourcefind.cn/codes/modelzoo/customop_onnxruntime.git
+cd customop_onnxruntime
+python model.py     // 生成模型
+bash compile.sh     // 编译生成算子库
+```
+
+### 运行示例
+1. 目录
+```
+rocm_custom_op
+├── compile.sh              // 编译脚本
+├── custom_op_library.cc    // 注册自定义算子
+├── custom_op_library.h
+├── docker
+├── include
+├── model.py               // 创建自定义模型
+├── readme.md
+├── rocm_ops.cc             // 调用自定义算子
+├── rocm_ops.h
+├── rocm_ops.hip            // 自定义算子实现
+└── benchmark.py                // 测试算子
+```
+
+2. 执行步骤
+```
+python test.py      // 测试模型     
+```
+
+3. 备注
+```
+1.在model.py中，定义了一个add.onnx模型。如需重新定义模型，请修改该文件。
+2.在compile.sh中，选项--offload-arch=gfx906，请将gfx906替换为本机适配的rocm架构(可用rocminfo | grep gfx查看)。
+3.若需测试别的自定义算子，修改内容如下：
+    3.1 修改rocm_ops.hip，重新实现自定义算子
+    3.2 修改rocm_ops.cc, 修改对自定义算子的调用
+    3.3 修改model.py，修改自定义模型的创建
+    3.4 修改benchmark.py，修改运行模型的输入数据
+```
+
+## result
+无
+
+### 精度
+无
+
+## 应用场景
+### 功能
+添加ORT custom op
+
+## 源码仓库及问题反馈
+- http://developer.sourcefind.cn/codes/modelzoo/customop_onnxruntime.git
+
+## 参考资料
+- https://github.com/microsoft/onnxruntime
+- https://onnxruntime.ai/docs/extensions/add-op.html
\ No newline at end of file
--- a/rocm_ops.cc
+++ b/rocm_ops.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifdef USE_ROCM
+
+#define ORT_API_MANUAL_INIT
+#include "onnxruntime_cxx_api.h"
+#undef ORT_API_MANUAL_INIT
+
+#include "core/providers/rocm/rocm_context.h"
+#include "onnxruntime_lite_custom_op.h"
+
+//Concat
+void rocm_concat(int axis, int M1, int N1, const float* X1, int M2, int N2, const float* X2, float* Z, hipStream_t stream);
+
+//Gemm
+void rocm_gemm(bool transA, bool transB, int M, int N, int K, float alpha, const float* A, const float* B, float beta, float* C, hipStream_t stream);
+
+extern "C"{ 
+	
+    //LeakyRelu
+    void rocm_leaky_relu(
+    int64_t size,
+    const float* d_X,
+    float* d_Y,
+    float alpha,
+    hipStream_t stream);
+    
+	//Attention
+	void rocm_attention(int B, int S, int H,
+                            const float* Q, const float* K, const float* V,
+                            float* Out, hipStream_t stream);
+   
+   //BatchNormalization
+   void rocm_batch_norm(int64_t N, int64_t C, int64_t H, int64_t W,
+                            const float* X,
+                            const float* gamma,
+                            const float* beta,
+                            const float* mean,
+                            const float* var,
+                            float epsilon,
+                            float* Y,
+                            hipStream_t stream);
+   
+   //Cast
+   void rocm_cast(
+    int64_t N,            // batch size
+    int64_t C,            // channels (或其它第一维)
+    int64_t H,            // 高度（或第二维）
+    int64_t W,            // 宽度（或第三维）
+    const float* X,       // 输入指针
+    int32_t* Y,           // 输出指针
+    hipStream_t stream);
+   
+   //Softmax
+   void rocm_softmax(int64_t M, int64_t N,
+                             const float* X, float* Y,
+                             hipStream_t compute_stream);
+   
+   //Celu
+   void rocm_celu(int64_t, const float*, float*, float, hipStream_t);
+	
+   //Relu
+    void rocm_relu(
+    int64_t size,
+    const float* X,
+    float* Y,
+    hipStream_t stream
+	);
+    
+    // Conv
+    void rocm_conv2d(const float* input,
+                            const float* weight,
+                            const float* bias,
+                            float* output,
+                            int N, int C_in, int H, int W,
+                            int C_out, int K_h, int K_w,
+                            int out_H, int out_W,
+                            hipStream_t stream);
+
+}
+
+
+
+using namespace Ort::Custom;
+
+#define CUSTOM_ENFORCE(cond, msg)  \
+  if (!(cond)) {                   \
+    throw std::runtime_error(msg); \
+  }
+
+namespace Rocm {
+
+void rocm_leaky_relu_forward(
+    const RocmContext& ctx,
+    const Tensor<float>& X,
+    Tensor<float>& Y) {
+  CUSTOM_ENFORCE(ctx.hip_stream, "No HIP stream available");
+  int64_t size = X.NumberOfElement();
+  const float alpha = 0.01f;
+
+  auto* y_ptr = Y.Allocate(X.Shape());
+  rocm_leaky_relu(size, X.Data(), y_ptr, alpha, ctx.hip_stream);
+}
+
+
+void rocm_relu_forward(
+    const Ort::Custom::RocmContext& rocm_ctx,
+    const Ort::Custom::Tensor<float>& X,
+    Ort::Custom::Tensor<float>& Y
+) {
+    CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+
+    auto input_shape = X.Shape();
+    int64_t size = X.NumberOfElement();
+    auto* y_data = Y.Allocate(input_shape);
+
+    rocm_relu(size, X.Data(), y_data, rocm_ctx.hip_stream);
+}
+
+
+void rocm_celu_forward(const Ort::Custom::RocmContext& ctx,
+                       const Ort::Custom::Tensor<float>& X,
+                       Ort::Custom::Tensor<float>& Y) {
+  CUSTOM_ENFORCE(ctx.hip_stream, "failed to fetch hip stream");
+  auto shape = X.Shape();
+  int64_t size = X.NumberOfElement();
+  float alpha = 1.0f; // or fetch from attribute
+  auto* y_ptr = Y.Allocate(shape);
+  rocm_celu(size, X.Data(), y_ptr, alpha, ctx.hip_stream);
+}
+
+
+/* softmax */
+void KernelSoftmax(const Ort::Custom::RocmContext& rocm_ctx,
+                   const Ort::Custom::Tensor<float>& X,
+                   Ort::Custom::Tensor<float>& Z) {
+  auto input_shape = X.Shape();
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+  // Expecting 2D input: [M, N]
+  CUSTOM_ENFORCE(input_shape.size() == 2, "Softmax only supports 2D input");
+  int64_t M = static_cast<int64_t>(input_shape[0]);
+  int64_t N = static_cast<int64_t>(input_shape[1]);
+  auto z_raw = Z.Allocate(input_shape);
+  // Call ROCm implementation
+  rocm_softmax(M, N, X.Data(), z_raw, rocm_ctx.hip_stream);
+}
+
+void rocm_cast_forward(
+    const Ort::Custom::RocmContext& rocm_ctx,
+    const Ort::Custom::Tensor<float>& X,
+    Ort::Custom::Tensor<int32_t>& Y) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+
+  // 假设只支持 4D 张量 [N,C,H,W]
+  auto shape = X.Shape();
+  CUSTOM_ENFORCE(shape.size() == 4, "Cast only supports 4D tensor [N,C,H,W]");
+
+  int64_t N = shape[0];
+  int64_t C = shape[1];
+  int64_t H = shape[2];
+  int64_t W = shape[3];
+
+  // 分配输出
+  auto* y_ptr = Y.Allocate({N, C, H, W});
+
+  // 正确调用：7 个参数
+  rocm_cast(
+      N, C, H, W,
+      X.Data(),
+      y_ptr,
+      rocm_ctx.hip_stream);
+}
+
+// BatchNormalization
+void rocm_batchnorm_forward(const Ort::Custom::RocmContext& rocm_ctx,
+                     const Ort::Custom::Tensor<float>& X,
+                     const Ort::Custom::Tensor<float>& scale,
+                     const Ort::Custom::Tensor<float>& B,
+                     const Ort::Custom::Tensor<float>& mean,
+                     const Ort::Custom::Tensor<float>& var,
+                     Ort::Custom::Tensor<float>& Y) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+
+  auto shape = X.Shape(); // expects [N, C, H, W]
+  CUSTOM_ENFORCE(shape.size() == 4, "Input must be 4D tensor [N, C, H, W]");
+
+  int64_t N = shape[0];
+  int64_t C = shape[1];
+  int64_t H = shape[2];
+  int64_t W = shape[3];
+
+  // Allocate output
+  auto* y_ptr = Y.Allocate({N, C, H, W});
+
+  // Epsilon attribute: retrieve via custom API or hardcode default
+  float epsilon = 1e-5f;
+  // If epsilon comes from attribute, fetch it here.
+
+  rocm_batch_norm(N, C, H, W,
+                  X.Data(), scale.Data(), B.Data(), mean.Data(), var.Data(),
+                  epsilon, y_ptr, rocm_ctx.hip_stream);
+}
+
+
+// attention
+void rocm_attention_forward(const Ort::Custom::RocmContext& rocm_ctx,
+                     const Ort::Custom::Tensor<float>& Q,
+                     const Ort::Custom::Tensor<float>& K,
+                     const Ort::Custom::Tensor<float>& V,
+                     Ort::Custom::Tensor<float>& Out) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+
+  auto shape = Q.Shape();  // 期望为 [B, S, H]
+  CUSTOM_ENFORCE(shape.size() == 3, "Input must be 3D tensor [B, S, H]");
+
+  int B = shape[0];
+  int S = shape[1];
+  int H = shape[2];
+
+  auto* out_ptr = Out.Allocate({B, S, H});
+  rocm_attention(B, S, H, Q.Data(), K.Data(), V.Data(), out_ptr, rocm_ctx.hip_stream);
+}
+
+// -------------------------------
+// Concat
+// -------------------------------
+void rocm_concat_forward(const Ort::Custom::RocmContext& rocm_ctx,
+                          const Ort::Custom::Tensor<float>& X1,
+                          const Ort::Custom::Tensor<float>& X2,
+                          Ort::Custom::Tensor<float>& Y) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+
+  auto shape1 = X1.Shape();
+  auto shape2 = X2.Shape();
+
+  // 支持二维张量按列连接（axis=1）
+  CUSTOM_ENFORCE(shape1.size() == 2 && shape2.size() == 2, "Inputs must be 2D tensors.");
+  CUSTOM_ENFORCE(shape1[0] == shape2[0], "Row dimensions must match for concat on axis 1.");
+
+  int M1 = shape1[0], N1 = shape1[1];
+  int M2 = shape2[0], N2 = shape2[1];
+
+  auto* y_data = Y.Allocate({M1, N1 + N2});  // 输出是合并后的矩阵
+  rocm_concat(1, M1, N1, X1.Data(), M2, N2, X2.Data(), y_data, rocm_ctx.hip_stream);
+}
+
+/******conv******/
+void rocm_conv_forward(const RocmContext& ctx,
+                       const Tensor<float>& input,
+                       const Tensor<float>& weight,
+                       const Tensor<float>& bias,
+                       Tensor<float>& output) {
+  CUSTOM_ENFORCE(ctx.hip_stream, "no HIP stream");
+
+  const auto& input_shape = input.Shape();   // [N, C_in, H, W]
+  const auto& weight_shape = weight.Shape(); // [C_out, C_in, K_h, K_w]
+  const int64_t N = input_shape[0];
+  const int64_t C_in = input_shape[1];
+  const int64_t H = input_shape[2];
+  const int64_t W = input_shape[3];
+  const int64_t C_out = weight_shape[0];
+  const int64_t K_h = weight_shape[2];
+  const int64_t K_w = weight_shape[3];
+  const int64_t out_H = (H - K_h) / 1 + 1;
+  const int64_t out_W = (W - K_w) / 1 + 1;
+
+  auto* y_ptr = output.Allocate({N, C_out, out_H, out_W});
+
+  rocm_conv2d(input.Data(), weight.Data(), bias.Data(), y_ptr,
+              N, C_in, H, W, C_out, K_h, K_w, out_H, out_W,
+              ctx.hip_stream);
+}
+
+// -------------------------------
+// Gemm
+// -------------------------------
+void rocm_gemm_forward(const Ort::Custom::RocmContext& rocm_ctx,
+                       const Ort::Custom::Tensor<float>& A,
+                       const Ort::Custom::Tensor<float>& B,
+                       const Ort::Custom::Tensor<float>& C,
+                       Ort::Custom::Tensor<float>& Y) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+
+  auto shapeA = A.Shape();
+  auto shapeB = B.Shape();
+  auto shapeC = C.Shape();
+
+  CUSTOM_ENFORCE(shapeA.size() == 2 && shapeB.size() == 2 && shapeC.size() == 2, "Inputs must be 2D tensors.");
+
+  int M = shapeA[0];
+  int K = shapeA[1];
+  int N = shapeB[1];
+
+  CUSTOM_ENFORCE(shapeB[0] == K, "Inner dimension mismatch between A and B in Gemm.");
+  CUSTOM_ENFORCE(shapeC[0] == M && shapeC[1] == N, "Output tensor shape mismatch in Gemm.");
+
+  auto* y_data = Y.Allocate({M, N});
+  rocm_gemm(false, false, M, N, K, 1.0f, A.Data(), B.Data(), 1.0f, y_data, rocm_ctx.hip_stream);
+}
+
+
+void RegisterOps(Ort::CustomOpDomain& domain) {
+
+ //注册 Attention 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpAttention{Ort::Custom::CreateLiteCustomOp("Attention", "ROCMExecutionProvider", rocm_attention_forward)};
+  domain.Add(c_CustomOpAttention.get());
+  
+  // 注册 BatchNormalization 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpBatchNorm{Ort::Custom::CreateLiteCustomOp("BatchNormalization", "ROCMExecutionProvider", rocm_batchnorm_forward)};
+  domain.Add(c_CustomOpBatchNorm.get());
+  
+  // 注册 Concat 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpConcat{Ort::Custom::CreateLiteCustomOp("Concat", "ROCMExecutionProvider", rocm_concat_forward)};
+  domain.Add(c_CustomOpConcat.get());
+  
+  // 注册 Gemm 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpGemm{Ort::Custom::CreateLiteCustomOp("Gemm", "ROCMExecutionProvider", rocm_gemm_forward)};
+  domain.Add(c_CustomOpGemm.get());
+  
+  // 注册 Cast 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpCast{Ort::Custom::CreateLiteCustomOp("Cast", "ROCMExecutionProvider", rocm_cast_forward)};
+  domain.Add(c_CustomOpCast.get());
+  
+  // 注册 Softmax 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpSoftmax{Ort::Custom::CreateLiteCustomOp("Softmax","ROCMExecutionProvider", KernelSoftmax)};
+  domain.Add(c_CustomOpSoftmax.get());
+  
+  // 注册 Celu 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CeluOp{Ort::Custom::CreateLiteCustomOp("Celu", "ROCMExecutionProvider", rocm_celu_forward)};
+  domain.Add(c_CeluOp.get());
+  
+  // 注册 ReLU 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpRelu{
+  Ort::Custom::CreateLiteCustomOp("Relu", "ROCMExecutionProvider", rocm_relu_forward)};
+  domain.Add(c_CustomOpRelu.get());
+  
+  // 注册LeakyRelu算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_LeakyReLU{
+      Ort::Custom::CreateLiteCustomOp(
+          "LeakyRelu", "ROCMExecutionProvider", rocm_leaky_relu_forward)};
+  domain.Add(c_LeakyReLU.get());
+
+	//注册conv算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_Conv{
+      Ort::Custom::CreateLiteCustomOp("Conv", "ROCMExecutionProvider", rocm_conv_forward)};
+  domain.Add(c_Conv.get());
+}
+  
+}  // namespace Rocm
+
+#endif
--- a/rocm_ops.cc.o
+++ b/rocm_ops.cc.o
--- a/rocm_ops.h
+++ b/rocm_ops.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace Rocm {
+
+#ifdef USE_ROCM
+
+void RegisterOps(Ort::CustomOpDomain& domain);
+
+#else
+
+inline void RegisterOps(Ort::CustomOpDomain&) {}
+
+#endif
+
+}  // namespace Rocm
--- a/rocm_ops.hip
+++ b/rocm_ops.hip
--- a/rocm_ops.hip.cpp
+++ b/rocm_ops.hip.cpp
--- a/rocm_ops.hip.o
+++ b/rocm_ops.hip.o