Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o,...

Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o, libcustom_op_library.so, rocm_ops.cc.o, compile_rdc.sh, custom_op_library.h, benchmark.py, compile.sh, custom_op_library.cc, cuda_utils.py, fp16.py, rocm_ops.h, node_utils.py, rocm_ops.cc, readme.md, rocm_ops.hip.cpp files

Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o,...
Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o, libcustom_op_library.so, rocm_ops.cc.o, compile_rdc.sh, custom_op_library.h, benchmark.py, compile.sh, custom_op_library.cc, cuda_utils.py, fp16.py, rocm_ops.h, node_utils.py, rocm_ops.cc, readme.md, rocm_ops.hip.cpp files
b3e3ac37 · ghfund4_b52 · b3e3ac37 · b3e3ac37 · b3e3ac37 · b3e3ac37
Commit b3e3ac37 authored May 13, 2025 by ghfund4_b52
17 changed files
--- a/benchmark.py
+++ b/benchmark.py
+import onnxruntime as ort
+import numpy as np
+import os
+import sys
+import onnx
+from onnx import numpy_helper
+from node_utils import INPUT_TYPE_TO_NP_TYPE_MAP
+from cuda_utils import set_batchsize, model_setbs
+from scipy import spatial
+import argparse
+import time
+def save_input_output_data(save_path, data_dict, isInput=True):
+    if not os.path.isdir(save_path):
+        os.makedirs(save_path)
+    keys = list(data_dict.keys())
+    data_prefix = 'input'
+    if not isInput:
+        data_prefix = 'output'
+    for j in range(len(data_dict)):
+        with open(os.path.join(save_path, '{}_{}.pb'.format(data_prefix, j)), 'wb') as f:
+            f.write(numpy_helper.from_array(
+                data_dict[keys[j]], keys[j]).SerializeToString())
+def load_pb_data(pb_path):
+    with open(pb_path, 'rb') as f:
+        input_content = f.read()
+        tensor = onnx.TensorProto()
+        tensor.ParseFromString(input_content)
+        f.close()
+        return numpy_helper.to_array(tensor)
+def get_cosine(gpu_array, cpu_array):
+    gpu_array = gpu_array.astype(np.float64)
+    cpu_array = cpu_array.astype(np.float64)
+    gpu_array = gpu_array.reshape([-1])
+    cpu_array = cpu_array.reshape([-1])
+    cosine = spatial.distance.cosine(cpu_array, gpu_array)
+    return cosine
+def get_snr(gpu_array, cpu_array):
+    cpu_array = cpu_array.astype(np.float64)
+    gpu_array = gpu_array.astype(np.float64)
+    diff_array = cpu_array - gpu_array
+    x = diff_array * diff_array
+    x = np.sum(x)
+    y = cpu_array * cpu_array
+    y = np.sum(y)
+    snr = (x) / (y + 1e-7)
+    snr = np.mean(snr)
+    return snr
+def accuracy_check_run(args):
+    EP_List = ['ROCMExecutionProvider']
+    model = onnx.load(args.input_model)
+    model = set_batchsize(model, args.batchsize)
+    so = ort.SessionOptions()
+    so.enable_profiling = True
+    so.register_custom_ops_library("/public/home/kj_gauss/All_test/libcustom_op_library.so")
+    so.intra_op_num_threads = 1
+    so.inter_op_num_threads = 1
+    cuda_session = ort.InferenceSession(
+        model.SerializeToString(), sess_options=so, providers=EP_List
+    )
+    #cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_List)
+    inputs = cuda_session.get_inputs()
+    outputs = cuda_session.get_outputs()
+    file_list = os.listdir(args.datapath)
+    input_list = []
+    output_list = []
+    for file in file_list:
+        if file[:5] == 'input':
+            input_list.append(file)
+        elif file[:6] == 'output':
+            output_list.append(file)
+    input_traits = [int(i[6:-3]) for i in input_list]
+    input_traits = sorted(input_traits)
+    input_list = [os.path.join(args.datapath, "input_{}.pb".format(i)) for i in input_traits]
+    output_traits = [int(i[7:-3]) for i in output_list]
+    output_traits = sorted(output_traits)
+    output_list = [os.path.join(args.datapath, "output_{}.pb".format(i)) for i in output_traits]
+    input_dict = {}
+    for input, input_file in zip(inputs, input_list):
+        input_dict[input.name] = load_pb_data(input_file)
+        if input_dict[input.name].shape[0] != args.batchsize:
+            print("Batchsize error! input data batchsize is {} but your input batchsize is {}, Please fix!".format(input_dict[input_file[0].name].shape[0], args.batchsize))
+            sys.exit()
+    gt_dict = {}
+    for output, gt_file in zip(outputs, output_list):
+        gt_dict[output.name] = load_pb_data(gt_file)
+    output_names = [x.name for x in cuda_session.get_outputs()]
+    output_data = cuda_session.run(output_names, input_dict)
+    for idx, output_name in enumerate(output_names):
+        print("output {}".format(output_name))
+        print("SNR IS : {}".format(get_snr(gt_dict[output_name], output_data[idx])))
+        print("COSINE IS : {}\n".format(get_cosine(gt_dict[output_name], output_data[idx])))
+def generate_golden_data_run(args):
+    import os
+    import time
+    import onnx
+    import onnxruntime as ort
+    import numpy as np
+    from onnx import numpy_helper
+    from node_utils import INPUT_TYPE_TO_NP_TYPE_MAP
+    def save_input_output_data(save_path, data_dict, isInput=True):
+        if not os.path.isdir(save_path):
+            os.makedirs(save_path)
+        prefix = 'input' if isInput else 'output'
+        for idx, (name, data) in enumerate(data_dict.items()):
+            with open(os.path.join(save_path, f'{prefix}_{idx}.pb'), 'wb') as f:
+                f.write(numpy_helper.from_array(data, name).SerializeToString())
+    model = onnx.load(args.input_model)
+    orig_shapes = {}
+    for vi in model.graph.input:
+        dims = []
+        for d in vi.type.tensor_type.shape.dim:
+            dims.append(d.dim_value if d.dim_value > 0 else None)
+        orig_shapes[vi.name] = dims
+    so = ort.SessionOptions()
+    so.register_custom_ops_library("libcustom_op_library.so")
+    so.intra_op_num_threads = 1
+    so.inter_op_num_threads = 1
+    providers = ['ROCMExecutionProvider']
+    t0 = time.time()
+    session = ort.InferenceSession(model.SerializeToString(),
+                                   sess_options=so,
+                                   providers=providers)
+    t1 = time.time()
+    print(f"Initialize ROCM session cost {(t1-t0)*1000:.2f} ms")
+    input_dict = {}
+    for inp in session.get_inputs():
+        name = inp.name
+        dtype_str = inp.type
+        shape = []
+        for d in orig_shapes[name]:
+            if d is None:
+                shape.append(args.batchsize)
+            else:
+                shape.append(d)
+        print(f"[INFO] {name} <- shape {shape}, type={dtype_str}")
+        data = np.random.rand(*shape)
+        if 'uint8' in dtype_str:
+            data = data * 255
+        elif 'int8' in dtype_str:
+            data = data * 255 - 128
+        data = data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[dtype_str])
+        input_dict[name] = data
+    if args.saveIOdata == 1:
+        save_input_output_data(args.datapath, input_dict, isInput=True)
+    output_names = [o.name for o in session.get_outputs()]
+    # 6. Warm-up
+    if args.warmup > 0:
+        for _ in range(args.warmup):
+            session.run(output_names, input_dict)
+    t_start = time.time()
+    for _ in range(args.runnum):
+        outputs = session.run(output_names, input_dict)
+    t_end = time.time()
+    latency_ms = (t_end - t_start) * 1000 / (args.runnum * args.batchsize)
+    print(f"Inference cost per sample: {latency_ms:.3f} ms  |  FPS: {1000/latency_ms:.2f}")
+    if args.saveIOdata == 1:
+        out_dict = {n: o for n, o in zip(output_names, outputs)}
+        save_input_output_data(args.datapath, out_dict, isInput=False)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_model",
+                        type=str,
+                        required=True,
+                        default="",
+                        help="input model file")
+    parser.add_argument("-b", "--batchsize",
+                        type=int,
+                        required=False,
+                        default=1,
+                        help="batchsize")
+    parser.add_argument("-c", "--checkresult",
+                        type=bool,
+                        required=False,
+                        default=False,
+                        help="check output accuracy")
+    parser.add_argument("-d", "--datapath",
+                        type=str,
+                        required=True,
+                        help="data path for saving golden data or checking output accuracy")
+    parser.add_argument("-w", "--warmup",
+                        type=int,
+                        required=False,
+                        default=50,
+                        help="input warm up iterations")
+    parser.add_argument("-n", "--runnum",
+                        type=int,
+                        required=False,
+                        default=100,
+                        help="input run model iterations")
+    parser.add_argument("-s", "--inputshape",
+                        type=int,
+                        required=False,
+                        default=-1,
+                        help="bert input shape")
+    parser.add_argument("-t", "--saveIOdata",
+                        type=int,
+                        required=False,
+                        default=1,
+                        help="save golden data")
+    ARGS = parser.parse_args()
+    if ARGS.checkresult:
+        accuracy_check_run(ARGS)
+    else:
+        generate_golden_data_run(ARGS)
\ No newline at end of file
--- a/compile.sh
+++ b/compile.sh
+/opt/dtk/hip/bin/hipcc  --offload-arch=gfx906 -I/opt/dtk-25.04/include -fPIC -x hip -o rocm_ops.hip.o -c rocm_ops.hip.cpp
+/usr/bin/c++ -DUSE_ROCM=1 -I ./include/onnxruntime/ -fPIC "-D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1" -o rocm_ops.cc.o -c rocm_ops.cc
+/usr/bin/c++ -I./include/onnxruntime/ -fPIC -o custom_op_library.cc.o -c custom_op_library.cc
+/opt/dtk/llvm/bin/clang++ -fPIC -shared -Wl,-soname,libcustom_op_library.so -o libcustom_op_library.so rocm_ops.hip.o custom_op_library.cc.o rocm_ops.cc.o -L/opt/dtk/lib -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25085.1211-205b0686 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc
+/opt/dtk/hip/bin/hipcc  --offload-arch=gfx906 -I/opt/dtk-25.04/include -fPIC -x hip -o rocm_ops.hip.o -c rocm_ops.hip.cpp
+/usr/bin/c++ -DUSE_ROCM=1 -I ./include/onnxruntime/ -fPIC "-D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1" -o rocm_ops.cc.o -c rocm_ops.cc
+/usr/bin/c++ -I./include/onnxruntime/ -fPIC -o custom_op_library.cc.o -c custom_op_library.cc
+/opt/dtk/llvm/bin/clang++ -fPIC -shared -Wl,-soname,libcustom_op_library.so -o libcustom_op_library.so rocm_ops.hip.o custom_op_library.cc.o rocm_ops.cc.o -L/opt/dtk/lib -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25085.1211-205b0686 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc
\ No newline at end of file
--- a/compile_rdc.sh
+++ b/compile_rdc.sh
+/opt/dtk/hip/bin/hipcc \
+  --offload-arch=gfx906 \
+  -shared \
+  -o libcustom_op_library.so \
+  rocm_ops.hip.cpp custom_op_library.cc.o rocm_ops.cc.o \
+  -L/opt/dtk/lib -L/opt/dtk/hip/lib \
+  -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib \
+  -lgalaxyhip \
+  /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a \
+  -lstdc++ -lm -lgcc_s -lgcc -lc
--- a/cuda_utils.py
+++ b/cuda_utils.py
+import onnxruntime as ort
+import onnx
+import numpy as np
+import time
+from node_utils import node_utils, INPUT_TYPE_TO_NP_TYPE_MAP
+import sys
+def set_batchsize(model, batchSize):
+    for node in model.graph.node:
+        if node.op_type in ['Reshape', 'Split', 'Transpose']:
+            return model
+    del model.graph.value_info[:]
+    for input in model.graph.input:
+        if len(input.type.tensor_type.shape.dim) > 1:
+            input.type.tensor_type.shape.dim[0].dim_value = batchSize
+    for output in model.graph.output:
+        if len(output.type.tensor_type.shape.dim) > 1:
+            output.type.tensor_type.shape.dim[0].dim_value = batchSize
+    return model
+def model_setbs(model, batchSize):
+    del model.graph.value_info[:]
+    for input in model.graph.input:
+        if len(input.type.tensor_type.shape.dim) > 1:
+            input.type.tensor_type.shape.dim[0].dim_value = batchSize
+    for output in model.graph.output:
+        if len(output.type.tensor_type.shape.dim) > 1:
+            output.type.tensor_type.shape.dim[0].dim_value = batchSize
+    return model
+def model_run(modelPath, batchSize=None):
+    model = onnx.load(modelPath)
+    if batchSize is not None:
+        model = set_batchsize(model, batchSize)
+    session_options = ort.SessionOptions()
+    session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+    EP_list = ['CUDAExecutionProvider']
+    start = time.time()
+    cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_list, sess_options=session_options)
+    end = time.time()
+    duration = (end - start) * 1000
+    print("Initialize Session cost {} ms".format(duration))
+    inputs = cuda_session.get_inputs()
+    outputs = cuda_session.get_outputs()
+    input_dict = {}
+    for input in inputs:
+        shape = [s for s in input.shape]
+        for idx in range(len(shape)):
+            if shape[idx] is None:
+                print("[ERROR] Input shape invalid,please Check")
+                return -1
+        input_data = np.random.random(shape)
+        if input.type.find('int') > 0:
+            input_data = input_data*10
+        input_data = input_data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[input.type])
+        input_dict[input.name] = ort.OrtValue.ortvalue_from_numpy(input_data, 'cuda_pinned', 0)
+    outputs_names = []
+    for output in outputs:
+        outputs_names.append(output.name)
+    io_binding = cuda_session.io_binding()
+    for key, ortValue in input_dict.items():
+        io_binding.bind_ortvalue_input(key, ortValue)
+    for out_name in outputs_names:
+        io_binding.bind_output(out_name, 'cuda_pinned', device_id=0)
+    # warm up
+    warm_up_num = 20
+    start = time.time()
+    for i in range(warm_up_num):
+        cuda_session.run_with_iobinding(io_binding)
+    end = time.time()
+    duration = (end - start) / (batchSize * warm_up_num) * 1000
+    print("Warm up cost {} ms".format(duration))
+    run_num = 50
+    start = time.time()
+    for i in range(run_num):
+        cuda_session.run_with_iobinding(io_binding)
+    end = time.time()
+    duration = (end - start) * 1000 / (run_num * batchSize)
+    print("Current inference cost {} ms".format(duration))
+    print("FPS is {:.2f}".format(1000/duration))
+    del cuda_session
+    return duration
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        print(len(sys.argv))
+        print("Input parameter error...")
+        print("python cudaRun.py modelPath batchSize")
+        sys.exit()
+    modelPath = sys.argv[1]
+    batchSize = int(sys.argv[2])
+    print(modelPath)
+    model_run(modelPath, batchSize)
--- a/custom_op_library.cc
+++ b/custom_op_library.cc
+#include "custom_op_library.h"
+#define ORT_API_MANUAL_INIT
+#include "onnxruntime_cxx_api.h"
+#undef ORT_API_MANUAL_INIT
+#include <vector>
+#include <cmath>
+#include <mutex>
+#include <system_error>
+#include "core/common/common.h"
+#include "core/framework/ortdevice.h"
+#include "core/framework/ortmemoryinfo.h"
+#include "rocm_ops.h"
+#include "onnxruntime_lite_custom_op.h"
+// static const char* c_OpDomain = "test.customop";
+static const char* c_OpDomain = "";
+static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain&& domain) {
+  static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
+  static std::mutex ort_custom_op_domain_mutex;
+  std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
+  ort_custom_op_domain_container.push_back(std::move(domain));
+}
+OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
+  Ort::Global<void>::api_ = api->GetApi(ORT_API_VERSION);
+  OrtStatus* result = nullptr;
+  ORT_TRY {
+    Ort::CustomOpDomain domain{c_OpDomain};
+    Rocm::RegisterOps(domain);
+    Ort::UnownedSessionOptions session_options(options);
+    session_options.Add(domain);
+    AddOrtCustomOpDomainToContainer(std::move(domain));
+  }
+  ORT_CATCH(const std::exception& e) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      Ort::Status status{e};
+      result = status.release();
+    });
+  }
+  return result;
+}
+OrtStatus* ORT_API_CALL RegisterCustomOpsAltName(OrtSessionOptions* options, const OrtApiBase* api) {
+  return RegisterCustomOps(options, api);
+}
--- a/custom_op_library.cc.o
+++ b/custom_op_library.cc.o
--- a/custom_op_library.h
+++ b/custom_op_library.h
+#pragma once
+#include "onnxruntime_c_api.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api);
+// alternative name to test registration by function name
+ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOpsAltName(OrtSessionOptions* options, const OrtApiBase* api);
+#ifdef __cplusplus
+}
+#endif
--- a/fp16.py
+++ b/fp16.py
+import numpy as np
+import onnx
+from onnx import helper, numpy_helper
+from onnx import onnx_pb as onnx_proto
+def _npfloat16_to_int(np_list):
+    return [int(bin(_.view('H'))[2:].zfill(16), 2) for _ in np_list]
+def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4):
+    def between(a, b, c):
+        return np.logical_and(a < b, b < c)
+    np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
+    np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
+    np_array = np.where(between(max_finite_val, np_array, float('inf')), max_finite_val, np_array)
+    np_array = np.where(between(float('-inf'), np_array, -max_finite_val), -max_finite_val, np_array)
+    return np.float16(np_array)
+def convert_tensor_float_to_float16(tensor, min_positive_val=1e-7, max_finite_val=1e4):
+    if not isinstance(tensor, onnx_proto.TensorProto):
+        raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
+    if tensor.data_type == onnx_proto.TensorProto.FLOAT:
+        tensor.data_type = onnx_proto.TensorProto.FLOAT16
+        if tensor.float_data:
+            float16_data = convert_np_to_float16(np.array(tensor.float_data),
+                                                 min_positive_val, max_finite_val)
+            int_list = _npfloat16_to_int(float16_data)
+            tensor.int32_data[:] = int_list
+            tensor.float_data[:] = []
+        if tensor.raw_data:
+            float32_list = np.fromstring(tensor.raw_data, dtype='float32')
+            float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
+            tensor.raw_data = float16_list.tostring()
+    return tensor
+def make_value_info_from_tensor(tensor):
+    shape = numpy_helper.to_array(tensor).shape
+    return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
--- a/libcustom_op_library.so
+++ b/libcustom_op_library.so
--- a/node_utils.py
+++ b/node_utils.py
+import onnx
+from onnx import shape_inference, helper, TensorProto
+import numpy as np
+import os
+import fp16
+DEBUG = False
+INPUT_TYPE_TO_NP_TYPE_MAP = {
+    'tensor(float)':      np.dtype('float32'),
+    'tensor(uint8)':      np.dtype('uint8'),
+    'tensor(int8)':       np.dtype('int8'),
+    'tensor(uint16)':     np.dtype('uint16'),
+    'tensor(int16)':      np.dtype('int16'),
+    'tensor(int32)':      np.dtype('int32'),
+    'tensor(int64)':      np.dtype('int64'),
+    'tensor(bool)':       np.dtype('bool'),
+    'tensor(float16)':    np.dtype('float16'),
+    'tensor(float64)':    np.dtype('float64'),
+    'tensor(complex64)':  np.dtype('complex64'),
+    'tensor(complex128)': np.dtype('complex128'),
+    'tensor(string)':     np.dtype(np.str_),
+    'tensor(float8e5m2)':    np.dtype('int8'),
+    'tensor(float8e5m2fnuz)': np.dtype('int8'),
+    'tensor(float8e4m3fnuz)': np.dtype('int8'),
+    'tensor(float8e4m3fn)':   np.dtype('int8'),
+    'seq(tensor(complex64))':  np.dtype('complex64'),
+    'seq(tensor(complex128))': np.dtype('complex128'),
+    'seq(tensor(uint8))':      np.dtype('uint8'),
+    'seq(tensor(int8))':       np.dtype('int8'),
+    'seq(tensor(int16))':      np.dtype('int16'),
+    'seq(tensor(uint16))':     np.dtype('uint16'),
+    'seq(tensor(int32))':      np.dtype('int32'),
+    'seq(tensor(uint32))':     np.dtype('uint32'),
+    'seq(tensor(int64))':      np.dtype('int64'),
+    'seq(tensor(uint64))':     np.dtype('uint64'),
+    'seq(tensor(float))':      np.dtype('float32'),
+    'seq(tensor(float16))':    np.dtype('float16'),
+    'seq(tensor(double))':     np.dtype('double'),
+    'seq(tensor(bool))':       np.dtype('bool'),
+    'seq(tensor(string))':     np.dtype(np.str_)
+}
+NP_TYPE_TO_ONNX_TYPE_MAP = {
+    np.dtype('float32') : 1,
+    np.dtype('uint8') : 2,
+    np.dtype('int8') : 3,
+    np.dtype('uint16') : 4,
+    np.dtype('int16') : 5,
+    np.dtype('int32') : 6,
+    np.dtype('int64') : 7,
+    np.dtype(np.str_) : 8,
+    np.dtype('bool') : 9,
+    np.dtype('float16') : 10,
+    np.dtype('float64') : 11,
+    np.dtype('uint32') : 12,
+    np.dtype('uint64') : 13,
+    np.dtype('complex64') : 14,
+    np.dtype('complex128') : 15,
+}
+Attribute_TYPE_MAP = {
+    0 : 'UNDEFINED',
+    1 : 'f',
+    2 : 'i',
+    3 : 's',
+    4 : 't',
+    5 : 'g',
+    6 : 'floats',
+    7 : 'ints',
+    8 : 'strings',
+    9 : 'tensors',
+    10 : 'graphs'
+}
+class node_utils:
+    def __init__(self, modelPath, batchSize=1, isINT8Model=True, inputShape=None):
+        print('Process {}'.format(modelPath))
+        self.modelPath = modelPath
+        self.batchSize = batchSize
+        self.isINT8Model = isINT8Model
+        self.filter_op_type = ['QuantizeLinear', 'DequantizeLinear', 'Squeeze', 'Unsqueeze', 'Shape']
+        self.select_input_op_type = ['Shape', 'Squeeze', 'Unsqueeze']
+        self.quant_op_list = ['Conv', 'ConvTranspose', 'Gemm', 'GlobalAveragePool', 'MaxPool', 'Add', 'Sub', 'Div', 'Mul',
+                              'MatMul', 'Transpose', 'Reshape', 'Flatten', 'Soft', 'Pad', 'Pow', 'Concat', 'LeakyRelu', 'Relu',
+                              'PRelu', 'Clip', 'Resize', 'AveragePool', 'GlobalMaxPool', 'Split', 'Slice', 'Sigmoid',
+                              'ReduceMean', 'Softplus', 'Tanh', 'ReduceSum', 'Gather', 'Swish', 'HardSigmoid', 'Mish']
+        self.model_op_list = ['Swish','LayerNorm','Mish']
+        self.input_node_dict = {}
+        self.output_node_dict = {}
+        self.name_node_dict = {}
+        self.initializer_dict = {}
+        self.value_info_dict = {}
+        self.node_type_dict = {}
+        self.model = onnx.load(modelPath)
+        if inputShape is not None:
+            for input in self.model.graph.input:
+                if input.name not in inputShape:
+                    print('[ERROR]: input {} with no input shape, please check!'.format(input.name))
+                    exit()
+                shape = inputShape[input.name]
+                for idx, s in enumerate(shape):
+                    input.type.tensor_type.shape.dim[idx].dim_value = s
+        self.initModelState()
+    def RefreshState(self):
+        self.input_node_dict = {}
+        self.output_node_dict = {}
+        self.name_node_dict = {}
+        self.initializer_dict = {}
+        self.value_info_dict = {}
+        self.initModelState()
+    def initModelState(self):
+        model = self.model
+        for node in model.graph.node:
+            if not node.name in self.name_node_dict:
+                self.name_node_dict[node.name] = node
+            for input in node.input:
+                if not input in self.input_node_dict:
+                    self.input_node_dict[input] = [node]
+                else:
+                    self.input_node_dict[input].append(node)
+            for output in node.output:
+                if not output in self.output_node_dict:
+                    self.output_node_dict[output] = [node]
+                else:
+                    self.output_node_dict[output].append(node)
+        for value_info in model.graph.value_info:
+            self.value_info_dict[value_info.name] = value_info
+        for initializer in model.graph.initializer:
+            self.initializer_dict[initializer.name] = initializer
+    def get_relate_qdq_node(self, name_list, is_front=True):
+        qdq_list = []
+        for name in name_list:
+            if is_front and name in self.output_node_dict:
+                    front_node_list = self.output_node_dict[name]
+                    for node in front_node_list:
+                        if node.op_type == 'DequantizeLinear':
+                            qdq_list.append(node)
+            elif not is_front and name in self.input_node_dict:
+                back_node_list = self.input_node_dict[name]
+                for node in back_node_list:
+                    if node.op_type == 'QuantizeLinear':
+                        qdq_list.append(node)
+        return qdq_list
+    def correct_value_info_by_ort(self):
+        import onnxruntime as ort
+        import time
+        import copy
+        def setBatchSize(model, batchSize):
+            for input in model.graph.input:
+                input.type.tensor_type.shape.dim[0].dim_value = batchSize
+            for output in model.graph.output:
+                output.type.tensor_type.shape.dim[0].dim_value = batchSize
+            return model
+        def needORTInfer(value_info):
+            if len(value_info.type.tensor_type.shape.dim) == 0:
+                return True
+            if self.firstDimIsBatch(value_info) and value_info.type.tensor_type.shape.dim[0].dim_value == -1:
+                return False
+            for dim in value_info.type.tensor_type.shape.dim:
+                if dim.dim_value <= 0:
+                    return True
+            return False
+        def canInferShapeByCPU(opset_import):
+            for detail in opset_import:
+                if detail.domain not in ['', 'ai.onnx', 'com.microsoft']:
+                    return False
+            return True
+        def canInferShapeByCPU(opset_import):
+            for detail in opset_import:
+                if detail.domain not in ['', 'ai.onnx', 'com.microsoft']:
+                    return False
+            return True
+        def firstDimIsBatch(value_info):
+            node = self.output_node_dict[value_info.name][0]
+            if node.op_type == 'Concat' and len(value_info.type.tensor_type.shape.dim) == 1:
+                return False
+            return True
+        model = self.model
+        model = setBatchSize(model, self.batchSize)
+        if canInferShapeByCPU(model.opset_import):
+            try:
+                del model.graph.value_info[:]
+                if self.modelPath.find('bert') < 0:
+                    model = shape_inference.infer_shapes(model, check_type=True, strict_mode=True, data_prop=True)
+                else:
+                    print("skip shape inference for bert...")
+                self.model = model
+                self.RefreshState()
+            except:
+                print("shape inference failed use onnxruntime infer later")
+        batchsize = self.batchSize
+        output_backup = copy.deepcopy(model.graph.output)
+        session_options = ort.SessionOptions()
+        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+        if self.modelPath.find('bert') < 0 and canInferShapeByCPU(model.opset_import):
+            EP_list = ['CPUExecutionProvider']
+        else:
+            EP_list = ['CUDAExecutionProvider']
+        all_tensor_dict = {}
+        for node in model.graph.node:
+            for output in node.output:
+                if output not in all_tensor_dict:
+                    all_tensor_dict[output] = 1
+                else:
+                    all_tensor_dict[output] += 1
+        value_info_list = []
+        value_info_dict = {}
+        value_info_list_back = []
+        for tensor_name in list(all_tensor_dict.keys()):
+            if DEBUG:
+                print(tensor_name)
+            if tensor_name not in self.value_info_dict:
+                if DEBUG:
+                    print('tensor {} not in self.value_info_dict and need infer by ORT'.format(tensor_name))
+                value_info_list.append(tensor_name)
+                value_info_dict[tensor_name] = onnx.ValueInfoProto(name=tensor_name)
+                continue
+            value_info = self.value_info_dict[tensor_name]
+            if self.output_node_dict[tensor_name][0].op_type in self.filter_shape_correct_op_type:
+                value_info_list_back.append(copy.deepcopy(value_info))
+                if DEBUG:
+                    print('add tensor {} to backup list'.format(tensor_name))
+                continue
+            if needORTInfer(value_info):
+                value_info_list.append(tensor_name)
+                value_info_dict[tensor_name] = value_info
+                if DEBUG:
+                    print('tensor {} need infer by ORT'.format(tensor_name))
+            else:
+                if len(value_info.type.tensor_type.shape.dim) > 0 and self.firstDimIsBatch(value_info):
+                    value_info.type.tensor_type.shape.dim[0].dim_value = batchsize
+                value_info_list_back.append(copy.deepcopy(value_info))
+                if DEBUG:
+                    print('modify tensor {} batchsize and add to backup list'.format(tensor_name))
+        if len(value_info_list) > 0:
+            step = 20
+            infer_times = int(len(value_info_list) / step) if len(value_info_list) % step == 0 else int((len(value_info_list) + (step - len(value_info_list) % step)) / step)
+            if DEBUG:
+                print('infer_times %d' % infer_times)
+            for i in range(infer_times):
+                output_name_list = []
+                if i == infer_times - 1:
+                    output_name_list = value_info_list[step*i:]
+                else:
+                    output_name_list = value_info_list[step*i:step*(i+1)]
+                if DEBUG:
+                    print(i, output_name_list)
+                del model.graph.output[:]
+                for output_name in output_name_list:
+                    model.graph.output.extend([onnx.ValueInfoProto(name=output_name)])
+                del model.graph.value_info[:]
+                start = time.time()
+                cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_list)
+                end = time.time()
+                duration = (end - start) * 1000
+                if DEBUG:
+                    print("Initialize Session cost {} ms\n".format(duration))
+                inputs = cuda_session.get_inputs()
+                input_dict = {}
+                for input in inputs:
+                    shape = []
+                    for s in input.shape:
+                        if isinstance(s, int):
+                            shape.append(s)
+                        else:
+                            print('[ERROR]: input shape must be inter but input is {}'.format(type(s)))
+                            exit()
+                    for idx in range(len(shape)):
+                        if shape[idx] is None:
+                            shape[idx] = 1
+                    input_data = np.random.random(shape)
+                    if input.type != 0:
+                        input_data = input_data*10
+                    input_data = input_data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[input.type])
+                    input_dict[input.name] = input_data
+                outputs = cuda_session.run(output_name_list, input_dict)
+                for idx, output_name in enumerate(output_name_list):
+                    if DEBUG:
+                        print('tensor {} ort infered shape is {}'.format(output_name, outputs[idx].shape))
+                    infered_shape = list(outputs[idx].shape)
+                    new_tmp_value_info = helper.make_tensor_value_info(output_name, NP_TYPE_TO_ONNX_TYPE_MAP[outputs[idx].dtype], infered_shape)
+                    value_info_dict[output_name] = new_tmp_value_info
+                del cuda_session
+            del model.graph.value_info[:]
+            model.graph.value_info.extend(list(value_info_dict.values()))
+            model.graph.value_info.extend(value_info_list_back)
+            del model.graph.output[:]
+            model.graph.output.extend(output_backup)
+        else:
+            del value_info_list_back
+        self.model = setBatchSize(model, batchsize)
+    def get_node_unique_info(self, node):
+        unique_str = '{}-'.format(node.op_type)
+        for input in node.input:
+            if input in self.value_info_dict:
+                value_info = self.value_info_dict[input]
+                start_idx = 0
+                for dim in value_info.type.tensor_type.shape.dim[start_idx:]:
+                    unique_str += '{},'.format(dim.dim_value)
+                unique_str = unique_str[:-1]
+                unique_str += '_elemType_{}_'.format(value_info.type.tensor_type.elem_type)
+            elif input in self.initializer_dict:
+                initial_tensor = self.initializer_dict[input]
+                start_idx = 0
+                for dim in initial_tensor.dims[start_idx:]:
+                    unique_str += '{},'.format(dim)
+                unique_str = unique_str[:-1]
+                unique_str += '_elemType_{}_'.format(initial_tensor.data_type)
+            else:
+                continue
+        unique_str = unique_str[:-1]
+        for attr in node.attribute:
+            unique_str += '_{}_'.format(attr.name)
+            attr_type = Attribute_TYPE_MAP[attr.type]
+            value = getattr(attr, attr_type)
+            if attr.type in [1, 2, 3]:
+                unique_str += '{}'.format(value)
+            elif attr.type in [6, 7, 8]:
+                for v in value:
+                    unique_str += '{},'.format(v)
+                unique_str = unique_str[:-1]
+        return unique_str
+    def modify_FP32_to_FP16(self, graph):
+        for input in graph.input:
+            if input.type.tensor_type.elem_type == 1:
+                input.type.tensor_type.elem_type = 10
+        for output in graph.output:
+            if output.type.tensor_type.elem_type == 1:
+                output.type.tensor_type.elem_type = 10
+        new_initializer = []
+        for initializer in graph.initializer:
+            if initializer.data_type == 1:
+                new_initializer.append(fp16.convert_tensor_float_to_float16(initializer))
+            else:
+                new_initializer.append(initializer)
+        del graph.initializer[:]
+        graph.initializer.extend(new_initializer)
+        return graph
+    def extractNodeAndSave(self, save_path):
+        self.correct_value_info_by_ort()
+        self.RefreshState()
+        model = self.model
+        model_list = []
+        node_list = []
+        for node in model.graph.node:
+            if node.op_type in self.filter_op_type:
+                continue
+            node_list.append(node)
+        for idx, node in enumerate(node_list):
+            if DEBUG:
+                print(node.name, node.op_type)
+            unique_str = self.get_node_unique_info(node=node)
+            if unique_str not in self.node_type_dict:
+                self.node_type_dict[unique_str] = [node.name]
+            else:
+                self.node_type_dict[unique_str].append(node.name)
+                if DEBUG:
+                   print('Repeated node, skip...')
+                continue
+            input_list = []
+            output_list = []
+            new_node_list = [node]
+            new_initializer_list = []
+            if self.isINT8Model:
+                if node.op_type in self.quant_op_list:
+                    dq_node = self.get_relate_qdq_node(node.input)
+                    q_node = self.get_relate_qdq_node(node.output, False)
+                    new_node_list.extend(dq_node)
+                    new_node_list.extend(q_node)
+            for new_node in new_node_list:
+                input_list.extend(new_node.input)
+                output_list.extend(new_node.output)
+            inter_tensors = []
+            for output in output_list:
+                if output in input_list:
+                    inter_tensors.append(output)
+            pure_input_list = []
+            pure_output_list = []
+            for input in input_list:
+                if input not in inter_tensors:
+                    pure_input_list.append(input)
+            for output in output_list:
+                if output not in inter_tensors:
+                    pure_output_list.append(output)
+            new_graph_input = []
+            for input in pure_input_list:
+                if input in self.initializer_dict:
+                    new_initializer_list.append(self.initializer_dict[input])
+                else:
+                    new_graph_input.append(input)
+            graph_input_proto_list = []
+            for graph_input in new_graph_input:
+                if graph_input in self.value_info_dict:
+                    input_value_info = self.value_info_dict[graph_input]
+                else:
+                    if DEBUG:
+                        print('tensor {} not in self.value_info_dict'.format(graph_input))
+                    for input in model.graph.input:
+                        if input.name == graph_input:
+                            input_value_info = input
+                            if DEBUG:
+                                print('get tensor {} from model.graph.input'.format(graph_input))
+                graph_input_proto_list.append(input_value_info)
+            graph_output_proto_list = []
+            for graph_output in pure_output_list:
+                if graph_output in self.value_info_dict:
+                    output_value_info = self.value_info_dict[graph_output]
+                else:
+                    if DEBUG:
+                        print('tensor {} not in self.value_info_dict'.format(graph_output))
+                    for output in model.graph.output:
+                        if output.name == graph_output:
+                            output_value_info = output
+                            if DEBUG:
+                                print('get tensor {} from model.graph.output'.format(graph_output))
+                graph_output_proto_list.append(output_value_info)
+            if node.op_type in ['Reshape', 'Expand']:
+                output_shape = [dim.dim_value for dim in graph_output_proto_list[0].type.tensor_type.shape.dim]
+                Reshape_input_shape = helper.make_tensor(node.input[1], TensorProto.INT64, [len(output_shape)], np.array(output_shape))
+                if len(graph_input_proto_list) == 2 and node.op_type == 'Reshape':
+                    new_initializer_list.append(Reshape_input_shape)
+                    remove_input_idx = 0
+                    for input_idx in range(len(graph_input_proto_list)):
+                        if graph_input_proto_list[input_idx].name == node.input[1]:
+                            remove_input_idx = input_idx
+                            break
+                    del graph_input_proto_list[remove_input_idx]
+                elif node.op_type == 'Expand':
+                    new_initializer_list.append(Reshape_input_shape)
+                    remove_input_idx = 0
+                    for input_idx in range(len(graph_input_proto_list)):
+                        if graph_input_proto_list[input_idx].name == node.input[0]:
+                            remove_input_idx = input_idx
+                            break
+                    del graph_input_proto_list[remove_input_idx]
+            if new_node_list[0].op_type == 'Cast':
+                for input, output in zip(graph_input_proto_list, graph_output_proto_list):
+                    assert len(input.type.tensor_type.shape.dim) == len(output.type.tensor_type.shape.dim)
+                    for idx in range(len(input.type.tensor_type.shape.dim)):
+                        if input.type.tensor_type.shape.dim[idx].dim_value != output.type.tensor_type.shape.dim[idx].dim_value:
+                            input.type.tensor_type.shape.dim[idx].dim_value = output.type.tensor_type.shape.dim[idx].dim_value
+            single_node_graph = helper.make_graph(
+                new_node_list,
+                '{}_single'.format(node.name),
+                graph_input_proto_list,
+                graph_output_proto_list,
+                new_initializer_list
+            )
+            if len(new_node_list) == 1 and new_node_list[0].op_type != 'Cast':
+                single_node_graph = self.modify_FP32_to_FP16(single_node_graph)
+            if node.op_type in self.model_op_list:
+                single_node_model = helper.make_model(
+                    single_node_graph,
+                    producer_name='model-single_op_model',
+                    opset_imports=[helper.make_opsetid("", 13)]
+                )
+            else:
+                single_node_model = helper.make_model(
+                    single_node_graph,
+                    producer_name='model-single_op_model',
+                    opset_imports=[helper.make_opsetid("", 13)]
+                )
+            single_node_model.ir_version = model.ir_version
+            try:
+                single_node_model = shape_inference.infer_shapes(single_node_model)
+            except:
+                print('single node model infer shape error... save original model')
+            new_path = '{}/{}/{}_{}.onnx'.format(save_path, node.op_type, node.op_type, idx)
+            if not os.path.exists(os.path.split(new_path)[0]):
+                os.makedirs(os.path.split(new_path)[0])
+            onnx.save_model(single_node_model, new_path)
+            model_name = os.path.basename(self.modelPath).split('.')[0]
+            model_info = {"name": model_name, "path": new_path, "node_detail": unique_str}
+            model_list.append(model_info)
+        return model_list
+if __name__ == '__main__':
+    modelPath = 'models/modelzoo1.0/detection/ox_yolov4_dy/ox_yolov4_dy_int8.onnx'
+    extractNode = extractNodefromModel(modelPath, 1)
+    save_path = './yolov4_node_model'
+    modelPathList, _ = extractNode.extractNodeAndSave(save_path)
+    for nodeModelPath in modelPathList:
+        print(nodeModelPath['name'], nodeModelPath['path'])
--- a/readme.md
+++ b/readme.md
+# Custom Op ONNXRuntime
+## Purpose
+`Adding the custom operator implementation and registering it in ONNX Runtime`
+## 环境配置
+### Docker（方法一）
+拉取镜像：
+```plaintext
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy-onnx1.19.2
+```
+创建并启动容器：
+```plaintext
+docker run --shm-size 16g --network=host --name=test --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/customop_onnxruntime:/home/customop -it <Your Image ID> /bin/bash
+```
+### Dockerfile（方法二）
+```
+cd ./docker
+docker build --no-cache -t customop:test .
+docker run --shm-size 16g --network=host --name=video_ort --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/customop_onnxruntime:/home/customop -it <Your Image ID> /bin/bash
+```
+## 使用
+### 编译工程
+```
+git clone http://developer.sourcefind.cn/codes/modelzoo/customop_onnxruntime.git
+cd customop_onnxruntime
+python model.py     // 生成模型
+bash compile.sh     // 编译生成算子库
+```
+### 运行示例
+1. 目录
+```
+rocm_custom_op
+├── compile.sh              // 编译脚本
+├── custom_op_library.cc    // 注册自定义算子
+├── custom_op_library.h
+├── docker
+├── include
+├── model.py               // 创建自定义模型
+├── readme.md
+├── rocm_ops.cc             // 调用自定义算子
+├── rocm_ops.h
+├── rocm_ops.hip            // 自定义算子实现
+└── benchmark.py                // 测试算子
+```
+2. 执行步骤
+```
+python test.py      // 测试模型     
+```
+3. 备注
+```
+1.在model.py中，定义了一个add.onnx模型。如需重新定义模型，请修改该文件。
+2.在compile.sh中，选项--offload-arch=gfx906，请将gfx906替换为本机适配的rocm架构(可用rocminfo | grep gfx查看)。
+3.若需测试别的自定义算子，修改内容如下：
+    3.1 修改rocm_ops.hip，重新实现自定义算子
+    3.2 修改rocm_ops.cc, 修改对自定义算子的调用
+    3.3 修改model.py，修改自定义模型的创建
+    3.4 修改benchmark.py，修改运行模型的输入数据
+```
+## result
+无
+### 精度
+无
+## 应用场景
+### 功能
+添加ORT custom op
+## 源码仓库及问题反馈
+- http://developer.sourcefind.cn/codes/modelzoo/customop_onnxruntime.git
+## 参考资料
+- https://github.com/microsoft/onnxruntime
+- https://onnxruntime.ai/docs/extensions/add-op.html
\ No newline at end of file
--- a/rocm_ops.cc
+++ b/rocm_ops.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#ifdef USE_ROCM
+#define ORT_API_MANUAL_INIT
+#include "onnxruntime_cxx_api.h"
+#undef ORT_API_MANUAL_INIT
+#include "core/providers/rocm/rocm_context.h"
+#include "onnxruntime_lite_custom_op.h"
+//Concat
+void rocm_concat(int axis, int M1, int N1, const float* X1, int M2, int N2, const float* X2, float* Z, hipStream_t stream);
+//Gemm
+void rocm_gemm(bool transA, bool transB, int M, int N, int K, float alpha, const float* A, const float* B, float beta, float* C, hipStream_t stream);
+extern "C"{ 
+    //LeakyRelu
+    void rocm_leaky_relu(
+    int64_t size,
+    const float* d_X,
+    float* d_Y,
+    float alpha,
+    hipStream_t stream);
+	//Attention
+	void rocm_attention(int B, int S, int H,
+                            const float* Q, const float* K, const float* V,
+                            float* Out, hipStream_t stream);
+   //BatchNormalization
+   void rocm_batch_norm(int64_t N, int64_t C, int64_t H, int64_t W,
+                            const float* X,
+                            const float* gamma,
+                            const float* beta,
+                            const float* mean,
+                            const float* var,
+                            float epsilon,
+                            float* Y,
+                            hipStream_t stream);
+   //Cast
+   void rocm_cast(
+    int64_t N,            // batch size
+    int64_t C,            // channels (或其它第一维)
+    int64_t H,            // 高度（或第二维）
+    int64_t W,            // 宽度（或第三维）
+    const float* X,       // 输入指针
+    int32_t* Y,           // 输出指针
+    hipStream_t stream);
+   //Softmax
+   void rocm_softmax(int64_t M, int64_t N,
+                             const float* X, float* Y,
+                             hipStream_t compute_stream);
+   //Celu
+   void rocm_celu(int64_t, const float*, float*, float, hipStream_t);
+   //Relu
+    void rocm_relu(
+    int64_t size,
+    const float* X,
+    float* Y,
+    hipStream_t stream
+	);
+    // Conv
+    void rocm_conv2d(const float* input,
+                            const float* weight,
+                            const float* bias,
+                            float* output,
+                            int N, int C_in, int H, int W,
+                            int C_out, int K_h, int K_w,
+                            int out_H, int out_W,
+                            hipStream_t stream);
+}
+using namespace Ort::Custom;
+#define CUSTOM_ENFORCE(cond, msg)  \
+  if (!(cond)) {                   \
+    throw std::runtime_error(msg); \
+  }
+namespace Rocm {
+void rocm_leaky_relu_forward(
+    const RocmContext& ctx,
+    const Tensor<float>& X,
+    Tensor<float>& Y) {
+  CUSTOM_ENFORCE(ctx.hip_stream, "No HIP stream available");
+  int64_t size = X.NumberOfElement();
+  const float alpha = 0.01f;
+  auto* y_ptr = Y.Allocate(X.Shape());
+  rocm_leaky_relu(size, X.Data(), y_ptr, alpha, ctx.hip_stream);
+}
+void rocm_relu_forward(
+    const Ort::Custom::RocmContext& rocm_ctx,
+    const Ort::Custom::Tensor<float>& X,
+    Ort::Custom::Tensor<float>& Y
+) {
+    CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+    auto input_shape = X.Shape();
+    int64_t size = X.NumberOfElement();
+    auto* y_data = Y.Allocate(input_shape);
+    rocm_relu(size, X.Data(), y_data, rocm_ctx.hip_stream);
+}
+void rocm_celu_forward(const Ort::Custom::RocmContext& ctx,
+                       const Ort::Custom::Tensor<float>& X,
+                       Ort::Custom::Tensor<float>& Y) {
+  CUSTOM_ENFORCE(ctx.hip_stream, "failed to fetch hip stream");
+  auto shape = X.Shape();
+  int64_t size = X.NumberOfElement();
+  float alpha = 1.0f; // or fetch from attribute
+  auto* y_ptr = Y.Allocate(shape);
+  rocm_celu(size, X.Data(), y_ptr, alpha, ctx.hip_stream);
+}
+/* softmax */
+void KernelSoftmax(const Ort::Custom::RocmContext& rocm_ctx,
+                   const Ort::Custom::Tensor<float>& X,
+                   Ort::Custom::Tensor<float>& Z) {
+  auto input_shape = X.Shape();
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+  // Expecting 2D input: [M, N]
+  CUSTOM_ENFORCE(input_shape.size() == 2, "Softmax only supports 2D input");
+  int64_t M = static_cast<int64_t>(input_shape[0]);
+  int64_t N = static_cast<int64_t>(input_shape[1]);
+  auto z_raw = Z.Allocate(input_shape);
+  // Call ROCm implementation
+  rocm_softmax(M, N, X.Data(), z_raw, rocm_ctx.hip_stream);
+}
+void rocm_cast_forward(
+    const Ort::Custom::RocmContext& rocm_ctx,
+    const Ort::Custom::Tensor<float>& X,
+    Ort::Custom::Tensor<int32_t>& Y) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+  // 假设只支持 4D 张量 [N,C,H,W]
+  auto shape = X.Shape();
+  CUSTOM_ENFORCE(shape.size() == 4, "Cast only supports 4D tensor [N,C,H,W]");
+  int64_t N = shape[0];
+  int64_t C = shape[1];
+  int64_t H = shape[2];
+  int64_t W = shape[3];
+  // 分配输出
+  auto* y_ptr = Y.Allocate({N, C, H, W});
+  // 正确调用：7 个参数
+  rocm_cast(
+      N, C, H, W,
+      X.Data(),
+      y_ptr,
+      rocm_ctx.hip_stream);
+}
+// BatchNormalization
+void rocm_batchnorm_forward(const Ort::Custom::RocmContext& rocm_ctx,
+                     const Ort::Custom::Tensor<float>& X,
+                     const Ort::Custom::Tensor<float>& scale,
+                     const Ort::Custom::Tensor<float>& B,
+                     const Ort::Custom::Tensor<float>& mean,
+                     const Ort::Custom::Tensor<float>& var,
+                     Ort::Custom::Tensor<float>& Y) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+  auto shape = X.Shape(); // expects [N, C, H, W]
+  CUSTOM_ENFORCE(shape.size() == 4, "Input must be 4D tensor [N, C, H, W]");
+  int64_t N = shape[0];
+  int64_t C = shape[1];
+  int64_t H = shape[2];
+  int64_t W = shape[3];
+  // Allocate output
+  auto* y_ptr = Y.Allocate({N, C, H, W});
+  // Epsilon attribute: retrieve via custom API or hardcode default
+  float epsilon = 1e-5f;
+  // If epsilon comes from attribute, fetch it here.
+  rocm_batch_norm(N, C, H, W,
+                  X.Data(), scale.Data(), B.Data(), mean.Data(), var.Data(),
+                  epsilon, y_ptr, rocm_ctx.hip_stream);
+}
+// attention
+void rocm_attention_forward(const Ort::Custom::RocmContext& rocm_ctx,
+                     const Ort::Custom::Tensor<float>& Q,
+                     const Ort::Custom::Tensor<float>& K,
+                     const Ort::Custom::Tensor<float>& V,
+                     Ort::Custom::Tensor<float>& Out) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+  auto shape = Q.Shape();  // 期望为 [B, S, H]
+  CUSTOM_ENFORCE(shape.size() == 3, "Input must be 3D tensor [B, S, H]");
+  int B = shape[0];
+  int S = shape[1];
+  int H = shape[2];
+  auto* out_ptr = Out.Allocate({B, S, H});
+  rocm_attention(B, S, H, Q.Data(), K.Data(), V.Data(), out_ptr, rocm_ctx.hip_stream);
+}
+// -------------------------------
+// Concat
+// -------------------------------
+void rocm_concat_forward(const Ort::Custom::RocmContext& rocm_ctx,
+                          const Ort::Custom::Tensor<float>& X1,
+                          const Ort::Custom::Tensor<float>& X2,
+                          Ort::Custom::Tensor<float>& Y) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+  auto shape1 = X1.Shape();
+  auto shape2 = X2.Shape();
+  // 支持二维张量按列连接（axis=1）
+  CUSTOM_ENFORCE(shape1.size() == 2 && shape2.size() == 2, "Inputs must be 2D tensors.");
+  CUSTOM_ENFORCE(shape1[0] == shape2[0], "Row dimensions must match for concat on axis 1.");
+  int M1 = shape1[0], N1 = shape1[1];
+  int M2 = shape2[0], N2 = shape2[1];
+  auto* y_data = Y.Allocate({M1, N1 + N2});  // 输出是合并后的矩阵
+  rocm_concat(1, M1, N1, X1.Data(), M2, N2, X2.Data(), y_data, rocm_ctx.hip_stream);
+}
+/******conv******/
+void rocm_conv_forward(const RocmContext& ctx,
+                       const Tensor<float>& input,
+                       const Tensor<float>& weight,
+                       const Tensor<float>& bias,
+                       Tensor<float>& output) {
+  CUSTOM_ENFORCE(ctx.hip_stream, "no HIP stream");
+  const auto& input_shape = input.Shape();   // [N, C_in, H, W]
+  const auto& weight_shape = weight.Shape(); // [C_out, C_in, K_h, K_w]
+  const int64_t N = input_shape[0];
+  const int64_t C_in = input_shape[1];
+  const int64_t H = input_shape[2];
+  const int64_t W = input_shape[3];
+  const int64_t C_out = weight_shape[0];
+  const int64_t K_h = weight_shape[2];
+  const int64_t K_w = weight_shape[3];
+  const int64_t out_H = (H - K_h) / 1 + 1;
+  const int64_t out_W = (W - K_w) / 1 + 1;
+  auto* y_ptr = output.Allocate({N, C_out, out_H, out_W});
+  rocm_conv2d(input.Data(), weight.Data(), bias.Data(), y_ptr,
+              N, C_in, H, W, C_out, K_h, K_w, out_H, out_W,
+              ctx.hip_stream);
+}
+// -------------------------------
+// Gemm
+// -------------------------------
+void rocm_gemm_forward(const Ort::Custom::RocmContext& rocm_ctx,
+                       const Ort::Custom::Tensor<float>& A,
+                       const Ort::Custom::Tensor<float>& B,
+                       const Ort::Custom::Tensor<float>& C,
+                       Ort::Custom::Tensor<float>& Y) {
+  CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
+  auto shapeA = A.Shape();
+  auto shapeB = B.Shape();
+  auto shapeC = C.Shape();
+  CUSTOM_ENFORCE(shapeA.size() == 2 && shapeB.size() == 2 && shapeC.size() == 2, "Inputs must be 2D tensors.");
+  int M = shapeA[0];
+  int K = shapeA[1];
+  int N = shapeB[1];
+  CUSTOM_ENFORCE(shapeB[0] == K, "Inner dimension mismatch between A and B in Gemm.");
+  CUSTOM_ENFORCE(shapeC[0] == M && shapeC[1] == N, "Output tensor shape mismatch in Gemm.");
+  auto* y_data = Y.Allocate({M, N});
+  rocm_gemm(false, false, M, N, K, 1.0f, A.Data(), B.Data(), 1.0f, y_data, rocm_ctx.hip_stream);
+}
+void RegisterOps(Ort::CustomOpDomain& domain) {
+ //注册 Attention 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpAttention{Ort::Custom::CreateLiteCustomOp("Attention", "ROCMExecutionProvider", rocm_attention_forward)};
+  domain.Add(c_CustomOpAttention.get());
+  // 注册 BatchNormalization 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpBatchNorm{Ort::Custom::CreateLiteCustomOp("BatchNormalization", "ROCMExecutionProvider", rocm_batchnorm_forward)};
+  domain.Add(c_CustomOpBatchNorm.get());
+  // 注册 Concat 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpConcat{Ort::Custom::CreateLiteCustomOp("Concat", "ROCMExecutionProvider", rocm_concat_forward)};
+  domain.Add(c_CustomOpConcat.get());
+  // 注册 Gemm 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpGemm{Ort::Custom::CreateLiteCustomOp("Gemm", "ROCMExecutionProvider", rocm_gemm_forward)};
+  domain.Add(c_CustomOpGemm.get());
+  // 注册 Cast 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpCast{Ort::Custom::CreateLiteCustomOp("Cast", "ROCMExecutionProvider", rocm_cast_forward)};
+  domain.Add(c_CustomOpCast.get());
+  // 注册 Softmax 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpSoftmax{Ort::Custom::CreateLiteCustomOp("Softmax","ROCMExecutionProvider", KernelSoftmax)};
+  domain.Add(c_CustomOpSoftmax.get());
+  // 注册 Celu 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CeluOp{Ort::Custom::CreateLiteCustomOp("Celu", "ROCMExecutionProvider", rocm_celu_forward)};
+  domain.Add(c_CeluOp.get());
+  // 注册 ReLU 算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpRelu{
+  Ort::Custom::CreateLiteCustomOp("Relu", "ROCMExecutionProvider", rocm_relu_forward)};
+  domain.Add(c_CustomOpRelu.get());
+  // 注册LeakyRelu算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_LeakyReLU{
+      Ort::Custom::CreateLiteCustomOp(
+          "LeakyRelu", "ROCMExecutionProvider", rocm_leaky_relu_forward)};
+  domain.Add(c_LeakyReLU.get());
+	//注册conv算子
+  static const std::unique_ptr<OrtLiteCustomOp> c_Conv{
+      Ort::Custom::CreateLiteCustomOp("Conv", "ROCMExecutionProvider", rocm_conv_forward)};
+  domain.Add(c_Conv.get());
+}
+}  // namespace Rocm
+#endif
--- a/rocm_ops.cc.o
+++ b/rocm_ops.cc.o
--- a/rocm_ops.h
+++ b/rocm_ops.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+namespace Rocm {
+#ifdef USE_ROCM
+void RegisterOps(Ort::CustomOpDomain& domain);
+#else
+inline void RegisterOps(Ort::CustomOpDomain&) {}
+#endif
+}  // namespace Rocm
--- a/rocm_ops.hip
+++ b/rocm_ops.hip
+#include "hip/hip_runtime.h"
+#include <stdio.h>
+#include <math.h>
+__global__ void _Add(long long sz, float* Z, const float* X, const float* Y) {
+    long long offset = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    if (offset < sz) {
+        Z[offset] = X[offset] + Y[offset];
+    }
+    return;
+}
+void rocm_add(int64_t sz, float* Z, const float* X, const float* Y, hipStream_t compute_stream) {
+    float *d_X, *d_Y, *d_Z;
+    // 为设备上的数组分配内存并检查分配结果
+    hipError_t err;
+    err = hipMalloc((void**)&d_X, sz * sizeof(float));
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMalloc for d_X failed: %s\n", hipGetErrorString(err));
+        return;
+    }
+    err = hipMalloc((void**)&d_Y, sz * sizeof(float));
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMalloc for d_Y failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        return;
+    }
+    err = hipMalloc((void**)&d_Z, sz * sizeof(float));
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMalloc for d_Z failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        return;
+    }
+    // 将主机上的 X 和 Y 数组数据复制到设备
+    err = hipMemcpyAsync(d_X, X, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMemcpyAsync for d_X failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    err = hipMemcpyAsync(d_Y, Y, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMemcpyAsync for d_Y failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    // 调用核函数
+    _Add<<<256, 256, 0, compute_stream>>>(static_cast<long long>(sz), d_Z, d_X, d_Y);
+    err = hipGetLastError();
+    if (err != hipSuccess) {
+        fprintf(stderr, "Kernel launch failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    // 将计算结果从设备复制回主机
+    err = hipMemcpyAsync(Z, d_Z, sz * sizeof(float), hipMemcpyDeviceToHost, compute_stream);
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMemcpyAsync for Z failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    // 同步流，确保所有操作完成
+    err = hipStreamSynchronize(compute_stream);
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipStreamSynchronize failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    // 释放设备上的内存
+    hipFree(d_X);
+    hipFree(d_Y);
+    hipFree(d_Z);
+}
+// Concat
+__global__ void _Concat2D(int axis,
+                          int M1, int N1, const float* X1,
+                          int M2, int N2, const float* X2,
+                          float* Z) {
+    int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    if (axis == 0) { // 按行连接
+        if (row < M1 && col < N1) {
+            Z[row * N1 + col] = X1[row * N1 + col];
+        } else if (row >= M1 && row < M1 + M2 && col < N2) {
+            Z[row * N2 + col] = X2[(row - M1) * N2 + col];
+        }
+    } else if (axis == 1) { // 按列连接
+        if (row < M1 && col < N1) {
+            Z[row * (N1 + N2) + col] = X1[row * N1 + col];
+        } else if (row < M2 && col >= N1 && col < N1 + N2) {
+            Z[row * (N1 + N2) + col] = X2[row * N2 + (col - N1)];
+        }
+    }
+    return;
+}
+void rocm_concat(int axis,
+                 int M1, int N1, const float* X1,
+                 int M2, int N2, const float* X2,
+                 float* Z,
+                 hipStream_t compute_stream) {
+    dim3 blockDim(16, 16);
+    dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
+    float *d_X1, *d_X2, *d_Z;
+    hipError_t err;
+    size_t size1 = M1 * N1 * sizeof(float);
+    size_t size2 = M2 * N2 * sizeof(float);
+    size_t sizeZ = (axis == 0 ? (M1 + M2) * N1 : M1 * (N1 + N2)) * sizeof(float);
+    // 分配显存
+    err = hipMalloc(&d_X1, size1); if (err != hipSuccess) { /* 错误处理 */ }
+    err = hipMalloc(&d_X2, size2); if (err != hipSuccess) { hipFree(d_X1); return; }
+    err = hipMalloc(&d_Z, sizeZ); if (err != hipSuccess) { hipFree(d_X1); hipFree(d_X2); return; }
+    // 拷贝数据到设备
+    hipMemcpyAsync(d_X1, X1, size1, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_X2, X2, size2, hipMemcpyHostToDevice, compute_stream);
+    // 启动核函数
+    // dim3 blockDim(16, 16);
+    // dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
+    _Concat2D<<<gridDim, blockDim, 0, compute_stream>>>(axis, M1, N1, d_X1, M2, N2, d_X2, d_Z);
+    // 拷贝结果回主机
+    hipMemcpyAsync(Z, d_Z, sizeZ, hipMemcpyDeviceToHost, compute_stream);
+    // 同步流
+    hipStreamSynchronize(compute_stream);
+    // 释放资源
+    hipFree(d_X1); hipFree(d_X2); hipFree(d_Z);
+    return;
+}
+//gemm
+#include <hip/hip_runtime.h>
+__global__ void _Gemm(bool transA, bool transB,
+                      int M, int N, int K,
+                      float alpha,
+                      const float* A,
+                      const float* B,
+                      float beta,
+                      float* C) {
+    int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    if (row >= M || col >= N) return;
+    float sum = 0.0f;
+    for (int k = 0; k < K; ++k) {
+        float a = transA ? A[k * M + row] : A[row * K + k];
+        float b = transB ? B[col * K + k] : B[k * N + col];
+        sum += a * b;
+    }
+    C[row * N + col] = alpha * sum + beta * C[row * N + col];
+return;
+}
+void rocm_gemm(bool transA, bool transB,
+               int M, int N, int K,
+               float alpha,
+               const float* A,
+               const float* B,
+               float beta,
+               float* C,
+               hipStream_t compute_stream) {
+    dim3 blockDim(16, 16);
+    dim3 gridDim((N + 15) / 16, (M + 15) / 16);
+    float *d_A, *d_B, *d_C;
+    hipError_t err;
+    size_t sizeA = transA ? K * M * sizeof(float) : M * K * sizeof(float);
+    size_t sizeB = transB ? N * K * sizeof(float) : K * N * sizeof(float);
+    size_t sizeC = M * N * sizeof(float);
+    // 分配显存
+    err = hipMalloc(&d_A, sizeA); if (err != hipSuccess) { goto error; }
+    err = hipMalloc(&d_B, sizeB); if (err != hipSuccess) { hipFree(d_A); goto error; }
+    err = hipMalloc(&d_C, sizeC); if (err != hipSuccess) { hipFree(d_A); hipFree(d_B); goto error; }
+    // 主机 -> 设备拷贝
+    hipMemcpyAsync(d_A, A, sizeA, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_B, B, sizeB, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_C, C, sizeC, hipMemcpyHostToDevice, compute_stream);
+    // 启动核函数
+    //dim3 blockDim(16, 16);
+    //dim3 gridDim((N + 15) / 16, (M + 15) / 16);
+    _Gemm<<<gridDim, blockDim, 0, compute_stream>>>(transA, transB, M, N, K, alpha, d_A, d_B, beta, d_C);
+    // 设备 -> 主机拷贝
+    hipMemcpyAsync(C, d_C, sizeC, hipMemcpyDeviceToHost, compute_stream);
+    // 同步流
+    hipStreamSynchronize(compute_stream);
+    // 清理资源
+    hipFree(d_A);
+    hipFree(d_B);
+    hipFree(d_C);
+    return;
+error:
+    fprintf(stderr, "HIP memory allocation or memcpy failed in rocm_gemm\n");
+    if (d_A) hipFree(d_A);
+    if (d_B) hipFree(d_B);
+    if (d_C) hipFree(d_C);
+}
+//GroupNormalization
+#include <math.h>
+__global__ void _GroupNorm(
+    int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
+    float eps, const float* X, float* Y,
+    const float* gamma, const float* beta
+) {
+    // 计算当前组和样本索引
+    int64_t group_idx = hipBlockIdx_x;
+    int64_t n = hipBlockIdx_y;
+    int64_t channels_per_group = C / G;
+    int64_t c_start = group_idx * channels_per_group;
+    int64_t c_end = c_start + channels_per_group;
+    // 组内总元素数
+    int64_t group_size = channels_per_group * H * W;
+    // 共享内存用于归约求和
+    __shared__ float shared_sum[256];
+    __shared__ float shared_sum_sq[256];
+    // 每个线程计算局部和与平方和
+    float sum = 0.0f, sum_sq = 0.0f;
+    for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
+        int64_t c = c_start + idx / (H * W);
+        int64_t hw = idx % (H * W);
+        int64_t h = hw / W;
+        int64_t w = hw % W;
+        int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
+        float val = X[linear_idx];
+        sum += val;
+        sum_sq += val * val;
+    }
+    shared_sum[hipThreadIdx_x] = sum;
+    shared_sum_sq[hipThreadIdx_x] = sum_sq;
+    __syncthreads();
+    // 树状归约求全局和
+    for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
+        if (hipThreadIdx_x < s) {
+            shared_sum[hipThreadIdx_x] += shared_sum[hipThreadIdx_x + s];
+            shared_sum_sq[hipThreadIdx_x] += shared_sum_sq[hipThreadIdx_x + s];
+        }
+        __syncthreads();
+    }
+    // 计算均值和方差
+    float mean = shared_sum[0] / group_size;
+    float var = shared_sum_sq[0] / group_size - mean * mean;
+    // 归一化并应用仿射变换
+    for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
+        int64_t c = c_start + idx / (H * W);
+        int64_t hw = idx % (H * W);
+        int64_t h = hw / W;
+        int64_t w = hw % W;
+        int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
+        float val = (X[linear_idx] - mean) / sqrtf(var + eps);
+        Y[linear_idx] = gamma[c] * val + beta[c];
+    }
+    return;
+}
+void rocm_group_norm(
+    int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
+    float eps, const float* X, float* Y,
+    const float* gamma, const float* beta,
+    hipStream_t compute_stream
+) {
+	 dim3 block_dim(256);  // 每个块256线程
+    dim3 grid_dim(G, N);  // 每个组和样本对应一个块
+    // 参数校验
+    if (C % G != 0) {
+        fprintf(stderr, "Error: Channels must be divisible by groups.\n");
+        return;
+    }
+    // 分配设备内存
+    float *d_X, *d_Y, *d_gamma, *d_beta;
+    hipError_t err;
+    size_t input_size = N * C * H * W * sizeof(float);
+    size_t param_size = C * sizeof(float);
+    err = hipMalloc(&d_X, input_size);
+    if (err != hipSuccess) { /* 处理错误 */ }
+    err = hipMalloc(&d_Y, input_size);
+    if (err != hipSuccess) { hipFree(d_X); return; }
+    err = hipMalloc(&d_gamma, param_size);
+    if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); return; }
+    err = hipMalloc(&d_beta, param_size);
+    if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); return; }
+    // 数据拷贝到设备
+    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_gamma, gamma, param_size, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_beta, beta, param_size, hipMemcpyHostToDevice, compute_stream);
+    // 配置核函数参数
+    // dim3 block_dim(256);  // 每个块256线程
+    // dim3 grid_dim(G, N);  // 每个组和样本对应一个块
+    // 启动核函数
+    _GroupNorm<<<grid_dim, block_dim, 0, compute_stream>>>(
+        N, C, H, W, G, eps, d_X, d_Y, d_gamma, d_beta
+    );
+    // 拷贝结果回主机
+    hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);
+    // 同步流并释放资源
+    hipStreamSynchronize(compute_stream);
+    hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); hipFree(d_beta);
+}
+//LogSoftmax
+__global__ void _LogSoftmax(int64_t N, int64_t D, const float* X, float* Y) {
+    int64_t n = hipBlockIdx_x;  // 每个样本一个线程块
+    int tid = hipThreadIdx_x;
+    // 共享内存存储最大值和指数和
+    __shared__ float shared_max[256];
+    __shared__ float shared_sum[256];
+    // 步骤1：计算样本内最大值
+    float max_val = -INFINITY;
+    for (int64_t i = tid; i < D; i += hipBlockDim_x) {
+        max_val = fmaxf(max_val, X[n * D + i]);
+    }
+    shared_max[tid] = max_val;
+    __syncthreads();
+    // 归约求全局最大值
+    for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
+        if (tid < s && shared_max[tid + s] > shared_max[tid]) {
+            shared_max[tid] = shared_max[tid + s];
+        }
+        __syncthreads();
+    }
+    float global_max = shared_max[0];
+    // 步骤2：计算指数和
+    float exp_sum = 0.0f;
+    for (int64_t i = tid; i < D; i += hipBlockDim_x) {
+        exp_sum += expf(X[n * D + i] - global_max);
+    }
+    shared_sum[tid] = exp_sum;
+    __syncthreads();
+    // 归约求全局指数和
+    for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            shared_sum[tid] += shared_sum[tid + s];
+        }
+        __syncthreads();
+    }
+    float global_sum = shared_sum[0];
+    // 步骤3：计算LogSoftmax
+    for (int64_t i = tid; i < D; i += hipBlockDim_x) {
+        Y[n * D + i] = (X[n * D + i] - global_max) - logf(global_sum);
+    }
+    return;
+}
+void rocm_log_softmax(
+    int64_t N, int64_t D, const float* X, float* Y, hipStream_t compute_stream
+) {
+	 dim3 block_dim(256);  // 每个块256线程
+    dim3 grid_dim(N);     // 每个样本一个线程块
+    // 分配设备内存
+    float *d_X, *d_Y;
+    hipError_t err;
+    size_t input_size = N * D * sizeof(float);
+    err = hipMalloc(&d_X, input_size);
+    if (err != hipSuccess) { /* 处理错误 */ }
+    err = hipMalloc(&d_Y, input_size);
+    if (err != hipSuccess) { hipFree(d_X); return; }
+    // 数据拷贝到设备
+    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);
+    // 配置核函数参数
+    //dim3 block_dim(256);  // 每个块256线程
+    //dim3 grid_dim(N);     // 每个样本一个线程块
+    // 启动核函数
+    _LogSoftmax<<<grid_dim, block_dim, 0, compute_stream>>>(N, D, d_X, d_Y);
+    // 拷贝结果回主机
+    hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);
+    // 同步流并释放资源
+    hipStreamSynchronize(compute_stream);
+    hipFree(d_X); hipFree(d_Y);
+}
+//attention
+__global__ void _DotProductAttention(int B, int S, int H,
+                                     const float* Q, const float* K, const float* V,
+                                     float scaling,
+                                     float* output) {
+    int b = blockIdx.z;
+    int i = blockIdx.y * blockDim.y + threadIdx.y; // query index
+    int j = blockIdx.x * blockDim.x + threadIdx.x; // hidden dim
+    if (b >= B || i >= S || j >= H) return;
+    // 计算 Q·K^T[i, k]
+    float scores[128]; // 假设 seq_len <= 128
+    for (int k = 0; k < S; ++k) {
+        float dot = 0.f;
+        for (int h = 0; h < H; ++h) {
+            dot += Q[(b * S + i) * H + h] * K[(b * S + k) * H + h];
+        }
+        scores[k] = dot / scaling;
+    }
+    // softmax over scores
+    float max_val = scores[0];
+    for (int k = 1; k < S; ++k) max_val = fmaxf(max_val, scores[k]);
+    float sum = 0.f;
+    for (int k = 0; k < S; ++k) {
+        scores[k] = expf(scores[k] - max_val);
+        sum += scores[k];
+    }
+    for (int k = 0; k < S; ++k) scores[k] /= sum;
+    // output = softmax * V
+    float result = 0.f;
+    for (int k = 0; k < S; ++k) {
+        result += scores[k] * V[(b * S + k) * H + j];
+    }
+    output[(b * S + i) * H + j] = result;
+}
+extern "C" void rocm_attention(int B, int S, int H,
+                            const float* Q, const float* K, const float* V,
+                            float* Out, hipStream_t stream) {
+    dim3 blockDim(16, 16);
+    dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
+    float *d_Q, *d_K, *d_V, *d_Out;
+    size_t size = B * S * H * sizeof(float);
+    hipMalloc(&d_Q, size);
+    hipMalloc(&d_K, size);
+    hipMalloc(&d_V, size);
+    hipMalloc(&d_Out, size);
+    hipMemcpyAsync(d_Q, Q, size, hipMemcpyHostToDevice, stream);
+    hipMemcpyAsync(d_K, K, size, hipMemcpyHostToDevice, stream);
+    hipMemcpyAsync(d_V, V, size, hipMemcpyHostToDevice, stream);
+    float scale = sqrtf((float)H);
+    // dim3 blockDim(16, 16);
+    // dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
+    _DotProductAttention<<<gridDim, blockDim, 0, stream>>>(B, S, H, d_Q, d_K, d_V, scale, d_Out);
+    hipMemcpyAsync(Out, d_Out, size, hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
+    hipFree(d_Q); hipFree(d_K); hipFree(d_V); hipFree(d_Out);
+    return;
+}
+// BatchNormalization
+__global__ void _BatchNormalization(
+    int N, int C, int H, int W,
+    const float* X,
+    const float* gamma,
+    const float* beta,
+    const float* mean,
+    const float* var,
+    float epsilon,
+    float* Y) {
+  // global thread index
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int total = N * C * H * W;
+  if (idx >= total) return;
+  // 计算坐标
+  int w = idx % W;
+  int tmp = idx / W;
+  int h = tmp % H;
+  tmp = tmp / H;
+  int c = tmp % C;
+  int n = tmp / C;
+  // 计算 Y = gamma[c] * (X - mean[c]) / sqrt(var[c] + eps) + beta[c]
+  int offset = ((n * C + c) * H + h) * W + w;
+  float x = X[offset];
+  float m = mean[c];
+  float v = var[c];
+  float inv_std = rsqrtf(v + epsilon);
+  Y[offset] = gamma[c] * ((x - m) * inv_std) + beta[c];
+}
+// host API：rocm_batch_norm
+extern "C" void rocm_batch_norm(
+    int64_t N, int64_t C, int64_t H, int64_t W,
+    const float* X,
+    const float* gamma,
+    const float* beta,
+    const float* mean,
+    const float* var,
+    float epsilon,
+    float* Y,
+    hipStream_t stream) {
+  size_t total = (size_t)N * C * H * W;
+  // 分配并拷贝 X、gamma、beta、mean、var 到设备
+  float *d_X, *d_gamma, *d_beta, *d_mean, *d_var, *d_Y;
+  hipMalloc(&d_X, total * sizeof(float));
+  hipMalloc(&d_Y, total * sizeof(float));
+  hipMalloc(&d_gamma, C * sizeof(float));
+  hipMalloc(&d_beta, C * sizeof(float));
+  hipMalloc(&d_mean, C * sizeof(float));
+  hipMalloc(&d_var, C * sizeof(float));
+  hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);
+  hipMemcpyAsync(d_gamma, gamma, C * sizeof(float), hipMemcpyHostToDevice, stream);
+  hipMemcpyAsync(d_beta, beta, C * sizeof(float), hipMemcpyHostToDevice, stream);
+  hipMemcpyAsync(d_mean, mean, C * sizeof(float), hipMemcpyHostToDevice, stream);
+  hipMemcpyAsync(d_var, var, C * sizeof(float), hipMemcpyHostToDevice, stream);
+  // 启动核函数：一维线程组织
+  int threads = 256;
+  int blocks = (total + threads - 1) / threads;
+  _BatchNormalization<<<blocks, threads, 0, stream>>>(
+      N, C, H, W,
+      d_X, d_gamma, d_beta, d_mean, d_var,
+      epsilon,
+      d_Y);
+  // 拷回结果
+  hipMemcpyAsync(Y, d_Y, total * sizeof(float), hipMemcpyDeviceToHost, stream);
+  hipStreamSynchronize(stream);
+  // 释放设备内存
+  hipFree(d_X);
+  hipFree(d_Y);
+  hipFree(d_gamma);
+  hipFree(d_beta);
+  hipFree(d_mean);
+  hipFree(d_var);
+  return;
+}
+// Cast Operator: float to int32
+// Device kernel: cast each element
+__global__ void _Cast(
+    int total,
+    const float* X,
+    int* Y) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= total) return;
+  // cast float to int32
+  Y[idx] = static_cast<int>(X[idx]);
+}
+// Host API: rocm_cast
+extern "C" void rocm_cast(
+    int64_t N, int64_t C, int64_t H, int64_t W,
+    const float* X,
+    int* Y,
+    hipStream_t stream) {
+  // total elements
+  size_t total = (size_t)N * C * H * W;
+  // allocate device memory
+  float* d_X;
+  int* d_Y;
+  hipMalloc(&d_X, total * sizeof(float));
+  hipMalloc(&d_Y, total * sizeof(int));
+  // copy input to device
+  hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);
+  // launch kernel
+  int threads = 256;
+  int blocks = (total + threads - 1) / threads;
+  _Cast<<<blocks, threads, 0, stream>>>(
+      total,
+      d_X,
+      d_Y);
+  // copy result back
+  hipMemcpyAsync(Y, d_Y, total * sizeof(int), hipMemcpyDeviceToHost, stream);
+  hipStreamSynchronize(stream);
+  // free device memory
+  hipFree(d_X);
+  hipFree(d_Y);
+  return;
+}
+extern "C" __global__
+void SoftmaxKernel(const float* X, float* Y, int M, int N) {
+    // M = batch_size, N = feature_size
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M) return;
+    const float* x_row = X + row * N;
+    float*       y_row = Y + row * N;
+    // 1) 找到这一行的最大值，用于数值稳定性
+    float m = x_row[0];
+    for (int j = 1; j < N; ++j) {
+        m = fmaxf(m, x_row[j]);
+    }
+    // 2) 计算 exp(x - m) 并累加
+    float sum = 0.f;
+    for (int j = 0; j < N; ++j) {
+        float e = expf(x_row[j] - m);
+        y_row[j] = e;
+        sum += e;
+    }
+    // 3) 归一化
+    for (int j = 0; j < N; ++j) {
+        y_row[j] /= sum;
+    }
+}
+// 这个函数由 ONNX Runtime 调用，替代原来的 rocm_add
+extern "C"
+void rocm_softmax(int64_t M, int64_t N,
+                  const float* X, float* Y,
+                  hipStream_t stream) {
+    // 每个线程处理一行，线程块大小 128
+    const int threads = 128;
+    const int blocks  = static_cast<int>((M + threads - 1) / threads);
+    hipLaunchKernelGGL(
+        SoftmaxKernel,
+        dim3(blocks), dim3(threads),
+        0,      // shared mem
+        stream, // hip stream
+        X, Y, static_cast<int>(M), static_cast<int>(N)
+    );
+    return;
+}
+template <typename T>
+__global__ void _CeluKernel(const T* X, T* Y, int64_t size, T alpha) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        T v = X[idx];
+        T pos = v > T(0) ? v : T(0);
+        T neg = v <= T(0) ? alpha * (exp(v / alpha) - T(1)) : T(0);
+        Y[idx] = pos + neg;
+    }
+    return;
+}
+extern "C" void rocm_celu(int64_t size,
+                           const float* X,
+                           float* Y,
+                           float alpha,
+                           hipStream_t stream) {
+    float *d_X, *d_Y;
+    hipMalloc(&d_X, size * sizeof(float));
+    hipMalloc(&d_Y, size * sizeof(float));
+    hipMemcpyAsync(d_X, X, size * sizeof(float), hipMemcpyHostToDevice, stream);
+    int threads = 256;
+    int blocks = (size + threads - 1) / threads;
+    _CeluKernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size, alpha);
+    hipMemcpyAsync(Y, d_Y, size * sizeof(float), hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
+    hipFree(d_X); hipFree(d_Y);
+    return;
+}
+//relu
+template <typename T>
+__global__ void _rocm_relu_kernel(float* input, float* output, int64_t size) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+    output[idx] = fmaxf(0.0f, input[idx]);
+}
+extern "C" void rocm_relu(
+    int64_t size,
+    const float* X,
+    float* Y,
+    hipStream_t stream
+) {
+    size_t input_size = size * sizeof(float);
+    float *d_X, *d_Y;
+    hipMalloc(&d_X, input_size);
+    hipMalloc(&d_Y, input_size);
+    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, stream);
+    int threads = 256;
+    int blocks = (size + threads - 1) / threads;
+    _rocm_relu_kernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size);
+    hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
+    hipFree(d_X);
+    hipFree(d_Y);
+    return;
+}
+// -------------------------------
+// TopK
+// -------------------------------
+extern "C" __global__
+void TopKKernel(
+    const float* __restrict__ X,   // [M * N]
+    float* __restrict__ values,    // [M * K]
+    int64_t* __restrict__ indices, // [M * K]
+    int M,
+    int N,
+    int K
+) {
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M) return;
+    const float* x_row = X + size_t(row) * N;
+    float*       v_row = values + size_t(row) * K;
+    int64_t*     i_row = indices + size_t(row) * K;
+    // 动态共享内存布局：前 K 个 float 存放 topK 值，后 K 个 int 存放对应索引
+    extern __shared__ char smem[];
+    float* shared_vals = (float*)smem;
+    int*   shared_idx  = (int*)(smem + K * sizeof(float));
+    // 初始化：shared_vals = -INF, shared_idx = -1
+    for (int t = threadIdx.x; t < K; t += blockDim.x) {
+        shared_vals[t] = -INFINITY;
+        shared_idx[t]  = -1;
+    }
+    __syncthreads();
+    // 扫描整行，维护一个长度为 K 的最小堆逻辑（但这里用简化的线性扫描替代堆）
+    for (int j = 0; j < N; ++j) {
+        float v = x_row[j];
+        // 找当前最小值位置
+        float min_val = shared_vals[0];
+        int   min_pos = 0;
+        for (int t = 1; t < K; ++t) {
+            if (shared_vals[t] < min_val) {
+                min_val = shared_vals[t];
+                min_pos = t;
+            }
+        }
+        // 替换
+        if (v > min_val) {
+            shared_vals[min_pos] = v;
+            shared_idx[min_pos]  = j;
+        }
+    }
+    __syncthreads();
+    // 对这 K 个元素做简单排序（降序），K 通常比较小
+    for (int i = 0; i < K; ++i) {
+        for (int j = i + 1; j < K; ++j) {
+            if (shared_vals[j] > shared_vals[i]) {
+                // swap value
+                float tv = shared_vals[i];
+                shared_vals[i] = shared_vals[j];
+                shared_vals[j] = tv;
+                // swap idx
+                int ti = shared_idx[i];
+                shared_idx[i] = shared_idx[j];
+                shared_idx[j] = ti;
+            }
+        }
+    }
+    __syncthreads();
+    // 写回全局内存
+    for (int t = threadIdx.x; t < K; t += blockDim.x) {
+        v_row[t] = shared_vals[t];
+        i_row[t] = (int64_t)shared_idx[t];
+    }
+}
+extern "C"
+void rocm_topk(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    const float* X,
+    float* values,
+    int64_t* indices,
+    hipStream_t stream
+) {
+    // 分配设备内存
+    size_t sizeX      = size_t(M) * N * sizeof(float);
+    size_t sizeOutVal = size_t(M) * K * sizeof(float);
+    size_t sizeOutIdx = size_t(M) * K * sizeof(int64_t);
+    float*   d_X;
+    float*   d_vals;
+    int64_t* d_idx;
+    if (hipMalloc(&d_X,   sizeX     ) != hipSuccess ||
+        hipMalloc(&d_vals,sizeOutVal) != hipSuccess ||
+        hipMalloc(&d_idx, sizeOutIdx) != hipSuccess) {
+        fprintf(stderr, "HIP malloc failed in rocm_topk\n");
+        if (d_X)    hipFree(d_X);
+        if (d_vals) hipFree(d_vals);
+        if (d_idx)  hipFree(d_idx);
+        return;
+    }
+    // 拷贝输入到设备
+    hipMemcpyAsync(d_X, X, sizeX, hipMemcpyHostToDevice, stream);
+    // 启动 Kernel：每个线程处理一行，动态共享内存大小 = K*(sizeof(float)+sizeof(int))
+    dim3 blockDim(128);
+    dim3 gridDim((M + blockDim.x - 1) / blockDim.x);
+    size_t shared_bytes = K * (sizeof(float) + sizeof(int));
+    hipLaunchKernelGGL(
+        TopKKernel,
+        gridDim, blockDim, shared_bytes, stream,
+        d_X, d_vals, d_idx,
+        int(M), int(N), int(K)
+    );
+    // 拷贝结果回主机
+    hipMemcpyAsync(values,  d_vals,  sizeOutVal, hipMemcpyDeviceToHost, stream);
+    hipMemcpyAsync(indices, d_idx,   sizeOutIdx, hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
+    // 释放
+    hipFree(d_X);
+    hipFree(d_vals);
+    hipFree(d_idx);
+}
--- a/rocm_ops.hip.cpp
+++ b/rocm_ops.hip.cpp
+#include "hip/hip_runtime.h"
+#include <stdio.h>
+#include <math.h>
+__global__ void _Add(long long sz, float* Z, const float* X, const float* Y) {
+    long long offset = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    if (offset < sz) {
+        Z[offset] = X[offset] + Y[offset];
+    }
+    return;
+}
+void rocm_add(int64_t sz, float* Z, const float* X, const float* Y, hipStream_t compute_stream) {
+    float *d_X, *d_Y, *d_Z;
+    // 为设备上的数组分配内存并检查分配结果
+    hipError_t err;
+    err = hipMalloc((void**)&d_X, sz * sizeof(float));
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMalloc for d_X failed: %s\n", hipGetErrorString(err));
+        return;
+    }
+    err = hipMalloc((void**)&d_Y, sz * sizeof(float));
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMalloc for d_Y failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        return;
+    }
+    err = hipMalloc((void**)&d_Z, sz * sizeof(float));
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMalloc for d_Z failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        return;
+    }
+    // 将主机上的 X 和 Y 数组数据复制到设备
+    err = hipMemcpyAsync(d_X, X, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMemcpyAsync for d_X failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    err = hipMemcpyAsync(d_Y, Y, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMemcpyAsync for d_Y failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    // 调用核函数
+    _Add<<<256, 256, 0, compute_stream>>>(static_cast<long long>(sz), d_Z, d_X, d_Y);
+    err = hipGetLastError();
+    if (err != hipSuccess) {
+        fprintf(stderr, "Kernel launch failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    // 将计算结果从设备复制回主机
+    err = hipMemcpyAsync(Z, d_Z, sz * sizeof(float), hipMemcpyDeviceToHost, compute_stream);
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipMemcpyAsync for Z failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    // 同步流，确保所有操作完成
+    err = hipStreamSynchronize(compute_stream);
+    if (err != hipSuccess) {
+        fprintf(stderr, "hipStreamSynchronize failed: %s\n", hipGetErrorString(err));
+        hipFree(d_X);
+        hipFree(d_Y);
+        hipFree(d_Z);
+        return;
+    }
+    // 释放设备上的内存
+    hipFree(d_X);
+    hipFree(d_Y);
+    hipFree(d_Z);
+}
+// Concat
+__global__ void _Concat2D(int axis,
+                          int M1, int N1, const float* X1,
+                          int M2, int N2, const float* X2,
+                          float* Z) {
+    int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    if (axis == 0) { // 按行连接
+        if (row < M1 && col < N1) {
+            Z[row * N1 + col] = X1[row * N1 + col];
+        } else if (row >= M1 && row < M1 + M2 && col < N2) {
+            Z[row * N2 + col] = X2[(row - M1) * N2 + col];
+        }
+    } else if (axis == 1) { // 按列连接
+        if (row < M1 && col < N1) {
+            Z[row * (N1 + N2) + col] = X1[row * N1 + col];
+        } else if (row < M2 && col >= N1 && col < N1 + N2) {
+            Z[row * (N1 + N2) + col] = X2[row * N2 + (col - N1)];
+        }
+    }
+    return;
+}
+void rocm_concat(int axis,
+                 int M1, int N1, const float* X1,
+                 int M2, int N2, const float* X2,
+                 float* Z,
+                 hipStream_t compute_stream) {
+    dim3 blockDim(16, 16);
+    dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
+    float *d_X1, *d_X2, *d_Z;
+    hipError_t err;
+    size_t size1 = M1 * N1 * sizeof(float);
+    size_t size2 = M2 * N2 * sizeof(float);
+    size_t sizeZ = (axis == 0 ? (M1 + M2) * N1 : M1 * (N1 + N2)) * sizeof(float);
+    // 分配显存
+    err = hipMalloc(&d_X1, size1); if (err != hipSuccess) { /* 错误处理 */ }
+    err = hipMalloc(&d_X2, size2); if (err != hipSuccess) { hipFree(d_X1); return; }
+    err = hipMalloc(&d_Z, sizeZ); if (err != hipSuccess) { hipFree(d_X1); hipFree(d_X2); return; }
+    // 拷贝数据到设备
+    hipMemcpyAsync(d_X1, X1, size1, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_X2, X2, size2, hipMemcpyHostToDevice, compute_stream);
+    // 启动核函数
+    // dim3 blockDim(16, 16);
+    // dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
+    _Concat2D<<<gridDim, blockDim, 0, compute_stream>>>(axis, M1, N1, d_X1, M2, N2, d_X2, d_Z);
+    // 拷贝结果回主机
+    hipMemcpyAsync(Z, d_Z, sizeZ, hipMemcpyDeviceToHost, compute_stream);
+    // 同步流
+    hipStreamSynchronize(compute_stream);
+    // 释放资源
+    hipFree(d_X1); hipFree(d_X2); hipFree(d_Z);
+    return;
+}
+//gemm
+#include <hip/hip_runtime.h>
+__global__ void _Gemm(bool transA, bool transB,
+                      int M, int N, int K,
+                      float alpha,
+                      const float* A,
+                      const float* B,
+                      float beta,
+                      float* C) {
+    int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    if (row >= M || col >= N) return;
+    float sum = 0.0f;
+    for (int k = 0; k < K; ++k) {
+        float a = transA ? A[k * M + row] : A[row * K + k];
+        float b = transB ? B[col * K + k] : B[k * N + col];
+        sum += a * b;
+    }
+    C[row * N + col] = alpha * sum + beta * C[row * N + col];
+return;
+}
+void rocm_gemm(bool transA, bool transB,
+               int M, int N, int K,
+               float alpha,
+               const float* A,
+               const float* B,
+               float beta,
+               float* C,
+               hipStream_t compute_stream) {
+    dim3 blockDim(16, 16);
+    dim3 gridDim((N + 15) / 16, (M + 15) / 16);
+    float *d_A, *d_B, *d_C;
+    hipError_t err;
+    size_t sizeA = transA ? K * M * sizeof(float) : M * K * sizeof(float);
+    size_t sizeB = transB ? N * K * sizeof(float) : K * N * sizeof(float);
+    size_t sizeC = M * N * sizeof(float);
+    // 分配显存
+    err = hipMalloc(&d_A, sizeA); if (err != hipSuccess) { goto error; }
+    err = hipMalloc(&d_B, sizeB); if (err != hipSuccess) { hipFree(d_A); goto error; }
+    err = hipMalloc(&d_C, sizeC); if (err != hipSuccess) { hipFree(d_A); hipFree(d_B); goto error; }
+    // 主机 -> 设备拷贝
+    hipMemcpyAsync(d_A, A, sizeA, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_B, B, sizeB, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_C, C, sizeC, hipMemcpyHostToDevice, compute_stream);
+    // 启动核函数
+    //dim3 blockDim(16, 16);
+    //dim3 gridDim((N + 15) / 16, (M + 15) / 16);
+    _Gemm<<<gridDim, blockDim, 0, compute_stream>>>(transA, transB, M, N, K, alpha, d_A, d_B, beta, d_C);
+    // 设备 -> 主机拷贝
+    hipMemcpyAsync(C, d_C, sizeC, hipMemcpyDeviceToHost, compute_stream);
+    // 同步流
+    hipStreamSynchronize(compute_stream);
+    // 清理资源
+    hipFree(d_A);
+    hipFree(d_B);
+    hipFree(d_C);
+    return;
+error:
+    fprintf(stderr, "HIP memory allocation or memcpy failed in rocm_gemm\n");
+    if (d_A) hipFree(d_A);
+    if (d_B) hipFree(d_B);
+    if (d_C) hipFree(d_C);
+}
+//GroupNormalization
+#include <math.h>
+__global__ void _GroupNorm(
+    int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
+    float eps, const float* X, float* Y,
+    const float* gamma, const float* beta
+) {
+    // 计算当前组和样本索引
+    int64_t group_idx = hipBlockIdx_x;
+    int64_t n = hipBlockIdx_y;
+    int64_t channels_per_group = C / G;
+    int64_t c_start = group_idx * channels_per_group;
+    int64_t c_end = c_start + channels_per_group;
+    // 组内总元素数
+    int64_t group_size = channels_per_group * H * W;
+    // 共享内存用于归约求和
+    __shared__ float shared_sum[256];
+    __shared__ float shared_sum_sq[256];
+    // 每个线程计算局部和与平方和
+    float sum = 0.0f, sum_sq = 0.0f;
+    for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
+        int64_t c = c_start + idx / (H * W);
+        int64_t hw = idx % (H * W);
+        int64_t h = hw / W;
+        int64_t w = hw % W;
+        int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
+        float val = X[linear_idx];
+        sum += val;
+        sum_sq += val * val;
+    }
+    shared_sum[hipThreadIdx_x] = sum;
+    shared_sum_sq[hipThreadIdx_x] = sum_sq;
+    __syncthreads();
+    // 树状归约求全局和
+    for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
+        if (hipThreadIdx_x < s) {
+            shared_sum[hipThreadIdx_x] += shared_sum[hipThreadIdx_x + s];
+            shared_sum_sq[hipThreadIdx_x] += shared_sum_sq[hipThreadIdx_x + s];
+        }
+        __syncthreads();
+    }
+    // 计算均值和方差
+    float mean = shared_sum[0] / group_size;
+    float var = shared_sum_sq[0] / group_size - mean * mean;
+    // 归一化并应用仿射变换
+    for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
+        int64_t c = c_start + idx / (H * W);
+        int64_t hw = idx % (H * W);
+        int64_t h = hw / W;
+        int64_t w = hw % W;
+        int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
+        float val = (X[linear_idx] - mean) / sqrtf(var + eps);
+        Y[linear_idx] = gamma[c] * val + beta[c];
+    }
+    return;
+}
+void rocm_group_norm(
+    int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
+    float eps, const float* X, float* Y,
+    const float* gamma, const float* beta,
+    hipStream_t compute_stream
+) {
+	 dim3 block_dim(256);  // 每个块256线程
+    dim3 grid_dim(G, N);  // 每个组和样本对应一个块
+    // 参数校验
+    if (C % G != 0) {
+        fprintf(stderr, "Error: Channels must be divisible by groups.\n");
+        return;
+    }
+    // 分配设备内存
+    float *d_X, *d_Y, *d_gamma, *d_beta;
+    hipError_t err;
+    size_t input_size = N * C * H * W * sizeof(float);
+    size_t param_size = C * sizeof(float);
+    err = hipMalloc(&d_X, input_size);
+    if (err != hipSuccess) { /* 处理错误 */ }
+    err = hipMalloc(&d_Y, input_size);
+    if (err != hipSuccess) { hipFree(d_X); return; }
+    err = hipMalloc(&d_gamma, param_size);
+    if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); return; }
+    err = hipMalloc(&d_beta, param_size);
+    if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); return; }
+    // 数据拷贝到设备
+    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_gamma, gamma, param_size, hipMemcpyHostToDevice, compute_stream);
+    hipMemcpyAsync(d_beta, beta, param_size, hipMemcpyHostToDevice, compute_stream);
+    // 配置核函数参数
+    // dim3 block_dim(256);  // 每个块256线程
+    // dim3 grid_dim(G, N);  // 每个组和样本对应一个块
+    // 启动核函数
+    _GroupNorm<<<grid_dim, block_dim, 0, compute_stream>>>(
+        N, C, H, W, G, eps, d_X, d_Y, d_gamma, d_beta
+    );
+    // 拷贝结果回主机
+    hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);
+    // 同步流并释放资源
+    hipStreamSynchronize(compute_stream);
+    hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); hipFree(d_beta);
+}
+//LogSoftmax
+__global__ void _LogSoftmax(int64_t N, int64_t D, const float* X, float* Y) {
+    int64_t n = hipBlockIdx_x;  // 每个样本一个线程块
+    int tid = hipThreadIdx_x;
+    // 共享内存存储最大值和指数和
+    __shared__ float shared_max[256];
+    __shared__ float shared_sum[256];
+    // 步骤1：计算样本内最大值
+    float max_val = -INFINITY;
+    for (int64_t i = tid; i < D; i += hipBlockDim_x) {
+        max_val = fmaxf(max_val, X[n * D + i]);
+    }
+    shared_max[tid] = max_val;
+    __syncthreads();
+    // 归约求全局最大值
+    for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
+        if (tid < s && shared_max[tid + s] > shared_max[tid]) {
+            shared_max[tid] = shared_max[tid + s];
+        }
+        __syncthreads();
+    }
+    float global_max = shared_max[0];
+    // 步骤2：计算指数和
+    float exp_sum = 0.0f;
+    for (int64_t i = tid; i < D; i += hipBlockDim_x) {
+        exp_sum += expf(X[n * D + i] - global_max);
+    }
+    shared_sum[tid] = exp_sum;
+    __syncthreads();
+    // 归约求全局指数和
+    for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            shared_sum[tid] += shared_sum[tid + s];
+        }
+        __syncthreads();
+    }
+    float global_sum = shared_sum[0];
+    // 步骤3：计算LogSoftmax
+    for (int64_t i = tid; i < D; i += hipBlockDim_x) {
+        Y[n * D + i] = (X[n * D + i] - global_max) - logf(global_sum);
+    }
+    return;
+}
+void rocm_log_softmax(
+    int64_t N, int64_t D, const float* X, float* Y, hipStream_t compute_stream
+) {
+	 dim3 block_dim(256);  // 每个块256线程
+    dim3 grid_dim(N);     // 每个样本一个线程块
+    // 分配设备内存
+    float *d_X, *d_Y;
+    hipError_t err;
+    size_t input_size = N * D * sizeof(float);
+    err = hipMalloc(&d_X, input_size);
+    if (err != hipSuccess) { /* 处理错误 */ }
+    err = hipMalloc(&d_Y, input_size);
+    if (err != hipSuccess) { hipFree(d_X); return; }
+    // 数据拷贝到设备
+    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);
+    // 配置核函数参数
+    //dim3 block_dim(256);  // 每个块256线程
+    //dim3 grid_dim(N);     // 每个样本一个线程块
+    // 启动核函数
+    _LogSoftmax<<<grid_dim, block_dim, 0, compute_stream>>>(N, D, d_X, d_Y);
+    // 拷贝结果回主机
+    hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);
+    // 同步流并释放资源
+    hipStreamSynchronize(compute_stream);
+    hipFree(d_X); hipFree(d_Y);
+}
+//attention
+__global__ void _DotProductAttention(int B, int S, int H,
+                                     const float* Q, const float* K, const float* V,
+                                     float scaling,
+                                     float* output) {
+    int b = blockIdx.z;
+    int i = blockIdx.y * blockDim.y + threadIdx.y; // query index
+    int j = blockIdx.x * blockDim.x + threadIdx.x; // hidden dim
+    if (b >= B || i >= S || j >= H) return;
+    // 计算 Q·K^T[i, k]
+    float scores[128]; // 假设 seq_len <= 128
+    for (int k = 0; k < S; ++k) {
+        float dot = 0.f;
+        for (int h = 0; h < H; ++h) {
+            dot += Q[(b * S + i) * H + h] * K[(b * S + k) * H + h];
+        }
+        scores[k] = dot / scaling;
+    }
+    // softmax over scores
+    float max_val = scores[0];
+    for (int k = 1; k < S; ++k) max_val = fmaxf(max_val, scores[k]);
+    float sum = 0.f;
+    for (int k = 0; k < S; ++k) {
+        scores[k] = expf(scores[k] - max_val);
+        sum += scores[k];
+    }
+    for (int k = 0; k < S; ++k) scores[k] /= sum;
+    // output = softmax * V
+    float result = 0.f;
+    for (int k = 0; k < S; ++k) {
+        result += scores[k] * V[(b * S + k) * H + j];
+    }
+    output[(b * S + i) * H + j] = result;
+}
+extern "C" void rocm_attention(int B, int S, int H,
+                            const float* Q, const float* K, const float* V,
+                            float* Out, hipStream_t stream) {
+    dim3 blockDim(16, 16);
+    dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
+    float *d_Q, *d_K, *d_V, *d_Out;
+    size_t size = B * S * H * sizeof(float);
+    hipMalloc(&d_Q, size);
+    hipMalloc(&d_K, size);
+    hipMalloc(&d_V, size);
+    hipMalloc(&d_Out, size);
+    hipMemcpyAsync(d_Q, Q, size, hipMemcpyHostToDevice, stream);
+    hipMemcpyAsync(d_K, K, size, hipMemcpyHostToDevice, stream);
+    hipMemcpyAsync(d_V, V, size, hipMemcpyHostToDevice, stream);
+    float scale = sqrtf((float)H);
+    // dim3 blockDim(16, 16);
+    // dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
+    _DotProductAttention<<<gridDim, blockDim, 0, stream>>>(B, S, H, d_Q, d_K, d_V, scale, d_Out);
+    hipMemcpyAsync(Out, d_Out, size, hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
+    hipFree(d_Q); hipFree(d_K); hipFree(d_V); hipFree(d_Out);
+    return;
+}
+// BatchNormalization
+__global__ void _BatchNormalization(
+    int N, int C, int H, int W,
+    const float* X,
+    const float* gamma,
+    const float* beta,
+    const float* mean,
+    const float* var,
+    float epsilon,
+    float* Y) {
+  // global thread index
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int total = N * C * H * W;
+  if (idx >= total) return;
+  // 计算坐标
+  int w = idx % W;
+  int tmp = idx / W;
+  int h = tmp % H;
+  tmp = tmp / H;
+  int c = tmp % C;
+  int n = tmp / C;
+  // 计算 Y = gamma[c] * (X - mean[c]) / sqrt(var[c] + eps) + beta[c]
+  int offset = ((n * C + c) * H + h) * W + w;
+  float x = X[offset];
+  float m = mean[c];
+  float v = var[c];
+  float inv_std = rsqrtf(v + epsilon);
+  Y[offset] = gamma[c] * ((x - m) * inv_std) + beta[c];
+}
+// host API：rocm_batch_norm
+extern "C" void rocm_batch_norm(
+    int64_t N, int64_t C, int64_t H, int64_t W,
+    const float* X,
+    const float* gamma,
+    const float* beta,
+    const float* mean,
+    const float* var,
+    float epsilon,
+    float* Y,
+    hipStream_t stream) {
+  size_t total = (size_t)N * C * H * W;
+  // 分配并拷贝 X、gamma、beta、mean、var 到设备
+  float *d_X, *d_gamma, *d_beta, *d_mean, *d_var, *d_Y;
+  hipMalloc(&d_X, total * sizeof(float));
+  hipMalloc(&d_Y, total * sizeof(float));
+  hipMalloc(&d_gamma, C * sizeof(float));
+  hipMalloc(&d_beta, C * sizeof(float));
+  hipMalloc(&d_mean, C * sizeof(float));
+  hipMalloc(&d_var, C * sizeof(float));
+  hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);
+  hipMemcpyAsync(d_gamma, gamma, C * sizeof(float), hipMemcpyHostToDevice, stream);
+  hipMemcpyAsync(d_beta, beta, C * sizeof(float), hipMemcpyHostToDevice, stream);
+  hipMemcpyAsync(d_mean, mean, C * sizeof(float), hipMemcpyHostToDevice, stream);
+  hipMemcpyAsync(d_var, var, C * sizeof(float), hipMemcpyHostToDevice, stream);
+  // 启动核函数：一维线程组织
+  int threads = 256;
+  int blocks = (total + threads - 1) / threads;
+  _BatchNormalization<<<blocks, threads, 0, stream>>>(
+      N, C, H, W,
+      d_X, d_gamma, d_beta, d_mean, d_var,
+      epsilon,
+      d_Y);
+  // 拷回结果
+  hipMemcpyAsync(Y, d_Y, total * sizeof(float), hipMemcpyDeviceToHost, stream);
+  hipStreamSynchronize(stream);
+  // 释放设备内存
+  hipFree(d_X);
+  hipFree(d_Y);
+  hipFree(d_gamma);
+  hipFree(d_beta);
+  hipFree(d_mean);
+  hipFree(d_var);
+  return;
+}
+// Cast Operator: float to int32
+// Device kernel: cast each element
+__global__ void _Cast(
+    int total,
+    const float* X,
+    int* Y) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= total) return;
+  // cast float to int32
+  Y[idx] = static_cast<int>(X[idx]);
+}
+// Host API: rocm_cast
+extern "C" void rocm_cast(
+    int64_t N, int64_t C, int64_t H, int64_t W,
+    const float* X,
+    int* Y,
+    hipStream_t stream) {
+  // total elements
+  size_t total = (size_t)N * C * H * W;
+  // allocate device memory
+  float* d_X;
+  int* d_Y;
+  hipMalloc(&d_X, total * sizeof(float));
+  hipMalloc(&d_Y, total * sizeof(int));
+  // copy input to device
+  hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);
+  // launch kernel
+  int threads = 256;
+  int blocks = (total + threads - 1) / threads;
+  _Cast<<<blocks, threads, 0, stream>>>(
+      total,
+      d_X,
+      d_Y);
+  // copy result back
+  hipMemcpyAsync(Y, d_Y, total * sizeof(int), hipMemcpyDeviceToHost, stream);
+  hipStreamSynchronize(stream);
+  // free device memory
+  hipFree(d_X);
+  hipFree(d_Y);
+  return;
+}
+extern "C" __global__
+void SoftmaxKernel(const float* X, float* Y, int M, int N) {
+    // M = batch_size, N = feature_size
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M) return;
+    const float* x_row = X + row * N;
+    float*       y_row = Y + row * N;
+    // 1) 找到这一行的最大值，用于数值稳定性
+    float m = x_row[0];
+    for (int j = 1; j < N; ++j) {
+        m = fmaxf(m, x_row[j]);
+    }
+    // 2) 计算 exp(x - m) 并累加
+    float sum = 0.f;
+    for (int j = 0; j < N; ++j) {
+        float e = expf(x_row[j] - m);
+        y_row[j] = e;
+        sum += e;
+    }
+    // 3) 归一化
+    for (int j = 0; j < N; ++j) {
+        y_row[j] /= sum;
+    }
+}
+// 这个函数由 ONNX Runtime 调用，替代原来的 rocm_add
+extern "C"
+void rocm_softmax(int64_t M, int64_t N,
+                  const float* X, float* Y,
+                  hipStream_t stream) {
+    // 每个线程处理一行，线程块大小 128
+    const int threads = 128;
+    const int blocks  = static_cast<int>((M + threads - 1) / threads);
+    hipLaunchKernelGGL(
+        SoftmaxKernel,
+        dim3(blocks), dim3(threads),
+        0,      // shared mem
+        stream, // hip stream
+        X, Y, static_cast<int>(M), static_cast<int>(N)
+    );
+    return;
+}
+template <typename T>
+__global__ void _CeluKernel(const T* X, T* Y, int64_t size, T alpha) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        T v = X[idx];
+        T pos = v > T(0) ? v : T(0);
+        T neg = v <= T(0) ? alpha * (exp(v / alpha) - T(1)) : T(0);
+        Y[idx] = pos + neg;
+    }
+    return;
+}
+extern "C" void rocm_celu(int64_t size,
+                           const float* X,
+                           float* Y,
+                           float alpha,
+                           hipStream_t stream) {
+    float *d_X, *d_Y;
+    hipMalloc(&d_X, size * sizeof(float));
+    hipMalloc(&d_Y, size * sizeof(float));
+    hipMemcpyAsync(d_X, X, size * sizeof(float), hipMemcpyHostToDevice, stream);
+    int threads = 256;
+    int blocks = (size + threads - 1) / threads;
+    _CeluKernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size, alpha);
+    hipMemcpyAsync(Y, d_Y, size * sizeof(float), hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
+    hipFree(d_X); hipFree(d_Y);
+    return;
+}
+//relu
+template <typename T>
+__global__ void _rocm_relu_kernel(float* input, float* output, int64_t size) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+    output[idx] = fmaxf(0.0f, input[idx]);
+}
+extern "C" void rocm_relu(
+    int64_t size,
+    const float* X,
+    float* Y,
+    hipStream_t stream
+) {
+    size_t input_size = size * sizeof(float);
+    float *d_X, *d_Y;
+    hipMalloc(&d_X, input_size);
+    hipMalloc(&d_Y, input_size);
+    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, stream);
+    int threads = 256;
+    int blocks = (size + threads - 1) / threads;
+    _rocm_relu_kernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size);
+    hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
+    hipFree(d_X);
+    hipFree(d_Y);
+    return;
+}
+// -------------------------------
+// TopK
+// -------------------------------
+extern "C" 
+__global__ void TopKKernel(
+    const float* __restrict__ X,   // [M * N]
+    float* __restrict__ values,    // [M * K]
+    int64_t* __restrict__ indices, // [M * K]
+    int M,
+    int N,
+    int K
+) {
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M) return;
+    const float* x_row = X + size_t(row) * N;
+    float*       v_row = values + size_t(row) * K;
+    int64_t*     i_row = indices + size_t(row) * K;
+    // 动态共享内存布局：前 K 个 float 存放 topK 值，后 K 个 int 存放对应索引
+    extern __shared__ char smem[];
+    float* shared_vals = (float*)smem;
+    int*   shared_idx  = (int*)(smem + K * sizeof(float));
+    // 初始化：shared_vals = -INF, shared_idx = -1
+    for (int t = threadIdx.x; t < K; t += blockDim.x) {
+        shared_vals[t] = -INFINITY;
+        shared_idx[t]  = -1;
+    }
+    __syncthreads();
+    // 扫描整行，维护一个长度为 K 的最小堆逻辑（但这里用简化的线性扫描替代堆）
+    for (int j = 0; j < N; ++j) {
+        float v = x_row[j];
+        // 找当前最小值位置
+        float min_val = shared_vals[0];
+        int   min_pos = 0;
+        for (int t = 1; t < K; ++t) {
+            if (shared_vals[t] < min_val) {
+                min_val = shared_vals[t];
+                min_pos = t;
+            }
+        }
+        // 替换
+        if (v > min_val) {
+            shared_vals[min_pos] = v;
+            shared_idx[min_pos]  = j;
+        }
+    }
+    __syncthreads();
+    // 对这 K 个元素做简单排序（降序），K 通常比较小
+    for (int i = 0; i < K; ++i) {
+        for (int j = i + 1; j < K; ++j) {
+            if (shared_vals[j] > shared_vals[i]) {
+                // swap value
+                float tv = shared_vals[i];
+                shared_vals[i] = shared_vals[j];
+                shared_vals[j] = tv;
+                // swap idx
+                int ti = shared_idx[i];
+                shared_idx[i] = shared_idx[j];
+                shared_idx[j] = ti;
+            }
+        }
+    }
+    __syncthreads();
+    // 写回全局内存
+    for (int t = threadIdx.x; t < K; t += blockDim.x) {
+        v_row[t] = shared_vals[t];
+        i_row[t] = (int64_t)shared_idx[t];
+    }
+}
+extern "C"
+void rocm_topk(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    const float* X,
+    float* values,
+    int64_t* indices,
+    hipStream_t stream
+) {
+    // 分配设备内存
+    size_t sizeX      = size_t(M) * N * sizeof(float);
+    size_t sizeOutVal = size_t(M) * K * sizeof(float);
+    size_t sizeOutIdx = size_t(M) * K * sizeof(int64_t);
+    float*   d_X;
+    float*   d_vals;
+    int64_t* d_idx;
+    if (hipMalloc(&d_X,   sizeX     ) != hipSuccess ||
+        hipMalloc(&d_vals,sizeOutVal) != hipSuccess ||
+        hipMalloc(&d_idx, sizeOutIdx) != hipSuccess) {
+        fprintf(stderr, "HIP malloc failed in rocm_topk\n");
+        if (d_X)    hipFree(d_X);
+        if (d_vals) hipFree(d_vals);
+        if (d_idx)  hipFree(d_idx);
+        return;
+    }
+    // 拷贝输入到设备
+    hipMemcpyAsync(d_X, X, sizeX, hipMemcpyHostToDevice, stream);
+    // 启动 Kernel：每个线程处理一行，动态共享内存大小 = K*(sizeof(float)+sizeof(int))
+    dim3 blockDim(128);
+    dim3 gridDim((M + blockDim.x - 1) / blockDim.x);
+    size_t shared_bytes = K * (sizeof(float) + sizeof(int));
+    hipLaunchKernelGGL(
+        TopKKernel,
+        gridDim, blockDim, shared_bytes, stream,
+        d_X, d_vals, d_idx,
+        int(M), int(N), int(K)
+    );
+    // 拷贝结果回主机
+    hipMemcpyAsync(values,  d_vals,  sizeOutVal, hipMemcpyDeviceToHost, stream);
+    hipMemcpyAsync(indices, d_idx,   sizeOutIdx, hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
+    // 释放
+    hipFree(d_X);
+    hipFree(d_vals);
+    hipFree(d_idx);
+}
+//ReduceLogSum
+template <typename T>
+__global__ void _rocm_reduce_log_sum_kernel(
+    const T* input, 
+    T* output,
+    int64_t N,
+    int64_t C,
+    int64_t H,
+    int64_t W,
+    int64_t axis,
+    bool keep_dims
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // 计算总线程数
+    int total;
+    if (keep_dims) {
+        total = N * C * H * W;
+    } else {
+        switch (axis) {
+            case 0: total = C * H * W; break;
+            case 1: total = N * H * W; break;
+            case 2: total = N * C * W; break;
+            case 3: total = N * C * H; break;
+            default: total = N * C * H * W; break;
+        }
+    }
+    if (idx >= total) return;
+    // 解析输出索引
+    int out_n = 0, out_c = 0, out_h = 0, out_w = 0;
+    if (keep_dims) {
+        out_n = idx / (C * H * W);
+        int rem = idx % (C * H * W);
+        out_c = rem / (H * W);
+        rem %= H * W;
+        out_h = rem / W;
+        out_w = rem % W;
+    } else {
+        switch (axis) {
+            case 0: {
+                out_n = 0;
+                out_c = idx / (H * W);
+                int rem = idx % (H * W);
+                out_h = rem / W;
+                out_w = rem % W;
+                break;
+            }
+            case 1: {
+                out_n = idx / (H * W);
+                int rem = idx % (H * W);
+                out_c = 0;
+                out_h = rem / W;
+                out_w = rem % W;
+                break;
+            }
+            case 2: {
+                out_n = idx / (C * W);
+                int rem = idx % (C * W);
+                out_c = rem / W;
+                out_w = rem % W;
+                out_h = 0;
+                break;
+            }
+            case 3: {
+                out_n = idx / (C * H);
+                int rem = idx % (C * H);
+                out_c = rem / H;
+                out_h = rem % H;
+                out_w = 0;
+                break;
+            }
+            default: {
+                out_n = idx / (C * H * W);
+                int rem = idx % (C * H * W);
+                out_c = rem / (H * W);
+                rem %= H * W;
+                out_h = rem / W;
+                out_w = rem % W;
+                break;
+            }
+        }
+    }
+    // 计算输入索引范围
+    int64_t start = 0, end = 0;
+    if (axis == 0) {
+        start = out_c * H * W + out_h * W + out_w;
+        end = N * C * H * W;
+    } else if (axis == 1) {
+        start = out_n * C * H * W + out_h * W + out_w;
+        end = start + C * H * W;
+    } else if (axis == 2) {
+        start = out_n * C * H * W + out_c * H * W + out_w;
+        end = start + H * C * W;
+    } else if (axis == 3) {
+        start = out_n * C * H * W + out_c * H * W + out_h * W;
+        end = start + W * C * H;
+    }
+    // 累加求和
+    T sum = T(0);
+    int64_t step;
+    switch (axis) {
+        case 0: step = C * H * W; break;
+        case 1: step = H * W; break;
+        case 2: step = W; break;
+        case 3: step = 1; break;
+        default: step = 1; break;
+    }
+    for (int64_t i = start; i < end; i += step) {
+        sum += input[i];
+    }
+    // 取自然对数
+    output[idx] = log(sum);
+}
+extern "C" void rocm_reduce_log_sum(
+    int64_t N,
+    int64_t C,
+    int64_t H,
+    int64_t W,
+    const float* X,
+    float* Y,
+    int64_t axis,
+    bool keep_dims,
+    hipStream_t stream
+) {
+    // 计算输出尺寸
+    int64_t out_N = keep_dims ? N : (axis == 0 ? 1 : N);
+    int64_t out_C = keep_dims ? C : (axis == 1 ? 1 : C);
+    int64_t out_H = keep_dims ? H : (axis == 2 ? 1 : H);
+    int64_t out_W = keep_dims ? W : (axis == 3 ? 1 : W);
+    size_t input_size = N * C * H * W * sizeof(float);
+    size_t output_size = out_N * out_C * out_H * out_W * sizeof(float);
+    // 设备内存分配
+    float *d_X, *d_Y;
+    hipMalloc(&d_X, input_size);
+    hipMalloc(&d_Y, output_size);
+    // 异步拷贝数据到设备
+    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, stream);
+    // 核函数配置
+    int total_threads = out_N * out_C * out_H * out_W;
+    int block_size = 256;
+    int grid_size = (total_threads + block_size - 1) / block_size;
+    // 启动核函数
+    _rocm_reduce_log_sum_kernel<float><<<grid_size, block_size, 0, stream>>>(
+        d_X, d_Y, N, C, H, W, axis, keep_dims
+    );
+    // 异步拷贝结果回主机
+    hipMemcpyAsync(Y, d_Y, output_size, hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
+    // 释放设备内存
+    hipFree(d_X);
+    hipFree(d_Y);
+   return;
+}
+__global__ void _RoiAlignKernel(
+    const float* X, int N, int C, int H, int W,
+    const float* rois, const int64_t* batch_inds,
+    int num_rois, int out_h, int out_w,
+    int sampling_ratio, float spatial_scale,
+    float* Y) {
+  int rid = blockIdx.x;    // ROI index
+  int c   = blockIdx.y;    // channel
+  int ph  = threadIdx.y;   // pooled y
+  int pw  = threadIdx.x;   // pooled x
+  if (rid >= num_rois || c >= C || ph >= out_h || pw >= out_w) return;
+  // 读取 ROI
+  const float* roi_ptr = rois + rid * 4;
+  float x1 = roi_ptr[0] * spatial_scale;
+  float y1 = roi_ptr[1] * spatial_scale;
+  float x2 = roi_ptr[2] * spatial_scale;
+  float y2 = roi_ptr[3] * spatial_scale;
+  int batch_id = static_cast<int>(batch_inds[rid]);
+  float roi_w = max(x2 - x1, 1.0f);
+  float roi_h = max(y2 - y1, 1.0f);
+  float bin_w = roi_w / static_cast<float>(out_w);
+  float bin_h = roi_h / static_cast<float>(out_h);
+  int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_h / out_h);
+  int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_w / out_w);
+  float count = static_cast<float>(roi_bin_grid_h * roi_bin_grid_w);
+  float sum = 0.0f;
+  // 在每个 bin 内做平均
+  for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
+    float y = y1 + ph * bin_h + (iy + 0.5f) * bin_h / roi_bin_grid_h;
+    for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
+      float x = x1 + pw * bin_w + (ix + 0.5f) * bin_w / roi_bin_grid_w;
+      // 双线性插值
+      int x0 = floorf(x), x1i = min(x0 + 1, W - 1);
+      int y0 = floorf(y), y1i = min(y0 + 1, H - 1);
+      float lx = x - x0, ly = y - y0;
+      const float* fmap = X + (batch_id * C + c) * H * W;
+      float v00 = fmap[y0 * W + x0];
+      float v01 = fmap[y0 * W + x1i];
+      float v10 = fmap[y1i * W + x0];
+      float v11 = fmap[y1i * W + x1i];
+      float w00 = (1 - lx) * (1 - ly);
+      float w01 = lx * (1 - ly);
+      float w10 = (1 - lx) * ly;
+      float w11 = lx * ly;
+      sum += v00 * w00 + v01 * w01 + v10 * w10 + v11 * w11;
+    }
+  }
+  float* out_ptr = Y + ((rid * C + c) * out_h + ph) * out_w + pw;
+  *out_ptr = sum / count;
+}
+extern "C" void rocm_roi_align(
+    const float* X, int64_t N, int64_t C, int64_t H, int64_t W,
+    const float* rois, const int64_t* batch_inds,
+    int64_t num_rois, int64_t out_h, int64_t out_w,
+    int64_t sampling_ratio, float spatial_scale,
+    float* Y, hipStream_t stream) {
+  dim3 grid(num_rois, C);
+  dim3 block(out_w, out_h);
+  _RoiAlignKernel<<<grid, block, 0, stream>>>(
+    X, N, C, H, W,
+    rois, batch_inds,
+    num_rois, out_h, out_w,
+    sampling_ratio, spatial_scale,
+    Y);
+  hipStreamSynchronize(stream);
+}
+// LeakyReLU kernel
+__global__ void _LeakyReLUKernel(const float* X, float* Y, int64_t size, float alpha) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= size) return;
+  float v = X[idx];
+  Y[idx] = (v >= 0.0f) ? v : alpha * v;
+}
+extern "C" void rocm_leaky_relu(
+    int64_t size,
+    const float* d_X,
+    float* d_Y,
+    float alpha,
+    hipStream_t stream) {
+  const int threads = 256;
+  int blocks = static_cast<int>((size + threads - 1) / threads);
+  _LeakyReLUKernel<<<blocks, threads, 0, stream>>>(d_X, d_Y, size, alpha);
+}
+//Conv
+__global__ void _Conv2dKernel(const float* input,
+                              const float* weight,
+                              const float* bias,
+                              float* output,
+                              int N, int C_in, int H, int W,
+                              int C_out, int K_h, int K_w,
+                              int out_H, int out_W) {
+  int n = blockIdx.x;
+  int oc = blockIdx.y;
+  int oh = threadIdx.y;
+  int ow = threadIdx.x;
+  if (oh >= out_H || ow >= out_W) return;
+  float sum = bias[oc];
+  for (int ic = 0; ic < C_in; ++ic) {
+    for (int kh = 0; kh < K_h; ++kh) {
+      for (int kw = 0; kw < K_w; ++kw) {
+        int ih = oh + kh;
+        int iw = ow + kw;
+        float val = input[n * (C_in * H * W) + ic * (H * W) + ih * W + iw];
+        float w = weight[oc * (C_in * K_h * K_w) + ic * (K_h * K_w) + kh * K_w + kw];
+        sum += val * w;
+      }
+    }
+  }
+  output[n * (C_out * out_H * out_W) + oc * (out_H * out_W) + oh * out_W + ow] = sum;
+}
+extern "C" void rocm_conv2d(const float* input,
+                            const float* weight,
+                            const float* bias,
+                            float* output,
+                            int N, int C_in, int H, int W,
+                            int C_out, int K_h, int K_w,
+                            int out_H, int out_W,
+                            hipStream_t stream) {
+  dim3 blocks(N, C_out);
+  dim3 threads(out_W, out_H);
+  _Conv2dKernel<<<blocks, threads, 0, stream>>>(
+      input, weight, bias, output,
+      N, C_in, H, W, C_out, K_h, K_w, out_H, out_W);
+  hipError_t err = hipGetLastError();
+  if (err != hipSuccess) {
+    fprintf(stderr, "Conv2D kernel launch failed: %s\n", hipGetErrorString(err));
+  }
+}
--- a/rocm_ops.hip.o
+++ b/rocm_ops.hip.o