Commit b3e3ac37 authored by ghfund4_b52's avatar ghfund4_b52
Browse files

Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o,...

Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o, libcustom_op_library.so, rocm_ops.cc.o, compile_rdc.sh, custom_op_library.h, benchmark.py, compile.sh, custom_op_library.cc, cuda_utils.py, fp16.py, rocm_ops.h, node_utils.py, rocm_ops.cc, readme.md, rocm_ops.hip.cpp files
parents
import onnxruntime as ort
import numpy as np
import os
import sys
import onnx
from onnx import numpy_helper
from node_utils import INPUT_TYPE_TO_NP_TYPE_MAP
from cuda_utils import set_batchsize, model_setbs
from scipy import spatial
import argparse
import time
def save_input_output_data(save_path, data_dict, isInput=True):
if not os.path.isdir(save_path):
os.makedirs(save_path)
keys = list(data_dict.keys())
data_prefix = 'input'
if not isInput:
data_prefix = 'output'
for j in range(len(data_dict)):
with open(os.path.join(save_path, '{}_{}.pb'.format(data_prefix, j)), 'wb') as f:
f.write(numpy_helper.from_array(
data_dict[keys[j]], keys[j]).SerializeToString())
def load_pb_data(pb_path):
with open(pb_path, 'rb') as f:
input_content = f.read()
tensor = onnx.TensorProto()
tensor.ParseFromString(input_content)
f.close()
return numpy_helper.to_array(tensor)
def get_cosine(gpu_array, cpu_array):
gpu_array = gpu_array.astype(np.float64)
cpu_array = cpu_array.astype(np.float64)
gpu_array = gpu_array.reshape([-1])
cpu_array = cpu_array.reshape([-1])
cosine = spatial.distance.cosine(cpu_array, gpu_array)
return cosine
def get_snr(gpu_array, cpu_array):
cpu_array = cpu_array.astype(np.float64)
gpu_array = gpu_array.astype(np.float64)
diff_array = cpu_array - gpu_array
x = diff_array * diff_array
x = np.sum(x)
y = cpu_array * cpu_array
y = np.sum(y)
snr = (x) / (y + 1e-7)
snr = np.mean(snr)
return snr
def accuracy_check_run(args):
EP_List = ['ROCMExecutionProvider']
model = onnx.load(args.input_model)
model = set_batchsize(model, args.batchsize)
so = ort.SessionOptions()
so.enable_profiling = True
so.register_custom_ops_library("/public/home/kj_gauss/All_test/libcustom_op_library.so")
so.intra_op_num_threads = 1
so.inter_op_num_threads = 1
cuda_session = ort.InferenceSession(
model.SerializeToString(), sess_options=so, providers=EP_List
)
#cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_List)
inputs = cuda_session.get_inputs()
outputs = cuda_session.get_outputs()
file_list = os.listdir(args.datapath)
input_list = []
output_list = []
for file in file_list:
if file[:5] == 'input':
input_list.append(file)
elif file[:6] == 'output':
output_list.append(file)
input_traits = [int(i[6:-3]) for i in input_list]
input_traits = sorted(input_traits)
input_list = [os.path.join(args.datapath, "input_{}.pb".format(i)) for i in input_traits]
output_traits = [int(i[7:-3]) for i in output_list]
output_traits = sorted(output_traits)
output_list = [os.path.join(args.datapath, "output_{}.pb".format(i)) for i in output_traits]
input_dict = {}
for input, input_file in zip(inputs, input_list):
input_dict[input.name] = load_pb_data(input_file)
if input_dict[input.name].shape[0] != args.batchsize:
print("Batchsize error! input data batchsize is {} but your input batchsize is {}, Please fix!".format(input_dict[input_file[0].name].shape[0], args.batchsize))
sys.exit()
gt_dict = {}
for output, gt_file in zip(outputs, output_list):
gt_dict[output.name] = load_pb_data(gt_file)
output_names = [x.name for x in cuda_session.get_outputs()]
output_data = cuda_session.run(output_names, input_dict)
for idx, output_name in enumerate(output_names):
print("output {}".format(output_name))
print("SNR IS : {}".format(get_snr(gt_dict[output_name], output_data[idx])))
print("COSINE IS : {}\n".format(get_cosine(gt_dict[output_name], output_data[idx])))
def generate_golden_data_run(args):
import os
import time
import onnx
import onnxruntime as ort
import numpy as np
from onnx import numpy_helper
from node_utils import INPUT_TYPE_TO_NP_TYPE_MAP
def save_input_output_data(save_path, data_dict, isInput=True):
if not os.path.isdir(save_path):
os.makedirs(save_path)
prefix = 'input' if isInput else 'output'
for idx, (name, data) in enumerate(data_dict.items()):
with open(os.path.join(save_path, f'{prefix}_{idx}.pb'), 'wb') as f:
f.write(numpy_helper.from_array(data, name).SerializeToString())
model = onnx.load(args.input_model)
orig_shapes = {}
for vi in model.graph.input:
dims = []
for d in vi.type.tensor_type.shape.dim:
dims.append(d.dim_value if d.dim_value > 0 else None)
orig_shapes[vi.name] = dims
so = ort.SessionOptions()
so.register_custom_ops_library("libcustom_op_library.so")
so.intra_op_num_threads = 1
so.inter_op_num_threads = 1
providers = ['ROCMExecutionProvider']
t0 = time.time()
session = ort.InferenceSession(model.SerializeToString(),
sess_options=so,
providers=providers)
t1 = time.time()
print(f"Initialize ROCM session cost {(t1-t0)*1000:.2f} ms")
input_dict = {}
for inp in session.get_inputs():
name = inp.name
dtype_str = inp.type
shape = []
for d in orig_shapes[name]:
if d is None:
shape.append(args.batchsize)
else:
shape.append(d)
print(f"[INFO] {name} <- shape {shape}, type={dtype_str}")
data = np.random.rand(*shape)
if 'uint8' in dtype_str:
data = data * 255
elif 'int8' in dtype_str:
data = data * 255 - 128
data = data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[dtype_str])
input_dict[name] = data
if args.saveIOdata == 1:
save_input_output_data(args.datapath, input_dict, isInput=True)
output_names = [o.name for o in session.get_outputs()]
# 6. Warm-up
if args.warmup > 0:
for _ in range(args.warmup):
session.run(output_names, input_dict)
t_start = time.time()
for _ in range(args.runnum):
outputs = session.run(output_names, input_dict)
t_end = time.time()
latency_ms = (t_end - t_start) * 1000 / (args.runnum * args.batchsize)
print(f"Inference cost per sample: {latency_ms:.3f} ms | FPS: {1000/latency_ms:.2f}")
if args.saveIOdata == 1:
out_dict = {n: o for n, o in zip(output_names, outputs)}
save_input_output_data(args.datapath, out_dict, isInput=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_model",
type=str,
required=True,
default="",
help="input model file")
parser.add_argument("-b", "--batchsize",
type=int,
required=False,
default=1,
help="batchsize")
parser.add_argument("-c", "--checkresult",
type=bool,
required=False,
default=False,
help="check output accuracy")
parser.add_argument("-d", "--datapath",
type=str,
required=True,
help="data path for saving golden data or checking output accuracy")
parser.add_argument("-w", "--warmup",
type=int,
required=False,
default=50,
help="input warm up iterations")
parser.add_argument("-n", "--runnum",
type=int,
required=False,
default=100,
help="input run model iterations")
parser.add_argument("-s", "--inputshape",
type=int,
required=False,
default=-1,
help="bert input shape")
parser.add_argument("-t", "--saveIOdata",
type=int,
required=False,
default=1,
help="save golden data")
ARGS = parser.parse_args()
if ARGS.checkresult:
accuracy_check_run(ARGS)
else:
generate_golden_data_run(ARGS)
\ No newline at end of file
/opt/dtk/hip/bin/hipcc --offload-arch=gfx906 -I/opt/dtk-25.04/include -fPIC -x hip -o rocm_ops.hip.o -c rocm_ops.hip.cpp
/usr/bin/c++ -DUSE_ROCM=1 -I ./include/onnxruntime/ -fPIC "-D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1" -o rocm_ops.cc.o -c rocm_ops.cc
/usr/bin/c++ -I./include/onnxruntime/ -fPIC -o custom_op_library.cc.o -c custom_op_library.cc
/opt/dtk/llvm/bin/clang++ -fPIC -shared -Wl,-soname,libcustom_op_library.so -o libcustom_op_library.so rocm_ops.hip.o custom_op_library.cc.o rocm_ops.cc.o -L/opt/dtk/lib -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25085.1211-205b0686 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc
/opt/dtk/hip/bin/hipcc --offload-arch=gfx906 -I/opt/dtk-25.04/include -fPIC -x hip -o rocm_ops.hip.o -c rocm_ops.hip.cpp
/usr/bin/c++ -DUSE_ROCM=1 -I ./include/onnxruntime/ -fPIC "-D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1" -o rocm_ops.cc.o -c rocm_ops.cc
/usr/bin/c++ -I./include/onnxruntime/ -fPIC -o custom_op_library.cc.o -c custom_op_library.cc
/opt/dtk/llvm/bin/clang++ -fPIC -shared -Wl,-soname,libcustom_op_library.so -o libcustom_op_library.so rocm_ops.hip.o custom_op_library.cc.o rocm_ops.cc.o -L/opt/dtk/lib -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25085.1211-205b0686 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc
\ No newline at end of file
/opt/dtk/hip/bin/hipcc \
--offload-arch=gfx906 \
-shared \
-o libcustom_op_library.so \
rocm_ops.hip.cpp custom_op_library.cc.o rocm_ops.cc.o \
-L/opt/dtk/lib -L/opt/dtk/hip/lib \
-Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib \
-lgalaxyhip \
/opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a \
-lstdc++ -lm -lgcc_s -lgcc -lc
import onnxruntime as ort
import onnx
import numpy as np
import time
from node_utils import node_utils, INPUT_TYPE_TO_NP_TYPE_MAP
import sys
def set_batchsize(model, batchSize):
for node in model.graph.node:
if node.op_type in ['Reshape', 'Split', 'Transpose']:
return model
del model.graph.value_info[:]
for input in model.graph.input:
if len(input.type.tensor_type.shape.dim) > 1:
input.type.tensor_type.shape.dim[0].dim_value = batchSize
for output in model.graph.output:
if len(output.type.tensor_type.shape.dim) > 1:
output.type.tensor_type.shape.dim[0].dim_value = batchSize
return model
def model_setbs(model, batchSize):
del model.graph.value_info[:]
for input in model.graph.input:
if len(input.type.tensor_type.shape.dim) > 1:
input.type.tensor_type.shape.dim[0].dim_value = batchSize
for output in model.graph.output:
if len(output.type.tensor_type.shape.dim) > 1:
output.type.tensor_type.shape.dim[0].dim_value = batchSize
return model
def model_run(modelPath, batchSize=None):
model = onnx.load(modelPath)
if batchSize is not None:
model = set_batchsize(model, batchSize)
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
EP_list = ['CUDAExecutionProvider']
start = time.time()
cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_list, sess_options=session_options)
end = time.time()
duration = (end - start) * 1000
print("Initialize Session cost {} ms".format(duration))
inputs = cuda_session.get_inputs()
outputs = cuda_session.get_outputs()
input_dict = {}
for input in inputs:
shape = [s for s in input.shape]
for idx in range(len(shape)):
if shape[idx] is None:
print("[ERROR] Input shape invalid,please Check")
return -1
input_data = np.random.random(shape)
if input.type.find('int') > 0:
input_data = input_data*10
input_data = input_data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[input.type])
input_dict[input.name] = ort.OrtValue.ortvalue_from_numpy(input_data, 'cuda_pinned', 0)
outputs_names = []
for output in outputs:
outputs_names.append(output.name)
io_binding = cuda_session.io_binding()
for key, ortValue in input_dict.items():
io_binding.bind_ortvalue_input(key, ortValue)
for out_name in outputs_names:
io_binding.bind_output(out_name, 'cuda_pinned', device_id=0)
# warm up
warm_up_num = 20
start = time.time()
for i in range(warm_up_num):
cuda_session.run_with_iobinding(io_binding)
end = time.time()
duration = (end - start) / (batchSize * warm_up_num) * 1000
print("Warm up cost {} ms".format(duration))
run_num = 50
start = time.time()
for i in range(run_num):
cuda_session.run_with_iobinding(io_binding)
end = time.time()
duration = (end - start) * 1000 / (run_num * batchSize)
print("Current inference cost {} ms".format(duration))
print("FPS is {:.2f}".format(1000/duration))
del cuda_session
return duration
if __name__ == '__main__':
if len(sys.argv) != 3:
print(len(sys.argv))
print("Input parameter error...")
print("python cudaRun.py modelPath batchSize")
sys.exit()
modelPath = sys.argv[1]
batchSize = int(sys.argv[2])
print(modelPath)
model_run(modelPath, batchSize)
#include "custom_op_library.h"
#define ORT_API_MANUAL_INIT
#include "onnxruntime_cxx_api.h"
#undef ORT_API_MANUAL_INIT
#include <vector>
#include <cmath>
#include <mutex>
#include <system_error>
#include "core/common/common.h"
#include "core/framework/ortdevice.h"
#include "core/framework/ortmemoryinfo.h"
#include "rocm_ops.h"
#include "onnxruntime_lite_custom_op.h"
// static const char* c_OpDomain = "test.customop";
static const char* c_OpDomain = "";
static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain&& domain) {
static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
static std::mutex ort_custom_op_domain_mutex;
std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
ort_custom_op_domain_container.push_back(std::move(domain));
}
OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
Ort::Global<void>::api_ = api->GetApi(ORT_API_VERSION);
OrtStatus* result = nullptr;
ORT_TRY {
Ort::CustomOpDomain domain{c_OpDomain};
Rocm::RegisterOps(domain);
Ort::UnownedSessionOptions session_options(options);
session_options.Add(domain);
AddOrtCustomOpDomainToContainer(std::move(domain));
}
ORT_CATCH(const std::exception& e) {
ORT_HANDLE_EXCEPTION([&]() {
Ort::Status status{e};
result = status.release();
});
}
return result;
}
OrtStatus* ORT_API_CALL RegisterCustomOpsAltName(OrtSessionOptions* options, const OrtApiBase* api) {
return RegisterCustomOps(options, api);
}
#pragma once
#include "onnxruntime_c_api.h"
#ifdef __cplusplus
extern "C" {
#endif
ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api);
// alternative name to test registration by function name
ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOpsAltName(OrtSessionOptions* options, const OrtApiBase* api);
#ifdef __cplusplus
}
#endif
import numpy as np
import onnx
from onnx import helper, numpy_helper
from onnx import onnx_pb as onnx_proto
def _npfloat16_to_int(np_list):
return [int(bin(_.view('H'))[2:].zfill(16), 2) for _ in np_list]
def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4):
def between(a, b, c):
return np.logical_and(a < b, b < c)
np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
np_array = np.where(between(max_finite_val, np_array, float('inf')), max_finite_val, np_array)
np_array = np.where(between(float('-inf'), np_array, -max_finite_val), -max_finite_val, np_array)
return np.float16(np_array)
def convert_tensor_float_to_float16(tensor, min_positive_val=1e-7, max_finite_val=1e4):
if not isinstance(tensor, onnx_proto.TensorProto):
raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
if tensor.data_type == onnx_proto.TensorProto.FLOAT:
tensor.data_type = onnx_proto.TensorProto.FLOAT16
if tensor.float_data:
float16_data = convert_np_to_float16(np.array(tensor.float_data),
min_positive_val, max_finite_val)
int_list = _npfloat16_to_int(float16_data)
tensor.int32_data[:] = int_list
tensor.float_data[:] = []
if tensor.raw_data:
float32_list = np.fromstring(tensor.raw_data, dtype='float32')
float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
tensor.raw_data = float16_list.tostring()
return tensor
def make_value_info_from_tensor(tensor):
shape = numpy_helper.to_array(tensor).shape
return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
This diff is collapsed.
# Custom Op ONNXRuntime
## Purpose
`Adding the custom operator implementation and registering it in ONNX Runtime`
## 环境配置
### Docker(方法一)
拉取镜像:
```plaintext
docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy-onnx1.19.2
```
创建并启动容器:
```plaintext
docker run --shm-size 16g --network=host --name=test --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/customop_onnxruntime:/home/customop -it <Your Image ID> /bin/bash
```
### Dockerfile(方法二)
```
cd ./docker
docker build --no-cache -t customop:test .
docker run --shm-size 16g --network=host --name=video_ort --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/customop_onnxruntime:/home/customop -it <Your Image ID> /bin/bash
```
## 使用
### 编译工程
```
git clone http://developer.sourcefind.cn/codes/modelzoo/customop_onnxruntime.git
cd customop_onnxruntime
python model.py // 生成模型
bash compile.sh // 编译生成算子库
```
### 运行示例
1. 目录
```
rocm_custom_op
├── compile.sh // 编译脚本
├── custom_op_library.cc // 注册自定义算子
├── custom_op_library.h
├── docker
├── include
├── model.py // 创建自定义模型
├── readme.md
├── rocm_ops.cc // 调用自定义算子
├── rocm_ops.h
├── rocm_ops.hip // 自定义算子实现
└── benchmark.py // 测试算子
```
2. 执行步骤
```
python test.py // 测试模型
```
3. 备注
```
1.在model.py中,定义了一个add.onnx模型。如需重新定义模型,请修改该文件。
2.在compile.sh中,选项--offload-arch=gfx906,请将gfx906替换为本机适配的rocm架构(可用rocminfo | grep gfx查看)。
3.若需测试别的自定义算子,修改内容如下:
3.1 修改rocm_ops.hip,重新实现自定义算子
3.2 修改rocm_ops.cc, 修改对自定义算子的调用
3.3 修改model.py,修改自定义模型的创建
3.4 修改benchmark.py,修改运行模型的输入数据
```
## result
### 精度
## 应用场景
### 功能
添加ORT custom op
## 源码仓库及问题反馈
- http://developer.sourcefind.cn/codes/modelzoo/customop_onnxruntime.git
## 参考资料
- https://github.com/microsoft/onnxruntime
- https://onnxruntime.ai/docs/extensions/add-op.html
\ No newline at end of file
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifdef USE_ROCM
#define ORT_API_MANUAL_INIT
#include "onnxruntime_cxx_api.h"
#undef ORT_API_MANUAL_INIT
#include "core/providers/rocm/rocm_context.h"
#include "onnxruntime_lite_custom_op.h"
//Concat
void rocm_concat(int axis, int M1, int N1, const float* X1, int M2, int N2, const float* X2, float* Z, hipStream_t stream);
//Gemm
void rocm_gemm(bool transA, bool transB, int M, int N, int K, float alpha, const float* A, const float* B, float beta, float* C, hipStream_t stream);
extern "C"{
//LeakyRelu
void rocm_leaky_relu(
int64_t size,
const float* d_X,
float* d_Y,
float alpha,
hipStream_t stream);
//Attention
void rocm_attention(int B, int S, int H,
const float* Q, const float* K, const float* V,
float* Out, hipStream_t stream);
//BatchNormalization
void rocm_batch_norm(int64_t N, int64_t C, int64_t H, int64_t W,
const float* X,
const float* gamma,
const float* beta,
const float* mean,
const float* var,
float epsilon,
float* Y,
hipStream_t stream);
//Cast
void rocm_cast(
int64_t N, // batch size
int64_t C, // channels (或其它第一维)
int64_t H, // 高度(或第二维)
int64_t W, // 宽度(或第三维)
const float* X, // 输入指针
int32_t* Y, // 输出指针
hipStream_t stream);
//Softmax
void rocm_softmax(int64_t M, int64_t N,
const float* X, float* Y,
hipStream_t compute_stream);
//Celu
void rocm_celu(int64_t, const float*, float*, float, hipStream_t);
//Relu
void rocm_relu(
int64_t size,
const float* X,
float* Y,
hipStream_t stream
);
// Conv
void rocm_conv2d(const float* input,
const float* weight,
const float* bias,
float* output,
int N, int C_in, int H, int W,
int C_out, int K_h, int K_w,
int out_H, int out_W,
hipStream_t stream);
}
using namespace Ort::Custom;
#define CUSTOM_ENFORCE(cond, msg) \
if (!(cond)) { \
throw std::runtime_error(msg); \
}
namespace Rocm {
void rocm_leaky_relu_forward(
const RocmContext& ctx,
const Tensor<float>& X,
Tensor<float>& Y) {
CUSTOM_ENFORCE(ctx.hip_stream, "No HIP stream available");
int64_t size = X.NumberOfElement();
const float alpha = 0.01f;
auto* y_ptr = Y.Allocate(X.Shape());
rocm_leaky_relu(size, X.Data(), y_ptr, alpha, ctx.hip_stream);
}
void rocm_relu_forward(
const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X,
Ort::Custom::Tensor<float>& Y
) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto input_shape = X.Shape();
int64_t size = X.NumberOfElement();
auto* y_data = Y.Allocate(input_shape);
rocm_relu(size, X.Data(), y_data, rocm_ctx.hip_stream);
}
void rocm_celu_forward(const Ort::Custom::RocmContext& ctx,
const Ort::Custom::Tensor<float>& X,
Ort::Custom::Tensor<float>& Y) {
CUSTOM_ENFORCE(ctx.hip_stream, "failed to fetch hip stream");
auto shape = X.Shape();
int64_t size = X.NumberOfElement();
float alpha = 1.0f; // or fetch from attribute
auto* y_ptr = Y.Allocate(shape);
rocm_celu(size, X.Data(), y_ptr, alpha, ctx.hip_stream);
}
/* softmax */
void KernelSoftmax(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X,
Ort::Custom::Tensor<float>& Z) {
auto input_shape = X.Shape();
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
// Expecting 2D input: [M, N]
CUSTOM_ENFORCE(input_shape.size() == 2, "Softmax only supports 2D input");
int64_t M = static_cast<int64_t>(input_shape[0]);
int64_t N = static_cast<int64_t>(input_shape[1]);
auto z_raw = Z.Allocate(input_shape);
// Call ROCm implementation
rocm_softmax(M, N, X.Data(), z_raw, rocm_ctx.hip_stream);
}
void rocm_cast_forward(
const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X,
Ort::Custom::Tensor<int32_t>& Y) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
// 假设只支持 4D 张量 [N,C,H,W]
auto shape = X.Shape();
CUSTOM_ENFORCE(shape.size() == 4, "Cast only supports 4D tensor [N,C,H,W]");
int64_t N = shape[0];
int64_t C = shape[1];
int64_t H = shape[2];
int64_t W = shape[3];
// 分配输出
auto* y_ptr = Y.Allocate({N, C, H, W});
// 正确调用:7 个参数
rocm_cast(
N, C, H, W,
X.Data(),
y_ptr,
rocm_ctx.hip_stream);
}
// BatchNormalization
void rocm_batchnorm_forward(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X,
const Ort::Custom::Tensor<float>& scale,
const Ort::Custom::Tensor<float>& B,
const Ort::Custom::Tensor<float>& mean,
const Ort::Custom::Tensor<float>& var,
Ort::Custom::Tensor<float>& Y) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto shape = X.Shape(); // expects [N, C, H, W]
CUSTOM_ENFORCE(shape.size() == 4, "Input must be 4D tensor [N, C, H, W]");
int64_t N = shape[0];
int64_t C = shape[1];
int64_t H = shape[2];
int64_t W = shape[3];
// Allocate output
auto* y_ptr = Y.Allocate({N, C, H, W});
// Epsilon attribute: retrieve via custom API or hardcode default
float epsilon = 1e-5f;
// If epsilon comes from attribute, fetch it here.
rocm_batch_norm(N, C, H, W,
X.Data(), scale.Data(), B.Data(), mean.Data(), var.Data(),
epsilon, y_ptr, rocm_ctx.hip_stream);
}
// attention
void rocm_attention_forward(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& Q,
const Ort::Custom::Tensor<float>& K,
const Ort::Custom::Tensor<float>& V,
Ort::Custom::Tensor<float>& Out) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto shape = Q.Shape(); // 期望为 [B, S, H]
CUSTOM_ENFORCE(shape.size() == 3, "Input must be 3D tensor [B, S, H]");
int B = shape[0];
int S = shape[1];
int H = shape[2];
auto* out_ptr = Out.Allocate({B, S, H});
rocm_attention(B, S, H, Q.Data(), K.Data(), V.Data(), out_ptr, rocm_ctx.hip_stream);
}
// -------------------------------
// Concat
// -------------------------------
void rocm_concat_forward(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X1,
const Ort::Custom::Tensor<float>& X2,
Ort::Custom::Tensor<float>& Y) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto shape1 = X1.Shape();
auto shape2 = X2.Shape();
// 支持二维张量按列连接(axis=1)
CUSTOM_ENFORCE(shape1.size() == 2 && shape2.size() == 2, "Inputs must be 2D tensors.");
CUSTOM_ENFORCE(shape1[0] == shape2[0], "Row dimensions must match for concat on axis 1.");
int M1 = shape1[0], N1 = shape1[1];
int M2 = shape2[0], N2 = shape2[1];
auto* y_data = Y.Allocate({M1, N1 + N2}); // 输出是合并后的矩阵
rocm_concat(1, M1, N1, X1.Data(), M2, N2, X2.Data(), y_data, rocm_ctx.hip_stream);
}
/******conv******/
void rocm_conv_forward(const RocmContext& ctx,
const Tensor<float>& input,
const Tensor<float>& weight,
const Tensor<float>& bias,
Tensor<float>& output) {
CUSTOM_ENFORCE(ctx.hip_stream, "no HIP stream");
const auto& input_shape = input.Shape(); // [N, C_in, H, W]
const auto& weight_shape = weight.Shape(); // [C_out, C_in, K_h, K_w]
const int64_t N = input_shape[0];
const int64_t C_in = input_shape[1];
const int64_t H = input_shape[2];
const int64_t W = input_shape[3];
const int64_t C_out = weight_shape[0];
const int64_t K_h = weight_shape[2];
const int64_t K_w = weight_shape[3];
const int64_t out_H = (H - K_h) / 1 + 1;
const int64_t out_W = (W - K_w) / 1 + 1;
auto* y_ptr = output.Allocate({N, C_out, out_H, out_W});
rocm_conv2d(input.Data(), weight.Data(), bias.Data(), y_ptr,
N, C_in, H, W, C_out, K_h, K_w, out_H, out_W,
ctx.hip_stream);
}
// -------------------------------
// Gemm
// -------------------------------
void rocm_gemm_forward(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& A,
const Ort::Custom::Tensor<float>& B,
const Ort::Custom::Tensor<float>& C,
Ort::Custom::Tensor<float>& Y) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto shapeA = A.Shape();
auto shapeB = B.Shape();
auto shapeC = C.Shape();
CUSTOM_ENFORCE(shapeA.size() == 2 && shapeB.size() == 2 && shapeC.size() == 2, "Inputs must be 2D tensors.");
int M = shapeA[0];
int K = shapeA[1];
int N = shapeB[1];
CUSTOM_ENFORCE(shapeB[0] == K, "Inner dimension mismatch between A and B in Gemm.");
CUSTOM_ENFORCE(shapeC[0] == M && shapeC[1] == N, "Output tensor shape mismatch in Gemm.");
auto* y_data = Y.Allocate({M, N});
rocm_gemm(false, false, M, N, K, 1.0f, A.Data(), B.Data(), 1.0f, y_data, rocm_ctx.hip_stream);
}
void RegisterOps(Ort::CustomOpDomain& domain) {
//注册 Attention 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpAttention{Ort::Custom::CreateLiteCustomOp("Attention", "ROCMExecutionProvider", rocm_attention_forward)};
domain.Add(c_CustomOpAttention.get());
// 注册 BatchNormalization 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpBatchNorm{Ort::Custom::CreateLiteCustomOp("BatchNormalization", "ROCMExecutionProvider", rocm_batchnorm_forward)};
domain.Add(c_CustomOpBatchNorm.get());
// 注册 Concat 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpConcat{Ort::Custom::CreateLiteCustomOp("Concat", "ROCMExecutionProvider", rocm_concat_forward)};
domain.Add(c_CustomOpConcat.get());
// 注册 Gemm 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpGemm{Ort::Custom::CreateLiteCustomOp("Gemm", "ROCMExecutionProvider", rocm_gemm_forward)};
domain.Add(c_CustomOpGemm.get());
// 注册 Cast 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpCast{Ort::Custom::CreateLiteCustomOp("Cast", "ROCMExecutionProvider", rocm_cast_forward)};
domain.Add(c_CustomOpCast.get());
// 注册 Softmax 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpSoftmax{Ort::Custom::CreateLiteCustomOp("Softmax","ROCMExecutionProvider", KernelSoftmax)};
domain.Add(c_CustomOpSoftmax.get());
// 注册 Celu 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CeluOp{Ort::Custom::CreateLiteCustomOp("Celu", "ROCMExecutionProvider", rocm_celu_forward)};
domain.Add(c_CeluOp.get());
// 注册 ReLU 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpRelu{
Ort::Custom::CreateLiteCustomOp("Relu", "ROCMExecutionProvider", rocm_relu_forward)};
domain.Add(c_CustomOpRelu.get());
// 注册LeakyRelu算子
static const std::unique_ptr<OrtLiteCustomOp> c_LeakyReLU{
Ort::Custom::CreateLiteCustomOp(
"LeakyRelu", "ROCMExecutionProvider", rocm_leaky_relu_forward)};
domain.Add(c_LeakyReLU.get());
//注册conv算子
static const std::unique_ptr<OrtLiteCustomOp> c_Conv{
Ort::Custom::CreateLiteCustomOp("Conv", "ROCMExecutionProvider", rocm_conv_forward)};
domain.Add(c_Conv.get());
}
} // namespace Rocm
#endif
File added
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace Rocm {
#ifdef USE_ROCM
void RegisterOps(Ort::CustomOpDomain& domain);
#else
inline void RegisterOps(Ort::CustomOpDomain&) {}
#endif
} // namespace Rocm
This diff is collapsed.
This diff is collapsed.
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment