Commit b3e3ac37 authored by ghfund4_b52's avatar ghfund4_b52
Browse files

Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o,...

Update custom_op_library.cc.o, rocm_ops.hip, rocm_ops.hip.o, libcustom_op_library.so, rocm_ops.cc.o, compile_rdc.sh, custom_op_library.h, benchmark.py, compile.sh, custom_op_library.cc, cuda_utils.py, fp16.py, rocm_ops.h, node_utils.py, rocm_ops.cc, readme.md, rocm_ops.hip.cpp files
parents
import onnxruntime as ort
import numpy as np
import os
import sys
import onnx
from onnx import numpy_helper
from node_utils import INPUT_TYPE_TO_NP_TYPE_MAP
from cuda_utils import set_batchsize, model_setbs
from scipy import spatial
import argparse
import time
def save_input_output_data(save_path, data_dict, isInput=True):
if not os.path.isdir(save_path):
os.makedirs(save_path)
keys = list(data_dict.keys())
data_prefix = 'input'
if not isInput:
data_prefix = 'output'
for j in range(len(data_dict)):
with open(os.path.join(save_path, '{}_{}.pb'.format(data_prefix, j)), 'wb') as f:
f.write(numpy_helper.from_array(
data_dict[keys[j]], keys[j]).SerializeToString())
def load_pb_data(pb_path):
with open(pb_path, 'rb') as f:
input_content = f.read()
tensor = onnx.TensorProto()
tensor.ParseFromString(input_content)
f.close()
return numpy_helper.to_array(tensor)
def get_cosine(gpu_array, cpu_array):
gpu_array = gpu_array.astype(np.float64)
cpu_array = cpu_array.astype(np.float64)
gpu_array = gpu_array.reshape([-1])
cpu_array = cpu_array.reshape([-1])
cosine = spatial.distance.cosine(cpu_array, gpu_array)
return cosine
def get_snr(gpu_array, cpu_array):
cpu_array = cpu_array.astype(np.float64)
gpu_array = gpu_array.astype(np.float64)
diff_array = cpu_array - gpu_array
x = diff_array * diff_array
x = np.sum(x)
y = cpu_array * cpu_array
y = np.sum(y)
snr = (x) / (y + 1e-7)
snr = np.mean(snr)
return snr
def accuracy_check_run(args):
EP_List = ['ROCMExecutionProvider']
model = onnx.load(args.input_model)
model = set_batchsize(model, args.batchsize)
so = ort.SessionOptions()
so.enable_profiling = True
so.register_custom_ops_library("/public/home/kj_gauss/All_test/libcustom_op_library.so")
so.intra_op_num_threads = 1
so.inter_op_num_threads = 1
cuda_session = ort.InferenceSession(
model.SerializeToString(), sess_options=so, providers=EP_List
)
#cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_List)
inputs = cuda_session.get_inputs()
outputs = cuda_session.get_outputs()
file_list = os.listdir(args.datapath)
input_list = []
output_list = []
for file in file_list:
if file[:5] == 'input':
input_list.append(file)
elif file[:6] == 'output':
output_list.append(file)
input_traits = [int(i[6:-3]) for i in input_list]
input_traits = sorted(input_traits)
input_list = [os.path.join(args.datapath, "input_{}.pb".format(i)) for i in input_traits]
output_traits = [int(i[7:-3]) for i in output_list]
output_traits = sorted(output_traits)
output_list = [os.path.join(args.datapath, "output_{}.pb".format(i)) for i in output_traits]
input_dict = {}
for input, input_file in zip(inputs, input_list):
input_dict[input.name] = load_pb_data(input_file)
if input_dict[input.name].shape[0] != args.batchsize:
print("Batchsize error! input data batchsize is {} but your input batchsize is {}, Please fix!".format(input_dict[input_file[0].name].shape[0], args.batchsize))
sys.exit()
gt_dict = {}
for output, gt_file in zip(outputs, output_list):
gt_dict[output.name] = load_pb_data(gt_file)
output_names = [x.name for x in cuda_session.get_outputs()]
output_data = cuda_session.run(output_names, input_dict)
for idx, output_name in enumerate(output_names):
print("output {}".format(output_name))
print("SNR IS : {}".format(get_snr(gt_dict[output_name], output_data[idx])))
print("COSINE IS : {}\n".format(get_cosine(gt_dict[output_name], output_data[idx])))
def generate_golden_data_run(args):
import os
import time
import onnx
import onnxruntime as ort
import numpy as np
from onnx import numpy_helper
from node_utils import INPUT_TYPE_TO_NP_TYPE_MAP
def save_input_output_data(save_path, data_dict, isInput=True):
if not os.path.isdir(save_path):
os.makedirs(save_path)
prefix = 'input' if isInput else 'output'
for idx, (name, data) in enumerate(data_dict.items()):
with open(os.path.join(save_path, f'{prefix}_{idx}.pb'), 'wb') as f:
f.write(numpy_helper.from_array(data, name).SerializeToString())
model = onnx.load(args.input_model)
orig_shapes = {}
for vi in model.graph.input:
dims = []
for d in vi.type.tensor_type.shape.dim:
dims.append(d.dim_value if d.dim_value > 0 else None)
orig_shapes[vi.name] = dims
so = ort.SessionOptions()
so.register_custom_ops_library("libcustom_op_library.so")
so.intra_op_num_threads = 1
so.inter_op_num_threads = 1
providers = ['ROCMExecutionProvider']
t0 = time.time()
session = ort.InferenceSession(model.SerializeToString(),
sess_options=so,
providers=providers)
t1 = time.time()
print(f"Initialize ROCM session cost {(t1-t0)*1000:.2f} ms")
input_dict = {}
for inp in session.get_inputs():
name = inp.name
dtype_str = inp.type
shape = []
for d in orig_shapes[name]:
if d is None:
shape.append(args.batchsize)
else:
shape.append(d)
print(f"[INFO] {name} <- shape {shape}, type={dtype_str}")
data = np.random.rand(*shape)
if 'uint8' in dtype_str:
data = data * 255
elif 'int8' in dtype_str:
data = data * 255 - 128
data = data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[dtype_str])
input_dict[name] = data
if args.saveIOdata == 1:
save_input_output_data(args.datapath, input_dict, isInput=True)
output_names = [o.name for o in session.get_outputs()]
# 6. Warm-up
if args.warmup > 0:
for _ in range(args.warmup):
session.run(output_names, input_dict)
t_start = time.time()
for _ in range(args.runnum):
outputs = session.run(output_names, input_dict)
t_end = time.time()
latency_ms = (t_end - t_start) * 1000 / (args.runnum * args.batchsize)
print(f"Inference cost per sample: {latency_ms:.3f} ms | FPS: {1000/latency_ms:.2f}")
if args.saveIOdata == 1:
out_dict = {n: o for n, o in zip(output_names, outputs)}
save_input_output_data(args.datapath, out_dict, isInput=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_model",
type=str,
required=True,
default="",
help="input model file")
parser.add_argument("-b", "--batchsize",
type=int,
required=False,
default=1,
help="batchsize")
parser.add_argument("-c", "--checkresult",
type=bool,
required=False,
default=False,
help="check output accuracy")
parser.add_argument("-d", "--datapath",
type=str,
required=True,
help="data path for saving golden data or checking output accuracy")
parser.add_argument("-w", "--warmup",
type=int,
required=False,
default=50,
help="input warm up iterations")
parser.add_argument("-n", "--runnum",
type=int,
required=False,
default=100,
help="input run model iterations")
parser.add_argument("-s", "--inputshape",
type=int,
required=False,
default=-1,
help="bert input shape")
parser.add_argument("-t", "--saveIOdata",
type=int,
required=False,
default=1,
help="save golden data")
ARGS = parser.parse_args()
if ARGS.checkresult:
accuracy_check_run(ARGS)
else:
generate_golden_data_run(ARGS)
\ No newline at end of file
/opt/dtk/hip/bin/hipcc --offload-arch=gfx906 -I/opt/dtk-25.04/include -fPIC -x hip -o rocm_ops.hip.o -c rocm_ops.hip.cpp
/usr/bin/c++ -DUSE_ROCM=1 -I ./include/onnxruntime/ -fPIC "-D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1" -o rocm_ops.cc.o -c rocm_ops.cc
/usr/bin/c++ -I./include/onnxruntime/ -fPIC -o custom_op_library.cc.o -c custom_op_library.cc
/opt/dtk/llvm/bin/clang++ -fPIC -shared -Wl,-soname,libcustom_op_library.so -o libcustom_op_library.so rocm_ops.hip.o custom_op_library.cc.o rocm_ops.cc.o -L/opt/dtk/lib -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25085.1211-205b0686 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc
/opt/dtk/hip/bin/hipcc --offload-arch=gfx906 -I/opt/dtk-25.04/include -fPIC -x hip -o rocm_ops.hip.o -c rocm_ops.hip.cpp
/usr/bin/c++ -DUSE_ROCM=1 -I ./include/onnxruntime/ -fPIC "-D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1" -o rocm_ops.cc.o -c rocm_ops.cc
/usr/bin/c++ -I./include/onnxruntime/ -fPIC -o custom_op_library.cc.o -c custom_op_library.cc
/opt/dtk/llvm/bin/clang++ -fPIC -shared -Wl,-soname,libcustom_op_library.so -o libcustom_op_library.so rocm_ops.hip.o custom_op_library.cc.o rocm_ops.cc.o -L/opt/dtk/lib -Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25085.1211-205b0686 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc
\ No newline at end of file
/opt/dtk/hip/bin/hipcc \
--offload-arch=gfx906 \
-shared \
-o libcustom_op_library.so \
rocm_ops.hip.cpp custom_op_library.cc.o rocm_ops.cc.o \
-L/opt/dtk/lib -L/opt/dtk/hip/lib \
-Wl,-rpath,/opt/dtk/lib:/opt/dtk/hip/lib \
-lgalaxyhip \
/opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a \
-lstdc++ -lm -lgcc_s -lgcc -lc
import onnxruntime as ort
import onnx
import numpy as np
import time
from node_utils import node_utils, INPUT_TYPE_TO_NP_TYPE_MAP
import sys
def set_batchsize(model, batchSize):
for node in model.graph.node:
if node.op_type in ['Reshape', 'Split', 'Transpose']:
return model
del model.graph.value_info[:]
for input in model.graph.input:
if len(input.type.tensor_type.shape.dim) > 1:
input.type.tensor_type.shape.dim[0].dim_value = batchSize
for output in model.graph.output:
if len(output.type.tensor_type.shape.dim) > 1:
output.type.tensor_type.shape.dim[0].dim_value = batchSize
return model
def model_setbs(model, batchSize):
del model.graph.value_info[:]
for input in model.graph.input:
if len(input.type.tensor_type.shape.dim) > 1:
input.type.tensor_type.shape.dim[0].dim_value = batchSize
for output in model.graph.output:
if len(output.type.tensor_type.shape.dim) > 1:
output.type.tensor_type.shape.dim[0].dim_value = batchSize
return model
def model_run(modelPath, batchSize=None):
model = onnx.load(modelPath)
if batchSize is not None:
model = set_batchsize(model, batchSize)
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
EP_list = ['CUDAExecutionProvider']
start = time.time()
cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_list, sess_options=session_options)
end = time.time()
duration = (end - start) * 1000
print("Initialize Session cost {} ms".format(duration))
inputs = cuda_session.get_inputs()
outputs = cuda_session.get_outputs()
input_dict = {}
for input in inputs:
shape = [s for s in input.shape]
for idx in range(len(shape)):
if shape[idx] is None:
print("[ERROR] Input shape invalid,please Check")
return -1
input_data = np.random.random(shape)
if input.type.find('int') > 0:
input_data = input_data*10
input_data = input_data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[input.type])
input_dict[input.name] = ort.OrtValue.ortvalue_from_numpy(input_data, 'cuda_pinned', 0)
outputs_names = []
for output in outputs:
outputs_names.append(output.name)
io_binding = cuda_session.io_binding()
for key, ortValue in input_dict.items():
io_binding.bind_ortvalue_input(key, ortValue)
for out_name in outputs_names:
io_binding.bind_output(out_name, 'cuda_pinned', device_id=0)
# warm up
warm_up_num = 20
start = time.time()
for i in range(warm_up_num):
cuda_session.run_with_iobinding(io_binding)
end = time.time()
duration = (end - start) / (batchSize * warm_up_num) * 1000
print("Warm up cost {} ms".format(duration))
run_num = 50
start = time.time()
for i in range(run_num):
cuda_session.run_with_iobinding(io_binding)
end = time.time()
duration = (end - start) * 1000 / (run_num * batchSize)
print("Current inference cost {} ms".format(duration))
print("FPS is {:.2f}".format(1000/duration))
del cuda_session
return duration
if __name__ == '__main__':
if len(sys.argv) != 3:
print(len(sys.argv))
print("Input parameter error...")
print("python cudaRun.py modelPath batchSize")
sys.exit()
modelPath = sys.argv[1]
batchSize = int(sys.argv[2])
print(modelPath)
model_run(modelPath, batchSize)
#include "custom_op_library.h"
#define ORT_API_MANUAL_INIT
#include "onnxruntime_cxx_api.h"
#undef ORT_API_MANUAL_INIT
#include <vector>
#include <cmath>
#include <mutex>
#include <system_error>
#include "core/common/common.h"
#include "core/framework/ortdevice.h"
#include "core/framework/ortmemoryinfo.h"
#include "rocm_ops.h"
#include "onnxruntime_lite_custom_op.h"
// static const char* c_OpDomain = "test.customop";
static const char* c_OpDomain = "";
static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain&& domain) {
static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
static std::mutex ort_custom_op_domain_mutex;
std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
ort_custom_op_domain_container.push_back(std::move(domain));
}
OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
Ort::Global<void>::api_ = api->GetApi(ORT_API_VERSION);
OrtStatus* result = nullptr;
ORT_TRY {
Ort::CustomOpDomain domain{c_OpDomain};
Rocm::RegisterOps(domain);
Ort::UnownedSessionOptions session_options(options);
session_options.Add(domain);
AddOrtCustomOpDomainToContainer(std::move(domain));
}
ORT_CATCH(const std::exception& e) {
ORT_HANDLE_EXCEPTION([&]() {
Ort::Status status{e};
result = status.release();
});
}
return result;
}
OrtStatus* ORT_API_CALL RegisterCustomOpsAltName(OrtSessionOptions* options, const OrtApiBase* api) {
return RegisterCustomOps(options, api);
}
#pragma once
#include "onnxruntime_c_api.h"
#ifdef __cplusplus
extern "C" {
#endif
ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api);
// alternative name to test registration by function name
ORT_EXPORT OrtStatus* ORT_API_CALL RegisterCustomOpsAltName(OrtSessionOptions* options, const OrtApiBase* api);
#ifdef __cplusplus
}
#endif
import numpy as np
import onnx
from onnx import helper, numpy_helper
from onnx import onnx_pb as onnx_proto
def _npfloat16_to_int(np_list):
return [int(bin(_.view('H'))[2:].zfill(16), 2) for _ in np_list]
def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4):
def between(a, b, c):
return np.logical_and(a < b, b < c)
np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
np_array = np.where(between(max_finite_val, np_array, float('inf')), max_finite_val, np_array)
np_array = np.where(between(float('-inf'), np_array, -max_finite_val), -max_finite_val, np_array)
return np.float16(np_array)
def convert_tensor_float_to_float16(tensor, min_positive_val=1e-7, max_finite_val=1e4):
if not isinstance(tensor, onnx_proto.TensorProto):
raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
if tensor.data_type == onnx_proto.TensorProto.FLOAT:
tensor.data_type = onnx_proto.TensorProto.FLOAT16
if tensor.float_data:
float16_data = convert_np_to_float16(np.array(tensor.float_data),
min_positive_val, max_finite_val)
int_list = _npfloat16_to_int(float16_data)
tensor.int32_data[:] = int_list
tensor.float_data[:] = []
if tensor.raw_data:
float32_list = np.fromstring(tensor.raw_data, dtype='float32')
float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
tensor.raw_data = float16_list.tostring()
return tensor
def make_value_info_from_tensor(tensor):
shape = numpy_helper.to_array(tensor).shape
return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
import onnx
from onnx import shape_inference, helper, TensorProto
import numpy as np
import os
import fp16
DEBUG = False
INPUT_TYPE_TO_NP_TYPE_MAP = {
'tensor(float)': np.dtype('float32'),
'tensor(uint8)': np.dtype('uint8'),
'tensor(int8)': np.dtype('int8'),
'tensor(uint16)': np.dtype('uint16'),
'tensor(int16)': np.dtype('int16'),
'tensor(int32)': np.dtype('int32'),
'tensor(int64)': np.dtype('int64'),
'tensor(bool)': np.dtype('bool'),
'tensor(float16)': np.dtype('float16'),
'tensor(float64)': np.dtype('float64'),
'tensor(complex64)': np.dtype('complex64'),
'tensor(complex128)': np.dtype('complex128'),
'tensor(string)': np.dtype(np.str_),
'tensor(float8e5m2)': np.dtype('int8'),
'tensor(float8e5m2fnuz)': np.dtype('int8'),
'tensor(float8e4m3fnuz)': np.dtype('int8'),
'tensor(float8e4m3fn)': np.dtype('int8'),
'seq(tensor(complex64))': np.dtype('complex64'),
'seq(tensor(complex128))': np.dtype('complex128'),
'seq(tensor(uint8))': np.dtype('uint8'),
'seq(tensor(int8))': np.dtype('int8'),
'seq(tensor(int16))': np.dtype('int16'),
'seq(tensor(uint16))': np.dtype('uint16'),
'seq(tensor(int32))': np.dtype('int32'),
'seq(tensor(uint32))': np.dtype('uint32'),
'seq(tensor(int64))': np.dtype('int64'),
'seq(tensor(uint64))': np.dtype('uint64'),
'seq(tensor(float))': np.dtype('float32'),
'seq(tensor(float16))': np.dtype('float16'),
'seq(tensor(double))': np.dtype('double'),
'seq(tensor(bool))': np.dtype('bool'),
'seq(tensor(string))': np.dtype(np.str_)
}
NP_TYPE_TO_ONNX_TYPE_MAP = {
np.dtype('float32') : 1,
np.dtype('uint8') : 2,
np.dtype('int8') : 3,
np.dtype('uint16') : 4,
np.dtype('int16') : 5,
np.dtype('int32') : 6,
np.dtype('int64') : 7,
np.dtype(np.str_) : 8,
np.dtype('bool') : 9,
np.dtype('float16') : 10,
np.dtype('float64') : 11,
np.dtype('uint32') : 12,
np.dtype('uint64') : 13,
np.dtype('complex64') : 14,
np.dtype('complex128') : 15,
}
Attribute_TYPE_MAP = {
0 : 'UNDEFINED',
1 : 'f',
2 : 'i',
3 : 's',
4 : 't',
5 : 'g',
6 : 'floats',
7 : 'ints',
8 : 'strings',
9 : 'tensors',
10 : 'graphs'
}
class node_utils:
def __init__(self, modelPath, batchSize=1, isINT8Model=True, inputShape=None):
print('Process {}'.format(modelPath))
self.modelPath = modelPath
self.batchSize = batchSize
self.isINT8Model = isINT8Model
self.filter_op_type = ['QuantizeLinear', 'DequantizeLinear', 'Squeeze', 'Unsqueeze', 'Shape']
self.select_input_op_type = ['Shape', 'Squeeze', 'Unsqueeze']
self.quant_op_list = ['Conv', 'ConvTranspose', 'Gemm', 'GlobalAveragePool', 'MaxPool', 'Add', 'Sub', 'Div', 'Mul',
'MatMul', 'Transpose', 'Reshape', 'Flatten', 'Soft', 'Pad', 'Pow', 'Concat', 'LeakyRelu', 'Relu',
'PRelu', 'Clip', 'Resize', 'AveragePool', 'GlobalMaxPool', 'Split', 'Slice', 'Sigmoid',
'ReduceMean', 'Softplus', 'Tanh', 'ReduceSum', 'Gather', 'Swish', 'HardSigmoid', 'Mish']
self.model_op_list = ['Swish','LayerNorm','Mish']
self.input_node_dict = {}
self.output_node_dict = {}
self.name_node_dict = {}
self.initializer_dict = {}
self.value_info_dict = {}
self.node_type_dict = {}
self.model = onnx.load(modelPath)
if inputShape is not None:
for input in self.model.graph.input:
if input.name not in inputShape:
print('[ERROR]: input {} with no input shape, please check!'.format(input.name))
exit()
shape = inputShape[input.name]
for idx, s in enumerate(shape):
input.type.tensor_type.shape.dim[idx].dim_value = s
self.initModelState()
def RefreshState(self):
self.input_node_dict = {}
self.output_node_dict = {}
self.name_node_dict = {}
self.initializer_dict = {}
self.value_info_dict = {}
self.initModelState()
def initModelState(self):
model = self.model
for node in model.graph.node:
if not node.name in self.name_node_dict:
self.name_node_dict[node.name] = node
for input in node.input:
if not input in self.input_node_dict:
self.input_node_dict[input] = [node]
else:
self.input_node_dict[input].append(node)
for output in node.output:
if not output in self.output_node_dict:
self.output_node_dict[output] = [node]
else:
self.output_node_dict[output].append(node)
for value_info in model.graph.value_info:
self.value_info_dict[value_info.name] = value_info
for initializer in model.graph.initializer:
self.initializer_dict[initializer.name] = initializer
def get_relate_qdq_node(self, name_list, is_front=True):
qdq_list = []
for name in name_list:
if is_front and name in self.output_node_dict:
front_node_list = self.output_node_dict[name]
for node in front_node_list:
if node.op_type == 'DequantizeLinear':
qdq_list.append(node)
elif not is_front and name in self.input_node_dict:
back_node_list = self.input_node_dict[name]
for node in back_node_list:
if node.op_type == 'QuantizeLinear':
qdq_list.append(node)
return qdq_list
def correct_value_info_by_ort(self):
import onnxruntime as ort
import time
import copy
def setBatchSize(model, batchSize):
for input in model.graph.input:
input.type.tensor_type.shape.dim[0].dim_value = batchSize
for output in model.graph.output:
output.type.tensor_type.shape.dim[0].dim_value = batchSize
return model
def needORTInfer(value_info):
if len(value_info.type.tensor_type.shape.dim) == 0:
return True
if self.firstDimIsBatch(value_info) and value_info.type.tensor_type.shape.dim[0].dim_value == -1:
return False
for dim in value_info.type.tensor_type.shape.dim:
if dim.dim_value <= 0:
return True
return False
def canInferShapeByCPU(opset_import):
for detail in opset_import:
if detail.domain not in ['', 'ai.onnx', 'com.microsoft']:
return False
return True
def canInferShapeByCPU(opset_import):
for detail in opset_import:
if detail.domain not in ['', 'ai.onnx', 'com.microsoft']:
return False
return True
def firstDimIsBatch(value_info):
node = self.output_node_dict[value_info.name][0]
if node.op_type == 'Concat' and len(value_info.type.tensor_type.shape.dim) == 1:
return False
return True
model = self.model
model = setBatchSize(model, self.batchSize)
if canInferShapeByCPU(model.opset_import):
try:
del model.graph.value_info[:]
if self.modelPath.find('bert') < 0:
model = shape_inference.infer_shapes(model, check_type=True, strict_mode=True, data_prop=True)
else:
print("skip shape inference for bert...")
self.model = model
self.RefreshState()
except:
print("shape inference failed use onnxruntime infer later")
batchsize = self.batchSize
output_backup = copy.deepcopy(model.graph.output)
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
if self.modelPath.find('bert') < 0 and canInferShapeByCPU(model.opset_import):
EP_list = ['CPUExecutionProvider']
else:
EP_list = ['CUDAExecutionProvider']
all_tensor_dict = {}
for node in model.graph.node:
for output in node.output:
if output not in all_tensor_dict:
all_tensor_dict[output] = 1
else:
all_tensor_dict[output] += 1
value_info_list = []
value_info_dict = {}
value_info_list_back = []
for tensor_name in list(all_tensor_dict.keys()):
if DEBUG:
print(tensor_name)
if tensor_name not in self.value_info_dict:
if DEBUG:
print('tensor {} not in self.value_info_dict and need infer by ORT'.format(tensor_name))
value_info_list.append(tensor_name)
value_info_dict[tensor_name] = onnx.ValueInfoProto(name=tensor_name)
continue
value_info = self.value_info_dict[tensor_name]
if self.output_node_dict[tensor_name][0].op_type in self.filter_shape_correct_op_type:
value_info_list_back.append(copy.deepcopy(value_info))
if DEBUG:
print('add tensor {} to backup list'.format(tensor_name))
continue
if needORTInfer(value_info):
value_info_list.append(tensor_name)
value_info_dict[tensor_name] = value_info
if DEBUG:
print('tensor {} need infer by ORT'.format(tensor_name))
else:
if len(value_info.type.tensor_type.shape.dim) > 0 and self.firstDimIsBatch(value_info):
value_info.type.tensor_type.shape.dim[0].dim_value = batchsize
value_info_list_back.append(copy.deepcopy(value_info))
if DEBUG:
print('modify tensor {} batchsize and add to backup list'.format(tensor_name))
if len(value_info_list) > 0:
step = 20
infer_times = int(len(value_info_list) / step) if len(value_info_list) % step == 0 else int((len(value_info_list) + (step - len(value_info_list) % step)) / step)
if DEBUG:
print('infer_times %d' % infer_times)
for i in range(infer_times):
output_name_list = []
if i == infer_times - 1:
output_name_list = value_info_list[step*i:]
else:
output_name_list = value_info_list[step*i:step*(i+1)]
if DEBUG:
print(i, output_name_list)
del model.graph.output[:]
for output_name in output_name_list:
model.graph.output.extend([onnx.ValueInfoProto(name=output_name)])
del model.graph.value_info[:]
start = time.time()
cuda_session = ort.InferenceSession(model.SerializeToString(), providers=EP_list)
end = time.time()
duration = (end - start) * 1000
if DEBUG:
print("Initialize Session cost {} ms\n".format(duration))
inputs = cuda_session.get_inputs()
input_dict = {}
for input in inputs:
shape = []
for s in input.shape:
if isinstance(s, int):
shape.append(s)
else:
print('[ERROR]: input shape must be inter but input is {}'.format(type(s)))
exit()
for idx in range(len(shape)):
if shape[idx] is None:
shape[idx] = 1
input_data = np.random.random(shape)
if input.type != 0:
input_data = input_data*10
input_data = input_data.astype(INPUT_TYPE_TO_NP_TYPE_MAP[input.type])
input_dict[input.name] = input_data
outputs = cuda_session.run(output_name_list, input_dict)
for idx, output_name in enumerate(output_name_list):
if DEBUG:
print('tensor {} ort infered shape is {}'.format(output_name, outputs[idx].shape))
infered_shape = list(outputs[idx].shape)
new_tmp_value_info = helper.make_tensor_value_info(output_name, NP_TYPE_TO_ONNX_TYPE_MAP[outputs[idx].dtype], infered_shape)
value_info_dict[output_name] = new_tmp_value_info
del cuda_session
del model.graph.value_info[:]
model.graph.value_info.extend(list(value_info_dict.values()))
model.graph.value_info.extend(value_info_list_back)
del model.graph.output[:]
model.graph.output.extend(output_backup)
else:
del value_info_list_back
self.model = setBatchSize(model, batchsize)
def get_node_unique_info(self, node):
unique_str = '{}-'.format(node.op_type)
for input in node.input:
if input in self.value_info_dict:
value_info = self.value_info_dict[input]
start_idx = 0
for dim in value_info.type.tensor_type.shape.dim[start_idx:]:
unique_str += '{},'.format(dim.dim_value)
unique_str = unique_str[:-1]
unique_str += '_elemType_{}_'.format(value_info.type.tensor_type.elem_type)
elif input in self.initializer_dict:
initial_tensor = self.initializer_dict[input]
start_idx = 0
for dim in initial_tensor.dims[start_idx:]:
unique_str += '{},'.format(dim)
unique_str = unique_str[:-1]
unique_str += '_elemType_{}_'.format(initial_tensor.data_type)
else:
continue
unique_str = unique_str[:-1]
for attr in node.attribute:
unique_str += '_{}_'.format(attr.name)
attr_type = Attribute_TYPE_MAP[attr.type]
value = getattr(attr, attr_type)
if attr.type in [1, 2, 3]:
unique_str += '{}'.format(value)
elif attr.type in [6, 7, 8]:
for v in value:
unique_str += '{},'.format(v)
unique_str = unique_str[:-1]
return unique_str
def modify_FP32_to_FP16(self, graph):
for input in graph.input:
if input.type.tensor_type.elem_type == 1:
input.type.tensor_type.elem_type = 10
for output in graph.output:
if output.type.tensor_type.elem_type == 1:
output.type.tensor_type.elem_type = 10
new_initializer = []
for initializer in graph.initializer:
if initializer.data_type == 1:
new_initializer.append(fp16.convert_tensor_float_to_float16(initializer))
else:
new_initializer.append(initializer)
del graph.initializer[:]
graph.initializer.extend(new_initializer)
return graph
def extractNodeAndSave(self, save_path):
self.correct_value_info_by_ort()
self.RefreshState()
model = self.model
model_list = []
node_list = []
for node in model.graph.node:
if node.op_type in self.filter_op_type:
continue
node_list.append(node)
for idx, node in enumerate(node_list):
if DEBUG:
print(node.name, node.op_type)
unique_str = self.get_node_unique_info(node=node)
if unique_str not in self.node_type_dict:
self.node_type_dict[unique_str] = [node.name]
else:
self.node_type_dict[unique_str].append(node.name)
if DEBUG:
print('Repeated node, skip...')
continue
input_list = []
output_list = []
new_node_list = [node]
new_initializer_list = []
if self.isINT8Model:
if node.op_type in self.quant_op_list:
dq_node = self.get_relate_qdq_node(node.input)
q_node = self.get_relate_qdq_node(node.output, False)
new_node_list.extend(dq_node)
new_node_list.extend(q_node)
for new_node in new_node_list:
input_list.extend(new_node.input)
output_list.extend(new_node.output)
inter_tensors = []
for output in output_list:
if output in input_list:
inter_tensors.append(output)
pure_input_list = []
pure_output_list = []
for input in input_list:
if input not in inter_tensors:
pure_input_list.append(input)
for output in output_list:
if output not in inter_tensors:
pure_output_list.append(output)
new_graph_input = []
for input in pure_input_list:
if input in self.initializer_dict:
new_initializer_list.append(self.initializer_dict[input])
else:
new_graph_input.append(input)
graph_input_proto_list = []
for graph_input in new_graph_input:
if graph_input in self.value_info_dict:
input_value_info = self.value_info_dict[graph_input]
else:
if DEBUG:
print('tensor {} not in self.value_info_dict'.format(graph_input))
for input in model.graph.input:
if input.name == graph_input:
input_value_info = input
if DEBUG:
print('get tensor {} from model.graph.input'.format(graph_input))
graph_input_proto_list.append(input_value_info)
graph_output_proto_list = []
for graph_output in pure_output_list:
if graph_output in self.value_info_dict:
output_value_info = self.value_info_dict[graph_output]
else:
if DEBUG:
print('tensor {} not in self.value_info_dict'.format(graph_output))
for output in model.graph.output:
if output.name == graph_output:
output_value_info = output
if DEBUG:
print('get tensor {} from model.graph.output'.format(graph_output))
graph_output_proto_list.append(output_value_info)
if node.op_type in ['Reshape', 'Expand']:
output_shape = [dim.dim_value for dim in graph_output_proto_list[0].type.tensor_type.shape.dim]
Reshape_input_shape = helper.make_tensor(node.input[1], TensorProto.INT64, [len(output_shape)], np.array(output_shape))
if len(graph_input_proto_list) == 2 and node.op_type == 'Reshape':
new_initializer_list.append(Reshape_input_shape)
remove_input_idx = 0
for input_idx in range(len(graph_input_proto_list)):
if graph_input_proto_list[input_idx].name == node.input[1]:
remove_input_idx = input_idx
break
del graph_input_proto_list[remove_input_idx]
elif node.op_type == 'Expand':
new_initializer_list.append(Reshape_input_shape)
remove_input_idx = 0
for input_idx in range(len(graph_input_proto_list)):
if graph_input_proto_list[input_idx].name == node.input[0]:
remove_input_idx = input_idx
break
del graph_input_proto_list[remove_input_idx]
if new_node_list[0].op_type == 'Cast':
for input, output in zip(graph_input_proto_list, graph_output_proto_list):
assert len(input.type.tensor_type.shape.dim) == len(output.type.tensor_type.shape.dim)
for idx in range(len(input.type.tensor_type.shape.dim)):
if input.type.tensor_type.shape.dim[idx].dim_value != output.type.tensor_type.shape.dim[idx].dim_value:
input.type.tensor_type.shape.dim[idx].dim_value = output.type.tensor_type.shape.dim[idx].dim_value
single_node_graph = helper.make_graph(
new_node_list,
'{}_single'.format(node.name),
graph_input_proto_list,
graph_output_proto_list,
new_initializer_list
)
if len(new_node_list) == 1 and new_node_list[0].op_type != 'Cast':
single_node_graph = self.modify_FP32_to_FP16(single_node_graph)
if node.op_type in self.model_op_list:
single_node_model = helper.make_model(
single_node_graph,
producer_name='model-single_op_model',
opset_imports=[helper.make_opsetid("", 13)]
)
else:
single_node_model = helper.make_model(
single_node_graph,
producer_name='model-single_op_model',
opset_imports=[helper.make_opsetid("", 13)]
)
single_node_model.ir_version = model.ir_version
try:
single_node_model = shape_inference.infer_shapes(single_node_model)
except:
print('single node model infer shape error... save original model')
new_path = '{}/{}/{}_{}.onnx'.format(save_path, node.op_type, node.op_type, idx)
if not os.path.exists(os.path.split(new_path)[0]):
os.makedirs(os.path.split(new_path)[0])
onnx.save_model(single_node_model, new_path)
model_name = os.path.basename(self.modelPath).split('.')[0]
model_info = {"name": model_name, "path": new_path, "node_detail": unique_str}
model_list.append(model_info)
return model_list
if __name__ == '__main__':
modelPath = 'models/modelzoo1.0/detection/ox_yolov4_dy/ox_yolov4_dy_int8.onnx'
extractNode = extractNodefromModel(modelPath, 1)
save_path = './yolov4_node_model'
modelPathList, _ = extractNode.extractNodeAndSave(save_path)
for nodeModelPath in modelPathList:
print(nodeModelPath['name'], nodeModelPath['path'])
# Custom Op ONNXRuntime
## Purpose
`Adding the custom operator implementation and registering it in ONNX Runtime`
## 环境配置
### Docker(方法一)
拉取镜像:
```plaintext
docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy-onnx1.19.2
```
创建并启动容器:
```plaintext
docker run --shm-size 16g --network=host --name=test --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/customop_onnxruntime:/home/customop -it <Your Image ID> /bin/bash
```
### Dockerfile(方法二)
```
cd ./docker
docker build --no-cache -t customop:test .
docker run --shm-size 16g --network=host --name=video_ort --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/customop_onnxruntime:/home/customop -it <Your Image ID> /bin/bash
```
## 使用
### 编译工程
```
git clone http://developer.sourcefind.cn/codes/modelzoo/customop_onnxruntime.git
cd customop_onnxruntime
python model.py // 生成模型
bash compile.sh // 编译生成算子库
```
### 运行示例
1. 目录
```
rocm_custom_op
├── compile.sh // 编译脚本
├── custom_op_library.cc // 注册自定义算子
├── custom_op_library.h
├── docker
├── include
├── model.py // 创建自定义模型
├── readme.md
├── rocm_ops.cc // 调用自定义算子
├── rocm_ops.h
├── rocm_ops.hip // 自定义算子实现
└── benchmark.py // 测试算子
```
2. 执行步骤
```
python test.py // 测试模型
```
3. 备注
```
1.在model.py中,定义了一个add.onnx模型。如需重新定义模型,请修改该文件。
2.在compile.sh中,选项--offload-arch=gfx906,请将gfx906替换为本机适配的rocm架构(可用rocminfo | grep gfx查看)。
3.若需测试别的自定义算子,修改内容如下:
3.1 修改rocm_ops.hip,重新实现自定义算子
3.2 修改rocm_ops.cc, 修改对自定义算子的调用
3.3 修改model.py,修改自定义模型的创建
3.4 修改benchmark.py,修改运行模型的输入数据
```
## result
### 精度
## 应用场景
### 功能
添加ORT custom op
## 源码仓库及问题反馈
- http://developer.sourcefind.cn/codes/modelzoo/customop_onnxruntime.git
## 参考资料
- https://github.com/microsoft/onnxruntime
- https://onnxruntime.ai/docs/extensions/add-op.html
\ No newline at end of file
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifdef USE_ROCM
#define ORT_API_MANUAL_INIT
#include "onnxruntime_cxx_api.h"
#undef ORT_API_MANUAL_INIT
#include "core/providers/rocm/rocm_context.h"
#include "onnxruntime_lite_custom_op.h"
//Concat
void rocm_concat(int axis, int M1, int N1, const float* X1, int M2, int N2, const float* X2, float* Z, hipStream_t stream);
//Gemm
void rocm_gemm(bool transA, bool transB, int M, int N, int K, float alpha, const float* A, const float* B, float beta, float* C, hipStream_t stream);
extern "C"{
//LeakyRelu
void rocm_leaky_relu(
int64_t size,
const float* d_X,
float* d_Y,
float alpha,
hipStream_t stream);
//Attention
void rocm_attention(int B, int S, int H,
const float* Q, const float* K, const float* V,
float* Out, hipStream_t stream);
//BatchNormalization
void rocm_batch_norm(int64_t N, int64_t C, int64_t H, int64_t W,
const float* X,
const float* gamma,
const float* beta,
const float* mean,
const float* var,
float epsilon,
float* Y,
hipStream_t stream);
//Cast
void rocm_cast(
int64_t N, // batch size
int64_t C, // channels (或其它第一维)
int64_t H, // 高度(或第二维)
int64_t W, // 宽度(或第三维)
const float* X, // 输入指针
int32_t* Y, // 输出指针
hipStream_t stream);
//Softmax
void rocm_softmax(int64_t M, int64_t N,
const float* X, float* Y,
hipStream_t compute_stream);
//Celu
void rocm_celu(int64_t, const float*, float*, float, hipStream_t);
//Relu
void rocm_relu(
int64_t size,
const float* X,
float* Y,
hipStream_t stream
);
// Conv
void rocm_conv2d(const float* input,
const float* weight,
const float* bias,
float* output,
int N, int C_in, int H, int W,
int C_out, int K_h, int K_w,
int out_H, int out_W,
hipStream_t stream);
}
using namespace Ort::Custom;
#define CUSTOM_ENFORCE(cond, msg) \
if (!(cond)) { \
throw std::runtime_error(msg); \
}
namespace Rocm {
void rocm_leaky_relu_forward(
const RocmContext& ctx,
const Tensor<float>& X,
Tensor<float>& Y) {
CUSTOM_ENFORCE(ctx.hip_stream, "No HIP stream available");
int64_t size = X.NumberOfElement();
const float alpha = 0.01f;
auto* y_ptr = Y.Allocate(X.Shape());
rocm_leaky_relu(size, X.Data(), y_ptr, alpha, ctx.hip_stream);
}
void rocm_relu_forward(
const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X,
Ort::Custom::Tensor<float>& Y
) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto input_shape = X.Shape();
int64_t size = X.NumberOfElement();
auto* y_data = Y.Allocate(input_shape);
rocm_relu(size, X.Data(), y_data, rocm_ctx.hip_stream);
}
void rocm_celu_forward(const Ort::Custom::RocmContext& ctx,
const Ort::Custom::Tensor<float>& X,
Ort::Custom::Tensor<float>& Y) {
CUSTOM_ENFORCE(ctx.hip_stream, "failed to fetch hip stream");
auto shape = X.Shape();
int64_t size = X.NumberOfElement();
float alpha = 1.0f; // or fetch from attribute
auto* y_ptr = Y.Allocate(shape);
rocm_celu(size, X.Data(), y_ptr, alpha, ctx.hip_stream);
}
/* softmax */
void KernelSoftmax(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X,
Ort::Custom::Tensor<float>& Z) {
auto input_shape = X.Shape();
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
// Expecting 2D input: [M, N]
CUSTOM_ENFORCE(input_shape.size() == 2, "Softmax only supports 2D input");
int64_t M = static_cast<int64_t>(input_shape[0]);
int64_t N = static_cast<int64_t>(input_shape[1]);
auto z_raw = Z.Allocate(input_shape);
// Call ROCm implementation
rocm_softmax(M, N, X.Data(), z_raw, rocm_ctx.hip_stream);
}
void rocm_cast_forward(
const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X,
Ort::Custom::Tensor<int32_t>& Y) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
// 假设只支持 4D 张量 [N,C,H,W]
auto shape = X.Shape();
CUSTOM_ENFORCE(shape.size() == 4, "Cast only supports 4D tensor [N,C,H,W]");
int64_t N = shape[0];
int64_t C = shape[1];
int64_t H = shape[2];
int64_t W = shape[3];
// 分配输出
auto* y_ptr = Y.Allocate({N, C, H, W});
// 正确调用:7 个参数
rocm_cast(
N, C, H, W,
X.Data(),
y_ptr,
rocm_ctx.hip_stream);
}
// BatchNormalization
void rocm_batchnorm_forward(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X,
const Ort::Custom::Tensor<float>& scale,
const Ort::Custom::Tensor<float>& B,
const Ort::Custom::Tensor<float>& mean,
const Ort::Custom::Tensor<float>& var,
Ort::Custom::Tensor<float>& Y) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto shape = X.Shape(); // expects [N, C, H, W]
CUSTOM_ENFORCE(shape.size() == 4, "Input must be 4D tensor [N, C, H, W]");
int64_t N = shape[0];
int64_t C = shape[1];
int64_t H = shape[2];
int64_t W = shape[3];
// Allocate output
auto* y_ptr = Y.Allocate({N, C, H, W});
// Epsilon attribute: retrieve via custom API or hardcode default
float epsilon = 1e-5f;
// If epsilon comes from attribute, fetch it here.
rocm_batch_norm(N, C, H, W,
X.Data(), scale.Data(), B.Data(), mean.Data(), var.Data(),
epsilon, y_ptr, rocm_ctx.hip_stream);
}
// attention
void rocm_attention_forward(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& Q,
const Ort::Custom::Tensor<float>& K,
const Ort::Custom::Tensor<float>& V,
Ort::Custom::Tensor<float>& Out) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto shape = Q.Shape(); // 期望为 [B, S, H]
CUSTOM_ENFORCE(shape.size() == 3, "Input must be 3D tensor [B, S, H]");
int B = shape[0];
int S = shape[1];
int H = shape[2];
auto* out_ptr = Out.Allocate({B, S, H});
rocm_attention(B, S, H, Q.Data(), K.Data(), V.Data(), out_ptr, rocm_ctx.hip_stream);
}
// -------------------------------
// Concat
// -------------------------------
void rocm_concat_forward(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& X1,
const Ort::Custom::Tensor<float>& X2,
Ort::Custom::Tensor<float>& Y) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto shape1 = X1.Shape();
auto shape2 = X2.Shape();
// 支持二维张量按列连接(axis=1)
CUSTOM_ENFORCE(shape1.size() == 2 && shape2.size() == 2, "Inputs must be 2D tensors.");
CUSTOM_ENFORCE(shape1[0] == shape2[0], "Row dimensions must match for concat on axis 1.");
int M1 = shape1[0], N1 = shape1[1];
int M2 = shape2[0], N2 = shape2[1];
auto* y_data = Y.Allocate({M1, N1 + N2}); // 输出是合并后的矩阵
rocm_concat(1, M1, N1, X1.Data(), M2, N2, X2.Data(), y_data, rocm_ctx.hip_stream);
}
/******conv******/
void rocm_conv_forward(const RocmContext& ctx,
const Tensor<float>& input,
const Tensor<float>& weight,
const Tensor<float>& bias,
Tensor<float>& output) {
CUSTOM_ENFORCE(ctx.hip_stream, "no HIP stream");
const auto& input_shape = input.Shape(); // [N, C_in, H, W]
const auto& weight_shape = weight.Shape(); // [C_out, C_in, K_h, K_w]
const int64_t N = input_shape[0];
const int64_t C_in = input_shape[1];
const int64_t H = input_shape[2];
const int64_t W = input_shape[3];
const int64_t C_out = weight_shape[0];
const int64_t K_h = weight_shape[2];
const int64_t K_w = weight_shape[3];
const int64_t out_H = (H - K_h) / 1 + 1;
const int64_t out_W = (W - K_w) / 1 + 1;
auto* y_ptr = output.Allocate({N, C_out, out_H, out_W});
rocm_conv2d(input.Data(), weight.Data(), bias.Data(), y_ptr,
N, C_in, H, W, C_out, K_h, K_w, out_H, out_W,
ctx.hip_stream);
}
// -------------------------------
// Gemm
// -------------------------------
void rocm_gemm_forward(const Ort::Custom::RocmContext& rocm_ctx,
const Ort::Custom::Tensor<float>& A,
const Ort::Custom::Tensor<float>& B,
const Ort::Custom::Tensor<float>& C,
Ort::Custom::Tensor<float>& Y) {
CUSTOM_ENFORCE(rocm_ctx.hip_stream, "failed to fetch hip stream");
auto shapeA = A.Shape();
auto shapeB = B.Shape();
auto shapeC = C.Shape();
CUSTOM_ENFORCE(shapeA.size() == 2 && shapeB.size() == 2 && shapeC.size() == 2, "Inputs must be 2D tensors.");
int M = shapeA[0];
int K = shapeA[1];
int N = shapeB[1];
CUSTOM_ENFORCE(shapeB[0] == K, "Inner dimension mismatch between A and B in Gemm.");
CUSTOM_ENFORCE(shapeC[0] == M && shapeC[1] == N, "Output tensor shape mismatch in Gemm.");
auto* y_data = Y.Allocate({M, N});
rocm_gemm(false, false, M, N, K, 1.0f, A.Data(), B.Data(), 1.0f, y_data, rocm_ctx.hip_stream);
}
void RegisterOps(Ort::CustomOpDomain& domain) {
//注册 Attention 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpAttention{Ort::Custom::CreateLiteCustomOp("Attention", "ROCMExecutionProvider", rocm_attention_forward)};
domain.Add(c_CustomOpAttention.get());
// 注册 BatchNormalization 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpBatchNorm{Ort::Custom::CreateLiteCustomOp("BatchNormalization", "ROCMExecutionProvider", rocm_batchnorm_forward)};
domain.Add(c_CustomOpBatchNorm.get());
// 注册 Concat 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpConcat{Ort::Custom::CreateLiteCustomOp("Concat", "ROCMExecutionProvider", rocm_concat_forward)};
domain.Add(c_CustomOpConcat.get());
// 注册 Gemm 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpGemm{Ort::Custom::CreateLiteCustomOp("Gemm", "ROCMExecutionProvider", rocm_gemm_forward)};
domain.Add(c_CustomOpGemm.get());
// 注册 Cast 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpCast{Ort::Custom::CreateLiteCustomOp("Cast", "ROCMExecutionProvider", rocm_cast_forward)};
domain.Add(c_CustomOpCast.get());
// 注册 Softmax 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpSoftmax{Ort::Custom::CreateLiteCustomOp("Softmax","ROCMExecutionProvider", KernelSoftmax)};
domain.Add(c_CustomOpSoftmax.get());
// 注册 Celu 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CeluOp{Ort::Custom::CreateLiteCustomOp("Celu", "ROCMExecutionProvider", rocm_celu_forward)};
domain.Add(c_CeluOp.get());
// 注册 ReLU 算子
static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpRelu{
Ort::Custom::CreateLiteCustomOp("Relu", "ROCMExecutionProvider", rocm_relu_forward)};
domain.Add(c_CustomOpRelu.get());
// 注册LeakyRelu算子
static const std::unique_ptr<OrtLiteCustomOp> c_LeakyReLU{
Ort::Custom::CreateLiteCustomOp(
"LeakyRelu", "ROCMExecutionProvider", rocm_leaky_relu_forward)};
domain.Add(c_LeakyReLU.get());
//注册conv算子
static const std::unique_ptr<OrtLiteCustomOp> c_Conv{
Ort::Custom::CreateLiteCustomOp("Conv", "ROCMExecutionProvider", rocm_conv_forward)};
domain.Add(c_Conv.get());
}
} // namespace Rocm
#endif
File added
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace Rocm {
#ifdef USE_ROCM
void RegisterOps(Ort::CustomOpDomain& domain);
#else
inline void RegisterOps(Ort::CustomOpDomain&) {}
#endif
} // namespace Rocm
#include "hip/hip_runtime.h"
#include <stdio.h>
#include <math.h>
__global__ void _Add(long long sz, float* Z, const float* X, const float* Y) {
long long offset = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
if (offset < sz) {
Z[offset] = X[offset] + Y[offset];
}
return;
}
void rocm_add(int64_t sz, float* Z, const float* X, const float* Y, hipStream_t compute_stream) {
float *d_X, *d_Y, *d_Z;
// 为设备上的数组分配内存并检查分配结果
hipError_t err;
err = hipMalloc((void**)&d_X, sz * sizeof(float));
if (err != hipSuccess) {
fprintf(stderr, "hipMalloc for d_X failed: %s\n", hipGetErrorString(err));
return;
}
err = hipMalloc((void**)&d_Y, sz * sizeof(float));
if (err != hipSuccess) {
fprintf(stderr, "hipMalloc for d_Y failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
return;
}
err = hipMalloc((void**)&d_Z, sz * sizeof(float));
if (err != hipSuccess) {
fprintf(stderr, "hipMalloc for d_Z failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
return;
}
// 将主机上的 X 和 Y 数组数据复制到设备
err = hipMemcpyAsync(d_X, X, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
if (err != hipSuccess) {
fprintf(stderr, "hipMemcpyAsync for d_X failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
err = hipMemcpyAsync(d_Y, Y, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
if (err != hipSuccess) {
fprintf(stderr, "hipMemcpyAsync for d_Y failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
// 调用核函数
_Add<<<256, 256, 0, compute_stream>>>(static_cast<long long>(sz), d_Z, d_X, d_Y);
err = hipGetLastError();
if (err != hipSuccess) {
fprintf(stderr, "Kernel launch failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
// 将计算结果从设备复制回主机
err = hipMemcpyAsync(Z, d_Z, sz * sizeof(float), hipMemcpyDeviceToHost, compute_stream);
if (err != hipSuccess) {
fprintf(stderr, "hipMemcpyAsync for Z failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
// 同步流,确保所有操作完成
err = hipStreamSynchronize(compute_stream);
if (err != hipSuccess) {
fprintf(stderr, "hipStreamSynchronize failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
// 释放设备上的内存
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
}
// Concat
__global__ void _Concat2D(int axis,
int M1, int N1, const float* X1,
int M2, int N2, const float* X2,
float* Z) {
int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if (axis == 0) { // 按行连接
if (row < M1 && col < N1) {
Z[row * N1 + col] = X1[row * N1 + col];
} else if (row >= M1 && row < M1 + M2 && col < N2) {
Z[row * N2 + col] = X2[(row - M1) * N2 + col];
}
} else if (axis == 1) { // 按列连接
if (row < M1 && col < N1) {
Z[row * (N1 + N2) + col] = X1[row * N1 + col];
} else if (row < M2 && col >= N1 && col < N1 + N2) {
Z[row * (N1 + N2) + col] = X2[row * N2 + (col - N1)];
}
}
return;
}
void rocm_concat(int axis,
int M1, int N1, const float* X1,
int M2, int N2, const float* X2,
float* Z,
hipStream_t compute_stream) {
dim3 blockDim(16, 16);
dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
float *d_X1, *d_X2, *d_Z;
hipError_t err;
size_t size1 = M1 * N1 * sizeof(float);
size_t size2 = M2 * N2 * sizeof(float);
size_t sizeZ = (axis == 0 ? (M1 + M2) * N1 : M1 * (N1 + N2)) * sizeof(float);
// 分配显存
err = hipMalloc(&d_X1, size1); if (err != hipSuccess) { /* 错误处理 */ }
err = hipMalloc(&d_X2, size2); if (err != hipSuccess) { hipFree(d_X1); return; }
err = hipMalloc(&d_Z, sizeZ); if (err != hipSuccess) { hipFree(d_X1); hipFree(d_X2); return; }
// 拷贝数据到设备
hipMemcpyAsync(d_X1, X1, size1, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_X2, X2, size2, hipMemcpyHostToDevice, compute_stream);
// 启动核函数
// dim3 blockDim(16, 16);
// dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
_Concat2D<<<gridDim, blockDim, 0, compute_stream>>>(axis, M1, N1, d_X1, M2, N2, d_X2, d_Z);
// 拷贝结果回主机
hipMemcpyAsync(Z, d_Z, sizeZ, hipMemcpyDeviceToHost, compute_stream);
// 同步流
hipStreamSynchronize(compute_stream);
// 释放资源
hipFree(d_X1); hipFree(d_X2); hipFree(d_Z);
return;
}
//gemm
#include <hip/hip_runtime.h>
__global__ void _Gemm(bool transA, bool transB,
int M, int N, int K,
float alpha,
const float* A,
const float* B,
float beta,
float* C) {
int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if (row >= M || col >= N) return;
float sum = 0.0f;
for (int k = 0; k < K; ++k) {
float a = transA ? A[k * M + row] : A[row * K + k];
float b = transB ? B[col * K + k] : B[k * N + col];
sum += a * b;
}
C[row * N + col] = alpha * sum + beta * C[row * N + col];
return;
}
void rocm_gemm(bool transA, bool transB,
int M, int N, int K,
float alpha,
const float* A,
const float* B,
float beta,
float* C,
hipStream_t compute_stream) {
dim3 blockDim(16, 16);
dim3 gridDim((N + 15) / 16, (M + 15) / 16);
float *d_A, *d_B, *d_C;
hipError_t err;
size_t sizeA = transA ? K * M * sizeof(float) : M * K * sizeof(float);
size_t sizeB = transB ? N * K * sizeof(float) : K * N * sizeof(float);
size_t sizeC = M * N * sizeof(float);
// 分配显存
err = hipMalloc(&d_A, sizeA); if (err != hipSuccess) { goto error; }
err = hipMalloc(&d_B, sizeB); if (err != hipSuccess) { hipFree(d_A); goto error; }
err = hipMalloc(&d_C, sizeC); if (err != hipSuccess) { hipFree(d_A); hipFree(d_B); goto error; }
// 主机 -> 设备拷贝
hipMemcpyAsync(d_A, A, sizeA, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_B, B, sizeB, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_C, C, sizeC, hipMemcpyHostToDevice, compute_stream);
// 启动核函数
//dim3 blockDim(16, 16);
//dim3 gridDim((N + 15) / 16, (M + 15) / 16);
_Gemm<<<gridDim, blockDim, 0, compute_stream>>>(transA, transB, M, N, K, alpha, d_A, d_B, beta, d_C);
// 设备 -> 主机拷贝
hipMemcpyAsync(C, d_C, sizeC, hipMemcpyDeviceToHost, compute_stream);
// 同步流
hipStreamSynchronize(compute_stream);
// 清理资源
hipFree(d_A);
hipFree(d_B);
hipFree(d_C);
return;
error:
fprintf(stderr, "HIP memory allocation or memcpy failed in rocm_gemm\n");
if (d_A) hipFree(d_A);
if (d_B) hipFree(d_B);
if (d_C) hipFree(d_C);
}
//GroupNormalization
#include <math.h>
__global__ void _GroupNorm(
int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
float eps, const float* X, float* Y,
const float* gamma, const float* beta
) {
// 计算当前组和样本索引
int64_t group_idx = hipBlockIdx_x;
int64_t n = hipBlockIdx_y;
int64_t channels_per_group = C / G;
int64_t c_start = group_idx * channels_per_group;
int64_t c_end = c_start + channels_per_group;
// 组内总元素数
int64_t group_size = channels_per_group * H * W;
// 共享内存用于归约求和
__shared__ float shared_sum[256];
__shared__ float shared_sum_sq[256];
// 每个线程计算局部和与平方和
float sum = 0.0f, sum_sq = 0.0f;
for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
int64_t c = c_start + idx / (H * W);
int64_t hw = idx % (H * W);
int64_t h = hw / W;
int64_t w = hw % W;
int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
float val = X[linear_idx];
sum += val;
sum_sq += val * val;
}
shared_sum[hipThreadIdx_x] = sum;
shared_sum_sq[hipThreadIdx_x] = sum_sq;
__syncthreads();
// 树状归约求全局和
for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
if (hipThreadIdx_x < s) {
shared_sum[hipThreadIdx_x] += shared_sum[hipThreadIdx_x + s];
shared_sum_sq[hipThreadIdx_x] += shared_sum_sq[hipThreadIdx_x + s];
}
__syncthreads();
}
// 计算均值和方差
float mean = shared_sum[0] / group_size;
float var = shared_sum_sq[0] / group_size - mean * mean;
// 归一化并应用仿射变换
for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
int64_t c = c_start + idx / (H * W);
int64_t hw = idx % (H * W);
int64_t h = hw / W;
int64_t w = hw % W;
int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
float val = (X[linear_idx] - mean) / sqrtf(var + eps);
Y[linear_idx] = gamma[c] * val + beta[c];
}
return;
}
void rocm_group_norm(
int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
float eps, const float* X, float* Y,
const float* gamma, const float* beta,
hipStream_t compute_stream
) {
dim3 block_dim(256); // 每个块256线程
dim3 grid_dim(G, N); // 每个组和样本对应一个块
// 参数校验
if (C % G != 0) {
fprintf(stderr, "Error: Channels must be divisible by groups.\n");
return;
}
// 分配设备内存
float *d_X, *d_Y, *d_gamma, *d_beta;
hipError_t err;
size_t input_size = N * C * H * W * sizeof(float);
size_t param_size = C * sizeof(float);
err = hipMalloc(&d_X, input_size);
if (err != hipSuccess) { /* 处理错误 */ }
err = hipMalloc(&d_Y, input_size);
if (err != hipSuccess) { hipFree(d_X); return; }
err = hipMalloc(&d_gamma, param_size);
if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); return; }
err = hipMalloc(&d_beta, param_size);
if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); return; }
// 数据拷贝到设备
hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_gamma, gamma, param_size, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_beta, beta, param_size, hipMemcpyHostToDevice, compute_stream);
// 配置核函数参数
// dim3 block_dim(256); // 每个块256线程
// dim3 grid_dim(G, N); // 每个组和样本对应一个块
// 启动核函数
_GroupNorm<<<grid_dim, block_dim, 0, compute_stream>>>(
N, C, H, W, G, eps, d_X, d_Y, d_gamma, d_beta
);
// 拷贝结果回主机
hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);
// 同步流并释放资源
hipStreamSynchronize(compute_stream);
hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); hipFree(d_beta);
}
//LogSoftmax
__global__ void _LogSoftmax(int64_t N, int64_t D, const float* X, float* Y) {
int64_t n = hipBlockIdx_x; // 每个样本一个线程块
int tid = hipThreadIdx_x;
// 共享内存存储最大值和指数和
__shared__ float shared_max[256];
__shared__ float shared_sum[256];
// 步骤1:计算样本内最大值
float max_val = -INFINITY;
for (int64_t i = tid; i < D; i += hipBlockDim_x) {
max_val = fmaxf(max_val, X[n * D + i]);
}
shared_max[tid] = max_val;
__syncthreads();
// 归约求全局最大值
for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
if (tid < s && shared_max[tid + s] > shared_max[tid]) {
shared_max[tid] = shared_max[tid + s];
}
__syncthreads();
}
float global_max = shared_max[0];
// 步骤2:计算指数和
float exp_sum = 0.0f;
for (int64_t i = tid; i < D; i += hipBlockDim_x) {
exp_sum += expf(X[n * D + i] - global_max);
}
shared_sum[tid] = exp_sum;
__syncthreads();
// 归约求全局指数和
for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
if (tid < s) {
shared_sum[tid] += shared_sum[tid + s];
}
__syncthreads();
}
float global_sum = shared_sum[0];
// 步骤3:计算LogSoftmax
for (int64_t i = tid; i < D; i += hipBlockDim_x) {
Y[n * D + i] = (X[n * D + i] - global_max) - logf(global_sum);
}
return;
}
void rocm_log_softmax(
int64_t N, int64_t D, const float* X, float* Y, hipStream_t compute_stream
) {
dim3 block_dim(256); // 每个块256线程
dim3 grid_dim(N); // 每个样本一个线程块
// 分配设备内存
float *d_X, *d_Y;
hipError_t err;
size_t input_size = N * D * sizeof(float);
err = hipMalloc(&d_X, input_size);
if (err != hipSuccess) { /* 处理错误 */ }
err = hipMalloc(&d_Y, input_size);
if (err != hipSuccess) { hipFree(d_X); return; }
// 数据拷贝到设备
hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);
// 配置核函数参数
//dim3 block_dim(256); // 每个块256线程
//dim3 grid_dim(N); // 每个样本一个线程块
// 启动核函数
_LogSoftmax<<<grid_dim, block_dim, 0, compute_stream>>>(N, D, d_X, d_Y);
// 拷贝结果回主机
hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);
// 同步流并释放资源
hipStreamSynchronize(compute_stream);
hipFree(d_X); hipFree(d_Y);
}
//attention
__global__ void _DotProductAttention(int B, int S, int H,
const float* Q, const float* K, const float* V,
float scaling,
float* output) {
int b = blockIdx.z;
int i = blockIdx.y * blockDim.y + threadIdx.y; // query index
int j = blockIdx.x * blockDim.x + threadIdx.x; // hidden dim
if (b >= B || i >= S || j >= H) return;
// 计算 Q·K^T[i, k]
float scores[128]; // 假设 seq_len <= 128
for (int k = 0; k < S; ++k) {
float dot = 0.f;
for (int h = 0; h < H; ++h) {
dot += Q[(b * S + i) * H + h] * K[(b * S + k) * H + h];
}
scores[k] = dot / scaling;
}
// softmax over scores
float max_val = scores[0];
for (int k = 1; k < S; ++k) max_val = fmaxf(max_val, scores[k]);
float sum = 0.f;
for (int k = 0; k < S; ++k) {
scores[k] = expf(scores[k] - max_val);
sum += scores[k];
}
for (int k = 0; k < S; ++k) scores[k] /= sum;
// output = softmax * V
float result = 0.f;
for (int k = 0; k < S; ++k) {
result += scores[k] * V[(b * S + k) * H + j];
}
output[(b * S + i) * H + j] = result;
}
extern "C" void rocm_attention(int B, int S, int H,
const float* Q, const float* K, const float* V,
float* Out, hipStream_t stream) {
dim3 blockDim(16, 16);
dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
float *d_Q, *d_K, *d_V, *d_Out;
size_t size = B * S * H * sizeof(float);
hipMalloc(&d_Q, size);
hipMalloc(&d_K, size);
hipMalloc(&d_V, size);
hipMalloc(&d_Out, size);
hipMemcpyAsync(d_Q, Q, size, hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_K, K, size, hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_V, V, size, hipMemcpyHostToDevice, stream);
float scale = sqrtf((float)H);
// dim3 blockDim(16, 16);
// dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
_DotProductAttention<<<gridDim, blockDim, 0, stream>>>(B, S, H, d_Q, d_K, d_V, scale, d_Out);
hipMemcpyAsync(Out, d_Out, size, hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
hipFree(d_Q); hipFree(d_K); hipFree(d_V); hipFree(d_Out);
return;
}
// BatchNormalization
__global__ void _BatchNormalization(
int N, int C, int H, int W,
const float* X,
const float* gamma,
const float* beta,
const float* mean,
const float* var,
float epsilon,
float* Y) {
// global thread index
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int total = N * C * H * W;
if (idx >= total) return;
// 计算坐标
int w = idx % W;
int tmp = idx / W;
int h = tmp % H;
tmp = tmp / H;
int c = tmp % C;
int n = tmp / C;
// 计算 Y = gamma[c] * (X - mean[c]) / sqrt(var[c] + eps) + beta[c]
int offset = ((n * C + c) * H + h) * W + w;
float x = X[offset];
float m = mean[c];
float v = var[c];
float inv_std = rsqrtf(v + epsilon);
Y[offset] = gamma[c] * ((x - m) * inv_std) + beta[c];
}
// host API:rocm_batch_norm
extern "C" void rocm_batch_norm(
int64_t N, int64_t C, int64_t H, int64_t W,
const float* X,
const float* gamma,
const float* beta,
const float* mean,
const float* var,
float epsilon,
float* Y,
hipStream_t stream) {
size_t total = (size_t)N * C * H * W;
// 分配并拷贝 X、gamma、beta、mean、var 到设备
float *d_X, *d_gamma, *d_beta, *d_mean, *d_var, *d_Y;
hipMalloc(&d_X, total * sizeof(float));
hipMalloc(&d_Y, total * sizeof(float));
hipMalloc(&d_gamma, C * sizeof(float));
hipMalloc(&d_beta, C * sizeof(float));
hipMalloc(&d_mean, C * sizeof(float));
hipMalloc(&d_var, C * sizeof(float));
hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_gamma, gamma, C * sizeof(float), hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_beta, beta, C * sizeof(float), hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_mean, mean, C * sizeof(float), hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_var, var, C * sizeof(float), hipMemcpyHostToDevice, stream);
// 启动核函数:一维线程组织
int threads = 256;
int blocks = (total + threads - 1) / threads;
_BatchNormalization<<<blocks, threads, 0, stream>>>(
N, C, H, W,
d_X, d_gamma, d_beta, d_mean, d_var,
epsilon,
d_Y);
// 拷回结果
hipMemcpyAsync(Y, d_Y, total * sizeof(float), hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
// 释放设备内存
hipFree(d_X);
hipFree(d_Y);
hipFree(d_gamma);
hipFree(d_beta);
hipFree(d_mean);
hipFree(d_var);
return;
}
// Cast Operator: float to int32
// Device kernel: cast each element
__global__ void _Cast(
int total,
const float* X,
int* Y) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= total) return;
// cast float to int32
Y[idx] = static_cast<int>(X[idx]);
}
// Host API: rocm_cast
extern "C" void rocm_cast(
int64_t N, int64_t C, int64_t H, int64_t W,
const float* X,
int* Y,
hipStream_t stream) {
// total elements
size_t total = (size_t)N * C * H * W;
// allocate device memory
float* d_X;
int* d_Y;
hipMalloc(&d_X, total * sizeof(float));
hipMalloc(&d_Y, total * sizeof(int));
// copy input to device
hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);
// launch kernel
int threads = 256;
int blocks = (total + threads - 1) / threads;
_Cast<<<blocks, threads, 0, stream>>>(
total,
d_X,
d_Y);
// copy result back
hipMemcpyAsync(Y, d_Y, total * sizeof(int), hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
// free device memory
hipFree(d_X);
hipFree(d_Y);
return;
}
extern "C" __global__
void SoftmaxKernel(const float* X, float* Y, int M, int N) {
// M = batch_size, N = feature_size
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M) return;
const float* x_row = X + row * N;
float* y_row = Y + row * N;
// 1) 找到这一行的最大值,用于数值稳定性
float m = x_row[0];
for (int j = 1; j < N; ++j) {
m = fmaxf(m, x_row[j]);
}
// 2) 计算 exp(x - m) 并累加
float sum = 0.f;
for (int j = 0; j < N; ++j) {
float e = expf(x_row[j] - m);
y_row[j] = e;
sum += e;
}
// 3) 归一化
for (int j = 0; j < N; ++j) {
y_row[j] /= sum;
}
}
// 这个函数由 ONNX Runtime 调用,替代原来的 rocm_add
extern "C"
void rocm_softmax(int64_t M, int64_t N,
const float* X, float* Y,
hipStream_t stream) {
// 每个线程处理一行,线程块大小 128
const int threads = 128;
const int blocks = static_cast<int>((M + threads - 1) / threads);
hipLaunchKernelGGL(
SoftmaxKernel,
dim3(blocks), dim3(threads),
0, // shared mem
stream, // hip stream
X, Y, static_cast<int>(M), static_cast<int>(N)
);
return;
}
template <typename T>
__global__ void _CeluKernel(const T* X, T* Y, int64_t size, T alpha) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
T v = X[idx];
T pos = v > T(0) ? v : T(0);
T neg = v <= T(0) ? alpha * (exp(v / alpha) - T(1)) : T(0);
Y[idx] = pos + neg;
}
return;
}
extern "C" void rocm_celu(int64_t size,
const float* X,
float* Y,
float alpha,
hipStream_t stream) {
float *d_X, *d_Y;
hipMalloc(&d_X, size * sizeof(float));
hipMalloc(&d_Y, size * sizeof(float));
hipMemcpyAsync(d_X, X, size * sizeof(float), hipMemcpyHostToDevice, stream);
int threads = 256;
int blocks = (size + threads - 1) / threads;
_CeluKernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size, alpha);
hipMemcpyAsync(Y, d_Y, size * sizeof(float), hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
hipFree(d_X); hipFree(d_Y);
return;
}
//relu
template <typename T>
__global__ void _rocm_relu_kernel(float* input, float* output, int64_t size) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
output[idx] = fmaxf(0.0f, input[idx]);
}
extern "C" void rocm_relu(
int64_t size,
const float* X,
float* Y,
hipStream_t stream
) {
size_t input_size = size * sizeof(float);
float *d_X, *d_Y;
hipMalloc(&d_X, input_size);
hipMalloc(&d_Y, input_size);
hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, stream);
int threads = 256;
int blocks = (size + threads - 1) / threads;
_rocm_relu_kernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size);
hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
hipFree(d_X);
hipFree(d_Y);
return;
}
// -------------------------------
// TopK
// -------------------------------
extern "C" __global__
void TopKKernel(
const float* __restrict__ X, // [M * N]
float* __restrict__ values, // [M * K]
int64_t* __restrict__ indices, // [M * K]
int M,
int N,
int K
) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M) return;
const float* x_row = X + size_t(row) * N;
float* v_row = values + size_t(row) * K;
int64_t* i_row = indices + size_t(row) * K;
// 动态共享内存布局:前 K 个 float 存放 topK 值,后 K 个 int 存放对应索引
extern __shared__ char smem[];
float* shared_vals = (float*)smem;
int* shared_idx = (int*)(smem + K * sizeof(float));
// 初始化:shared_vals = -INF, shared_idx = -1
for (int t = threadIdx.x; t < K; t += blockDim.x) {
shared_vals[t] = -INFINITY;
shared_idx[t] = -1;
}
__syncthreads();
// 扫描整行,维护一个长度为 K 的最小堆逻辑(但这里用简化的线性扫描替代堆)
for (int j = 0; j < N; ++j) {
float v = x_row[j];
// 找当前最小值位置
float min_val = shared_vals[0];
int min_pos = 0;
for (int t = 1; t < K; ++t) {
if (shared_vals[t] < min_val) {
min_val = shared_vals[t];
min_pos = t;
}
}
// 替换
if (v > min_val) {
shared_vals[min_pos] = v;
shared_idx[min_pos] = j;
}
}
__syncthreads();
// 对这 K 个元素做简单排序(降序),K 通常比较小
for (int i = 0; i < K; ++i) {
for (int j = i + 1; j < K; ++j) {
if (shared_vals[j] > shared_vals[i]) {
// swap value
float tv = shared_vals[i];
shared_vals[i] = shared_vals[j];
shared_vals[j] = tv;
// swap idx
int ti = shared_idx[i];
shared_idx[i] = shared_idx[j];
shared_idx[j] = ti;
}
}
}
__syncthreads();
// 写回全局内存
for (int t = threadIdx.x; t < K; t += blockDim.x) {
v_row[t] = shared_vals[t];
i_row[t] = (int64_t)shared_idx[t];
}
}
extern "C"
void rocm_topk(
int64_t M,
int64_t N,
int64_t K,
const float* X,
float* values,
int64_t* indices,
hipStream_t stream
) {
// 分配设备内存
size_t sizeX = size_t(M) * N * sizeof(float);
size_t sizeOutVal = size_t(M) * K * sizeof(float);
size_t sizeOutIdx = size_t(M) * K * sizeof(int64_t);
float* d_X;
float* d_vals;
int64_t* d_idx;
if (hipMalloc(&d_X, sizeX ) != hipSuccess ||
hipMalloc(&d_vals,sizeOutVal) != hipSuccess ||
hipMalloc(&d_idx, sizeOutIdx) != hipSuccess) {
fprintf(stderr, "HIP malloc failed in rocm_topk\n");
if (d_X) hipFree(d_X);
if (d_vals) hipFree(d_vals);
if (d_idx) hipFree(d_idx);
return;
}
// 拷贝输入到设备
hipMemcpyAsync(d_X, X, sizeX, hipMemcpyHostToDevice, stream);
// 启动 Kernel:每个线程处理一行,动态共享内存大小 = K*(sizeof(float)+sizeof(int))
dim3 blockDim(128);
dim3 gridDim((M + blockDim.x - 1) / blockDim.x);
size_t shared_bytes = K * (sizeof(float) + sizeof(int));
hipLaunchKernelGGL(
TopKKernel,
gridDim, blockDim, shared_bytes, stream,
d_X, d_vals, d_idx,
int(M), int(N), int(K)
);
// 拷贝结果回主机
hipMemcpyAsync(values, d_vals, sizeOutVal, hipMemcpyDeviceToHost, stream);
hipMemcpyAsync(indices, d_idx, sizeOutIdx, hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
// 释放
hipFree(d_X);
hipFree(d_vals);
hipFree(d_idx);
}
#include "hip/hip_runtime.h"
#include <stdio.h>
#include <math.h>
__global__ void _Add(long long sz, float* Z, const float* X, const float* Y) {
long long offset = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
if (offset < sz) {
Z[offset] = X[offset] + Y[offset];
}
return;
}
void rocm_add(int64_t sz, float* Z, const float* X, const float* Y, hipStream_t compute_stream) {
float *d_X, *d_Y, *d_Z;
// 为设备上的数组分配内存并检查分配结果
hipError_t err;
err = hipMalloc((void**)&d_X, sz * sizeof(float));
if (err != hipSuccess) {
fprintf(stderr, "hipMalloc for d_X failed: %s\n", hipGetErrorString(err));
return;
}
err = hipMalloc((void**)&d_Y, sz * sizeof(float));
if (err != hipSuccess) {
fprintf(stderr, "hipMalloc for d_Y failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
return;
}
err = hipMalloc((void**)&d_Z, sz * sizeof(float));
if (err != hipSuccess) {
fprintf(stderr, "hipMalloc for d_Z failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
return;
}
// 将主机上的 X 和 Y 数组数据复制到设备
err = hipMemcpyAsync(d_X, X, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
if (err != hipSuccess) {
fprintf(stderr, "hipMemcpyAsync for d_X failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
err = hipMemcpyAsync(d_Y, Y, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
if (err != hipSuccess) {
fprintf(stderr, "hipMemcpyAsync for d_Y failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
// 调用核函数
_Add<<<256, 256, 0, compute_stream>>>(static_cast<long long>(sz), d_Z, d_X, d_Y);
err = hipGetLastError();
if (err != hipSuccess) {
fprintf(stderr, "Kernel launch failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
// 将计算结果从设备复制回主机
err = hipMemcpyAsync(Z, d_Z, sz * sizeof(float), hipMemcpyDeviceToHost, compute_stream);
if (err != hipSuccess) {
fprintf(stderr, "hipMemcpyAsync for Z failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
// 同步流,确保所有操作完成
err = hipStreamSynchronize(compute_stream);
if (err != hipSuccess) {
fprintf(stderr, "hipStreamSynchronize failed: %s\n", hipGetErrorString(err));
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
return;
}
// 释放设备上的内存
hipFree(d_X);
hipFree(d_Y);
hipFree(d_Z);
}
// Concat
__global__ void _Concat2D(int axis,
int M1, int N1, const float* X1,
int M2, int N2, const float* X2,
float* Z) {
int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if (axis == 0) { // 按行连接
if (row < M1 && col < N1) {
Z[row * N1 + col] = X1[row * N1 + col];
} else if (row >= M1 && row < M1 + M2 && col < N2) {
Z[row * N2 + col] = X2[(row - M1) * N2 + col];
}
} else if (axis == 1) { // 按列连接
if (row < M1 && col < N1) {
Z[row * (N1 + N2) + col] = X1[row * N1 + col];
} else if (row < M2 && col >= N1 && col < N1 + N2) {
Z[row * (N1 + N2) + col] = X2[row * N2 + (col - N1)];
}
}
return;
}
void rocm_concat(int axis,
int M1, int N1, const float* X1,
int M2, int N2, const float* X2,
float* Z,
hipStream_t compute_stream) {
dim3 blockDim(16, 16);
dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
float *d_X1, *d_X2, *d_Z;
hipError_t err;
size_t size1 = M1 * N1 * sizeof(float);
size_t size2 = M2 * N2 * sizeof(float);
size_t sizeZ = (axis == 0 ? (M1 + M2) * N1 : M1 * (N1 + N2)) * sizeof(float);
// 分配显存
err = hipMalloc(&d_X1, size1); if (err != hipSuccess) { /* 错误处理 */ }
err = hipMalloc(&d_X2, size2); if (err != hipSuccess) { hipFree(d_X1); return; }
err = hipMalloc(&d_Z, sizeZ); if (err != hipSuccess) { hipFree(d_X1); hipFree(d_X2); return; }
// 拷贝数据到设备
hipMemcpyAsync(d_X1, X1, size1, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_X2, X2, size2, hipMemcpyHostToDevice, compute_stream);
// 启动核函数
// dim3 blockDim(16, 16);
// dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
_Concat2D<<<gridDim, blockDim, 0, compute_stream>>>(axis, M1, N1, d_X1, M2, N2, d_X2, d_Z);
// 拷贝结果回主机
hipMemcpyAsync(Z, d_Z, sizeZ, hipMemcpyDeviceToHost, compute_stream);
// 同步流
hipStreamSynchronize(compute_stream);
// 释放资源
hipFree(d_X1); hipFree(d_X2); hipFree(d_Z);
return;
}
//gemm
#include <hip/hip_runtime.h>
__global__ void _Gemm(bool transA, bool transB,
int M, int N, int K,
float alpha,
const float* A,
const float* B,
float beta,
float* C) {
int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if (row >= M || col >= N) return;
float sum = 0.0f;
for (int k = 0; k < K; ++k) {
float a = transA ? A[k * M + row] : A[row * K + k];
float b = transB ? B[col * K + k] : B[k * N + col];
sum += a * b;
}
C[row * N + col] = alpha * sum + beta * C[row * N + col];
return;
}
void rocm_gemm(bool transA, bool transB,
int M, int N, int K,
float alpha,
const float* A,
const float* B,
float beta,
float* C,
hipStream_t compute_stream) {
dim3 blockDim(16, 16);
dim3 gridDim((N + 15) / 16, (M + 15) / 16);
float *d_A, *d_B, *d_C;
hipError_t err;
size_t sizeA = transA ? K * M * sizeof(float) : M * K * sizeof(float);
size_t sizeB = transB ? N * K * sizeof(float) : K * N * sizeof(float);
size_t sizeC = M * N * sizeof(float);
// 分配显存
err = hipMalloc(&d_A, sizeA); if (err != hipSuccess) { goto error; }
err = hipMalloc(&d_B, sizeB); if (err != hipSuccess) { hipFree(d_A); goto error; }
err = hipMalloc(&d_C, sizeC); if (err != hipSuccess) { hipFree(d_A); hipFree(d_B); goto error; }
// 主机 -> 设备拷贝
hipMemcpyAsync(d_A, A, sizeA, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_B, B, sizeB, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_C, C, sizeC, hipMemcpyHostToDevice, compute_stream);
// 启动核函数
//dim3 blockDim(16, 16);
//dim3 gridDim((N + 15) / 16, (M + 15) / 16);
_Gemm<<<gridDim, blockDim, 0, compute_stream>>>(transA, transB, M, N, K, alpha, d_A, d_B, beta, d_C);
// 设备 -> 主机拷贝
hipMemcpyAsync(C, d_C, sizeC, hipMemcpyDeviceToHost, compute_stream);
// 同步流
hipStreamSynchronize(compute_stream);
// 清理资源
hipFree(d_A);
hipFree(d_B);
hipFree(d_C);
return;
error:
fprintf(stderr, "HIP memory allocation or memcpy failed in rocm_gemm\n");
if (d_A) hipFree(d_A);
if (d_B) hipFree(d_B);
if (d_C) hipFree(d_C);
}
//GroupNormalization
#include <math.h>
__global__ void _GroupNorm(
int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
float eps, const float* X, float* Y,
const float* gamma, const float* beta
) {
// 计算当前组和样本索引
int64_t group_idx = hipBlockIdx_x;
int64_t n = hipBlockIdx_y;
int64_t channels_per_group = C / G;
int64_t c_start = group_idx * channels_per_group;
int64_t c_end = c_start + channels_per_group;
// 组内总元素数
int64_t group_size = channels_per_group * H * W;
// 共享内存用于归约求和
__shared__ float shared_sum[256];
__shared__ float shared_sum_sq[256];
// 每个线程计算局部和与平方和
float sum = 0.0f, sum_sq = 0.0f;
for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
int64_t c = c_start + idx / (H * W);
int64_t hw = idx % (H * W);
int64_t h = hw / W;
int64_t w = hw % W;
int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
float val = X[linear_idx];
sum += val;
sum_sq += val * val;
}
shared_sum[hipThreadIdx_x] = sum;
shared_sum_sq[hipThreadIdx_x] = sum_sq;
__syncthreads();
// 树状归约求全局和
for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
if (hipThreadIdx_x < s) {
shared_sum[hipThreadIdx_x] += shared_sum[hipThreadIdx_x + s];
shared_sum_sq[hipThreadIdx_x] += shared_sum_sq[hipThreadIdx_x + s];
}
__syncthreads();
}
// 计算均值和方差
float mean = shared_sum[0] / group_size;
float var = shared_sum_sq[0] / group_size - mean * mean;
// 归一化并应用仿射变换
for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
int64_t c = c_start + idx / (H * W);
int64_t hw = idx % (H * W);
int64_t h = hw / W;
int64_t w = hw % W;
int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
float val = (X[linear_idx] - mean) / sqrtf(var + eps);
Y[linear_idx] = gamma[c] * val + beta[c];
}
return;
}
void rocm_group_norm(
int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
float eps, const float* X, float* Y,
const float* gamma, const float* beta,
hipStream_t compute_stream
) {
dim3 block_dim(256); // 每个块256线程
dim3 grid_dim(G, N); // 每个组和样本对应一个块
// 参数校验
if (C % G != 0) {
fprintf(stderr, "Error: Channels must be divisible by groups.\n");
return;
}
// 分配设备内存
float *d_X, *d_Y, *d_gamma, *d_beta;
hipError_t err;
size_t input_size = N * C * H * W * sizeof(float);
size_t param_size = C * sizeof(float);
err = hipMalloc(&d_X, input_size);
if (err != hipSuccess) { /* 处理错误 */ }
err = hipMalloc(&d_Y, input_size);
if (err != hipSuccess) { hipFree(d_X); return; }
err = hipMalloc(&d_gamma, param_size);
if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); return; }
err = hipMalloc(&d_beta, param_size);
if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); return; }
// 数据拷贝到设备
hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_gamma, gamma, param_size, hipMemcpyHostToDevice, compute_stream);
hipMemcpyAsync(d_beta, beta, param_size, hipMemcpyHostToDevice, compute_stream);
// 配置核函数参数
// dim3 block_dim(256); // 每个块256线程
// dim3 grid_dim(G, N); // 每个组和样本对应一个块
// 启动核函数
_GroupNorm<<<grid_dim, block_dim, 0, compute_stream>>>(
N, C, H, W, G, eps, d_X, d_Y, d_gamma, d_beta
);
// 拷贝结果回主机
hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);
// 同步流并释放资源
hipStreamSynchronize(compute_stream);
hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); hipFree(d_beta);
}
//LogSoftmax
__global__ void _LogSoftmax(int64_t N, int64_t D, const float* X, float* Y) {
int64_t n = hipBlockIdx_x; // 每个样本一个线程块
int tid = hipThreadIdx_x;
// 共享内存存储最大值和指数和
__shared__ float shared_max[256];
__shared__ float shared_sum[256];
// 步骤1:计算样本内最大值
float max_val = -INFINITY;
for (int64_t i = tid; i < D; i += hipBlockDim_x) {
max_val = fmaxf(max_val, X[n * D + i]);
}
shared_max[tid] = max_val;
__syncthreads();
// 归约求全局最大值
for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
if (tid < s && shared_max[tid + s] > shared_max[tid]) {
shared_max[tid] = shared_max[tid + s];
}
__syncthreads();
}
float global_max = shared_max[0];
// 步骤2:计算指数和
float exp_sum = 0.0f;
for (int64_t i = tid; i < D; i += hipBlockDim_x) {
exp_sum += expf(X[n * D + i] - global_max);
}
shared_sum[tid] = exp_sum;
__syncthreads();
// 归约求全局指数和
for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
if (tid < s) {
shared_sum[tid] += shared_sum[tid + s];
}
__syncthreads();
}
float global_sum = shared_sum[0];
// 步骤3:计算LogSoftmax
for (int64_t i = tid; i < D; i += hipBlockDim_x) {
Y[n * D + i] = (X[n * D + i] - global_max) - logf(global_sum);
}
return;
}
void rocm_log_softmax(
int64_t N, int64_t D, const float* X, float* Y, hipStream_t compute_stream
) {
dim3 block_dim(256); // 每个块256线程
dim3 grid_dim(N); // 每个样本一个线程块
// 分配设备内存
float *d_X, *d_Y;
hipError_t err;
size_t input_size = N * D * sizeof(float);
err = hipMalloc(&d_X, input_size);
if (err != hipSuccess) { /* 处理错误 */ }
err = hipMalloc(&d_Y, input_size);
if (err != hipSuccess) { hipFree(d_X); return; }
// 数据拷贝到设备
hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);
// 配置核函数参数
//dim3 block_dim(256); // 每个块256线程
//dim3 grid_dim(N); // 每个样本一个线程块
// 启动核函数
_LogSoftmax<<<grid_dim, block_dim, 0, compute_stream>>>(N, D, d_X, d_Y);
// 拷贝结果回主机
hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);
// 同步流并释放资源
hipStreamSynchronize(compute_stream);
hipFree(d_X); hipFree(d_Y);
}
//attention
__global__ void _DotProductAttention(int B, int S, int H,
const float* Q, const float* K, const float* V,
float scaling,
float* output) {
int b = blockIdx.z;
int i = blockIdx.y * blockDim.y + threadIdx.y; // query index
int j = blockIdx.x * blockDim.x + threadIdx.x; // hidden dim
if (b >= B || i >= S || j >= H) return;
// 计算 Q·K^T[i, k]
float scores[128]; // 假设 seq_len <= 128
for (int k = 0; k < S; ++k) {
float dot = 0.f;
for (int h = 0; h < H; ++h) {
dot += Q[(b * S + i) * H + h] * K[(b * S + k) * H + h];
}
scores[k] = dot / scaling;
}
// softmax over scores
float max_val = scores[0];
for (int k = 1; k < S; ++k) max_val = fmaxf(max_val, scores[k]);
float sum = 0.f;
for (int k = 0; k < S; ++k) {
scores[k] = expf(scores[k] - max_val);
sum += scores[k];
}
for (int k = 0; k < S; ++k) scores[k] /= sum;
// output = softmax * V
float result = 0.f;
for (int k = 0; k < S; ++k) {
result += scores[k] * V[(b * S + k) * H + j];
}
output[(b * S + i) * H + j] = result;
}
extern "C" void rocm_attention(int B, int S, int H,
const float* Q, const float* K, const float* V,
float* Out, hipStream_t stream) {
dim3 blockDim(16, 16);
dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
float *d_Q, *d_K, *d_V, *d_Out;
size_t size = B * S * H * sizeof(float);
hipMalloc(&d_Q, size);
hipMalloc(&d_K, size);
hipMalloc(&d_V, size);
hipMalloc(&d_Out, size);
hipMemcpyAsync(d_Q, Q, size, hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_K, K, size, hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_V, V, size, hipMemcpyHostToDevice, stream);
float scale = sqrtf((float)H);
// dim3 blockDim(16, 16);
// dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
_DotProductAttention<<<gridDim, blockDim, 0, stream>>>(B, S, H, d_Q, d_K, d_V, scale, d_Out);
hipMemcpyAsync(Out, d_Out, size, hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
hipFree(d_Q); hipFree(d_K); hipFree(d_V); hipFree(d_Out);
return;
}
// BatchNormalization
__global__ void _BatchNormalization(
int N, int C, int H, int W,
const float* X,
const float* gamma,
const float* beta,
const float* mean,
const float* var,
float epsilon,
float* Y) {
// global thread index
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int total = N * C * H * W;
if (idx >= total) return;
// 计算坐标
int w = idx % W;
int tmp = idx / W;
int h = tmp % H;
tmp = tmp / H;
int c = tmp % C;
int n = tmp / C;
// 计算 Y = gamma[c] * (X - mean[c]) / sqrt(var[c] + eps) + beta[c]
int offset = ((n * C + c) * H + h) * W + w;
float x = X[offset];
float m = mean[c];
float v = var[c];
float inv_std = rsqrtf(v + epsilon);
Y[offset] = gamma[c] * ((x - m) * inv_std) + beta[c];
}
// host API:rocm_batch_norm
extern "C" void rocm_batch_norm(
int64_t N, int64_t C, int64_t H, int64_t W,
const float* X,
const float* gamma,
const float* beta,
const float* mean,
const float* var,
float epsilon,
float* Y,
hipStream_t stream) {
size_t total = (size_t)N * C * H * W;
// 分配并拷贝 X、gamma、beta、mean、var 到设备
float *d_X, *d_gamma, *d_beta, *d_mean, *d_var, *d_Y;
hipMalloc(&d_X, total * sizeof(float));
hipMalloc(&d_Y, total * sizeof(float));
hipMalloc(&d_gamma, C * sizeof(float));
hipMalloc(&d_beta, C * sizeof(float));
hipMalloc(&d_mean, C * sizeof(float));
hipMalloc(&d_var, C * sizeof(float));
hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_gamma, gamma, C * sizeof(float), hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_beta, beta, C * sizeof(float), hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_mean, mean, C * sizeof(float), hipMemcpyHostToDevice, stream);
hipMemcpyAsync(d_var, var, C * sizeof(float), hipMemcpyHostToDevice, stream);
// 启动核函数:一维线程组织
int threads = 256;
int blocks = (total + threads - 1) / threads;
_BatchNormalization<<<blocks, threads, 0, stream>>>(
N, C, H, W,
d_X, d_gamma, d_beta, d_mean, d_var,
epsilon,
d_Y);
// 拷回结果
hipMemcpyAsync(Y, d_Y, total * sizeof(float), hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
// 释放设备内存
hipFree(d_X);
hipFree(d_Y);
hipFree(d_gamma);
hipFree(d_beta);
hipFree(d_mean);
hipFree(d_var);
return;
}
// Cast Operator: float to int32
// Device kernel: cast each element
__global__ void _Cast(
int total,
const float* X,
int* Y) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= total) return;
// cast float to int32
Y[idx] = static_cast<int>(X[idx]);
}
// Host API: rocm_cast
extern "C" void rocm_cast(
int64_t N, int64_t C, int64_t H, int64_t W,
const float* X,
int* Y,
hipStream_t stream) {
// total elements
size_t total = (size_t)N * C * H * W;
// allocate device memory
float* d_X;
int* d_Y;
hipMalloc(&d_X, total * sizeof(float));
hipMalloc(&d_Y, total * sizeof(int));
// copy input to device
hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);
// launch kernel
int threads = 256;
int blocks = (total + threads - 1) / threads;
_Cast<<<blocks, threads, 0, stream>>>(
total,
d_X,
d_Y);
// copy result back
hipMemcpyAsync(Y, d_Y, total * sizeof(int), hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
// free device memory
hipFree(d_X);
hipFree(d_Y);
return;
}
extern "C" __global__
void SoftmaxKernel(const float* X, float* Y, int M, int N) {
// M = batch_size, N = feature_size
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M) return;
const float* x_row = X + row * N;
float* y_row = Y + row * N;
// 1) 找到这一行的最大值,用于数值稳定性
float m = x_row[0];
for (int j = 1; j < N; ++j) {
m = fmaxf(m, x_row[j]);
}
// 2) 计算 exp(x - m) 并累加
float sum = 0.f;
for (int j = 0; j < N; ++j) {
float e = expf(x_row[j] - m);
y_row[j] = e;
sum += e;
}
// 3) 归一化
for (int j = 0; j < N; ++j) {
y_row[j] /= sum;
}
}
// 这个函数由 ONNX Runtime 调用,替代原来的 rocm_add
extern "C"
void rocm_softmax(int64_t M, int64_t N,
const float* X, float* Y,
hipStream_t stream) {
// 每个线程处理一行,线程块大小 128
const int threads = 128;
const int blocks = static_cast<int>((M + threads - 1) / threads);
hipLaunchKernelGGL(
SoftmaxKernel,
dim3(blocks), dim3(threads),
0, // shared mem
stream, // hip stream
X, Y, static_cast<int>(M), static_cast<int>(N)
);
return;
}
template <typename T>
__global__ void _CeluKernel(const T* X, T* Y, int64_t size, T alpha) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
T v = X[idx];
T pos = v > T(0) ? v : T(0);
T neg = v <= T(0) ? alpha * (exp(v / alpha) - T(1)) : T(0);
Y[idx] = pos + neg;
}
return;
}
extern "C" void rocm_celu(int64_t size,
const float* X,
float* Y,
float alpha,
hipStream_t stream) {
float *d_X, *d_Y;
hipMalloc(&d_X, size * sizeof(float));
hipMalloc(&d_Y, size * sizeof(float));
hipMemcpyAsync(d_X, X, size * sizeof(float), hipMemcpyHostToDevice, stream);
int threads = 256;
int blocks = (size + threads - 1) / threads;
_CeluKernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size, alpha);
hipMemcpyAsync(Y, d_Y, size * sizeof(float), hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
hipFree(d_X); hipFree(d_Y);
return;
}
//relu
template <typename T>
__global__ void _rocm_relu_kernel(float* input, float* output, int64_t size) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
output[idx] = fmaxf(0.0f, input[idx]);
}
extern "C" void rocm_relu(
int64_t size,
const float* X,
float* Y,
hipStream_t stream
) {
size_t input_size = size * sizeof(float);
float *d_X, *d_Y;
hipMalloc(&d_X, input_size);
hipMalloc(&d_Y, input_size);
hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, stream);
int threads = 256;
int blocks = (size + threads - 1) / threads;
_rocm_relu_kernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size);
hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
hipFree(d_X);
hipFree(d_Y);
return;
}
// -------------------------------
// TopK
// -------------------------------
extern "C"
__global__ void TopKKernel(
const float* __restrict__ X, // [M * N]
float* __restrict__ values, // [M * K]
int64_t* __restrict__ indices, // [M * K]
int M,
int N,
int K
) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M) return;
const float* x_row = X + size_t(row) * N;
float* v_row = values + size_t(row) * K;
int64_t* i_row = indices + size_t(row) * K;
// 动态共享内存布局:前 K 个 float 存放 topK 值,后 K 个 int 存放对应索引
extern __shared__ char smem[];
float* shared_vals = (float*)smem;
int* shared_idx = (int*)(smem + K * sizeof(float));
// 初始化:shared_vals = -INF, shared_idx = -1
for (int t = threadIdx.x; t < K; t += blockDim.x) {
shared_vals[t] = -INFINITY;
shared_idx[t] = -1;
}
__syncthreads();
// 扫描整行,维护一个长度为 K 的最小堆逻辑(但这里用简化的线性扫描替代堆)
for (int j = 0; j < N; ++j) {
float v = x_row[j];
// 找当前最小值位置
float min_val = shared_vals[0];
int min_pos = 0;
for (int t = 1; t < K; ++t) {
if (shared_vals[t] < min_val) {
min_val = shared_vals[t];
min_pos = t;
}
}
// 替换
if (v > min_val) {
shared_vals[min_pos] = v;
shared_idx[min_pos] = j;
}
}
__syncthreads();
// 对这 K 个元素做简单排序(降序),K 通常比较小
for (int i = 0; i < K; ++i) {
for (int j = i + 1; j < K; ++j) {
if (shared_vals[j] > shared_vals[i]) {
// swap value
float tv = shared_vals[i];
shared_vals[i] = shared_vals[j];
shared_vals[j] = tv;
// swap idx
int ti = shared_idx[i];
shared_idx[i] = shared_idx[j];
shared_idx[j] = ti;
}
}
}
__syncthreads();
// 写回全局内存
for (int t = threadIdx.x; t < K; t += blockDim.x) {
v_row[t] = shared_vals[t];
i_row[t] = (int64_t)shared_idx[t];
}
}
extern "C"
void rocm_topk(
int64_t M,
int64_t N,
int64_t K,
const float* X,
float* values,
int64_t* indices,
hipStream_t stream
) {
// 分配设备内存
size_t sizeX = size_t(M) * N * sizeof(float);
size_t sizeOutVal = size_t(M) * K * sizeof(float);
size_t sizeOutIdx = size_t(M) * K * sizeof(int64_t);
float* d_X;
float* d_vals;
int64_t* d_idx;
if (hipMalloc(&d_X, sizeX ) != hipSuccess ||
hipMalloc(&d_vals,sizeOutVal) != hipSuccess ||
hipMalloc(&d_idx, sizeOutIdx) != hipSuccess) {
fprintf(stderr, "HIP malloc failed in rocm_topk\n");
if (d_X) hipFree(d_X);
if (d_vals) hipFree(d_vals);
if (d_idx) hipFree(d_idx);
return;
}
// 拷贝输入到设备
hipMemcpyAsync(d_X, X, sizeX, hipMemcpyHostToDevice, stream);
// 启动 Kernel:每个线程处理一行,动态共享内存大小 = K*(sizeof(float)+sizeof(int))
dim3 blockDim(128);
dim3 gridDim((M + blockDim.x - 1) / blockDim.x);
size_t shared_bytes = K * (sizeof(float) + sizeof(int));
hipLaunchKernelGGL(
TopKKernel,
gridDim, blockDim, shared_bytes, stream,
d_X, d_vals, d_idx,
int(M), int(N), int(K)
);
// 拷贝结果回主机
hipMemcpyAsync(values, d_vals, sizeOutVal, hipMemcpyDeviceToHost, stream);
hipMemcpyAsync(indices, d_idx, sizeOutIdx, hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
// 释放
hipFree(d_X);
hipFree(d_vals);
hipFree(d_idx);
}
//ReduceLogSum
template <typename T>
__global__ void _rocm_reduce_log_sum_kernel(
const T* input,
T* output,
int64_t N,
int64_t C,
int64_t H,
int64_t W,
int64_t axis,
bool keep_dims
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// 计算总线程数
int total;
if (keep_dims) {
total = N * C * H * W;
} else {
switch (axis) {
case 0: total = C * H * W; break;
case 1: total = N * H * W; break;
case 2: total = N * C * W; break;
case 3: total = N * C * H; break;
default: total = N * C * H * W; break;
}
}
if (idx >= total) return;
// 解析输出索引
int out_n = 0, out_c = 0, out_h = 0, out_w = 0;
if (keep_dims) {
out_n = idx / (C * H * W);
int rem = idx % (C * H * W);
out_c = rem / (H * W);
rem %= H * W;
out_h = rem / W;
out_w = rem % W;
} else {
switch (axis) {
case 0: {
out_n = 0;
out_c = idx / (H * W);
int rem = idx % (H * W);
out_h = rem / W;
out_w = rem % W;
break;
}
case 1: {
out_n = idx / (H * W);
int rem = idx % (H * W);
out_c = 0;
out_h = rem / W;
out_w = rem % W;
break;
}
case 2: {
out_n = idx / (C * W);
int rem = idx % (C * W);
out_c = rem / W;
out_w = rem % W;
out_h = 0;
break;
}
case 3: {
out_n = idx / (C * H);
int rem = idx % (C * H);
out_c = rem / H;
out_h = rem % H;
out_w = 0;
break;
}
default: {
out_n = idx / (C * H * W);
int rem = idx % (C * H * W);
out_c = rem / (H * W);
rem %= H * W;
out_h = rem / W;
out_w = rem % W;
break;
}
}
}
// 计算输入索引范围
int64_t start = 0, end = 0;
if (axis == 0) {
start = out_c * H * W + out_h * W + out_w;
end = N * C * H * W;
} else if (axis == 1) {
start = out_n * C * H * W + out_h * W + out_w;
end = start + C * H * W;
} else if (axis == 2) {
start = out_n * C * H * W + out_c * H * W + out_w;
end = start + H * C * W;
} else if (axis == 3) {
start = out_n * C * H * W + out_c * H * W + out_h * W;
end = start + W * C * H;
}
// 累加求和
T sum = T(0);
int64_t step;
switch (axis) {
case 0: step = C * H * W; break;
case 1: step = H * W; break;
case 2: step = W; break;
case 3: step = 1; break;
default: step = 1; break;
}
for (int64_t i = start; i < end; i += step) {
sum += input[i];
}
// 取自然对数
output[idx] = log(sum);
}
extern "C" void rocm_reduce_log_sum(
int64_t N,
int64_t C,
int64_t H,
int64_t W,
const float* X,
float* Y,
int64_t axis,
bool keep_dims,
hipStream_t stream
) {
// 计算输出尺寸
int64_t out_N = keep_dims ? N : (axis == 0 ? 1 : N);
int64_t out_C = keep_dims ? C : (axis == 1 ? 1 : C);
int64_t out_H = keep_dims ? H : (axis == 2 ? 1 : H);
int64_t out_W = keep_dims ? W : (axis == 3 ? 1 : W);
size_t input_size = N * C * H * W * sizeof(float);
size_t output_size = out_N * out_C * out_H * out_W * sizeof(float);
// 设备内存分配
float *d_X, *d_Y;
hipMalloc(&d_X, input_size);
hipMalloc(&d_Y, output_size);
// 异步拷贝数据到设备
hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, stream);
// 核函数配置
int total_threads = out_N * out_C * out_H * out_W;
int block_size = 256;
int grid_size = (total_threads + block_size - 1) / block_size;
// 启动核函数
_rocm_reduce_log_sum_kernel<float><<<grid_size, block_size, 0, stream>>>(
d_X, d_Y, N, C, H, W, axis, keep_dims
);
// 异步拷贝结果回主机
hipMemcpyAsync(Y, d_Y, output_size, hipMemcpyDeviceToHost, stream);
hipStreamSynchronize(stream);
// 释放设备内存
hipFree(d_X);
hipFree(d_Y);
return;
}
__global__ void _RoiAlignKernel(
const float* X, int N, int C, int H, int W,
const float* rois, const int64_t* batch_inds,
int num_rois, int out_h, int out_w,
int sampling_ratio, float spatial_scale,
float* Y) {
int rid = blockIdx.x; // ROI index
int c = blockIdx.y; // channel
int ph = threadIdx.y; // pooled y
int pw = threadIdx.x; // pooled x
if (rid >= num_rois || c >= C || ph >= out_h || pw >= out_w) return;
// 读取 ROI
const float* roi_ptr = rois + rid * 4;
float x1 = roi_ptr[0] * spatial_scale;
float y1 = roi_ptr[1] * spatial_scale;
float x2 = roi_ptr[2] * spatial_scale;
float y2 = roi_ptr[3] * spatial_scale;
int batch_id = static_cast<int>(batch_inds[rid]);
float roi_w = max(x2 - x1, 1.0f);
float roi_h = max(y2 - y1, 1.0f);
float bin_w = roi_w / static_cast<float>(out_w);
float bin_h = roi_h / static_cast<float>(out_h);
int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_h / out_h);
int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_w / out_w);
float count = static_cast<float>(roi_bin_grid_h * roi_bin_grid_w);
float sum = 0.0f;
// 在每个 bin 内做平均
for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
float y = y1 + ph * bin_h + (iy + 0.5f) * bin_h / roi_bin_grid_h;
for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
float x = x1 + pw * bin_w + (ix + 0.5f) * bin_w / roi_bin_grid_w;
// 双线性插值
int x0 = floorf(x), x1i = min(x0 + 1, W - 1);
int y0 = floorf(y), y1i = min(y0 + 1, H - 1);
float lx = x - x0, ly = y - y0;
const float* fmap = X + (batch_id * C + c) * H * W;
float v00 = fmap[y0 * W + x0];
float v01 = fmap[y0 * W + x1i];
float v10 = fmap[y1i * W + x0];
float v11 = fmap[y1i * W + x1i];
float w00 = (1 - lx) * (1 - ly);
float w01 = lx * (1 - ly);
float w10 = (1 - lx) * ly;
float w11 = lx * ly;
sum += v00 * w00 + v01 * w01 + v10 * w10 + v11 * w11;
}
}
float* out_ptr = Y + ((rid * C + c) * out_h + ph) * out_w + pw;
*out_ptr = sum / count;
}
extern "C" void rocm_roi_align(
const float* X, int64_t N, int64_t C, int64_t H, int64_t W,
const float* rois, const int64_t* batch_inds,
int64_t num_rois, int64_t out_h, int64_t out_w,
int64_t sampling_ratio, float spatial_scale,
float* Y, hipStream_t stream) {
dim3 grid(num_rois, C);
dim3 block(out_w, out_h);
_RoiAlignKernel<<<grid, block, 0, stream>>>(
X, N, C, H, W,
rois, batch_inds,
num_rois, out_h, out_w,
sampling_ratio, spatial_scale,
Y);
hipStreamSynchronize(stream);
}
// LeakyReLU kernel
__global__ void _LeakyReLUKernel(const float* X, float* Y, int64_t size, float alpha) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
float v = X[idx];
Y[idx] = (v >= 0.0f) ? v : alpha * v;
}
extern "C" void rocm_leaky_relu(
int64_t size,
const float* d_X,
float* d_Y,
float alpha,
hipStream_t stream) {
const int threads = 256;
int blocks = static_cast<int>((size + threads - 1) / threads);
_LeakyReLUKernel<<<blocks, threads, 0, stream>>>(d_X, d_Y, size, alpha);
}
//Conv
__global__ void _Conv2dKernel(const float* input,
const float* weight,
const float* bias,
float* output,
int N, int C_in, int H, int W,
int C_out, int K_h, int K_w,
int out_H, int out_W) {
int n = blockIdx.x;
int oc = blockIdx.y;
int oh = threadIdx.y;
int ow = threadIdx.x;
if (oh >= out_H || ow >= out_W) return;
float sum = bias[oc];
for (int ic = 0; ic < C_in; ++ic) {
for (int kh = 0; kh < K_h; ++kh) {
for (int kw = 0; kw < K_w; ++kw) {
int ih = oh + kh;
int iw = ow + kw;
float val = input[n * (C_in * H * W) + ic * (H * W) + ih * W + iw];
float w = weight[oc * (C_in * K_h * K_w) + ic * (K_h * K_w) + kh * K_w + kw];
sum += val * w;
}
}
}
output[n * (C_out * out_H * out_W) + oc * (out_H * out_W) + oh * out_W + ow] = sum;
}
extern "C" void rocm_conv2d(const float* input,
const float* weight,
const float* bias,
float* output,
int N, int C_in, int H, int W,
int C_out, int K_h, int K_w,
int out_H, int out_W,
hipStream_t stream) {
dim3 blocks(N, C_out);
dim3 threads(out_W, out_H);
_Conv2dKernel<<<blocks, threads, 0, stream>>>(
input, weight, bias, output,
N, C_in, H, W, C_out, K_h, K_w, out_H, out_W);
hipError_t err = hipGetLastError();
if (err != hipSuccess) {
fprintf(stderr, "Conv2D kernel launch failed: %s\n", hipGetErrorString(err));
}
}
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment