Unverified Commit 85bc98ac authored by qinyiqun's avatar qinyiqun Committed by GitHub
Browse files

ISSUE/628 适配QY C610 GPU,增加编译选项,适配已有算子。添加bge类模型所需的算子, (#629)



* ISSUE/628 适配QY C610 GPU,增加编译选项,适配已有算子。添加bge类模型所需的算子,包括gelu,layer_norm,lp_norm(支持l1,l2 norm),relu,softmax,tanh。

---------
Co-authored-by: default avatarxgqdut2016 <kenan_gewei@163.com>
Co-authored-by: default avatarxgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
parent 7c397dd2
#ifndef __TANH_CUDA_H__
#define __TANH_CUDA_H__
#include <cmath>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace op::tanh::cuda {
typedef struct TanhOp {
static constexpr size_t num_inputs = 1;
__device__ __forceinline__ float tanh_f32_func(float x) const {
return tanhf(x);
}
template <typename T>
__device__ __forceinline__ T operator()(const T &input) const {
if constexpr (std::is_same_v<T, half2>) {
float2 vf = __half22float2(input);
float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y));
return __float22half2_rn(vr);
} else if constexpr (std::is_same_v<T, half>) {
float xf = __half2float(input);
float yf = tanh_f32_func(xf);
return __float2half_rn(yf);
} else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
float f0 = __bfloat162float(__low2bfloat16(input));
float f1 = __bfloat162float(__high2bfloat16(input));
float r0 = tanh_f32_func(f0);
float r1 = tanh_f32_func(f1);
return __floats2bfloat162_rn(r0, r1);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float xf = __bfloat162float(input);
float rf = tanh_f32_func(xf);
return __float2bfloat16_rn(rf);
} else if constexpr (std::is_same_v<T, float>) {
return tanh_f32_func(input);
} else if constexpr (std::is_same_v<T, double>) {
return std::tanh(input);
} else {
return std::tanh(input);
}
}
} TanhOp;
} // namespace op::tanh::cuda
#endif // __TANH_CUDA_H__
#ifndef __TANH_METAX_API_H__
#define __TANH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(tanh, metax)
#endif // __TANH_METAX_API_H__
#include "tanh_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::tanh::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(output_shape, input_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::tanh::metax
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "tanh_nvidia.cuh"
namespace op::tanh::nvidia {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(output_shape, input_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::tanh::nvidia
#ifndef __TANH_CUDA_API_H__
#define __TANH_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(tanh, nvidia)
#endif // __TANH_CUDA_API_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/tanh.h"
#ifdef ENABLE_CPU_API
#include "cpu/tanh_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/tanh_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/tanh_metax.h"
#endif
__C infiniStatus_t infiniopCreateTanhDescriptor(
infiniopHandle_t handle,
infiniopTanhDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::tanh::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::tanh::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
{input_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::tanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopTanh(
infiniopTanhDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, {input}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#ifdef ENABLE_NVIDIA_API
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/topkrouter_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "nvidia/topkrouter_nvidia.cuh"
#endif
......@@ -23,6 +23,9 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, i
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
}
......@@ -43,6 +46,9 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
}
......@@ -66,6 +72,9 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
}
......@@ -86,6 +95,9 @@ __C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescrip
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia);
#endif
}
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/topksoftmax_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "nvidia/topksoftmax_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -28,6 +28,9 @@ __C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
......@@ -52,6 +55,9 @@ __C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescri
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
......@@ -81,6 +87,9 @@ __C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, voi
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
......@@ -105,6 +114,9 @@ __C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescr
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, metax);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/zeros_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/zeros_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -40,6 +40,9 @@ __C infiniStatus_t infiniopCreateZerosDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
......@@ -70,6 +73,9 @@ __C infiniStatus_t infiniopGetZerosWorkspaceSize(infiniopZerosDescriptor_t desc,
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
......@@ -108,6 +114,9 @@ __C infiniStatus_t infiniopZeros(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
......@@ -140,6 +149,9 @@ infiniopDestroyZerosDescriptor(infiniopZerosDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
......
#ifndef __INFINIOP_REDUCE_CUDA_H__
#define __INFINIOP_REDUCE_CUDA_H__
#include <cub/block/block_reduce.cuh>
/*
* Device functions for reduction operations on CUDA.
*
......
......@@ -20,6 +20,7 @@ void printUsage() {
<< " metax" << std::endl
<< " moore" << std::endl
<< " iluvatar" << std::endl
<< " qy" << std::endl
<< " kunlun" << std::endl
<< " hygon" << std::endl
<< std::endl;
......@@ -51,6 +52,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
else PARSE_DEVICE("--metax", INFINI_DEVICE_METAX)
else PARSE_DEVICE("--moore", INFINI_DEVICE_MOORE)
else PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR)
else PARSE_DEVICE("--qy", INFINI_DEVICE_QY)
else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
else PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON)
else {
......
......@@ -3,7 +3,7 @@
#include "../infinirt_impl.h"
namespace infinirt::cuda {
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
INFINIRT_DEVICE_API_IMPL
#else
INFINIRT_DEVICE_API_NOOP
......
......@@ -23,7 +23,7 @@ __C infiniStatus_t infinirtGetAllDeviceCount(int *count_array) {
return INFINI_STATUS_NULL_POINTER;
}
for (size_t i = 0; i < INFINI_DEVICE_TYPE_COUNT; i++) {
if (i == INFINI_DEVICE_ILUVATAR || i == INFINI_DEVICE_HYGON) {
if (i == INFINI_DEVICE_ILUVATAR || i == INFINI_DEVICE_QY || i == INFINI_DEVICE_KUNLUN || i == INFINI_DEVICE_HYGON) {
count_array[i] = 0;
continue;
}
......@@ -77,6 +77,9 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
case INFINI_DEVICE_ILUVATAR: \
_status = infinirt::cuda::API PARAMS; \
break; \
case INFINI_DEVICE_QY: \
_status = infinirt::cuda::API PARAMS; \
break; \
case INFINI_DEVICE_HYGON: \
_status = infinirt::cuda::API PARAMS; \
break; \
......
......@@ -33,14 +33,14 @@ from framework import (
# Test cases - 定义不同的测试场景
_TEST_CASES = [
TestCase("basic_print", (2, 3)), # 基本打印
TestCase("binary_save", (3, 4)), # 二进制保存
TestCase("multidimensional", (2, 2, 3)), # 多维张量
TestCase("basic_print", (2, 3)), # 基本打印
TestCase("binary_save", (3, 4)), # 二进制保存
TestCase("multidimensional", (2, 2, 3)), # 多维张量
]
# 非连续内存布局测试用例 (is_contiguous=False)
_NON_CONTIGUOUS_TEST_CASES = [
TestCase("non_contiguous", (3, 4)), # 测试 transpose 等导致的非连续内存布局
TestCase("non_contiguous", (3, 4)), # 测试 transpose 等导致的非连续内存布局
]
# 大规模性能测试用例 - 一千万个数据
......@@ -68,10 +68,11 @@ _TOLERANCE_MAP = {
# Helper Functions
# ==============================================================================
def load_binary_with_torch(filename, dtype, shape):
"""使用 torch.frombuffer 读取二进制文件"""
torch_dtype = to_torch_dtype(dtype)
with open(filename, 'rb') as f:
with open(filename, "rb") as f:
data = f.read()
return torch.frombuffer(data, dtype=torch_dtype).reshape(shape)
......@@ -80,69 +81,80 @@ def load_binary_with_torch(filename, dtype, shape):
# Test Methods
# ==============================================================================
def test_basic_print(device, test_case, dtype, config):
"""测试基本的 debug 打印功能"""
test_name, shape = test_case.args
print(f"Testing Basic Print on {InfiniDeviceNames[device]} with "
f"shape:{shape}, dtype:{dtype}")
print(
f"Testing Basic Print on {InfiniDeviceNames[device]} with "
f"shape:{shape}, dtype:{dtype}"
)
device_str = torch_device_map[device]
torch_dtype = to_torch_dtype(dtype)
# 创建测试张量
torch_tensor = torch.arange(1, int(np.prod(shape)) + 1,
dtype=torch_dtype, device=device_str).reshape(shape)
torch_tensor = torch.arange(
1, int(np.prod(shape)) + 1, dtype=torch_dtype, device=device_str
).reshape(shape)
infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
# 测试 debug 打印(不保存文件)
infini_tensor.debug()
print(f"✓ Basic print test passed")
def test_binary_save(device, test_case, dtype, config):
"""测试二进制格式保存"""
test_name, shape = test_case.args
print(f"Testing Binary Save on {InfiniDeviceNames[device]} with "
f"shape:{shape}, dtype:{dtype}")
print(
f"Testing Binary Save on {InfiniDeviceNames[device]} with "
f"shape:{shape}, dtype:{dtype}"
)
device_str = torch_device_map[device]
torch_dtype = to_torch_dtype(dtype)
# 创建测试张量
torch_tensor = torch.arange(1, int(np.prod(shape)) + 1,
dtype=torch_dtype, device=device_str).reshape(shape)
torch_tensor = torch.arange(
1, int(np.prod(shape)) + 1, dtype=torch_dtype, device=device_str
).reshape(shape)
infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
# 保存为二进制文件
bin_file = f"/tmp/debug_test_{device}_{dtype}_binary.bin"
infini_tensor.debug(bin_file)
# 验证文件存在
assert os.path.exists(bin_file), f"Binary file not created: {bin_file}"
# 验证文件大小
expected_size = int(np.prod(shape)) * torch_tensor.element_size()
actual_size = os.path.getsize(bin_file)
assert actual_size == expected_size, \
f"Binary file size mismatch: {actual_size} vs {expected_size}"
assert (
actual_size == expected_size
), f"Binary file size mismatch: {actual_size} vs {expected_size}"
# 使用 torch.frombuffer 读取并验证
loaded_tensor = load_binary_with_torch(bin_file, dtype, shape)
# 将两个张量都移到 CPU 进行比较
torch_tensor_cpu = torch_tensor.cpu()
loaded_tensor_cpu = loaded_tensor.cpu()
tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu,
atol=tolerance["atol"], rtol=tolerance["rtol"]), \
f"Binary data mismatch"
assert torch.allclose(
loaded_tensor_cpu,
torch_tensor_cpu,
atol=tolerance["atol"],
rtol=tolerance["rtol"],
), f"Binary data mismatch"
# 清理
os.remove(bin_file)
print(f"✓ Binary save test passed")
......@@ -151,38 +163,44 @@ def test_binary_save(device, test_case, dtype, config):
def test_multidimensional(device, test_case, dtype, config):
"""测试多维张量"""
test_name, shape = test_case.args
print(f"Testing Multidimensional on {InfiniDeviceNames[device]} with "
f"shape:{shape}, dtype:{dtype}")
print(
f"Testing Multidimensional on {InfiniDeviceNames[device]} with "
f"shape:{shape}, dtype:{dtype}"
)
device_str = torch_device_map[device]
torch_dtype = to_torch_dtype(dtype)
# 创建多维张量
torch_tensor = torch.arange(1, int(np.prod(shape)) + 1,
dtype=torch_dtype, device=device_str).reshape(shape)
torch_tensor = torch.arange(
1, int(np.prod(shape)) + 1, dtype=torch_dtype, device=device_str
).reshape(shape)
infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
# 测试打印
infini_tensor.debug()
# 测试保存和读取
bin_file = f"/tmp/debug_test_multidim_{device}_{dtype}.bin"
infini_tensor.debug(bin_file)
assert os.path.exists(bin_file), "Multidimensional binary file not created"
# 验证
loaded_tensor = load_binary_with_torch(bin_file, dtype, shape)
torch_tensor_cpu = torch_tensor.cpu()
loaded_tensor_cpu = loaded_tensor.cpu()
tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu,
atol=tolerance["atol"], rtol=tolerance["rtol"]), \
f"Multidimensional data mismatch"
assert torch.allclose(
loaded_tensor_cpu,
torch_tensor_cpu,
atol=tolerance["atol"],
rtol=tolerance["rtol"],
), f"Multidimensional data mismatch"
# 清理
os.remove(bin_file)
print(f"✓ Multidimensional test passed")
......@@ -191,24 +209,25 @@ def test_multidimensional(device, test_case, dtype, config):
def test_non_contiguous_stride(device, test_case, dtype, config):
"""测试非连续内存布局的情况(is_contiguous=False,例如 transpose 后的张量)"""
test_name, shape = test_case.args
print(f"\n{'='*70}")
print(f"Testing Non-Contiguous Memory Layout on {InfiniDeviceNames[device]}")
print(f" Shape: {shape}, Dtype: {dtype}")
print(f"{'='*70}")
device_str = torch_device_map[device]
torch_dtype = to_torch_dtype(dtype)
# 创建连续张量
print(f"\nStep 1: Creating contiguous tensor...")
torch_tensor_orig = torch.arange(1, int(np.prod(shape)) + 1,
dtype=torch_dtype, device=device_str).reshape(shape)
torch_tensor_orig = torch.arange(
1, int(np.prod(shape)) + 1, dtype=torch_dtype, device=device_str
).reshape(shape)
print(f" Original shape: {torch_tensor_orig.shape}")
print(f" Original stride: {torch_tensor_orig.stride()}")
print(f" Is contiguous: {torch_tensor_orig.is_contiguous()}")
print(f" Data:\n{torch_tensor_orig}")
# 进行 transpose 操作,创建非连续张量
print(f"\nStep 2: Transposing to create non-contiguous tensor...")
torch_tensor_t = torch_tensor_orig.t() # transpose
......@@ -216,78 +235,87 @@ def test_non_contiguous_stride(device, test_case, dtype, config):
print(f" Transposed stride: {torch_tensor_t.stride()}")
print(f" Is contiguous: {torch_tensor_t.is_contiguous()}")
print(f" Data:\n{torch_tensor_t}")
# 创建 InfiniCore 张量(非连续)
# 注意:from_blob 不支持 strides,所以我们使用 permute 创建非连续张量
# permute([1, 0]) 相当于 transpose,会创建非连续的内存布局
infini_tensor_orig = create_infinicore_tensor(torch_tensor_orig, device_str)
infini_tensor_t = infini_tensor_orig.as_strided(
list(torch_tensor_t.shape),
list(torch_tensor_t.stride())
list(torch_tensor_t.shape), list(torch_tensor_t.stride())
)
print(f"\nStep 3: InfiniCore tensor after permute:")
print(f" Shape: {infini_tensor_t.shape}")
print(f" Stride: {infini_tensor_t.stride()}")
print(f" Is contiguous: {infini_tensor_t.is_contiguous()}")
# ===== 测试二进制格式 =====
print(f"\n{'='*70}")
print(f"Testing Binary Format (.bin) with Non-Contiguous Memory Layout")
print(f"{'='*70}")
print(f"Note: Binary format now SUPPORTS non-contiguous memory layout!")
print(f" It automatically detects and handles stride correctly.")
bin_file = f"/tmp/debug_non_contiguous_{device}_{dtype}.bin"
infini_tensor_t.debug(bin_file)
# 验证二进制文件
assert os.path.exists(bin_file), f"Binary file not created: {bin_file}"
# 检查文件大小
actual_size = os.path.getsize(bin_file)
expected_size = int(np.prod(torch_tensor_t.shape)) * torch_tensor_t.element_size()
print(f"\nFile size check:")
print(f" Expected: {expected_size} bytes ({int(np.prod(torch_tensor_t.shape))} elements)")
print(
f" Expected: {expected_size} bytes ({int(np.prod(torch_tensor_t.shape))} elements)"
)
print(f" Actual: {actual_size} bytes")
assert actual_size == expected_size, \
f"File size mismatch: {actual_size} vs {expected_size}"
assert (
actual_size == expected_size
), f"File size mismatch: {actual_size} vs {expected_size}"
print(f" ✓ File size is correct")
# 读取并验证数据
loaded_tensor = load_binary_with_torch(bin_file, dtype, torch_tensor_t.shape)
torch_tensor_cpu = torch_tensor_t.cpu()
loaded_tensor_cpu = loaded_tensor.cpu()
tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
print(f"\nData verification:")
print(f" Expected (first 2 rows):\n{torch_tensor_cpu[:2]}")
print(f" Got (first 2 rows):\n{loaded_tensor_cpu[:2]}")
assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu,
atol=tolerance["atol"], rtol=tolerance["rtol"]), \
f"Data verification failed: loaded data doesn't match expected"
assert torch.allclose(
loaded_tensor_cpu,
torch_tensor_cpu,
atol=tolerance["atol"],
rtol=tolerance["rtol"],
), f"Data verification failed: loaded data doesn't match expected"
print(f"\n✓ Binary format: Data matches perfectly!")
print(f" Binary format correctly handles non-contiguous memory layout using stride")
print(
f" Binary format correctly handles non-contiguous memory layout using stride"
)
# 清理
os.remove(bin_file)
print(f"\n{'='*70}")
print(f"Non-Contiguous Memory Layout Test Summary:")
print(f" ✅ Binary format (.bin): NOW supports non-contiguous memory!")
print(f" Performance: Contiguous tensors use fast path, non-contiguous use stride-based writing")
print(
f" Performance: Contiguous tensors use fast path, non-contiguous use stride-based writing"
)
print(f"{'='*70}\n")
def test_large_scale_binary_performance(device, test_case, dtype, config):
"""测试大规模数据二进制保存性能(一千万个数据)"""
test_name, shape = test_case.args
num_elements = int(np.prod(shape))
element_size_bytes = {
infinicore.float32: 4,
......@@ -296,9 +324,9 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
infinicore.int32: 4,
infinicore.int64: 8,
}
total_size_mb = (num_elements * element_size_bytes.get(dtype, 4)) / (1024 * 1024)
print(f"\n{'='*70}")
print(f"Performance Test: Large Scale Binary Save")
print(f" Device: {InfiniDeviceNames[device]}")
......@@ -307,22 +335,22 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
print(f" Dtype: {dtype}")
print(f" Expected file size: {total_size_mb:.2f} MB")
print(f"{'='*70}")
device_str = torch_device_map[device]
torch_dtype = to_torch_dtype(dtype)
# 创建大规模张量
print(f"Creating tensor with {num_elements:,} elements...")
create_start = time.time()
torch_tensor = torch.randn(shape, dtype=torch_dtype, device=device_str)
create_time = time.time() - create_start
print(f" Tensor creation time: {create_time:.4f} seconds")
infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
# 测试保存性能
bin_file = f"/tmp/debug_large_scale_{device}_{dtype}.bin"
print(f"\n{'='*70}")
print(f"[1/2] Writing Binary File")
print(f"{'='*70}")
......@@ -330,24 +358,24 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
save_start = time.time()
infini_tensor.debug(bin_file)
save_time = time.time() - save_start
# 验证文件存在
assert os.path.exists(bin_file), f"Binary file not created: {bin_file}"
# 获取实际文件大小
actual_size = os.path.getsize(bin_file)
actual_size_mb = actual_size / (1024 * 1024)
# 计算写入吞吐量
write_throughput_mbps = actual_size_mb / save_time if save_time > 0 else 0
# 打印写入性能结果
print(f"\n✓ Write Performance:")
print(f" File size: {actual_size_mb:.2f} MB ({actual_size:,} bytes)")
print(f" Write time: {save_time:.4f} seconds")
print(f" Write throughput: {write_throughput_mbps:.2f} MB/s")
print(f" Elements written/sec: {num_elements/save_time:,.0f}")
# 测试读取性能
print(f"\n{'='*70}")
print(f"[2/2] Reading Binary File (for verification)")
......@@ -356,25 +384,27 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
loaded_tensor = load_binary_with_torch(bin_file, dtype, shape)
read_time = time.time() - read_start
read_throughput_mbps = actual_size_mb / read_time if read_time > 0 else 0
print(f"\n✓ Read Performance:")
print(f" Read time: {read_time:.4f} seconds")
print(f" Read throughput: {read_throughput_mbps:.2f} MB/s")
print(f" Elements read/sec: {num_elements/read_time:,.0f}")
# 简单验证前几个元素(不做完整验证以节省时间)
torch_tensor_cpu = torch_tensor.cpu()
loaded_tensor_cpu = loaded_tensor.cpu()
sample_size = min(1000, num_elements)
tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
assert torch.allclose(loaded_tensor_cpu.flatten()[:sample_size],
torch_tensor_cpu.flatten()[:sample_size],
atol=tolerance["atol"], rtol=tolerance["rtol"]), \
f"Data verification failed (sampled first {sample_size} elements)"
assert torch.allclose(
loaded_tensor_cpu.flatten()[:sample_size],
torch_tensor_cpu.flatten()[:sample_size],
atol=tolerance["atol"],
rtol=tolerance["rtol"],
), f"Data verification failed (sampled first {sample_size} elements)"
print(f" Data verification: ✓ (sampled first {sample_size} elements)")
# 打印性能总结
print(f"\n{'='*70}")
print(f"Performance Summary")
......@@ -383,21 +413,24 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
print(f" File size: {actual_size_mb:.2f} MB")
print(f" Write time: {save_time:.4f} sec → {write_throughput_mbps:.2f} MB/s")
print(f" Read time: {read_time:.4f} sec → {read_throughput_mbps:.2f} MB/s")
print(f" Speed ratio (Read/Write): {read_throughput_mbps/write_throughput_mbps:.2f}x")
print(
f" Speed ratio (Read/Write): {read_throughput_mbps/write_throughput_mbps:.2f}x"
)
print(f"{'='*70}")
# 清理
os.remove(bin_file)
print(f"\n✓ Large scale performance test passed\n")
# ==============================================================================
# Main Execution Function
# ==============================================================================
def main():
args = get_args()
# 创建测试配置
config = TestConfig(
tensor_dtypes=_TENSOR_DTYPES,
......@@ -405,58 +438,62 @@ def main():
debug=args.debug,
bench=False, # debug 测试不需要性能测试
)
# 获取测试设备
devices = get_test_devices(args)
print("Starting debug tests...")
all_passed = True
# 为每种测试类型运行测试
test_funcs = [
("Basic Print", test_basic_print, [_TEST_CASES[0]]),
("Binary Save", test_binary_save, [_TEST_CASES[1]]),
("Multidimensional", test_multidimensional, [_TEST_CASES[2]]),
]
for test_name, test_func, test_cases in test_funcs:
print(f"\n{'='*60}")
print(f"Testing {test_name}")
print(f"{'='*60}")
runner = TestRunner(test_cases, config)
passed = runner.run_tests(devices, test_func)
all_passed = all_passed and passed
# 运行非连续内存布局测试
print(f"\n{'='*60}")
print(f"Testing Non-Contiguous Memory Layout (is_contiguous=False)")
print(f"{'='*60}")
non_contiguous_runner = TestRunner(_NON_CONTIGUOUS_TEST_CASES, config)
non_contiguous_passed = non_contiguous_runner.run_tests(devices, test_non_contiguous_stride)
non_contiguous_passed = non_contiguous_runner.run_tests(
devices, test_non_contiguous_stride
)
all_passed = all_passed and non_contiguous_passed
# 运行大规模性能测试
print(f"\n{'='*60}")
print(f"Testing Large Scale Performance (10M elements)")
print(f"{'='*60}")
large_scale_runner = TestRunner(_LARGE_SCALE_TEST_CASES, config)
large_scale_passed = large_scale_runner.run_tests(devices, test_large_scale_binary_performance)
large_scale_passed = large_scale_runner.run_tests(
devices, test_large_scale_binary_performance
)
all_passed = all_passed and large_scale_passed
# 打印总结
print(f"\n{'='*60}")
print("Test Summary")
print(f"{'='*60}")
if all_passed:
print("\033[92m✅ All debug tests passed!\033[0m")
else:
print("\033[91m❌ Some tests failed!\033[0m")
sys.exit(0 if all_passed else 1)
......
......@@ -23,6 +23,7 @@ def get_supported_hardware_platforms():
("--moore", "Moore Threads GPUs (requires torch_musa)"),
("--kunlun", "Kunlun XPUs (requires torch_xmlir)"),
("--hygon", "Hygon DCUs"),
("--qy", "QY GPUs"),
]
......@@ -194,6 +195,15 @@ def get_test_devices(args):
devices_to_test.append(InfiniDeviceEnum.HYGON)
except ImportError:
print("Warning: Hygon DCU support not available")
if args.qy:
try:
# Iluvatar GPU detection
import torch
devices_to_test.append(InfiniDeviceEnum.QY)
except ImportError:
print("Warning: QY GPU support not available")
# Default to CPU if no devices specified
if not devices_to_test:
......
......@@ -8,6 +8,7 @@ class InfiniDeviceEnum:
ILUVATAR = 6
KUNLUN = 7
HYGON = 8
QY = 9
InfiniDeviceNames = {
......@@ -18,6 +19,7 @@ InfiniDeviceNames = {
InfiniDeviceEnum.METAX: "Metax",
InfiniDeviceEnum.MOORE: "Moore",
InfiniDeviceEnum.ILUVATAR: "Iluvatar",
InfiniDeviceEnum.QY: "Qy",
InfiniDeviceEnum.KUNLUN: "Kunlun",
InfiniDeviceEnum.HYGON: "Hygon",
}
......@@ -32,4 +34,5 @@ torch_device_map = {
InfiniDeviceEnum.ILUVATAR: "cuda",
InfiniDeviceEnum.KUNLUN: "cuda",
InfiniDeviceEnum.HYGON: "cuda",
InfiniDeviceEnum.QY: "cuda",
}
......@@ -23,7 +23,6 @@ from libinfiniop import (
)
def causal_softmax(x):
type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
......
......@@ -96,27 +96,27 @@ NUM_ITERATIONS = 1000
def conv(x, w, stride, padding, dilation, y_tensor, bias=None):
match len(x.shape) - 2:
case 1:
y_tensor.copy_(
F.conv1d(
x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
)
ndim = len(x.shape) - 2#不要使用match,会导致CI无法通过
if ndim == 1:
y_tensor.copy_(
F.conv1d(
x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
)
case 2:
y_tensor.copy_(
F.conv2d(
x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
)
)
elif ndim == 2:
y_tensor.copy_(
F.conv2d(
x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
)
case 3:
y_tensor.copy_(
F.conv3d(
x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
)
)
elif ndim == 3:
y_tensor.copy_(
F.conv3d(
x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
)
case _:
print("Error: Pytorch -> Unsupported tensor dimension")
)
else:
print("Error: Pytorch -> Unsupported tensor dimension")
# infer the shape of the output given the inputs for a N-ary convolution
......
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, input_stride, output_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
#((13, 4), (0, 1), None),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
#((13, 4, 4), (4, 0, 1), None),
((16, 5632), None, None),
((16, 5632), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
InfiniDtype.F64: {"atol": 1e-6, "rtol": 1e-6},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def test(
handle,
device,
shape,
input_stride=None,
output_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
input = TestTensor(shape, input_stride, dtype, device)
if inplace == Inplace.INPLACE:
if input_stride != output_stride:
return
output = input
else:
output = TestTensor(shape, output_stride, dtype, device, mode="ones")
if output.is_broadcast():
return
print(
f"Testing Gelu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}"
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
new_tensor = torch.nn.functional.gelu(input.torch_tensor())
output.update_torch_tensor(new_tensor)
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateGeluDescriptor(
handle,
ctypes.byref(descriptor),
output.descriptor,
input.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [input, output]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetGeluWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, output.device)
def lib_gelu():
check_error(
LIBINFINIOP.infiniopGelu(
descriptor,
workspace.data(),
workspace.size(),
output.data(),
input.data(),
None,
)
)
lib_gelu()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(
output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: torch.nn.functional.gelu(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_gelu(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyGeluDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment