Commit 18773b69 authored by wooway777's avatar wooway777
Browse files

Revert "Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15"

This reverts commit 21c6af2d, reversing
changes made to 99a802dd.
parent bfead271
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/reciprocal.h"
#ifdef ENABLE_CPU_API
#include "cpu/reciprocal_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/reciprocal_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/reciprocal_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/reciprocal_kunlun.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/reciprocal_bang.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/reciprocal_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateReciprocalDescriptor(
infiniopHandle_t handle,
infiniopReciprocalDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::reciprocal::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
{x_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
GET(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopReciprocal(
infiniopReciprocalDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t
infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
......@@ -119,9 +119,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.addcmul(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation (operator not yet available)."""
return infinicore.addcmul(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.addcmul(*args, **kwargs)
def main():
......
......@@ -97,9 +97,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.atanh(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore atanh implementation"""
return infinicore.atanh(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.atanh(*args, **kwargs)
def main():
......
......@@ -80,9 +80,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation (operator not yet available)."""
return infinicore.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs)
def main():
......
......@@ -63,9 +63,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.cdist(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation (operator not yet available)."""
return infinicore.cdist(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.cdist(*args, **kwargs)
def main():
......
......@@ -89,9 +89,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.reciprocal(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation (operator not yet available)."""
return infinicore.reciprocal(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.reciprocal(*args, **kwargs)
def main():
......
import torch
import ctypes
from ctypes import c_uint64, c_float
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration
# ==============================================================================
_TEST_CASES_ = [
# shape, input_stride, t1_stride, t2_stride
((3, 3), None, None, None),
((32, 512), None, None, None),
((32, 512), (1024, 1), (1024, 1), (1024, 1)),
((16, 32, 64), None, None, None),
((8, 1, 1024), None, None, None), # 包含广播形状的潜在测试
]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
}
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_INPUT = auto()
_INPLACE = [Inplace.OUT_OF_PLACE, Inplace.INPLACE_INPUT]
_VALUES = [1.0, 0.5, -2.0] # 测试不同的 value 系数
_TEST_CASES = [
test_case + (inplace_item, value)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
for value in _VALUES
]
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 100
def test(
handle,
device,
shape,
input_stride=None,
t1_stride=None,
t2_stride=None,
inplace=Inplace.OUT_OF_PLACE,
value=1.0,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing Addcmul on {InfiniDeviceNames[device]} with shape:{shape} value:{value} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
# 准备输入 Tensor
input_tensor = TestTensor(shape, input_stride, dtype, device)
t1 = TestTensor(shape, t1_stride, dtype, device)
t2 = TestTensor(shape, t2_stride, dtype, device)
# 使用 PyTorch 计算参考答案
# out = input + value * t1 * t2
ans = torch.addcmul(input_tensor.torch_tensor(), t1.torch_tensor(), t2.torch_tensor(), value=value)
if inplace == Inplace.INPLACE_INPUT:
out = input_tensor
else:
out = TestTensor(shape, None, dtype, device)
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
# 注意:根据之前的定义,Create 接口接收 value
check_error(
LIBINFINIOP.infiniopCreateAddcmulDescriptor(
handle,
ctypes.byref(descriptor),
out.descriptor,
input_tensor.descriptor,
t1.descriptor,
t2.descriptor,
c_float(value)
)
)
# 销毁临时描述符以防内核错误引用
for t in [input_tensor, t1, t2, out]:
t.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetAddcmulWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, input_tensor.device)
def lib_addcmul():
check_error(
LIBINFINIOP.infiniopAddcmul(
descriptor,
workspace.data(),
workspace_size.value,
out.data(),
input_tensor.data(),
t1.data(),
t2.data(),
None,
)
)
lib_addcmul()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
if PROFILE:
profile_operation("PyTorch", lambda: torch.addcmul(input_tensor.torch_tensor(), t1.torch_tensor(), t2.torch_tensor(), value=value), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_addcmul(), device, NUM_PRERUN, NUM_ITERATIONS)
check_error(LIBINFINIOP.infiniopDestroyAddcmulDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mAddcmul tests passed!\033[0m")
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration
# ==============================================================================
_TEST_CASES_ = [
# shape, a_stride, y_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
((16, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_A = auto()
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_A,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# atanh typically supports floating point types
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def atanh_torch(y, a):
torch.atanh(a, out=y)
def test(
handle,
device,
shape,
a_stride=None,
y_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
# Initialize input tensor
a = TestTensor(shape, a_stride, dtype, device)
# Crucial: clamp values to (-1, 1) to avoid NaN/Inf for atanh
with torch.no_grad():
a.torch_tensor().clamp_(-0.99, 0.99)
# Keep underlying data in sync for all devices (including CPU)
a.actual_tensor().copy_(a.torch_tensor())
if inplace == Inplace.INPLACE_A:
if a_stride != y_stride:
return
y = a
else:
y = TestTensor(shape, y_stride, dtype, device, mode="ones")
if y.is_broadcast():
return
print(
f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} y_stride:{y_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
# Reference calculation
atanh_torch(y.torch_tensor(), a.torch_tensor())
if sync is not None:
sync()
# Create descriptor
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateAtanhDescriptor(
handle,
ctypes.byref(descriptor),
y.descriptor,
a.descriptor,
)
)
# Invalidate descriptors to ensure kernel uses its own internal state
for tensor in [a, y]:
tensor.destroy_desc()
# Workspace management
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetAtanhWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, y.device)
def lib_atanh():
check_error(
LIBINFINIOP.infiniopAtanh(
descriptor,
workspace.data(),
workspace.size(),
y.data(),
a.data(),
None,
)
)
# Run library function
lib_atanh()
# Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
# Profiling
if PROFILE:
profile_operation("PyTorch", lambda: atanh_torch(y.torch_tensor(), a.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS)
check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mAtanh Test passed!\033[0m")
import torch
import ctypes
from ctypes import c_uint64, c_float, c_char_p
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
# ==============================================================================
# Configuration
# ==============================================================================
# 测试场景:(shape, has_weight, has_pos_weight, reduction)
_TEST_CASES_DATA = [
((4, 5), False, False, "none"),
((8, 8), True, False, "sum"),
((32, 512), False, True, "mean"),
((16, 32, 64), True, True, "mean"),
]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-2, "rtol": 5e-2},
InfiniDtype.BF16: {"atol": 1e-2, "rtol": 5e-2},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
}
_REDUCTIONS = ["none", "mean", "sum"]
_REDUCTION_MAP = {
"none": 0, # INFINIOP_REDUCTION_NONE
"mean": 1, # INFINIOP_REDUCTION_MEAN
"sum": 2, # INFINIOP_REDUCTION_SUM
}
# 生成最终测试用例组合
_TEST_CASES = _TEST_CASES_DATA
DEBUG = False
PROFILE = False
def test(
handle,
device,
shape,
has_weight=False,
has_pos_weight=False,
reduction="none",
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing BCEWithLogits on {InfiniDeviceNames[device]} shape:{shape} "
f"weight:{has_weight} pos_weight:{has_pos_weight} reduction:{reduction} dtype:{InfiniDtypeNames[dtype]}"
)
# 1. 准备输入 Tensor
input_tensor = TestTensor(shape, None, dtype, device)
target = TestTensor(shape, None, dtype, device)
weight = TestTensor(shape, None, dtype, device) if has_weight else None
# pos_weight 通常在最后一维广播,形状为 (C,)
pos_weight_shape = (shape[-1],)
pos_weight = TestTensor(pos_weight_shape, None, dtype, device) if has_pos_weight else None
# 2. 使用 PyTorch 计算参考答案
torch_input = input_tensor.torch_tensor()
torch_target = target.torch_tensor()
torch_weight = weight.torch_tensor() if has_weight else None
torch_pos_weight = pos_weight.torch_tensor() if has_pos_weight else None
ans = torch.nn.functional.binary_cross_entropy_with_logits(
torch_input,
torch_target,
weight=torch_weight,
pos_weight=torch_pos_weight,
reduction=reduction
)
# 3. 准备输出 Tensor (根据 reduction 确定形状)
out_shape = () if reduction != "none" else shape
out = TestTensor(out_shape, None, dtype, device)
if sync is not None:
sync()
# 4. 创建描述符并执行
descriptor = infiniopOperatorDescriptor_t()
# 模拟 C 接口调用
reduction_enum = _REDUCTION_MAP[reduction]
check_error(
LIBINFINIOP.infiniopCreateBCEWithLogitsDescriptor(
handle,
ctypes.byref(descriptor),
out.descriptor,
input_tensor.descriptor,
target.descriptor,
weight.descriptor if has_weight else None,
pos_weight.descriptor if has_pos_weight else None,
reduction_enum # 传入归约方式枚举值,对应 infiniopReduction_t
)
)
# 销毁临时描述符
for t in [input_tensor, target, out]:
t.destroy_desc()
if weight: weight.destroy_desc()
if pos_weight: pos_weight.destroy_desc()
workspace_size = c_uint64(0)
check_error(LIBINFINIOP.infiniopGetBCEWithLogitsWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
workspace = TestWorkspace(workspace_size.value, device)
def lib_op():
check_error(
LIBINFINIOP.infiniopBCEWithLogits(
descriptor,
workspace.data(),
workspace_size.value,
out.data(),
input_tensor.data(),
target.data(),
weight.data() if has_weight else None,
pos_weight.data() if has_pos_weight else None,
None,
)
)
lib_op()
if sync is not None:
sync()
# 5. 验证结果
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
if PROFILE:
profile_operation("PyTorch", lambda: torch.nn.functional.binary_cross_entropy_with_logits(
torch_input, torch_target, weight=torch_weight, pos_weight=torch_pos_weight, reduction=reduction
), device)
profile_operation(" lib", lib_op, device)
check_error(LIBINFINIOP.infiniopDestroyBCEWithLogitsDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mBCEWithLogits tests passed!\033[0m")
import torch
import ctypes
from ctypes import c_uint64, c_float, c_double
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
# ==============================================================================
# Configuration
# ==============================================================================
# 格式: (M, N, D, x1_stride, x2_stride)
# x1: (M, D), x2: (N, D), out: (M, N)
_TEST_CASES_DATA = [
(5, 6, 3, None, None),
(32, 64, 128, None, None),
(32, 64, 128, (256, 1), (256, 1)), # 测试带步长的输入
(10, 7, 5, None, None),
]
_TENSOR_DTYPES = [InfiniDtype.F32] # cdist 通常对精度敏感,初测建议用 F32
_TOLERANCE_MAP = {
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
}
_P_VALUES = [1.0, 2.0, float("inf")] # 不同的 p 范数测试
_TEST_CASES = [
test_case + (p_val,)
for test_case in _TEST_CASES_DATA
for p_val in _P_VALUES
]
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 100
def test(
handle,
device,
M, N, D,
x1_stride=None,
x2_stride=None,
p=2.0,
dtype=InfiniDtype.F32,
sync=None,
):
print(
f"Testing Cdist on {InfiniDeviceNames[device]} with M:{M}, N:{N}, D:{D}, p:{p}, dtype:{InfiniDtypeNames[dtype]}"
)
# 1. 准备输入输出形状
x1_shape = (M, D)
x2_shape = (N, D)
out_shape = (M, N)
# 2. 准备输入 Tensor
x1 = TestTensor(x1_shape, x1_stride, dtype, device)
x2 = TestTensor(x2_shape, x2_stride, dtype, device)
out = TestTensor(out_shape, None, dtype, device)
# 3. 使用 PyTorch 计算参考答案
# torch.cdist 要求输入至少是 2D
ans = torch.cdist(x1.torch_tensor(), x2.torch_tensor(), p=p)
if sync is not None:
sync()
# 4. 创建算子描述符
descriptor = infiniopOperatorDescriptor_t()
# 注意:这里假设 C 接口名为 infiniopCreateCdistDescriptor
check_error(
LIBINFINIOP.infiniopCreateCdistDescriptor(
handle,
ctypes.byref(descriptor),
out.descriptor,
x1.descriptor,
x2.descriptor,
c_double(p) # 通常 p 使用 double 或 float 传递
)
)
# 销毁临时描述符以防内核错误引用(沿用 addcmul 风格)
for t in [x1, x2, out]:
t.destroy_desc()
# 5. Workspace 准备
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetCdistWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, x1.device)
# 6. 执行函数定义
def lib_cdist():
check_error(
LIBINFINIOP.infiniopCdist(
descriptor,
workspace.data(),
workspace_size.value,
out.data(),
x1.data(),
x2.data(),
None, # stream
)
)
# 7. 运行
lib_cdist()
if sync is not None:
sync()
# 8. 验证结果
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
# 9. 性能分析
if PROFILE:
profile_operation("PyTorch", lambda: torch.cdist(x1.torch_tensor(), x2.torch_tensor(), p=p), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_cdist(), device, NUM_PRERUN, NUM_ITERATIONS)
check_error(LIBINFINIOP.infiniopDestroyCdistDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mCdist tests passed!\033[0m")
......@@ -4,7 +4,8 @@ from .structs import (
infiniopOperatorDescriptor_t,
)
from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_double, c_uint64
from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
class OpRegister:
registry = []
......@@ -19,36 +20,6 @@ class OpRegister:
for op in cls.registry:
op(lib)
@OpRegister.operator
def atanh_(lib):
lib.infiniopCreateAtanhDescriptor.restype = c_int32
lib.infiniopCreateAtanhDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetAtanhWorkspaceSize.restype = c_int32
lib.infiniopGetAtanhWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopAtanh.restype = c_int32
lib.infiniopAtanh.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p, # workspace
c_size_t, # workspace_size
c_void_p, # y_data
c_void_p, # a_data
c_void_p, # stream
]
lib.infiniopDestroyAtanhDescriptor.restype = c_int32
lib.infiniopDestroyAtanhDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def add_(lib):
......@@ -84,156 +55,6 @@ def add_(lib):
]
@OpRegister.operator
def addcmul_(lib):
lib.infiniopCreateAddcmulDescriptor.restype = c_int32
lib.infiniopCreateAddcmulDescriptor.argtypes = [
infiniopHandle_t, # handle
POINTER(infiniopOperatorDescriptor_t), # desc_ptr
infiniopTensorDescriptor_t, # out_desc
infiniopTensorDescriptor_t, # input_desc
infiniopTensorDescriptor_t, # t1_desc
infiniopTensorDescriptor_t, # t2_desc
c_float, # value (标量系数)
]
lib.infiniopGetAddcmulWorkspaceSize.restype = c_int32
lib.infiniopGetAddcmulWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t, # descriptor
POINTER(c_size_t), # size_ptr
]
lib.infiniopAddcmul.restype = c_int32
lib.infiniopAddcmul.argtypes = [
infiniopOperatorDescriptor_t, # descriptor
c_void_p, # workspace
c_size_t, # workspace_size
c_void_p, # out_ptr
c_void_p, # input_ptr
c_void_p, # t1_ptr
c_void_p, # t2_ptr
c_void_p, # stream
]
lib.infiniopDestroyAddcmulDescriptor.restype = c_int32
lib.infiniopDestroyAddcmulDescriptor.argtypes = [
infiniopOperatorDescriptor_t, # descriptor
]
@OpRegister.operator
def cdist_(lib):
# 1. 创建描述符接口
# 接口通常接收 handle, 输出 desc, 两个输入 desc, 以及范数 p
lib.infiniopCreateCdistDescriptor.restype = c_int32
lib.infiniopCreateCdistDescriptor.argtypes = [
infiniopHandle_t, # handle
POINTER(infiniopOperatorDescriptor_t), # desc_ptr
infiniopTensorDescriptor_t, # y_desc (输出)
infiniopTensorDescriptor_t, # x1_desc
infiniopTensorDescriptor_t, # x2_desc
c_double, # p (范数阶数)
]
# 2. 获取 Workspace 大小接口
lib.infiniopGetCdistWorkspaceSize.restype = c_int32
lib.infiniopGetCdistWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t, # descriptor
POINTER(c_size_t), # size_ptr
]
# 3. 执行算子接口
lib.infiniopCdist.restype = c_int32
lib.infiniopCdist.argtypes = [
infiniopOperatorDescriptor_t, # descriptor
c_void_p, # workspace
c_size_t, # workspace_size
c_void_p, # y_ptr
c_void_p, # x1_ptr
c_void_p, # x2_ptr
c_void_p, # stream
]
# 4. 销毁描述符接口
lib.infiniopDestroyCdistDescriptor.restype = c_int32
lib.infiniopDestroyCdistDescriptor.argtypes = [
infiniopOperatorDescriptor_t, # descriptor
]
@OpRegister.operator
def binary_cross_entropy_with_logits_(lib):
# 1. 创建描述符 (Descriptor Creation)
lib.infiniopCreateBCEWithLogitsDescriptor.restype = c_int32
lib.infiniopCreateBCEWithLogitsDescriptor.argtypes = [
infiniopHandle_t, # handle
POINTER(infiniopOperatorDescriptor_t), # desc_ptr
infiniopTensorDescriptor_t, # out_desc
infiniopTensorDescriptor_t, # input_desc (logits)
infiniopTensorDescriptor_t, # target_desc
infiniopTensorDescriptor_t, # weight_desc (可选,不可用则传 NULL)
infiniopTensorDescriptor_t, # pos_weight_desc (可选,不可用则传 NULL)
c_int32 # reduction (0:none, 1:mean, 2:sum)
]
# 2. 获取工作空间大小 (Workspace Size)
lib.infiniopGetBCEWithLogitsWorkspaceSize.restype = c_int32
lib.infiniopGetBCEWithLogitsWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t, # descriptor
POINTER(c_size_t), # size_ptr
]
# 3. 执行算子 (Execution)
lib.infiniopBCEWithLogits.restype = c_int32
lib.infiniopBCEWithLogits.argtypes = [
infiniopOperatorDescriptor_t, # descriptor
c_void_p, # workspace
c_size_t, # workspace_size
c_void_p, # out_ptr
c_void_p, # input_ptr (logits)
c_void_p, # target_ptr
c_void_p, # weight_ptr (可选)
c_void_p, # pos_weight_ptr (可选)
c_void_p, # stream
]
# 4. 销毁描述符 (Destruction)
lib.infiniopDestroyBCEWithLogitsDescriptor.restype = c_int32
lib.infiniopDestroyBCEWithLogitsDescriptor.argtypes = [
infiniopOperatorDescriptor_t, # descriptor
]
@OpRegister.operator
def reciprocal_(lib):
lib.infiniopCreateReciprocalDescriptor.restype = c_int32
lib.infiniopCreateReciprocalDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t, # Output descriptor
infiniopTensorDescriptor_t, # Input descriptor
]
# 获取工作空间大小接口
lib.infiniopGetReciprocalWorkspaceSize.restype = c_int32
lib.infiniopGetReciprocalWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
# 最后的 c_void_p 通常对应 stream 或其他异步句柄,保持一致即可
lib.infiniopReciprocal.restype = c_int32
lib.infiniopReciprocal.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p, # Workspace pointer
c_size_t, # Workspace size
c_void_p, # Output data pointer
c_void_p, # Input data pointer
c_void_p, # Stream pointer (optional)
]
# 销毁描述符接口
lib.infiniopDestroyReciprocalDescriptor.restype = c_int32
lib.infiniopDestroyReciprocalDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def equal_(lib):
# =========================================================
# 1. 注册 Create 函数
......
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration
# ==============================================================================
_TEST_CASES_ = [
# shape, input_stride, output_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((16, 5632), None, None),
((16, 5632), (13312, 1), (13312, 1)),
((13, 16, 2), (128, 4, 1), (64, 4, 1)),
((4, 4, 5632), None, None),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE = auto()
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Reciprocal usually outputs floats; Integer types are often not supported or special-cased
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def reciprocal(y, x):
torch.reciprocal(x, out=y)
def test(
handle,
device,
shape,
in_stride=None,
out_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
# Initialize input 'x'
# Use 'random' mode but ensure values are not near zero to avoid infinity
x = TestTensor(shape, in_stride, dtype, device)
if inplace == Inplace.INPLACE:
if in_stride != out_stride:
return
y = x
else:
y = TestTensor(shape, out_stride, dtype, device)
if y.is_broadcast():
return
print(
f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} "
f"in_stride:{in_stride} out_stride:{out_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
# Calculate ground truth using PyTorch
reciprocal(y.torch_tensor(), x.torch_tensor())
if sync is not None:
sync()
# Create Descriptor
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateReciprocalDescriptor(
handle,
ctypes.byref(descriptor),
y.descriptor,
x.descriptor,
)
)
# Invalidate descriptors as per framework requirement
for tensor in [x, y]:
tensor.destroy_desc()
# Workspace allocation
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetReciprocalWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, y.device)
def lib_reciprocal():
check_error(
LIBINFINIOP.infiniopReciprocal(
descriptor,
workspace.data(),
workspace.size(),
y.data(),
x.data(),
None,
)
)
lib_reciprocal()
# Verification
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
# Profiling
if PROFILE:
profile_operation("PyTorch", lambda: reciprocal(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS)
check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment