Unverified Commit 85bc98ac authored by qinyiqun's avatar qinyiqun Committed by GitHub
Browse files

ISSUE/628 适配QY C610 GPU,增加编译选项,适配已有算子。添加bge类模型所需的算子, (#629)



* ISSUE/628 适配QY C610 GPU,增加编译选项,适配已有算子。添加bge类模型所需的算子,包括gelu,layer_norm,lp_norm(支持l1,l2 norm),relu,softmax,tanh。

---------
Co-authored-by: default avatarxgqdut2016 <kenan_gewei@163.com>
Co-authored-by: default avatarxgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
parent 7c397dd2
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
_TEST_CASES_ = [
# shape, bias_exist, eps, input_strides, output_strides, weight_strides
((5, 4), True, 1e-5, None, None, None),
((5, 4, 32, 2048), True, 1e-5, None, None, None),
((13, 4, 4), True, 1e-5, [30, 4, 1], [50, 4, 1], [2]),
((16, 5, 563), True, 1e-4, None, None, None),
((5, 16, 563), False, 1e-5, None, None, [10]),
((4, 4, 563), True, 1e-5, None, None, None),
((40, 40, 56), True, 1e-5, [3600, 56, 1], None, None),
((40, 40, 56), False, 1e-5, [3600, 56, 1], None, None),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 5e-2, "rtol": 5e-2},
InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.BF16: {"atol": 5e-2, "rtol": 5e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def torch_layer_norm(
output: torch.Tensor,
input_standardization: torch.Tensor,
input_std_deviation: torch.Tensor,
input: torch.Tensor,
weight,
bias,
eps,
bias_exist: bool,
):
normalized_shape = input.shape[-1:]
ln = torch.nn.LayerNorm(
normalized_shape=normalized_shape,
eps=eps,
dtype=torch.float,
bias=bias_exist,
device=input.device,
)
ln.weight.data = weight.type(torch.float)
if bias_exist:
ln.bias.data = bias.type(torch.float)
input = input.type(torch.float)
mean = input.mean(dim=-1, keepdim=True)
var = input.var(dim=-1, correction=0)
std = torch.sqrt(var + eps)
input_standardization.copy_(
((input - mean) / std.unsqueeze(2)).type(input_standardization.dtype)
)
input_std_deviation.copy_(std.type(input_standardization.dtype))
output.copy_(ln(input).detach().type(output.dtype))
def layer_norm(
output: torch.Tensor, input: torch.Tensor, weight, bias, eps, bias_exist: bool
):
normalized_shape = input.shape[-1:]
ln = torch.nn.LayerNorm(
normalized_shape=normalized_shape, eps=eps, bias=bias_exist, device=input.device
)
ln.weight.data = weight
if bias_exist:
ln.bias.data = bias
output.copy_(ln.forward(input).detach().type(output.dtype))
def test(
handle,
device,
input_shape,
bias_exist,
eps,
input_strides,
output_strides,
weight_strides,
inplace,
dtype,
sync=None,
):
print(
f"Testing layer_norm on {InfiniDeviceNames[device]} with input_shape:{input_shape},"
f"bias:{bias_exist},eps:{eps},"
f"dtype:{InfiniDtypeNames[dtype]}"
)
input_standardization = TestTensor(
input_shape,
None,
dtype,
device,
)
input_std_deviation = TestTensor(
input_shape[:-1],
None,
dtype,
device,
)
input = TestTensor(input_shape, input_strides, dtype, device, mode="zeros")
if inplace == Inplace.INPLACE:
if output_strides != input_strides:
return
output = input
else:
output = TestTensor(
input_shape,
output_strides,
dtype,
device,
)
weight = TestTensor(
input_shape[-1:],
weight_strides,
dtype,
device,
)
bias = (
TestTensor(
input_shape[-1:],
None,
dtype,
device,
)
if bias_exist
else None
)
layer_norm(
output.torch_tensor(),
input.torch_tensor(),
weight.torch_tensor(),
bias.torch_tensor() if bias_exist else None,
eps,
bias_exist,
)
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateLayerNormDescriptor(
handle,
ctypes.byref(descriptor),
output.descriptor,
input_standardization.descriptor,
input_std_deviation.descriptor,
input.descriptor,
weight.descriptor,
bias.descriptor if bias_exist else None,
eps,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in (
[output, input_standardization, input_std_deviation, input, weight] + [bias]
if bias_exist
else []
):
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetLayerNormWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, output.device)
def lib_layer_norm():
check_error(
LIBINFINIOP.infiniopLayerNorm(
descriptor,
workspace.data(),
workspace.size(),
output.data(),
input_standardization.data(),
input_std_deviation.data(),
input.data(),
weight.data(),
bias.data() if bias_exist else None,
None,
)
)
lib_layer_norm()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
debug(
input_standardization.actual_tensor(),
input_standardization.torch_tensor(),
atol=atol,
rtol=rtol,
)
debug(
input_std_deviation.actual_tensor(),
input_std_deviation.torch_tensor(),
atol=atol,
rtol=rtol,
)
assert torch.allclose(
output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
)
assert torch.allclose(
input_standardization.actual_tensor(),
input_standardization.torch_tensor(),
atol=atol,
rtol=rtol,
)
assert torch.allclose(
input_std_deviation.actual_tensor(),
input_std_deviation.torch_tensor(),
atol=atol,
rtol=rtol,
)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: torch_layer_norm(
output, input_standardization, input_std_deviation, input, weight, bias, eps, bias_exist
), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_layer_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyLayerNormDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest my layer_norm passed!\033[0m")
......@@ -8,6 +8,7 @@ class InfiniDeviceEnum:
ILUVATAR = 6
KUNLUN = 7
HYGON = 8
QY = 9
InfiniDeviceNames = {
......@@ -20,6 +21,7 @@ InfiniDeviceNames = {
InfiniDeviceEnum.ILUVATAR: "Iluvatar",
InfiniDeviceEnum.KUNLUN: "Kunlun",
InfiniDeviceEnum.HYGON: "Hygon",
InfiniDeviceEnum.QY: "QY",
}
# Mapping that maps InfiniDeviceEnum to torch device string
......@@ -33,4 +35,5 @@ torch_device_map = {
InfiniDeviceEnum.ILUVATAR: "cuda",
InfiniDeviceEnum.KUNLUN: "cuda",
InfiniDeviceEnum.HYGON: "cuda",
InfiniDeviceEnum.QY: "cuda",
}
......@@ -456,6 +456,39 @@ def sub_(lib):
]
@OpRegister.operator
def softmax_(lib):
lib.infiniopCreateSoftmaxDescriptor.restype = c_int32
lib.infiniopCreateSoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_int32,
]
lib.infiniopGetSoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetSoftmaxWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopSoftmax.restype = c_int32
lib.infiniopSoftmax.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySoftmaxDescriptor.restype = c_int32
lib.infiniopDestroySoftmaxDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def swiglu_(lib):
lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
......@@ -578,7 +611,7 @@ def topksoftmax_(lib):
c_void_p,
c_void_p,
c_size_t,
c_int32,
c_int32,
c_void_p,
]
lib.infiniopDestroyTopksoftmaxDescriptor.restype = c_int32
......@@ -594,7 +627,7 @@ def topkrouter_(lib):
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t
infiniopTensorDescriptor_t,
]
lib.infiniopGetTopkrouterWorkspaceSize.restype = c_int32
......@@ -706,6 +739,7 @@ def zeros_(lib):
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def ones_(lib):
lib.infiniopCreateOnesDescriptor.restype = c_int32
......@@ -738,6 +772,38 @@ def ones_(lib):
]
@OpRegister.operator
def gelu_(lib):
lib.infiniopCreateGeluDescriptor.restype = c_int32
lib.infiniopCreateGeluDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGeluWorkspaceSize.restype = c_int32
lib.infiniopGetGeluWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopGelu.restype = c_int32
lib.infiniopGelu.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyGeluDescriptor.restype = c_int32
lib.infiniopDestroyGeluDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def silu_(lib):
lib.infiniopCreateSiluDescriptor.restype = c_int32
......@@ -768,3 +834,107 @@ def silu_(lib):
lib.infiniopDestroySiluDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def layer_norm_(lib):
lib.infiniopCreateLayerNormDescriptor.restype = c_int32
lib.infiniopCreateLayerNormDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
]
lib.infiniopGetLayerNormWorkspaceSize.restype = c_int32
lib.infiniopGetLayerNormWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopLayerNorm.restype = c_int32
lib.infiniopLayerNorm.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyLayerNormDescriptor.restype = c_int32
lib.infiniopDestroyLayerNormDescriptor.argtypes = [infiniopOperatorDescriptor_t]
@OpRegister.operator
def lp_norm_(lib):
lib.infiniopCreateLPNormDescriptor.restype = c_int32
lib.infiniopCreateLPNormDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_int32,
c_int32,
c_float,
]
lib.infiniopGetLPNormWorkspaceSize.restype = c_int32
lib.infiniopGetLPNormWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopLPNorm.restype = c_int32
lib.infiniopLPNorm.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyLPNormDescriptor.restype = c_int32
lib.infiniopDestroyLPNormDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def tanh_(lib):
lib.infiniopCreateTanhDescriptor.restype = c_int32
lib.infiniopCreateTanhDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetTanhWorkspaceSize.restype = c_int32
lib.infiniopGetTanhWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopTanh.restype = c_int32
lib.infiniopTanh.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyTanhDescriptor.restype = c_int32
lib.infiniopDestroyTanhDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
......@@ -94,9 +94,17 @@ class TestTensor(CTensor):
elif mode == "randint":
randint_low = -2000000000 if randint_low is None else randint_low
randint_high = 2000000000 if randint_high is None else randint_high
self._torch_tensor = torch.randint(randint_low,randint_high, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device])
self._torch_tensor = torch.randint(
randint_low,
randint_high,
torch_shape,
dtype=to_torch_dtype(dt),
device=torch_device_map[device],
)
elif mode == "float8_e4m3fn":
self._torch_tensor = torch.rand(shape, dtype=torch.float32, device=torch_device_map[device]).to(dtype=torch.float8_e4m3fn)
self._torch_tensor = torch.rand(
shape, dtype=torch.float32, device=torch_device_map[device]
).to(dtype=torch.float8_e4m3fn)
elif mode == "manual":
assert set_tensor is not None
assert torch_shape == list(set_tensor.shape)
......@@ -136,14 +144,19 @@ class TestTensor(CTensor):
def is_broadcast(self):
return self.strides is not None and 0 in self.strides
@staticmethod
def from_binary(binary_file, shape, strides, dt: InfiniDtype, device: InfiniDeviceEnum):
def from_binary(
binary_file, shape, strides, dt: InfiniDtype, device: InfiniDeviceEnum
):
data = np.fromfile(binary_file, dtype=to_numpy_dtype(dt))
base = torch.from_numpy(data)
torch_tensor = torch.as_strided(base, size=shape, stride=strides).to(torch_device_map[device])
torch_tensor = torch.as_strided(base, size=shape, stride=strides).to(
torch_device_map[device]
)
return TestTensor(
shape, strides, dt, device, mode="binary", set_tensor=torch_tensor)
shape, strides, dt, device, mode="binary", set_tensor=torch_tensor
)
@staticmethod
def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
......@@ -156,6 +169,9 @@ class TestTensor(CTensor):
def update_torch_tensor(self, new_tensor: torch.Tensor):
self._torch_tensor = new_tensor
def update_torch_tensor(self, new_tensor: torch.Tensor):
self._torch_tensor = new_tensor
def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
if dt == InfiniDtype.BOOL:
......@@ -225,7 +241,6 @@ def to_numpy_dtype(dt: InfiniDtype, compatability_mode=False):
raise ValueError("Unsupported data type")
class TestWorkspace:
def __init__(self, size, device):
if size != 0:
......@@ -294,7 +309,18 @@ def rearrange_tensor(tensor, new_strides):
new_positions += offset
# Copy the original data to the new tensor
if tensor.dtype in [torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32,torch.int64, torch.float16,torch.bfloat16,torch.float32,torch.float64]:
if tensor.dtype in [
torch.bool,
torch.uint8,
torch.int8,
torch.int16,
torch.int32,
torch.int64,
torch.float16,
torch.bfloat16,
torch.float32,
torch.float64,
]:
new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
elif tensor.dtype in [torch.uint16, torch.uint32, torch.uint64]:
new_tensor_int64 = new_tensor.to(dtype=torch.int64)
......@@ -303,12 +329,14 @@ def rearrange_tensor(tensor, new_strides):
new_tensor = new_tensor_int64.to(dtype=tensor.dtype)
elif tensor.dtype in [torch.float8_e4m3fn]:
new_tensor_float64 = new_tensor.to(dtype=torch.float64)
tensor_float64 = tensor.to(dtype=torch.float64)
new_tensor_float64.view(-1).index_add_(0, new_positions, tensor_float64.view(-1))
tensor_float64 = tensor.to(dtype=torch.float64)
new_tensor_float64.view(-1).index_add_(
0, new_positions, tensor_float64.view(-1)
)
new_tensor = new_tensor_float64.to(dtype=tensor.dtype)
else:
raise ValueError("Unsupported data type")
new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))
return new_tensor
......@@ -355,6 +383,11 @@ def get_args():
action="store_true",
help="Run Iluvatar GPU test",
)
parser.add_argument(
"--qy",
action="store_true",
help="Run Qy GPU test",
)
parser.add_argument(
"--cambricon",
action="store_true",
......@@ -515,7 +548,7 @@ def print_discrepancy(
actual = actual.to("cpu")
expected = expected.to("cpu")
actual_isnan = torch.isnan(actual)
expected_isnan = torch.isnan(expected)
......@@ -525,7 +558,8 @@ def print_discrepancy(
)
diff_mask = nan_mismatch | (
torch.abs(actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64)) > (atol + rtol * torch.abs(expected.to(dtype=torch.float64)))
torch.abs(actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64))
> (atol + rtol * torch.abs(expected.to(dtype=torch.float64)))
)
diff_indices = torch.nonzero(diff_mask, as_tuple=False)
delta = actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64)
......@@ -670,6 +704,8 @@ def get_test_devices(args):
devices_to_test.append(InfiniDeviceEnum.NVIDIA)
if args.iluvatar:
devices_to_test.append(InfiniDeviceEnum.ILUVATAR)
if args.qy:
devices_to_test.append(InfiniDeviceEnum.QY)
if args.cambricon:
import torch_mlu
......
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, x_strides, y_strides, axis, p, eps
((2, 1, 512), [17408, 1024, 1], [17408, 1024, 1], -1, 2, 1e-12),
((2, 1, 1024), [17408, 1024, 1], [17408, 1024, 1], -1, 2, 1e-12),
((2, 1, 2048), [17408, 1024, 1], [17408, 1024, 1], -1, 2, 1e-12),
((2048, 2050), None, None, 0, 1, 1e-12),
((2048, 2050), None, None, 1, 1, 1e-12),
((12, 16, 512, 512), None, None, 0, 2, 1e-12),
((12, 16, 512, 512), None, None, 1, 2, 1e-12),
((12, 16, 512, 512), None, None, 2, 1, 1e-12),
((12, 16, 512, 512), None, None, 3, 2, 1e-12),
((1, 16, 512, 512), None, None, 0, 2, 1e-12),
((1, 16, 512, 512), None, None, 1, 1, 1e-12),
((1, 16, 512, 512), None, None, 2, 2, 1e-12),
((1, 16, 512, 512), None, None, 3, 2, 1e-12),
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.F32: {"atol": 3e-5, "rtol": 1e-5},
}
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
_INPLACE = [
Inplace.INPLACE_X,
Inplace.OUT_OF_PLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def lp_norm(x, axis, p, eps):
return torch.nn.functional.normalize(
x.to(torch.float32), dim=axis, p=p, eps=eps
).to(x.dtype)
def test(
handle,
device,
shape,
x_strides,
y_strides,
axis,
p,
eps,
inplace=Inplace.OUT_OF_PLACE,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing LPNorm on {InfiniDeviceNames[device]} with shape:{shape}, y_strides:{y_strides}, x_strides:{x_strides}, axis:{axis}, p:{p}, eps:{eps} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
x = TestTensor(shape, x_strides, dtype, device)
ans = lp_norm(x.torch_tensor(), axis, p, eps)
if inplace == Inplace.INPLACE_X:
y = x
else:
y = TestTensor(shape, y_strides, dtype, device)
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateLPNormDescriptor(
handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, axis, p, eps
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x.destroy_desc()
y.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetLPNormWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, x.device)
def lib_lp_norm():
check_error(
LIBINFINIOP.infiniopLPNorm(
descriptor,
workspace.data(),
workspace_size.value,
y.data(),
x.data(),
None,
)
)
lib_lp_norm()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: lp_norm(x.torch_tensor(), axis, p, eps), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_lp_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyLPNormDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -43,6 +43,7 @@ class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
......@@ -71,9 +72,11 @@ PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def torch_sigmoid(y, x):
torch.sigmoid(x, out=y)
def test(
handle,
device,
......@@ -169,4 +172,3 @@ if __name__ == "__main__":
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92m Test passed! \033[0m")
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, axis
((4, 4), 0),
((12, 16, 512, 512), 0),
((12, 16, 512, 512), 1),
((12, 16, 512, 512), 2),
((12, 16, 512, 512), 3),
((1, 16, 512, 512), 0),
((1, 16, 512, 512), 1),
((1, 16, 512, 512), 2),
((1, 16, 512, 512), 3),
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.F32: {"atol": 3e-5, "rtol": 1e-5},
}
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
_INPLACE = [
Inplace.INPLACE_X,
Inplace.OUT_OF_PLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def softmax(x, axis):
return torch.softmax(x, axis)
def test(
handle,
device,
shape,
axis,
inplace=Inplace.OUT_OF_PLACE,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing Softmax on {InfiniDeviceNames[device]} with shape:{shape}, axis:{axis} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
x = TestTensor(shape, None, dtype, device)
ans = softmax(x.torch_tensor(), axis)
if inplace == Inplace.INPLACE_X:
y = x
else:
y = TestTensor(shape, None, dtype, device)
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateSoftmaxDescriptor(
handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, axis
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x.destroy_desc()
y.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetSoftmaxWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, x.device)
def lib_softmax():
check_error(
LIBINFINIOP.infiniopSoftmax(
descriptor,
workspace.data(),
workspace_size.value,
y.data(),
x.data(),
None,
)
)
lib_softmax()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: softmax(x.torch_tensor(), axis), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroySoftmaxDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
get_sync_func,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ========================================================================
# Configuration (Internal Use Only)
# ========================================================================
_TEST_CASES_ = [
# shape, input_stride, output_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((13, 4), (0, 1), None),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), None),
((16, 5632), None, None),
((16, 5632), (10240, 1), (10240, 1)),
((4, 4, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_INPUT = auto()
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_INPUT,
]
_TEST_CASES = [
test_case + (inplace,) for test_case in _TEST_CASES_ for inplace in _INPLACE
]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def tanh(output, input):
output.copy_(torch.tanh(input))
def test(
handle,
device,
shape,
input_stride=None,
output_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
input = TestTensor(shape, input_stride, dtype, device)
if inplace == Inplace.INPLACE_INPUT:
if input_stride != output_stride:
return
output = input
else:
output = TestTensor(shape, output_stride, dtype, device, mode="ones")
if output.is_broadcast():
return
print(
f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
tanh(output.torch_tensor(), input.torch_tensor())
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateTanhDescriptor(
handle,
ctypes.byref(descriptor),
output.descriptor,
input.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [input, output]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetTanhWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, output.device)
def lib_tanh():
check_error(
LIBINFINIOP.infiniopTanh(
descriptor,
workspace.data(),
workspace_size.value,
output.data(),
input.data(),
None,
)
)
lib_tanh()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(
output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: tanh(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -102,6 +102,18 @@ if has_config("iluvatar-gpu") then
includes("xmake/iluvatar.lua")
end
-- qy
option("qy-gpu")
set_default(false)
set_showmenu(true)
set_description("Whether to compile implementations for Qy GPU")
option_end()
if has_config("qy-gpu") then
add_defines("ENABLE_QY_API")
includes("xmake/qy.lua")
end
-- 沐曦
option("metax-gpu")
set_default(false)
......@@ -228,6 +240,10 @@ target("infinirt")
if has_config("iluvatar-gpu") then
add_deps("infinirt-iluvatar")
end
if has_config("qy-gpu") then
add_deps("infinirt-qy")
add_files("build/.objs/infinirt-qy/rules/qy.cuda/src/infinirt/cuda/*.cu.o", {public = true})
end
if has_config("kunlun-xpu") then
add_deps("infinirt-kunlun")
end
......@@ -253,6 +269,11 @@ target("infiniop")
if has_config("iluvatar-gpu") then
add_deps("infiniop-iluvatar")
end
if has_config("qy-gpu") then
add_deps("infiniop-qy")
add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/nvidia/*.cu.o", {public = true})
add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/nvidia/*.cu.o", {public = true})
end
if has_config("cambricon-mlu") then
add_deps("infiniop-cambricon")
......@@ -303,6 +324,9 @@ target("infiniccl")
if has_config("iluvatar-gpu") then
add_deps("infiniccl-iluvatar")
end
if has_config("qy-gpu") then
add_deps("infiniccl-qy")
end
if has_config("moore-gpu") then
add_deps("infiniccl-moore")
......
local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
if CUDNN_ROOT ~= nil then
add_includedirs(CUDNN_ROOT .. "/include")
end
add_includedirs("/usr/local/denglin/sdk/include", "../include")
add_linkdirs("/usr/local/denglin/sdk/lib")
add_links("curt", "cublas", "cudnn")
set_languages("cxx17")
add_cxxflags("-std=c++17") -- 显式设置 C++17
add_cuflags("--std=c++17",{force = true}) -- 确保 CUDA 编译器也使用 C++17
rule("ignore.o")
set_extensions(".o") -- 防止 xmake 默认处理
on_build_files(function () end)
rule("qy.cuda")
set_extensions(".cu")
-- 缓存所有 .o 文件路径
local qy_objfiles = {}
on_load(function (target)
target:add("includedirs", "/usr/local/denglin/sdk/include")
end)
after_load(function (target)
-- 过滤 cudadevrt/cudart_static
local links = target:get("syslinks") or {}
local filtered = {}
for _, link in ipairs(links) do
if link ~= "cudadevrt" and link ~= "cudart_static" then
table.insert(filtered, link)
end
end
target:set("syslinks", filtered)
end)
on_buildcmd_file(function (target, batchcmds, sourcefile, opt)
import("core.project.project")
import("core.project.config")
import("core.base.option")
local dlcc = "/usr/local/denglin/sdk/bin/dlcc"
local sdk_path = "/usr/local/denglin/sdk"
local arch = "dlgput64"
local relpath = path.relative(sourcefile, project.directory())
local objfile = path.join(config.buildir(), ".objs", target:name(), "rules", "qy.cuda", relpath .. ".o")
-- 🟢 强制注册 .o 文件给 target
target:add("objectfiles", objfile)
target:set("buildadd", true)
local argv = {
"-c", sourcefile,
"-o", objfile,
"--cuda-path=" .. sdk_path,
"--cuda-gpu-arch=" .. arch,
"-std=c++17", "-O2", "-fPIC"
}
for _, incdir in ipairs(target:get("includedirs") or {}) do
table.insert(argv, "-I" .. incdir)
end
for _, def in ipairs(target:get("defines") or {}) do
table.insert(argv, "-D" .. def)
end
batchcmds:mkdir(path.directory(objfile))
batchcmds:show_progress(opt.progress, "${color.build.object}compiling.dlcu %s", relpath)
batchcmds:vrunv(dlcc, argv)
end)
target("infiniop-qy")
set_kind("static")
add_deps("infini-utils")
on_install(function (target) end)
add_rules("qy.cuda", {override = true})
if is_plat("windows") then
add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
add_cuflags("-Xcompiler=/W3", "-Xcompiler=/WX")
add_cxxflags("/FS")
if CUDNN_ROOT ~= nil then
add_linkdirs(CUDNN_ROOT .. "\\lib\\x64")
end
else
add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror")
add_cuflags("-Xcompiler=-fPIC")
add_cuflags("--extended-lambda")
add_culdflags("-Xcompiler=-fPIC")
add_cxxflags("-fPIC")
add_cuflags("--expt-relaxed-constexpr")
if CUDNN_ROOT ~= nil then
add_linkdirs(CUDNN_ROOT .. "/lib")
end
end
add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
set_languages("cxx17")
add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
if has_config("ninetoothed") then
add_files("../build/ninetoothed/*.c")
end
target_end()
target("infinirt-qy")
set_kind("static")
add_deps("infini-utils")
on_install(function (target) end)
add_rules("qy.cuda", {override = true})
if is_plat("windows") then
add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
add_cxxflags("/FS")
else
add_cuflags("-Xcompiler=-fPIC")
add_culdflags("-Xcompiler=-fPIC")
add_cxflags("-fPIC")
end
set_languages("cxx17")
add_files("../src/infinirt/cuda/*.cu")
target_end()
target("infiniccl-qy")
set_kind("static")
add_deps("infinirt")
on_install(function (target) end)
if has_config("ccl") then
add_rules("qy.cuda", {override = true})
if not is_plat("windows") then
add_cuflags("-Xcompiler=-fPIC")
add_culdflags("-Xcompiler=-fPIC")
add_cxflags("-fPIC")
local nccl_root = os.getenv("NCCL_ROOT")
if nccl_root then
add_includedirs(nccl_root .. "/include")
add_links(nccl_root .. "/lib/libnccl.so")
else
add_links("nccl") -- Fall back to default nccl linking
end
add_files("../src/infiniccl/cuda/*.cu")
else
print("[Warning] NCCL is not supported on Windows")
end
end
set_languages("cxx17")
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment