Commit c2e87202 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/142

parents 41818f84 c203635b
......@@ -65,6 +65,7 @@ def test(
y_stride=None,
w12_stride=None,
w3_stride=None,
sync=None
):
print(
f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
......@@ -97,6 +98,10 @@ def test(
x_tensor = to_tensor(x, lib)
w12_tensor = to_tensor(w12, lib)
w3_tensor = to_tensor(w3, lib)
if sync is not None:
sync()
descriptor = infiniopMLPDescriptor_t()
check_error(
lib.infiniopCreateMLPDescriptor(
......
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
get_test_devices,
check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4), (0, 1), None, None),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_A = auto()
INPLACE_B = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_A,
Inplace.INPLACE_B,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-7},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class MulDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMulDescriptor_t = POINTER(MulDescriptor)
def mul(x, y):
return torch.mul(x, y)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def test(
lib,
handle,
torch_device,
shape,
a_stride=None,
b_stride=None,
c_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
print(
f"Testing Mul on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
)
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
ans = mul(a, b)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
if sync is not None:
sync()
descriptor = infiniopMulDescriptor_t()
check_error(
lib.infiniopCreateMulDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetMulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, c.device)
def lib_mul():
check_error(
lib.infiniopMul(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
None,
)
)
lib_mul()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: mul(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_mul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyMulDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateMulDescriptor.restype = c_int32
lib.infiniopCreateMulDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMulDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetMulWorkspaceSize.restype = c_int32
lib.infiniopGetMulWorkspaceSize.argtypes = [
infiniopMulDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopMul.restype = c_int32
lib.infiniopMul.argtypes = [
infiniopMulDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMulDescriptor.restype = c_int32
lib.infiniopDestroyMulDescriptor.argtypes = [
infiniopMulDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -103,6 +103,7 @@ def test(
topk,
temperature,
dtype=torch.float16,
sync=None
):
print(
f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
......@@ -122,6 +123,9 @@ def test(
indices_tensor.descriptor.contents.dt = InfiniDtype.U64 # treat int64 as uint64
if sync is not None:
sync()
descriptor = infiniopRandomSampleDescriptor_t()
check_error(
lib.infiniopCreateRandomSampleDescriptor(
......
......@@ -17,19 +17,88 @@ from libinfiniop import (
profile_operation,
)
def row_major_strides(shape):
"""生成张量的行优先(C风格)stride
Args:
shape: 张量形状
Returns:
行优先strides列表
"""
# 行优先 (C风格,从最后一维到第一维)
stride = 1
strides = [1]
for dim in reversed(shape[1:]):
stride *= dim
strides.insert(0, stride)
return strides
def column_major_strides(shape):
"""生成张量的列优先(Fortran风格)stride
Args:
shape: 张量形状
Returns:
列优先strides列表
"""
# 列优先 (Fortran风格,从第一维到最后一维)
stride = 1
strides = [stride]
for dim in shape[:-1]:
stride *= dim
strides.append(stride)
return strides
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES = [
# ((src_shape, src_stride), (dst_shape, dst_stride))
(((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
(((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
(((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
(((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
(((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
(((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
(((64,), (1,)), ((64,), (1,))),
# (shape, x_stride, y_stride)
(
(2, 4, 64), # shape
(2, 4, 8), # x_stride
(512, 128, 2) # y_stride
),
(
(100, 100), # shape
(1, 100), # x_stride
(100, 1) # y_stride
),
(
(4, 4), # shape
(1, 4), # x_stride
(4, 1) # y_stride
),
(
(4, 6, 64), # shape
(64, 4*64, 1), # x_stride
(6*64, 64, 1) # y_stride
),
(
(2000, 2000), # shape
(1, 2000), # x_stride
(2000, 1) # y_stride
),
(
(2001, 2001), # shape
(1, 2001), # x_stride
(2001, 1) # y_stride
),
(
(3, 4, 7, 53, 9), # shape
row_major_strides((3, 4, 7, 53, 9)), # x_stride
column_major_strides((3, 4, 7, 53, 9)) # y_stride
),
(
(3, 4, 50, 50, 5, 7), # shape
row_major_strides((3, 4, 50, 50, 5, 7)), # x_stride
column_major_strides((3, 4, 50, 50, 5, 7)) # y_stride
),
]
# Data types used for testing
......@@ -58,24 +127,28 @@ def test(
lib,
handle,
torch_device,
x_shape,
shape,
x_stride,
y_shape,
y_stride,
dtype=torch.float16,
sync=None
):
print(
f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} dtype:{dtype}"
f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
)
x = torch.rand(x_shape, dtype=dtype).to(torch_device)
y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
x = torch.rand(shape, dtype=dtype).to(torch_device)
y = torch.zeros(shape, dtype=dtype).to(torch_device)
x, y = [
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([x, y], [x_stride, y_stride])
]
x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
if sync is not None:
sync()
descriptor = infiniopRearrangeDescriptor_t()
check_error(
......@@ -86,7 +159,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x_tensor, y_tensor]:
tensor.descriptor.contents.invalidate()
tensor.destroyDesc(lib)
def lib_rearrange():
check_error(
......
......@@ -55,6 +55,7 @@ def test(
tensor_shape,
tensor_dtype=torch.float16,
inplace=Inplace.OUT_OF_PLACE,
sync=None
):
print(
f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
......@@ -78,8 +79,11 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
descriptor = infiniopReluDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopReluDescriptor_t()
check_error(
lib.infiniopCreateReluDescriptor(
handle,
......
......@@ -72,6 +72,7 @@ def test(
x_stride,
w_dtype=torch.float16,
dtype=torch.float16,
sync=None
):
print(
f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
......@@ -89,9 +90,11 @@ def test(
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([x, y], [x_stride, y_stride])
]
x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]]
if sync is not None:
sync()
descriptor = infiniopRMSNormDescriptor_t()
check_error(
......
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
from libinfiniop import (
InfiniDtype,
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
......@@ -18,30 +17,49 @@ from libinfiniop import (
profile_operation,
synchronize_device,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES = [
# (t_shape, t_strides)
((1, 32, 128), None),
((1, 32, 64), None),
_TEST_CASES_ = [
# (shape, x_strides, y_strides)
((1, 32, 128), None, None),
((10, 32, 64), None, None),
# 昇腾暂不满足这个用例,最后一维度 <=32 会有问题,可能与其核心
# 接口 GatherMask 的内部实现相关,目前 48 64 128 都可以支持
((4, 1, 32), None),
((1, 32, 128), None),
((3, 32, 128), (8000, 200, 1)),
((4, 1, 32), (64, 64, 1), None),
((11, 33, 128), None, (8000, 200, 1)),
((3, 32, 128), (8000, 200, 1), (7000, 128, 1)),
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16]
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2},
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
torch.float32: {"atol": 1e-4, "rtol": 1e-3},
}
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_X,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
......@@ -55,23 +73,21 @@ class RoPEDescriptor(Structure):
infiniopRoPEDescriptor_t = POINTER(RoPEDescriptor)
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
ndim = x.ndim
assert 0 <= 1 < ndim
assert freqs_cis.shape == (x.shape[0], x.shape[-1])
shape = [d if i == 0 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
return freqs_cis.view(*shape)
def rotary_embedding(t, pos, theta, torch_device):
def rotary_embedding(t, sin, cos, torch_device):
dh = t.shape[2]
dt = t.dtype
assert dh % 2 == 0, "Embedding dimension must be even."
t_even = t[..., 0::2] # [seq_len, n_head, dh // 2]
t_odd = t[..., 1::2] # [seq_len, n_head, dh // 2]
freqs = (1.0 / (theta ** (torch.arange(0, dh, 2).float() / dh))).to(torch_device)
freqs = torch.outer(pos, freqs) # [seq_len, dh // 2]
cos = torch.cos(freqs).unsqueeze(1) # [seq_len, 1, dh // 2]
sin = torch.sin(freqs).unsqueeze(1) # [seq_len, 1, dh // 2]
cos = cos.unsqueeze(1) # [seq_len, 1, dh // 2]
sin = sin.unsqueeze(1) # [seq_len, 1, dh // 2]
if torch_device == "cpu":
(t_even, t_odd, cos, sin) = (
t_even.float(),
t_odd.float(),
cos.float(),
sin.float(),
)
t_out_even = t_even * cos - t_odd * sin
t_out_odd = t_even * sin + t_odd * cos
......@@ -80,60 +96,67 @@ def rotary_embedding(t, pos, theta, torch_device):
t_out[..., 0::2] = t_out_even
t_out[..., 1::2] = t_out_odd
return t_out
return t_out.to(dt).to(torch_device)
def sin_cos_table(max_seq_len, dim, torch_device, theta):
pos = torch.arange(
0, max_seq_len, dtype=torch.float32, device=torch.device(torch_device)
)
def sin_cos_table(pos, dim, torch_device, theta, dtype):
assert dim % 2 == 0, "Embedding dimension must be even."
freqs = (1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))).to(
torch_device
)
# (a0, a1, a2) -> (a0, a0, a1, a1, a2, a2)
freqs = torch.repeat_interleave(freqs, repeats=2)
angles = torch.outer(pos, freqs)
return torch.sin(angles), torch.cos(angles)
def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
return torch.sin(angles).to(dtype), torch.cos(angles).to(dtype)
def test(
lib,
handle,
torch_device,
shape,
x_strides=None,
y_strides=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float32,
sync=None
):
if inplace == Inplace.INPLACE_X:
y_strides = x_strides
print(
f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}"
f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} x_strides:{x_strides} y_strides:{y_strides} and dtype:{dtype} inplace:{inplace}"
)
t = torch.rand(shape, dtype=dtype)
x = torch.rand(shape, dtype=dtype).to(torch_device)
x = rearrange_if_needed(x, x_strides)
if inplace == Inplace.INPLACE_X:
y = x
else:
y = torch.rand(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_strides)
theta = 1e5
pos = torch.arange(0, x.shape[0], dtype=torch.int32).to(torch_device)
sin_table, cos_table = sin_cos_table(pos, x.shape[2], x.device, theta, dtype)
t = rearrange_if_needed(t, strides)
posTmp = torch.arange(0, t.shape[0]).to(torch_device)
pos = torch.zeros(2 * posTmp.shape[0], dtype=torch.int32)
for i in range(posTmp.shape[0]):
pos[2 * i] = posTmp[i]
pos[2 * i + 1] = 0
pos = pos.to(torch_device)
theta = 1e4
ans = rotary_embedding(t, posTmp, theta, torch_device)
ans = rotary_embedding(x, sin_table, cos_table, torch_device)
descriptor = infiniopRoPEDescriptor_t()
# 2x table length for test
sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta)
t_tensor, sin_table_tensor, cos_table_tensor = [
to_tensor(tensor, lib) for tensor in [t, sin_table, cos_table]
x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor = [
to_tensor(tensor, lib, force_unsigned=True)
for tensor in [x, pos, sin_table, cos_table]
]
if inplace == Inplace.INPLACE_X:
y_tensor = x_tensor
else:
y_tensor = to_tensor(y, lib)
pos_tensor = to_tensor(pos[: t.shape[0]], lib)
pos_tensor.descriptor.contents.dtype = InfiniDtype.U64
if torch_device == "npu":
synchronize_device(torch_device)
if sync is not None:
sync()
check_error(
lib.infiniopCreateRoPEDescriptor(
handle,
ctypes.byref(descriptor),
t_tensor.descriptor,
y_tensor.descriptor,
x_tensor.descriptor,
pos_tensor.descriptor,
sin_table_tensor.descriptor,
cos_table_tensor.descriptor,
......@@ -141,14 +164,14 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [t_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]:
tensor.descriptor.contents.invalidate()
for tensor in [y_tensor, x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetRoPEWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, t.device)
workspace = create_workspace(workspace_size.value, x.device)
def lib_rope():
check_error(
......@@ -156,7 +179,8 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
t_tensor.data,
y_tensor.data,
x_tensor.data,
pos_tensor.data,
sin_table_tensor.data,
cos_table_tensor.data,
......@@ -165,16 +189,19 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
)
lib_rope()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(t, ans, atol=atol, rtol=rtol)
assert torch.allclose(t, ans, atol=atol, rtol=rtol)
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
if PROFILE:
profile_operation(
"PyTorch",
lambda: rotary_embedding(t, posTmp, theta, torch_device),
lambda: rotary_embedding(x, pos, theta, torch_device),
torch_device,
NUM_PRERUN,
NUM_ITERATIONS,
......@@ -232,5 +259,5 @@ if __name__ == "__main__":
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
......@@ -14,6 +14,7 @@ from libinfiniop import (
debug,
get_tolerance,
profile_operation,
create_workspace
)
from enum import Enum, auto
......@@ -25,8 +26,10 @@ _TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4), (0, 1), None, None),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None, None),
......@@ -58,7 +61,8 @@ _TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2},
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 2e-7, "rtol": 1e-7},
}
DEBUG = False
......@@ -76,6 +80,38 @@ infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor)
def swiglu(a, b):
return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def test(
......@@ -98,18 +134,10 @@ def test(
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
ans = swiglu(a, b)
a, b, c = [
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
......@@ -134,10 +162,19 @@ def test(
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetSwiGLUWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, c.device)
def lib_swiglu():
check_error(
lib.infiniopSwiGLU(
descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data, a_tensor.data, b_tensor.data, None
)
)
......@@ -170,10 +207,18 @@ if __name__ == "__main__":
infiniopTensorDescriptor_t,
]
lib.infiniopGetSwiGLUWorkspaceSize.restype = c_int32
lib.infiniopGetSwiGLUWorkspaceSize.argtypes = [
infiniopSwiGLUDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopSwiGLU.restype = c_int32
lib.infiniopSwiGLU.argtypes = [
infiniopSwiGLUDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
......
......@@ -4,9 +4,10 @@ local GREEN = '\27[0;32m'
local YELLOW = '\27[1;33m'
local NC = '\27[0m' -- No Color
add_includedirs("include")
set_encodings("utf-8")
add_includedirs("include")
if is_mode("debug") then
add_defines("DEBUG_MODE")
end
......@@ -117,6 +118,18 @@ if has_config("kunlun-xpu") then
includes("xmake/kunlun.lua")
end
-- InfiniCCL
option("ccl")
set_default(false)
set_default(false)
set_showmenu(true)
set_description("Wether to complie implementations for InfiniCCL")
option_end()
if has_config("ccl") then
add_defines("ENABLE_CCL")
end
target("infini-utils")
set_kind("static")
on_install(function (target) end)
......@@ -149,6 +162,9 @@ target("infinirt")
if has_config("nv-gpu") then
add_deps("infinirt-cuda")
end
if has_config("cambricon-mlu") then
add_deps("infinirt-cambricon")
end
if has_config("ascend-npu") then
add_deps("infinirt-ascend")
end
......@@ -219,10 +235,25 @@ target("infiniop")
add_installfiles("include/infinicore.h", {prefixdir = "include"})
target_end()
target("infiniccl")
set_kind("shared")
add_deps("infinirt")
if has_config("nv-gpu") then
add_deps("infiniccl-cuda")
end
set_languages("cxx17")
add_files("src/infiniccl/*.cc")
add_installfiles("include/infiniccl.h", {prefixdir = "include"})
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
target_end()
target("all")
set_kind("phony")
add_deps("infiniop", "infinirt")
add_deps("infiniop", "infinirt", "infiniccl")
after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
target_end()
......
......@@ -50,9 +50,8 @@ target("infiniop-ascend")
add_files("$(projectdir)/src/infiniop/devices/ascend/*.cc", "$(projectdir)/src/infiniop/ops/*/ascend/*.cc")
-- Add operator
-- TODO: add it back after ascend-kernels is fixed
-- add_rules("ascend-kernels")
-- add_links(builddir.."/libascend_kernels.a")
add_rules("ascend-kernels")
add_links(builddir.."/libascend_kernels.a")
target_end()
target("infinirt-ascend")
......
......@@ -50,3 +50,13 @@ target("infiniop-cambricon")
add_files(mlu_files, {rule = "mlu"})
end
target_end()
target("infinirt-cambricon")
set_kind("static")
add_deps("infini-utils")
set_languages("cxx17")
on_install(function (target) end)
-- Add include dirs
add_files("../src/infinirt/bang/*.cc")
add_cxflags("-lstdc++ -Wall -Werror -fPIC")
target_end()
......@@ -28,6 +28,7 @@ target("infiniop-cuda")
else
add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror")
add_cuflags("-Xcompiler=-fPIC")
add_cuflags("--extended-lambda")
add_culdflags("-Xcompiler=-fPIC")
add_cxxflags("-fPIC")
end
......@@ -57,3 +58,34 @@ target("infinirt-cuda")
set_languages("cxx17")
add_files("../src/infinirt/cuda/*.cu")
target_end()
target("infiniccl-cuda")
set_kind("static")
add_deps("infinirt")
on_install(function (target) end)
if has_config("ccl") then
set_policy("build.cuda.devlink", true)
set_toolchains("cuda")
add_links("cudart")
if not is_plat("windows") then
add_cuflags("-Xcompiler=-fPIC")
add_culdflags("-Xcompiler=-fPIC")
add_cxflags("-fPIC")
local nccl_root = os.getenv("NCCL_ROOT")
if nccl_root then
add_includedirs(nccl_root .. "/include")
add_links(nccl_root .. "/lib/libnccl.so")
else
add_links("nccl") -- Fall back to default nccl linking
end
add_files("../src/infiniccl/cuda/*.cu")
else
print("[Warning] NCCL is not supported on Windows")
end
end
set_languages("cxx17")
target_end()
add_defines("ENABLE_KUNLUN_API")
local KUNLUN_HOME = os.getenv("KUNLUN_HOME")
local XTDK_DIR = path.join(KUNLUN_HOME, "XTDK")
-- Add include dirs
add_includedirs(path.join(KUNLUN_HOME, "include"), {public=true})
......@@ -7,6 +8,55 @@ add_linkdirs(path.join(KUNLUN_HOME, "lib64"))
add_links("xpurt")
add_links("xpuapi")
rule("xpu")
set_extensions(".xpu")
on_load(function (target)
target:add("includedirs", path.join(os.projectdir(), "include"))
end)
on_build_file(function (target, sourcefile)
local objectfile = target:objectfile(sourcefile)
local basename = objectfile:gsub("%.o$", "")
os.mkdir(path.directory(objectfile))
local cc = path.join(XTDK_DIR, "bin/clang++")
local includedirs = table.concat(target:get("includedirs"), " ")
local arch_map = {
["x86_64"] = "x86_64-linux-gnu",
["arm64"] = "aarch64-linux-gnu"
}
local args = {
"--sysroot=/",
"--target=" .. arch_map[os.arch()],
"-fPIC",
"-pie",
"--xpu-arch=xpu2",
"--basename", basename,
"-std=c++11",
"-O2",
"-fno-builtin",
"-g",
"-c", sourcefile,
"-v"
}
for _, includedir in ipairs(target:get("includedirs")) do
table.insert(args, "-I" .. includedir)
end
-- print(args)
os.execv(cc, args)
table.insert(target:objectfiles(), objectfile)
table.insert(target:objectfiles(), basename .. ".device.bin.o")
print(target:objectfiles())
end)
rule_end()
local src_dir = path.join(os.projectdir(), "src", "infiniop")
target("infiniop-kunlun")
set_kind("static")
add_deps("infini-utils")
......@@ -17,6 +67,11 @@ target("infiniop-kunlun")
set_languages("cxx17")
add_files("$(projectdir)/src/infiniop/devices/kunlun/*.cc", "$(projectdir)/src/infiniop/ops/*/kunlun/*.cc")
-- compile handwriting kernel
local xpu_files = os.files(src_dir .. "/ops/*/kunlun/*.xpu")
if #xpu_files > 0 then
add_files(xpu_files, {rule = "xpu"})
end
target_end()
target("infinirt-kunlun")
......
local MACA_ROOT = os.getenv("MACA_PATH") or os.getenv("MACA_HOME") or os.getenv("MACA_ROOT")
add_includedirs(MACA_ROOT .. "/include")
add_linkdirs(MACA_ROOT .. "/lib")
add_links("libhcdnn.so")
add_links("libhcblas.so")
add_links("libhcruntime.so")
add_links("hcdnn", "hcblas", "hcruntime")
rule("maca")
set_extensions(".maca")
......@@ -34,13 +31,11 @@ rule_end()
target("infiniop-metax")
set_kind("static")
on_install(function (target) end)
add_cxflags("-lstdc++ -Wall -fPIC")
set_languages("cxx17")
set_warnings("all")
set_warnings("all", "error")
add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"})
target_end()
target("infinirt-metax")
......@@ -48,7 +43,7 @@ target("infinirt-metax")
set_languages("cxx17")
on_install(function (target) end)
add_deps("infini-utils")
-- Add files
add_files("$(projectdir)/src/infinirt/maca/*.cc")
add_cxflags("-lstdc++ -Wall -Werror -fPIC")
set_warnings("all", "error")
add_cxflags("-lstdc++ -fPIC")
add_files("../src/infinirt/maca/*.cc")
target_end()
......@@ -34,3 +34,20 @@ target("infiniop-test")
set_installdir(INFINI_ROOT)
target_end()
target("infiniccl-test")
set_kind("binary")
add_deps("infini-utils")
set_default(false)
set_warnings("all", "error")
set_languages("cxx17")
local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
add_includedirs(INFINI_ROOT.."/include")
add_linkdirs(INFINI_ROOT.."/lib")
add_links("infinirt", "infiniccl")
add_files(os.projectdir().."/src/infiniccl-test/*.cpp")
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment