Unverified Commit 0166515c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge branch 'main' into issue/300

parents f0300ff3 a23c4d13
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch
from typing import Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class MaxPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMaxPoolDescriptor_t = POINTER(MaxPoolDescriptor)
def pool(x, k, padding, stride, dilation=1):
pooling_layers = {
1: torch.nn.MaxPool1d,
2: torch.nn.MaxPool2d,
3: torch.nn.MaxPool3d,
}
ndim = len(x.shape) - 2
if ndim not in pooling_layers:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
ans = pooling_layers[ndim](k, stride=stride, padding=padding, dilation=dilation)(x)
if PROFILE:
torch.cuda.synchronize()
return ans
def inferShape(x_shape, kernel_shape, padding, strides):
assert (
len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
input_shape = x_shape[2:]
output_shape = []
for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
output_dim = (dim + 2 * p - k) // s + 1
output_shape.append(output_dim)
return x_shape[:2] + tuple(output_shape)
# convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p)
def test(
lib,
handle,
torch_device,
x_shape,
k_shape,
padding,
strides,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(
inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = pool(x, k_shape, padding, strides)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = pool(x, k_shape, padding, strides)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopMaxPoolDescriptor_t()
check_error(
lib.infiniopCreateMaxPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
tuple_to_void_p(k_shape),
tuple_to_void_p(padding),
tuple_to_void_p(strides),
len(k_shape),
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetMaxPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopMaxPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopMaxPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape, kernel_shape, padding, strides
((1, 1, 10), (3,), (1,), (1,)),
((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateMaxPoolDescriptor.restype = c_int32
lib.infiniopCreateMaxPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMaxPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32
lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [
infiniopMaxPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopMaxPool.restype = c_int32
lib.infiniopMaxPool.argtypes = [
infiniopMaxPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32
lib.infiniopDestroyMaxPoolDescriptor.argtypes = [
infiniopMaxPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
CTensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
)
from operatorspy.tests.test_utils import get_args
import torch
import torch.nn as nn
class MLPDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMLPDescriptor_t = POINTER(MLPDescriptor)
def swiglu(a, b):
return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
def mlp(y, x, w12, w3, alpha, residual):
input_dtype = x.dtype
intermediate_size = w3.shape[0]
a = torch.matmul(
x.to(torch.float32), w12[:, intermediate_size:].to(torch.float32)
).to(input_dtype)
b = torch.matmul(
x.to(torch.float32), w12[:, 0:intermediate_size].to(torch.float32)
).to(input_dtype)
c = swiglu(a, b)
d = torch.matmul(c.to(torch.float32), alpha * w3.to(torch.float32)).to(input_dtype)
out = d + y if residual else d
return out
def test(
lib,
handle,
torch_device,
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype=torch.float16,
x_stride=None,
y_stride=None,
w12_stride=None,
w3_stride=None,
sync=None
):
print(
f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
f" alpha:{alpha} residual:{residual} dtype:{dtype} x_stride:{x_stride} y_stride:{y_stride} w12_stride:{w12_stride} w3_stride:{w3_stride}"
)
y = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
x = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
w12 = (
torch.rand([hidden_size, 2 * intermediate_size], dtype=dtype).to(torch_device)
* 0.01
)
w3 = (
torch.rand([intermediate_size, hidden_size], dtype=dtype).to(torch_device)
* 0.01
)
ans = mlp(y, x, w12, w3, alpha, residual)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
if w12_stride is not None:
w12 = rearrange_tensor(w12, w12_stride)
if w3_stride is not None:
w3 = rearrange_tensor(w3, w3_stride)
y_tensor = to_tensor(y, lib)
x_tensor = to_tensor(x, lib)
w12_tensor = to_tensor(w12, lib)
w3_tensor = to_tensor(w3, lib)
if sync is not None:
sync()
descriptor = infiniopMLPDescriptor_t()
check_error(
lib.infiniopCreateMLPDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
w12_tensor.descriptor,
w3_tensor.descriptor,
alpha,
residual,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
y_tensor.descriptor.contents.invalidate()
x_tensor.descriptor.contents.invalidate()
w12_tensor.descriptor.contents.invalidate()
w3_tensor.descriptor.contents.invalidate()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetMLPWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, x.device)
check_error(
lib.infiniopMLP(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
y_tensor.data,
x_tensor.data,
w12_tensor.data,
w3_tensor.data,
None,
)
)
assert torch.allclose(y, ans, atol=0, rtol=2e-2)
check_error(lib.infiniopDestroyMLPDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
) in test_cases:
test(
lib,
handle,
"cpu",
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
) in test_cases:
test(
lib,
handle,
"cuda",
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
) in test_cases:
test(
lib,
handle,
"mlu",
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# num_tokens, hidden_size, intermediate_size, alpha, residual, dtype, x_stride, y_stride, w12_stride, w3_stride
(4, 4096, 11008, 1.0, True, torch.float16, None, None, None, None),
(4, 4096, 11008, 1.0, True, torch.float16, [8192, 1], [8192, 1], None, None),
(
4,
4096,
11008,
1.0,
True,
torch.float16,
None,
None,
[1, 4096],
[1, 11008],
),
(4, 4096, 11008, 1.0, False, torch.float16, None, None, None, None),
(4, 4096, 11008, 1.0, False, torch.float16, [8192, 1], [8192, 1], None, None),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateMLPDescriptor.restype = c_int32
lib.infiniopCreateMLPDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMLPDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
c_bool,
]
lib.infiniopGetMLPWorkspaceSize.restype = c_int32
lib.infiniopGetMLPWorkspaceSize.argtypes = [
infiniopMLPDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopMLP.restype = c_int32
lib.infiniopMLP.argtypes = [
infiniopMLPDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMLPDescriptor.restype = c_int32
lib.infiniopDestroyMLPDescriptor.argtypes = [
infiniopMLPDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -58,126 +59,93 @@ _TEST_CASES = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class MulDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMulDescriptor_t = POINTER(MulDescriptor)
def mul(x, y):
return torch.mul(x, y)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def mul(c, a, b):
torch.mul(a, b, out=c)
def test(
lib,
handle,
torch_device,
device,
shape,
a_stride=None,
b_stride=None,
c_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
a = TestTensor(shape, a_stride, dtype, device)
b = TestTensor(shape, b_stride, dtype, device)
if inplace == Inplace.INPLACE_A:
if c_stride is not None and c_stride != a_stride:
return
c = a
elif inplace == Inplace.INPLACE_B:
if c_stride is not None and c_stride != b_stride:
return
c = b
else:
c = TestTensor(shape, c_stride, dtype, device)
if c.is_broadcast():
return
print(
f"Testing Mul on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
f"Testing Mul on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
mul(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
ans = mul(a, b)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
if sync is not None:
sync()
descriptor = infiniopMulDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateMulDescriptor(
LIBINFINIOP.infiniopCreateMulDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c.descriptor,
a.descriptor,
b.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
for tensor in [a, b, c]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetMulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetMulWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, c.device)
workspace = TestWorkspace(workspace_size.value, c.device)
def lib_mul():
check_error(
lib.infiniopMul(
LIBINFINIOP.infiniopMul(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
c.data(),
a.data(),
b.data(),
None,
)
)
......@@ -186,52 +154,20 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: mul(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_mul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: mul(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_mul(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyMulDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyMulDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateMulDescriptor.restype = c_int32
lib.infiniopCreateMulDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMulDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetMulWorkspaceSize.restype = c_int32
lib.infiniopGetMulWorkspaceSize.argtypes = [
infiniopMulDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopMul.restype = c_int32
lib.infiniopMul.argtypes = [
infiniopMulDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMulDescriptor.restype = c_int32
lib.infiniopDestroyMulDescriptor.argtypes = [
infiniopMulDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -240,7 +176,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
from ctypes import c_uint64
from libinfiniop import (
InfiniDtype,
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
create_workspace,
test_operator,
get_args,
debug_all,
get_tolerance,
profile_operation,
synchronize_device,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
# ==============================================================================
......@@ -37,11 +37,11 @@ _TEST_CASES = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.bfloat16]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 0},
torch.bfloat16: {"atol": 0, "rtol": 0},
InfiniDtype.F16: {"atol": 0, "rtol": 0},
InfiniDtype.BF16: {"atol": 0, "rtol": 0},
}
......@@ -51,13 +51,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class RandomSampleDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopRandomSampleDescriptor_t = POINTER(RandomSampleDescriptor)
def random_sample(data, random_val, topp, topk, voc, temperature):
if topp > 0 and topk > 1:
sorted_vals, sorted_indices = torch.sort(data, descending=True)
......@@ -68,81 +61,83 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
k_index = min(topk, voc) - 1
threshold = min(cum_probs[k_index], topp) * random_val
try:
idx = torch.searchsorted(cum_probs, threshold)
except Exception:
# Fallback for manual search if torch.searchsorted is not supported
indices = (cum_probs >= threshold).nonzero(as_tuple=True)[0]
idx = indices[0] if indices.numel() > 0 else torch.tensor(len(cum_probs)-1, device=cum_probs.device)
idx = (
indices[0]
if indices.numel() > 0
else torch.tensor(len(cum_probs) - 1, device=cum_probs.device)
)
return sorted_indices[idx]
return torch.argmax(data)
def test(
lib,
handle,
torch_device,
device,
voc,
random_val,
topp,
topk,
temperature,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
f"Testing RandomSample on {InfiniDeviceNames[device]} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{InfiniDtypeNames[dtype]}"
)
data = torch.arange(voc).float() * 0.0001
_perm = torch.randperm(voc)
data = data[_perm].to(dtype).to(torch_device)
logits = TestTensor.from_torch(
torch.arange(voc)[_perm].float() * 0.0001, dtype, device
)
ans = random_sample(
data, random_val, topp, topk, voc, temperature
logits.torch_tensor(), random_val, topp, topk, voc, temperature
).to(
torch.int32
) # 这个函数在device速度可能会很慢,可以通过data.to("cpu")方式加快计算过程
indices = torch.zeros([], dtype=torch.int64).to(torch_device)
x_tensor, indices_tensor = [to_tensor(tensor, lib) for tensor in [data, indices]]
indices_tensor.descriptor.contents.dt = InfiniDtype.U64 # treat int64 as uint64
indices = TestTensor([], None, InfiniDtype.I32, device, mode="zeros")
if sync is not None:
sync()
descriptor = infiniopRandomSampleDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateRandomSampleDescriptor(
LIBINFINIOP.infiniopCreateRandomSampleDescriptor(
handle,
ctypes.byref(descriptor),
indices_tensor.descriptor,
x_tensor.descriptor,
indices.descriptor,
logits.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x_tensor, indices_tensor]:
tensor.destroyDesc(lib)
for tensor in [logits, indices]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetRandomSampleWorkspaceSize(
LIBINFINIOP.infiniopGetRandomSampleWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, torch_device)
workspace = TestWorkspace(workspace_size.value, device)
def lib_random_sample():
check_error(
lib.infiniopRandomSample(
LIBINFINIOP.infiniopRandomSample(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
indices_tensor.data,
x_tensor.data,
indices.data(),
logits.data(),
random_val,
topp,
topk,
......@@ -153,66 +148,36 @@ def test(
lib_random_sample()
if torch_device == "npu":
synchronize_device(torch_device)
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug_all(
(indices.type(ans.dtype), data[indices]),
(ans, data[ans]),
(indices.actual_tensor(), logits.actual_tensor()[indices.actual_tensor()]),
(ans, logits.torch_tensor()[ans]),
"or",
atol=atol,
rtol=rtol,
)
assert indices.type(ans.dtype) == ans or data[ans] == data[indices]
assert (
indices.actual_tensor() == ans
or logits.actual_tensor()[indices.actual_tensor()] == logits.torch_tensor()[ans]
)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: random_sample(
data, random_val, topp, topk, voc, temperature
), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_random_sample(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
logits.torch_tensor(), random_val, topp, topk, voc, temperature
), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_random_sample(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyRandomSampleDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
lib.infiniopCreateRandomSampleDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopRandomSampleDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
infiniopRandomSampleDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopRandomSample.restype = c_int32
lib.infiniopRandomSample.argtypes = [
infiniopRandomSampleDescriptor_t,
c_void_p,
c_uint64,
c_uint64,
c_void_p,
c_float,
c_float,
c_int32,
c_float,
c_void_p,
]
lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
infiniopRandomSampleDescriptor_t,
]
DEBUG = args.debug
PROFILE = args.profile
......@@ -221,6 +186,6 @@ if __name__ == "__main__":
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
rearrange_tensor,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
def row_major_strides(shape):
"""生成张量的行优先(C风格)stride
Args:
shape: 张量形状
Returns:
行优先strides列表
"""
......@@ -34,12 +34,13 @@ def row_major_strides(shape):
strides.insert(0, stride)
return strides
def column_major_strides(shape):
"""生成张量的列优先(Fortran风格)stride
Args:
shape: 张量形状
Returns:
列优先strides列表
"""
......@@ -52,62 +53,37 @@ def column_major_strides(shape):
return strides
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES = [
# (shape, x_stride, y_stride)
((100, 100), (1, 100), (100, 1)), # shape # x_stride # y_stride
((4, 4), (1, 4), (4, 1)), # shape # x_stride # y_stride
((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)), # shape # x_stride # y_stride
((2000, 2000), (1, 2000), (2000, 1)), # shape # x_stride # y_stride
((2001, 2001), (1, 2001), (2001, 1)), # shape # x_stride # y_stride
((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)), # shape # x_stride # y_stride
(
(100, 100), # shape
(1, 100), # x_stride
(100, 1) # y_stride
),
(
(4, 4), # shape
(1, 4), # x_stride
(4, 1) # y_stride
),
(
(4, 6, 64), # shape
(64, 4*64, 1), # x_stride
(6*64, 64, 1) # y_stride
),
(
(2000, 2000), # shape
(1, 2000), # x_stride
(2000, 1) # y_stride
(3, 4, 7, 53, 9), # shape
row_major_strides((3, 4, 7, 53, 9)), # x_stride
column_major_strides((3, 4, 7, 53, 9)), # y_stride
),
(
(2001, 2001), # shape
(1, 2001), # x_stride
(2001, 1) # y_stride
),
(
(2, 2, 2, 4), # shape
(16, 8, 4, 1), # x_stride
(16, 8, 1, 2) # y_stride
),
(
(3, 4, 7, 53, 9), # shape
row_major_strides((3, 4, 7, 53, 9)), # x_stride
column_major_strides((3, 4, 7, 53, 9)) # y_stride
),
(
(3, 4, 50, 50, 5, 7), # shape
(3, 4, 50, 50, 5, 7), # shape
row_major_strides((3, 4, 50, 50, 5, 7)), # x_stride
column_major_strides((3, 4, 50, 50, 5, 7)) # y_stride
column_major_strides((3, 4, 50, 50, 5, 7)), # y_stride
),
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 0},
torch.float32: {"atol": 0, "rtol": 0},
InfiniDtype.F16: {"atol": 0, "rtol": 0},
InfiniDtype.F32: {"atol": 0, "rtol": 0},
}
DEBUG = False
......@@ -116,106 +92,60 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class RearrangeDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopRearrangeDescriptor_t = POINTER(RearrangeDescriptor)
def rearrange_torch(x, x_shape, y_stride):
y_ = x.clone()
y_.set_(y_.untyped_storage(), 0, x_shape, y_stride)
y_[:] = x.view_as(y_)
return y_
def rearrange_torch(y, x, x_shape, y_stride):
y.set_(y.untyped_storage(), 0, x_shape, y_stride)
y[:] = x.view_as(y)
def test(
lib,
handle,
torch_device,
shape,
x_stride,
y_stride,
dtype=torch.float16,
sync=None
handle, torch_device, shape, x_stride, y_stride, dtype=InfiniDtype.F16, sync=None
):
print(
f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
f"Testing Rerrange on {InfiniDeviceNames[torch_device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]}"
)
x = torch.rand(shape, dtype=dtype).to(torch_device)
y = torch.zeros(shape, dtype=dtype).to(torch_device)
x = TestTensor(shape, x_stride, dtype, device)
y = TestTensor(shape, y_stride, dtype, device, mode="ones")
rearrange_torch(x, shape, y_stride)
rearrange_torch(y.torch_tensor(), x.torch_tensor(), shape, y_stride)
x, y = [
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([x, y], [x_stride, y_stride])
]
x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
if sync is not None:
sync()
descriptor = infiniopRearrangeDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateRearrangeDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
LIBINFINIOP.infiniopCreateRearrangeDescriptor(
handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x_tensor, y_tensor]:
tensor.destroyDesc(lib)
for tensor in [x, y]:
tensor.destroy_desc()
def lib_rearrange():
check_error(
lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
)
check_error(LIBINFINIOP.infiniopRearrange(descriptor, y.data(), x.data(), None))
lib_rearrange()
# Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(x, y, atol=atol, rtol=rtol)
assert torch.allclose(x, y, atol=atol, rtol=rtol)
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: rearrange_torch(x, shape, y_stride), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_rearrange(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: rearrange_torch(y.torch_tensor(), x.torch_tensor(), shape, y_stride), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_rearrange(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyRearrangeDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateRearrangeDescriptor.restype = c_int32
lib.infiniopCreateRearrangeDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopRearrangeDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopRearrange.restype = c_int32
lib.infiniopRearrange.argtypes = [
infiniopRearrangeDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
......@@ -224,6 +154,6 @@ if __name__ == "__main__":
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
from ctypes import c_uint64
from enum import Enum, auto
import torch
from libinfiniop import (
LIBINFINIOP,
InfiniDeviceNames,
InfiniDtype,
InfiniDtypeNames,
TestTensor,
TestWorkspace,
check_error,
debug,
get_args,
get_test_devices,
get_tolerance,
infiniopOperatorDescriptor_t,
profile_operation,
test_operator,
)
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# tensor_shape, inplace
# TODO: Uncomment the following line.
# ((),),
((1, 3),),
((3, 3),),
((32, 20, 512),),
((33, 333, 333),),
((32, 256, 112, 112),),
((3, 3, 13, 9, 17),),
]
class Inplace(Enum):
......@@ -33,160 +42,121 @@ class Inplace(Enum):
INPLACE_X = auto()
class ReluDescriptor(Structure):
_fields_ = [("device", c_int32)]
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_X,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
infiniopReluDescriptor_t = POINTER(ReluDescriptor)
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def relu(x):
if PROFILE:
ans = torch.nn.functional.relu(x).to(x.dtype)
torch.cuda.synchronize()
return ans
return torch.nn.functional.relu(x).to(x.dtype)
def test(
lib,
handle,
torch_device,
tensor_shape,
tensor_dtype=torch.float16,
inplace=Inplace.OUT_OF_PLACE,
sync=None
handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
):
print(
f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
x_torch_tensor = torch.rand(shape) * 2 - 1
x = TestTensor(
shape,
x_torch_tensor.stride(),
dtype,
device,
mode="manual",
set_tensor=x_torch_tensor,
)
x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
y = (
torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device)
if inplace == Inplace.OUT_OF_PLACE
else x
)
if inplace == Inplace.INPLACE_X:
y = x
else:
y = TestTensor(shape, None, dtype, device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = relu(x)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = relu(x)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
if y.is_broadcast():
return
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
print(
f"Testing Relu on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
)
ans = relu(x.torch_tensor())
if sync is not None:
sync()
sync()
descriptor = infiniopReluDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateReluDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
LIBINFINIOP.infiniopCreateReluDescriptor(
handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for tensor in [x, y]:
tensor.destroy_desc()
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyReluDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for tensor_shape, inplace in test_cases:
# fmt: off
test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetReluWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, y.device)
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for tensor_shape, inplace in test_cases:
# fmt: off
test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle)
def lib_relu():
LIBINFINIOP.infiniopRelu(
descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
)
lib_relu()
def test_bang(lib, test_cases):
import torch_mlu
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for tensor_shape, inplace in test_cases:
# Profiling workflow
if PROFILE:
# fmt: off
test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
profile_operation("PyTorch", lambda: relu(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_relu(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
destroy_handle(lib, handle)
check_error(LIBINFINIOP.infiniopDestroyReluDescriptor(descriptor))
if __name__ == "__main__":
test_cases = [
# tensor_shape, inplace
((), Inplace.OUT_OF_PLACE),
((), Inplace.INPLACE_X),
((1, 3), Inplace.OUT_OF_PLACE),
((3, 3), Inplace.OUT_OF_PLACE),
((3, 3, 13, 9, 17), Inplace.INPLACE_X),
((32, 20, 512), Inplace.INPLACE_X),
((33, 333, 333), Inplace.OUT_OF_PLACE),
((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateReluDescriptor.restype = c_int32
lib.infiniopCreateReluDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopReluDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopRelu.restype = c_int32
lib.infiniopRelu.argtypes = [
infiniopReluDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyReluDescriptor.restype = c_int32
lib.infiniopDestroyReluDescriptor.argtypes = [
infiniopReluDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
import ctypes
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
# ==============================================================================
......@@ -33,23 +32,21 @@ _TEST_CASES_ = [
((16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1)),
]
# w (weight) types
# w (weight) types
# Note: 'None' means the same as input dtype
_WEIGHT_DTYPES = [None, torch.float32]
_WEIGHT_DTYPES = [None, InfiniDtype.F32]
# x types used for testing
_TENSOR_DTYPES = [torch.float16, torch.bfloat16]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]
# Form the test cases by appending each element of _WEIGHT_DTYPES to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (w_dtype,)
for test_case in _TEST_CASES_
for w_dtype in _WEIGHT_DTYPES
test_case + (w_dtype,) for test_case in _TEST_CASES_ for w_dtype in _WEIGHT_DTYPES
]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 2e-3, "rtol": 2e-3},
torch.bfloat16: {"atol": 8e-3, "rtol": 8e-3},
InfiniDtype.F16: {"atol": 2e-3, "rtol": 2e-3},
InfiniDtype.BF16: {"atol": 8e-3, "rtol": 8e-3},
}
DEBUG = False
......@@ -58,13 +55,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class RMSNormDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor)
def rms_norm(ans, x, w, eps):
torch.pow(x, 2, out=ans)
mean = torch.mean(ans, dim=-1, keepdim=True)
......@@ -75,73 +65,67 @@ def rms_norm(ans, x, w, eps):
def test(
lib,
handle,
torch_device,
device,
y_shape,
x_shape,
w_shape,
y_stride,
x_stride,
w_dtype=torch.float16,
dtype=torch.float16,
w_dtype=InfiniDtype.F32,
dtype=InfiniDtype.F16,
sync=None,
):
w_dtype = w_dtype if w_dtype else dtype
print(
f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
f" y_stride:{y_stride} x_stride:{x_stride} w_dtype:{w_dtype} dtype:{dtype}"
f"Testing RMS_Norm on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
f" y_stride:{y_stride} x_stride:{x_stride} w_dtype:{InfiniDtypeNames[w_dtype]} dtype:{InfiniDtypeNames[dtype]}"
)
w_dtype = w_dtype if w_dtype else dtype
y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
x = torch.rand(x_shape, dtype=dtype).to(torch_device)
w = torch.rand(w_shape, dtype=w_dtype).to(torch_device)
ans = torch.zeros(y_shape, dtype=dtype).to(torch_device)
eps = 1e-5
rms_norm(ans, x, w, eps)
y = TestTensor(y_shape, y_stride, dtype, device, mode="ones")
x = TestTensor(x_shape, x_stride, dtype, device, scale=0.01)
w = TestTensor(w_shape, None, w_dtype, device)
x, y = [
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([x, y], [x_stride, y_stride])
]
x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]]
eps = 1e-6
rms_norm(y.torch_tensor(), x.torch_tensor(), w.torch_tensor(), eps)
if sync is not None:
sync()
descriptor = infiniopRMSNormDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateRMSNormDescriptor(
LIBINFINIOP.infiniopCreateRMSNormDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
w_tensor.descriptor,
y.descriptor,
x.descriptor,
w.descriptor,
eps,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x_tensor, y_tensor, w_tensor]:
tensor.destroyDesc(lib)
for tensor in [x, y, w]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetRMSNormWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetRMSNormWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, y.device)
workspace = TestWorkspace(workspace_size.value, y.device)
def lib_rms_norm():
check_error(
lib.infiniopRMSNorm(
LIBINFINIOP.infiniopRMSNorm(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
y_tensor.data,
x_tensor.data,
w_tensor.data,
y.data(),
x.data(),
w.data(),
None,
)
)
......@@ -150,53 +134,20 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: rms_norm(ans, x, w, eps), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_rms_norm(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: rms_norm(y.torch_tensor(), x.torch_tensor(), w.torch_tensor(), eps), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_rms_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyRMSNormDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateRMSNormDescriptor.restype = c_int32
lib.infiniopCreateRMSNormDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopRMSNormDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
]
lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
infiniopRMSNormDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopRMSNorm.restype = c_int32
lib.infiniopRMSNorm.argtypes = [
infiniopRMSNormDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
lib.infiniopDestroyRMSNormDescriptor.argtypes = [
infiniopRMSNormDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -206,6 +157,6 @@ if __name__ == "__main__":
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
synchronize_device,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceEnum,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -35,13 +36,13 @@ _TEST_CASES_ = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.bfloat16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
torch.bfloat16: {"atol": 5e-3, "rtol": 5e-2},
torch.float32: {"atol": 1e-4, "rtol": 1e-3},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-3},
}
......@@ -67,14 +68,7 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class RoPEDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopRoPEDescriptor_t = POINTER(RoPEDescriptor)
def rotary_embedding(t, sin, cos, torch_device):
def rotary_embedding(ans, t, sin, cos, device):
dh = t.shape[2]
dt = t.dtype
assert dh % 2 == 0, "Embedding dimension must be even."
......@@ -82,7 +76,7 @@ def rotary_embedding(t, sin, cos, torch_device):
t_odd = t[..., 1::2] # [seq_len, n_head, dh // 2]
cos = cos.unsqueeze(1) # [seq_len, 1, dh // 2]
sin = sin.unsqueeze(1) # [seq_len, 1, dh // 2]
if torch_device == "cpu":
if device == InfiniDeviceEnum.CPU:
(t_even, t_odd, cos, sin) = (
t_even.float(),
t_odd.float(),
......@@ -93,26 +87,23 @@ def rotary_embedding(t, sin, cos, torch_device):
t_out_even = t_even * cos - t_odd * sin
t_out_odd = t_even * sin + t_odd * cos
t_out = torch.empty_like(t)
t_out[..., 0::2] = t_out_even
t_out[..., 1::2] = t_out_odd
return t_out.to(dt).to(torch_device)
ans[..., 0::2] = t_out_even.to(dt)
ans[..., 1::2] = t_out_odd.to(dt)
def sin_cos_table(pos, dim, torch_device, theta, dtype):
def sin_cos_table(pos, dim, device, theta, dtype):
assert dim % 2 == 0, "Embedding dimension must be even."
freqs = (1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))).to(
torch_device
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
angles = torch.outer(pos.cpu(), freqs)
return (
TestTensor.from_torch(torch.sin(angles), dtype, device),
TestTensor.from_torch(torch.cos(angles), dtype, device),
)
angles = torch.outer(pos, freqs)
return torch.sin(angles).to(dtype), torch.cos(angles).to(dtype)
def test(
lib,
handle,
torch_device,
device,
shape,
x_strides=None,
y_strides=None,
......@@ -120,71 +111,71 @@ def test(
dtype=torch.float32,
sync=None,
):
x = TestTensor(shape, x_strides, dtype, device)
if inplace == Inplace.INPLACE_X:
y_strides = x_strides
print(
f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} x_strides:{x_strides} y_strides:{y_strides} and dtype:{dtype} inplace:{inplace}"
)
x = torch.rand(shape, dtype=dtype).to(torch_device)
x = rearrange_if_needed(x, x_strides)
if inplace == Inplace.INPLACE_X:
if x_strides != y_strides:
return
y = x
else:
y = torch.rand(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_strides)
y = TestTensor(shape, y_strides, dtype, device)
print(
f"Testing Rotary Positional Embedding on {InfiniDeviceNames[device]} with shape:{shape} x_strides:{x_strides} y_strides:{y_strides} and dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
theta = 1e5
pos = torch.arange(0, x.shape[0], dtype=torch.int32).to(torch_device)
sin_table, cos_table = sin_cos_table(pos, x.shape[2], x.device, theta, dtype)
pos = TestTensor.from_torch(torch.arange(0, x.shape[0]), InfiniDtype.I32, device)
sin_table, cos_table = sin_cos_table(
pos.torch_tensor(), x.shape[2], x.device, theta, dtype
)
ans = rotary_embedding(x, sin_table, cos_table, torch_device)
rotary_embedding(
y.torch_tensor(),
x.torch_tensor(),
sin_table.torch_tensor(),
cos_table.torch_tensor(),
device,
)
descriptor = infiniopRoPEDescriptor_t()
x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor = [
to_tensor(tensor, lib, force_unsigned=True)
for tensor in [x, pos, sin_table, cos_table]
]
if inplace == Inplace.INPLACE_X:
y_tensor = x_tensor
else:
y_tensor = to_tensor(y, lib)
descriptor = infiniopOperatorDescriptor_t()
if sync is not None:
sync()
check_error(
lib.infiniopCreateRoPEDescriptor(
LIBINFINIOP.infiniopCreateRoPEDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
pos_tensor.descriptor,
sin_table_tensor.descriptor,
cos_table_tensor.descriptor,
y.descriptor,
x.descriptor,
pos.descriptor,
sin_table.descriptor,
cos_table.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [y_tensor, x_tensor, pos_tensor, sin_table_tensor, cos_table_tensor]:
tensor.destroyDesc(lib)
for tensor in [y, x, pos, sin_table, cos_table]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetRoPEWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetRoPEWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, x.device)
workspace = TestWorkspace(workspace_size.value, x.device)
def lib_rope():
check_error(
lib.infiniopRoPE(
LIBINFINIOP.infiniopRoPE(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
y_tensor.data,
x_tensor.data,
pos_tensor.data,
sin_table_tensor.data,
cos_table_tensor.data,
y.data(),
x.data(),
pos.data(),
sin_table.data(),
cos_table.data(),
None,
)
)
......@@ -196,60 +187,32 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
if PROFILE:
profile_operation(
"PyTorch",
lambda: rotary_embedding(x, sin_table, cos_table, torch_device),
torch_device,
lambda: rotary_embedding(
y.torch_tensor(),
x.torch_tensor(),
sin_table.torch_tensor(),
cos_table.torch_tensor(),
device,
),
device,
NUM_PRERUN,
NUM_ITERATIONS,
)
profile_operation(
" lib", lambda: lib_rope(), torch_device, NUM_PRERUN, NUM_ITERATIONS
" lib", lambda: lib_rope(), device, NUM_PRERUN, NUM_ITERATIONS
)
check_error(lib.infiniopDestroyRoPEDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyRoPEDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateRoPEDescriptor.restype = c_int32
lib.infiniopCreateRoPEDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopRoPEDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
lib.infiniopGetRoPEWorkspaceSize.argtypes = [
infiniopRoPEDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopRoPE.restype = c_int32
lib.infiniopRoPE.argtypes = [
infiniopRoPEDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRoPEDescriptor.restype = c_int32
lib.infiniopDestroyRoPEDescriptor.argtypes = [
infiniopRoPEDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -259,6 +222,6 @@ if __name__ == "__main__":
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -58,12 +59,13 @@ _TEST_CASES = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
}
DEBUG = False
......@@ -72,111 +74,78 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class SubDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopSubDescriptor_t = POINTER(SubDescriptor)
def sub(x, y):
return torch.sub(x, y)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def sub(c, a, b):
return torch.sub(a, b, out=c)
def test(
lib,
handle,
torch_device,
device,
shape,
a_stride=None,
b_stride=None,
c_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
a = TestTensor(shape, a_stride, dtype, device)
b = TestTensor(shape, b_stride, dtype, device)
if inplace == Inplace.INPLACE_A:
if c_stride is not None and c_stride != a_stride:
return
c = a
elif inplace == Inplace.INPLACE_B:
if c_stride is not None and c_stride != b_stride:
return
c = b
else:
c = TestTensor(shape, c_stride, dtype, device)
if c.is_broadcast():
return
print(
f"Testing Sub on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
f"Testing Sub on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
sub(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
ans = sub(a, b)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
if sync is not None:
sync()
descriptor = infiniopSubDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateSubDescriptor(
LIBINFINIOP.infiniopCreateSubDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c.descriptor,
a.descriptor,
b.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
for tensor in [a, b, c]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetSubWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetSubWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, c.device)
workspace = TestWorkspace(workspace_size.value, device)
def lib_sub():
check_error(
lib.infiniopSub(
LIBINFINIOP.infiniopSub(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
c.data(),
a.data(),
b.data(),
None,
)
)
......@@ -185,53 +154,20 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: sub(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_sub(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: sub(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_sub(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroySubDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroySubDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateSubDescriptor.restype = c_int32
lib.infiniopCreateSubDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopSubDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetSubWorkspaceSize.restype = c_int32
lib.infiniopGetSubWorkspaceSize.argtypes = [
infiniopSubDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopSub.restype = c_int32
lib.infiniopSub.argtypes = [
infiniopSubDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySubDescriptor.restype = c_int32
lib.infiniopDestroySubDescriptor.argtypes = [
infiniopSubDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
......@@ -239,6 +175,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -58,13 +59,13 @@ _TEST_CASES = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.bfloat16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.bfloat16: {"atol": 5e-3, "rtol": 5e-3},
torch.float32: {"atol": 2e-7, "rtol": 1e-7},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-3},
InfiniDtype.F32: {"atol": 2e-7, "rtol": 1e-7},
}
DEBUG = False
......@@ -73,111 +74,79 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class SwiGLUDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor)
def swiglu(a, b):
return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def test(
lib,
handle,
torch_device,
device,
shape,
a_stride=None,
b_stride=None,
c_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
a = TestTensor(shape, a_stride, dtype, device)
b = TestTensor(shape, b_stride, dtype, device)
if inplace == Inplace.INPLACE_A:
if c_stride is not None and c_stride != a_stride:
return
c = a
elif inplace == Inplace.INPLACE_B:
if c_stride is not None and c_stride != b_stride:
return
c = b
else:
c = TestTensor(shape, c_stride, dtype, device)
if c.is_broadcast():
return
print(
f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
f"Testing SwiGLU on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
ans = swiglu(a.torch_tensor(), b.torch_tensor())
ans = swiglu(a, b)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
if sync is not None:
sync()
descriptor = infiniopSwiGLUDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateSwiGLUDescriptor(
LIBINFINIOP.infiniopCreateSwiGLUDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c.descriptor,
a.descriptor,
b.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
for tensor in [a, b, c]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetSwiGLUWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetSwiGLUWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, c.device)
workspace = TestWorkspace(workspace_size.value, c.device)
def lib_swiglu():
check_error(
lib.infiniopSwiGLU(
LIBINFINIOP.infiniopSwiGLU(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
c.data(),
a.data(),
b.data(),
None,
)
)
......@@ -186,52 +155,20 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
debug(c.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(c.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: swiglu(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_swiglu(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: swiglu(a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_swiglu(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroySwiGLUDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
lib.infiniopCreateSwiGLUDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopSwiGLUDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetSwiGLUWorkspaceSize.restype = c_int32
lib.infiniopGetSwiGLUWorkspaceSize.argtypes = [
infiniopSwiGLUDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopSwiGLU.restype = c_int32
lib.infiniopSwiGLU.argtypes = [
infiniopSwiGLUDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
lib.infiniopDestroySwiGLUDescriptor.argtypes = [
infiniopSwiGLUDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -240,6 +177,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -12,11 +12,17 @@ if is_mode("debug") then
add_defines("DEBUG_MODE")
end
if is_plat("windows") then
set_runtimes("MD")
add_ldflags("/utf-8", {force = true})
add_cxflags("/utf-8", {force = true})
end
-- CPU
option("cpu")
set_default(true)
set_showmenu(true)
set_description("Whether to complie implementations for CPU")
set_description("Whether to compile implementations for CPU")
option_end()
option("omp")
......@@ -38,32 +44,29 @@ end
option("nv-gpu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Nvidia GPU")
set_description("Whether to compile implementations for Nvidia GPU")
option_end()
if has_config("nv-gpu") then
add_defines("ENABLE_CUDA_API")
includes("xmake/cuda.lua")
add_defines("ENABLE_NVIDIA_API")
includes("xmake/nvidia.lua")
end
-- 天数智芯
option("iluvatar-gpu")
set_default(false)
option("cudnn")
set_default(true)
set_showmenu(true)
set_description("Whether to complie implementations for Iluvatar GPU")
set_description("Whether to compile cudnn for Nvidia GPU")
option_end()
if has_config("iluvatar-gpu") then
add_defines("ENABLE_CUDA_API")
add_defines("ENABLE_ILUVATAR_CUDA_API")
includes("xmake/iluvatar.lua")
if has_config("cudnn") then
add_defines("ENABLE_CUDNN_API")
end
-- 寒武纪
option("cambricon-mlu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Cambricon MLU")
set_description("Whether to compile implementations for Cambricon MLU")
option_end()
if has_config("cambricon-mlu") then
......@@ -75,7 +78,7 @@ end
option("ascend-npu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Huawei Ascend NPU")
set_description("Whether to compile implementations for Huawei Ascend NPU")
option_end()
if has_config("ascend-npu") then
......@@ -83,23 +86,35 @@ if has_config("ascend-npu") then
includes("xmake/ascend.lua")
end
-- 天数智芯
option("iluvatar-gpu")
set_default(false)
set_showmenu(true)
set_description("Whether to compile implementations for Iluvatar GPU")
option_end()
if has_config("iluvatar-gpu") then
add_defines("ENABLE_ILUVATAR_API")
includes("xmake/iluvatar.lua")
end
-- 沐曦
option("metax-gpu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for MetaX GPU")
set_description("Whether to compile implementations for MetaX GPU")
option_end()
if has_config("metax-gpu") then
add_defines("ENABLE_METAX_API")
includes("xmake/maca.lua")
includes("xmake/metax.lua")
end
-- 摩尔线程
option("moore-gpu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Moore Threads GPU")
set_description("Whether to compile implementations for Moore Threads GPU")
option_end()
if has_config("moore-gpu") then
......@@ -111,11 +126,10 @@ end
option("sugon-dcu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Sugon DCU")
set_description("Whether to compile implementations for Sugon DCU")
option_end()
if has_config("sugon-dcu") then
add_defines("ENABLE_CUDA_API")
add_defines("ENABLE_SUGON_CUDA_API")
end
......@@ -131,12 +145,22 @@ if has_config("kunlun-xpu") then
includes("xmake/kunlun.lua")
end
-- 九齿
option("ninetoothed")
set_default(false)
set_showmenu(true)
set_description("Whether to complie NineToothed implementations")
option_end()
if has_config("ninetoothed") then
add_defines("ENABLE_NINETOOTHED")
end
-- InfiniCCL
option("ccl")
set_default(false)
set_default(false)
set_showmenu(true)
set_description("Wether to complie implementations for InfiniCCL")
set_description("Wether to compile implementations for InfiniCCL")
option_end()
if has_config("ccl") then
......@@ -159,7 +183,7 @@ target("infini-utils")
add_cxflags("-fPIC", "-Wno-unknown-pragmas")
if has_config("omp") then
add_cxflags("-fopenmp")
add_ldflags("-fopenmp")
add_ldflags("-fopenmp", {force = true})
end
end
......@@ -173,7 +197,7 @@ target("infinirt")
add_deps("infinirt-cpu")
end
if has_config("nv-gpu") then
add_deps("infinirt-cuda")
add_deps("infinirt-nvidia")
end
if has_config("cambricon-mlu") then
add_deps("infinirt-cambricon")
......@@ -207,7 +231,7 @@ target("infiniop")
add_deps("infiniop-cpu")
end
if has_config("nv-gpu") then
add_deps("infiniop-cuda")
add_deps("infiniop-nvidia")
end
if has_config("iluvatar-gpu") then
add_deps("infiniop-iluvatar")
......@@ -221,9 +245,9 @@ target("infiniop")
)
add_shflags("-s", "-shared", "-fPIC")
add_links("cublas", "cudnn", "cudadevrt", "cudart_static", "rt", "pthread", "dl")
-- Using -linfiniop-cuda will fail, manually link the target using full path
-- Using -linfiniop-nvidia will fail, manually link the target using full path
add_deps("nv-gpu", {inherit = false})
add_links(builddir.."/libinfiniop-cuda.a")
add_links(builddir.."/libinfiniop-nvidia.a")
set_toolchains("sugon-dcu-linker")
end
......@@ -259,7 +283,7 @@ target("infiniccl")
add_deps("infinirt")
if has_config("nv-gpu") then
add_deps("infiniccl-cuda")
add_deps("infiniccl-nvidia")
end
if has_config("ascend-npu") then
add_deps("infiniccl-ascend")
......@@ -270,6 +294,9 @@ target("infiniccl")
if has_config("metax-gpu") then
add_deps("infiniccl-metax")
end
if has_config("iluvatar-gpu") then
add_deps("infiniccl-iluvatar")
end
set_languages("cxx17")
......
......@@ -42,13 +42,17 @@ target("infiniop-iluvatar")
add_links("cudart", "cublas", "cudnn")
set_warnings("all", "error")
add_cuflags("-Wno-error=unused-private-field")
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
add_cuflags("-fPIC")
add_culdflags("-fPIC")
add_cxflags("-fPIC")
-- set_languages("cxx17") 天数似乎不能用这个配置
add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu")
add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
if has_config("ninetoothed") then
add_files("../build/ninetoothed/*.c", {cxflags = {"-Wno-return-type"}})
end
target_end()
target("infinirt-iluvatar")
......@@ -64,10 +68,39 @@ target("infinirt-iluvatar")
set_warnings("all", "error")
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
add_cuflags("-fPIC")
add_culdflags("-fPIC")
add_cxflags("-fPIC")
-- set_languages("cxx17") 天数似乎不能用这个配置
add_files("../src/infinirt/cuda/*.cu")
target_end()
target("infiniccl-iluvatar")
set_kind("static")
add_deps("infinirt")
on_install(function (target) end)
if has_config("ccl") then
set_toolchains("iluvatar.toolchain")
add_rules("iluvatar.env")
set_values("cuda.rdc", false)
add_links("cudart")
set_warnings("all", "error")
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
add_culdflags("-fPIC")
add_cxflags("-fPIC")
local nccl_root = os.getenv("NCCL_ROOT")
if nccl_root then
add_includedirs(nccl_root .. "/include")
add_links(nccl_root .. "/lib/libnccl.so")
else
add_links("nccl") -- Fall back to default nccl linking
end
-- set_languages("cxx17") 天数似乎不能用这个配置
add_files("../src/infiniccl/cuda/*.cu")
end
target_end()
......@@ -23,6 +23,11 @@ rule("maca")
table.insert(args, "-I" .. includedir)
end
local defines = target:get("defines")
for _, define in ipairs(defines) do
table.insert(args, "-D" .. define)
end
os.execv(htcc, args)
table.insert(target:objectfiles(), objectfile)
end)
......@@ -34,8 +39,12 @@ target("infiniop-metax")
set_languages("cxx17")
set_warnings("all", "error")
add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"})
add_files("../src/infiniop/devices/metax/*.cc", "../src/infiniop/ops/*/metax/*.cc")
add_files("../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
if has_config("ninetoothed") then
add_files("../build/ninetoothed/*.c", {cxflags = {"-include stdlib.h", "-Wno-return-type"}})
end
target_end()
target("infinirt-metax")
......@@ -45,7 +54,7 @@ target("infinirt-metax")
add_deps("infini-utils")
set_warnings("all", "error")
add_cxflags("-lstdc++ -fPIC")
add_files("../src/infinirt/maca/*.cc")
add_files("../src/infinirt/metax/*.cc")
target_end()
target("infiniccl-metax")
......@@ -58,8 +67,8 @@ target("infiniccl-metax")
end
if has_config("ccl") then
add_links("libhccl.so")
add_files("../src/infiniccl/maca/*.cc")
add_files("../src/infiniccl/metax/*.cc")
end
set_languages("cxx17")
target_end()
local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH")
local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
if CUDA_ROOT ~= nil then
add_includedirs(CUDA_ROOT .. "/include")
end
if CUDNN_ROOT ~= nil then
add_includedirs(CUDNN_ROOT .. "/include")
end
target("infiniop-cuda")
target("infiniop-nvidia")
set_kind("static")
add_deps("infini-utils")
on_install(function (target) end)
set_policy("build.cuda.devlink", true)
set_toolchains("cuda")
add_links("cublas", "cudnn")
add_links("cudart", "cublas")
if has_config("cudnn") then
add_links("cudnn")
end
add_cugencodes("native")
on_load(function (target)
import("lib.detect.find_tool")
local nvcc = find_tool("nvcc")
if nvcc ~= nil then
if is_plat("windows") then
nvcc_path = os.iorun("where nvcc"):match("(.-)\r?\n")
else
nvcc_path = nvcc.program
end
target:add("linkdirs", path.directory(path.directory(nvcc_path)) .. "/lib64/stubs")
target:add("links", "cuda")
end
end)
if is_plat("windows") then
add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
add_cuflags("-Xcompiler=/W3", "-Xcompiler=/WX")
......@@ -31,13 +44,23 @@ target("infiniop-cuda")
add_cuflags("--extended-lambda")
add_culdflags("-Xcompiler=-fPIC")
add_cxxflags("-fPIC")
add_cuflags("--expt-relaxed-constexpr")
if CUDNN_ROOT ~= nil then
add_linkdirs(CUDNN_ROOT .. "/lib")
end
end
add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
set_languages("cxx17")
add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu")
add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
if has_config("ninetoothed") then
add_files("../build/ninetoothed/*.c")
end
target_end()
target("infinirt-cuda")
target("infinirt-nvidia")
set_kind("static")
add_deps("infini-utils")
on_install(function (target) end)
......@@ -59,7 +82,7 @@ target("infinirt-cuda")
add_files("../src/infinirt/cuda/*.cu")
target_end()
target("infiniccl-cuda")
target("infiniccl-nvidia")
set_kind("static")
add_deps("infinirt")
on_install(function (target) end)
......@@ -87,5 +110,5 @@ target("infiniccl-cuda")
end
end
set_languages("cxx17")
target_end()
......@@ -51,3 +51,15 @@ target("infiniccl-test")
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
target_end()
target("infinirt-test")
set_kind("binary")
add_deps("infinirt")
on_install(function (target) end)
set_languages("cxx17")
set_warnings("all", "error")
add_files(os.projectdir().."/src/infinirt-test/*.cc")
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment