Unverified Commit 0166515c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge branch 'main' into issue/300

parents f0300ff3 a23c4d13
#include "infinirt_maca.h"
#include "infinirt_metax.h"
#include "../../utils.h"
#include <hcr/hc_runtime.h>
#include <hcr/hc_runtime_api.h>
#define CHECK_MACART(RT_API) CHECK_INTERNAL(RT_API, hcSuccess)
namespace infinirt::maca {
namespace infinirt::metax {
infiniStatus_t getDeviceCount(int *count) {
CHECK_MACART(hcGetDeviceCount(count));
return INFINI_STATUS_SUCCESS;
......@@ -124,4 +124,4 @@ infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) {
CHECK_MACART(hcFreeAsync(ptr, (hcStream_t)stream));
return INFINI_STATUS_SUCCESS;
}
} // namespace infinirt::maca
} // namespace infinirt::metax
......@@ -2,12 +2,12 @@
#define __INFINIRT_MACA_H__
#include "../infinirt_impl.h"
namespace infinirt::maca {
namespace infinirt::metax {
#ifdef ENABLE_METAX_API
INFINIRT_DEVICE_API_IMPL
#else
INFINIRT_DEVICE_API_NOOP
#endif
} // namespace infinirt::maca
} // namespace infinirt::metax
#endif // __INFINIRT_MACA_H__
......@@ -65,11 +65,11 @@ Name: test.0.ans, NDims: 2, Shape: [6, 4], DataType: F64, DataOffset: 320
- `Meta` 中必须包含 `test_count` ,表示测例数量。
- 每个测例的 `Meta``Tensor` 名字以 `test.[id].` 开头,后接具体信息名称。数字 `[id]` 表示测例编号。编号必须为 0 到 test_count-1.
- `Tensor` 名字接 `.strides` 表示步长,若没有则默认为连续。
- 注意:gguf 中的 shape 和 stride 的存储方向是反向的,第一个数代表最后一维。
### GGUF测例构建要求
不参与计算的 `Tensor` 不应存储数据,避免 `GGUF` 文件中出现冗余内容。
此类 `Tensor` 应使用 `np.empty(tuple(0 for _ in shape), dtype=dtype)` 构造其数据字段, 且 `GGUF` 需存储此张量的形状数据 `.shape`、步长数据 `.strides`,否则无法成功构建,可使用 `contiguous_gguf_strides(shape)` 计算步长数据。
对于 `Elementwise` 算子,需包含零步长(zero-stride)测试。对于步长为0的张量,`GGUF` 不应存储冗余广播数据,可使用 `process_zero_stride_tensor`进行冗余数据移除,同时必须在 `GGUF` 中提供此张量的实际形状数据 `.shape`,否则无法成功构建。
\ No newline at end of file
对于 `Elementwise` 算子,需包含零步长(zero-stride)测试。对于步长为0的张量,`GGUF` 不应存储冗余广播数据,可使用 `process_zero_stride_tensor`进行冗余数据移除,同时必须在 `GGUF` 中提供此张量的实际形状数据 `.shape`,否则无法成功构建。
import gguf
from typing import List
import gguf
import numpy as np
from gguf import GGMLQuantizationType
from ml_dtypes import bfloat16
def np_dtype_to_ggml(tensor_dtype: np.dtype):
if tensor_dtype == np.float16:
if tensor_dtype == bfloat16:
return GGMLQuantizationType.BF16
elif tensor_dtype == np.float16:
return GGMLQuantizationType.F16
elif tensor_dtype == np.float32:
return GGMLQuantizationType.F32
elif tensor_dtype == np.float64:
return GGMLQuantizationType.F64
elif tensor_dtype == np.bool:
return GGMLQuantizationType.Q8_K
elif tensor_dtype == np.int8:
return GGMLQuantizationType.I8
elif tensor_dtype == np.int16:
......@@ -21,7 +27,7 @@ def np_dtype_to_ggml(tensor_dtype: np.dtype):
return GGMLQuantizationType.I64
else:
raise ValueError(
"Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now"
"Only BF16, F16, F32, F64, BOOL, I8, I16, I32, I64 tensors are supported for now"
)
......@@ -37,6 +43,7 @@ def contiguous_gguf_strides(shape: tuple[int, ...]) -> list[int]:
acc *= size
return strides[::-1]
def process_zero_stride_tensor(tensor, stride=None):
if stride:
slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride)
......@@ -44,6 +51,7 @@ def process_zero_stride_tensor(tensor, stride=None):
else:
return tensor
class InfiniopTestCase:
op_name: str
......
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -58,12 +59,13 @@ _TEST_CASES = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
}
DEBUG = False
......@@ -72,52 +74,13 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class AddDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAddDescriptor_t = POINTER(AddDescriptor)
def add(ans, x, y):
torch.add(x, y, out=ans)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def add(c, a, b):
torch.add(a, b, out=c)
def test(
lib,
handle,
torch_device,
device,
shape,
a_stride=None,
b_stride=None,
......@@ -126,58 +89,64 @@ def test(
dtype=torch.float16,
sync=None,
):
a = TestTensor(shape, a_stride, dtype, device)
b = TestTensor(shape, b_stride, dtype, device)
if inplace == Inplace.INPLACE_A:
if a_stride != c_stride:
return
c = a
elif inplace == Inplace.INPLACE_B:
if c_stride != b_stride:
return
c = b
else:
c = TestTensor(shape, c_stride, dtype, device, mode="ones")
if c.is_broadcast():
return
print(
f"Testing Add on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
f"Testing Add on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
ans = torch.zeros(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
add(ans, a, b)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
if sync is not None:
sync()
descriptor = infiniopAddDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateAddDescriptor(
LIBINFINIOP.infiniopCreateAddDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c.descriptor,
a.descriptor,
b.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
for tensor in [a, b, c]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetAddWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetAddWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, c.device)
workspace = TestWorkspace(workspace_size.value, c.device)
def lib_add():
check_error(
lib.infiniopAdd(
LIBINFINIOP.infiniopAdd(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
workspace.data(),
workspace.size(),
c.data(),
a.data(),
b.data(),
None,
)
)
......@@ -186,52 +155,20 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: add(ans, a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_add(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_add(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyAddDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyAddDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAddDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetAddWorkspaceSize.restype = c_int32
lib.infiniopGetAddWorkspaceSize.argtypes = [
infiniopAddDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [
infiniopAddDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopAddDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -240,6 +177,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
from ctypes import c_uint64
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from libinfiniop import (
open_lib,
to_tensor,
infiniopHandle_t,
infiniopTensorDescriptor_t,
check_error,
rearrange_tensor,
create_workspace,
get_args,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
import torch
class AttentionDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
def causal_softmax(x):
type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
......@@ -85,9 +79,8 @@ def attention(q, k, v, k_cache, v_cache, pos):
def test(
lib,
handle,
torch_device,
device,
n_q_head,
n_kv_head,
seq_len,
......@@ -100,94 +93,79 @@ def test(
v_stride=None,
k_cache_stride=None,
v_cache_stride=None,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
f"Testing Attention on {InfiniDeviceNames[device]} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
f"dtype:{InfiniDtypeNames[dtype]} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
)
out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device)
q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
k_cache = (
torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
* 0.1
out = TestTensor([seq_len, n_q_head, head_dim], None, dtype, device, mode="zeros")
q = TestTensor([n_q_head, seq_len, head_dim], q_stride, dtype, device, scale=0.1)
k = TestTensor([n_kv_head, seq_len, head_dim], k_stride, dtype, device, scale=0.1)
v = TestTensor([n_kv_head, seq_len, head_dim], v_stride, dtype, device, scale=0.1)
k_cache = TestTensor(
[n_kv_head, k_cache_buf_len, head_dim], k_cache_stride, dtype, device, scale=0.1
)
v_cache = (
torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
* 0.1
v_cache = TestTensor(
[n_kv_head, v_cache_buf_len, head_dim], v_cache_stride, dtype, device, scale=0.1
)
ans = attention(q, k, v, k_cache, v_cache, pos)
if q_stride is not None:
q = rearrange_tensor(q, q_stride)
if k_stride is not None:
k = rearrange_tensor(k, k_stride)
if v_stride is not None:
v = rearrange_tensor(v, v_stride)
if k_cache_stride is not None:
k_cache = rearrange_tensor(k_cache, k_cache_stride)
if v_cache_stride is not None:
v_cache = rearrange_tensor(v_cache, v_cache_stride)
out_tensor = to_tensor(out, lib)
q_tensor = to_tensor(q, lib)
k_tensor = to_tensor(k, lib)
v_tensor = to_tensor(v, lib)
k_cache_tensor = to_tensor(k_cache, lib)
v_cache_tensor = to_tensor(v_cache, lib)
def torch_attention():
return attention(
q.torch_tensor(),
k.torch_tensor(),
v.torch_tensor(),
k_cache.torch_tensor(),
v_cache.torch_tensor(),
pos,
)
ans = torch_attention()
if sync is not None:
sync()
descriptor = infiniopAttentionDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateAttentionDescriptor(
LIBINFINIOP.infiniopCreateAttentionDescriptor(
handle,
ctypes.byref(descriptor),
out_tensor.descriptor,
q_tensor.descriptor,
k_tensor.descriptor,
v_tensor.descriptor,
k_cache_tensor.descriptor,
v_cache_tensor.descriptor,
out.descriptor,
q.descriptor,
k.descriptor,
v.descriptor,
k_cache.descriptor,
v_cache.descriptor,
pos,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [
out_tensor,
q_tensor,
k_tensor,
v_tensor,
k_cache_tensor,
v_cache_tensor,
]:
tensor.destroyDesc(lib)
for tensor in [out, q, k, v, k_cache, v_cache]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetAttentionWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, out.device)
workspace = TestWorkspace(workspace_size.value, out.device)
def lib_attention():
check_error(
lib.infiniopAttention(
LIBINFINIOP.infiniopAttention(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
out_tensor.data,
q_tensor.data,
k_tensor.data,
v_tensor.data,
k_cache_tensor.data,
v_cache_tensor.data,
out.data(),
q.data(),
k.data(),
v.data(),
k_cache.data(),
v_cache.data(),
None,
)
)
......@@ -197,25 +175,25 @@ def test(
# Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(out, ans, atol=atol, rtol=rtol)
assert torch.allclose(out, ans, atol=atol, rtol=rtol)
debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: torch_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyAttentionDescriptor(descriptor))
if __name__ == "__main__":
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2},
torch.float32: {"atol": 1e-5, "rtol": 1e-3},
InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-3},
}
DEBUG = False
......@@ -284,45 +262,6 @@ if __name__ == "__main__":
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAttentionDescriptor.restype = c_int32
lib.infiniopCreateAttentionDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAttentionDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_uint64,
]
lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
lib.infiniopGetAttentionWorkspaceSize.argtypes = [
infiniopAttentionDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAttention.restype = c_int32
lib.infiniopAttention.argtypes = [
infiniopAttentionDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAttentionDescriptor.restype = c_int32
lib.infiniopDestroyAttentionDescriptor.argtypes = [
infiniopAttentionDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -332,5 +271,5 @@ if __name__ == "__main__":
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, test_cases, _TENSOR_DTYPES)
test_operator(device, test, test_cases, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch
from typing import Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class AvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
def pool(x, k, padding, stride, dilation=1):
pooling_layers = {
1: torch.nn.AvgPool1d,
2: torch.nn.AvgPool2d,
3: torch.nn.AvgPool3d,
}
ndim = len(x.shape) - 2
if ndim not in pooling_layers:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
if ndim == 3 and x.dtype == torch.float16:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(
x.to(torch.float32)
).to(torch.float16)
else:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
if PROFILE:
torch.cuda.synchronize()
return ans
def inferShape(x_shape, kernel_shape, padding, strides):
assert (
len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
input_shape = x_shape[2:]
output_shape = []
for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
output_dim = (dim + 2 * p - k) // s + 1
output_shape.append(output_dim)
return x_shape[:2] + tuple(output_shape)
# convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p)
def test(
lib,
handle,
torch_device,
x_shape,
k_shape,
padding,
strides,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(
inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = pool(x, k_shape, padding, strides)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = pool(x, k_shape, padding, strides)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
tuple_to_void_p(k_shape),
tuple_to_void_p(padding),
tuple_to_void_p(strides),
len(k_shape),
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# fmt: off
# x_shape, kernel_shape, padding, strides
((1, 1, 10), (3,), (1,), (1,)),
((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
# fmt: on
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
infiniopAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAvgPool.restype = c_int32
lib.infiniopAvgPool.argtypes = [
infiniopAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
infiniopAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -34,13 +35,13 @@ _TEST_CASES_ = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.bfloat16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
torch.bfloat16: {"atol": 5e-3, "rtol": 5e-2},
torch.float32: {"atol": 1e-5, "rtol": 1e-5},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
}
......@@ -66,13 +67,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class CausalSoftmaxDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
def causal_softmax(x):
type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
......@@ -81,66 +75,57 @@ def causal_softmax(x):
def test(
lib,
handle,
torch_device,
device,
shape,
x_stride=None,
y_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
f"Testing CausalSoftmax on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
x = torch.rand(shape, dtype=dtype).to(torch_device)
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
x = torch.where(mask == 1, torch.full_like(x, torch.finfo(x.dtype).max), x)
ans = causal_softmax(x)
x = rearrange_if_needed(x, x_stride)
x_tensor = to_tensor(x, lib)
x = TestTensor(shape, x_stride, dtype, device)
ans = causal_softmax(x.torch_tensor())
if inplace == Inplace.INPLACE_X:
y = x
y_tensor = x_tensor
else:
y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
y = TestTensor(shape, x_stride, dtype, device)
if sync is not None:
sync()
descriptor = infiniopCausalSoftmaxDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateCausalSoftmaxDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
LIBINFINIOP.infiniopCreateCausalSoftmaxDescriptor(
handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.destroyDesc(lib)
x.destroy_desc()
y.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetCausalSoftmaxWorkspaceSize(
LIBINFINIOP.infiniopGetCausalSoftmaxWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, x.device)
workspace = TestWorkspace(workspace_size.value, x.device)
def lib_causal_softmax():
check_error(
lib.infiniopCausalSoftmax(
LIBINFINIOP.infiniopCausalSoftmax(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
y_tensor.data,
x_tensor.data,
y.data(),
x.data(),
None,
)
)
......@@ -152,49 +137,21 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: causal_softmax(x), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_causal_softmax(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopCausalSoftmaxDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopCausalSoftmax.restype = c_int32
lib.infiniopCausalSoftmax.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
]
lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -203,6 +160,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -2,21 +2,22 @@
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -38,29 +39,26 @@ _TEST_CASES_ = [
((5, 10), None, None, -2.0, 0.0),
((2, 3, 4), None, None, -2.0, 0.0),
# 奇怪形状测试
((7, 13), None, None, -1.0, 1.0), # 质数维度
((3, 5, 7), None, None, -1.0, 1.0), # 三维质数
((7, 13), None, None, -1.0, 1.0), # 质数维度
((3, 5, 7), None, None, -1.0, 1.0), # 三维质数
# 非标准形状测试
((1, 1), None, None, -1.0, 1.0), # 最小形状
((100, 100), None, None, -1.0, 1.0), # 大形状
((16, 16, 16), None, None, -1.0, 1.0), # 大三维
((1, 1), None, None, -1.0, 1.0), # 最小形状
((100, 100), None, None, -1.0, 1.0), # 大形状
((16, 16, 16), None, None, -1.0, 1.0), # 大三维
# 极端值测试
((10,), None, None, -1000.0, 1000.0), # 大范围
((10,), None, None, -0.001, 0.001), # 小范围
((10,), None, None, 0.0, 0.0), # min=max
# 特殊形状测试
((0,), None, None, -1.0, 1.0), # 空张量
((1, 0), None, None, -1.0, 1.0), # 空维度
((10,), None, None, -0.001, 0.001), # 小范围
((10,), None, None, 0.0, 0.0), # min=max
]
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-6},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-6},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
}
......@@ -86,154 +84,108 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ClipDescriptor(Structure):
_fields_ = [("device_type", c_int32), ("device_id", c_int32)]
infiniopClipDescriptor_t = POINTER(ClipDescriptor)
def clip(x, min_val, max_val):
return torch.clamp(x, min_val, max_val)
def create_tensor_with_stride(shape, stride, dtype, device):
"""Create a tensor with specific stride without using view() that might cause errors."""
x = torch.rand(shape, dtype=dtype, device=device) * 4.0 - 2.0 # Range: [-2, 2]
if stride is None:
return x
if len(shape) == 2 and len(stride) == 2:
if stride == (shape[1], 1):
return x.contiguous()
elif stride == (1, shape[0]):
return x.transpose(0, 1).contiguous().transpose(0, 1)
else:
y = torch.zeros(shape, dtype=dtype, device=device)
for i in range(shape[0]):
for j in range(shape[1]):
y[i, j] = x[i, j]
return y.contiguous()
return x
def clip(y, x, min_val, max_val):
torch.clamp(x, min_val, max_val, out=y)
def test(
lib,
handle,
torch_device,
device,
shape,
x_stride=None,
y_stride=None,
min_val=-1.0,
max_val=1.0,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float32,
dtype=InfiniDtype.F32,
sync=None,
):
print(
f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}"
x = TestTensor(shape, x_stride, dtype, device)
min_ = TestTensor(
shape, [0 for _ in shape], dtype, device, mode="zeros", bias=min_val
)
max_ = TestTensor(
shape, [0 for _ in shape], dtype, device, mode="zeros", bias=max_val
)
x = create_tensor_with_stride(shape, x_stride, dtype, torch_device)
ans = clip(x, min_val, max_val)
x = rearrange_if_needed(x, x_stride)
x_tensor = to_tensor(x, lib)
if inplace == Inplace.INPLACE_X:
if x_stride != y_stride:
return
y = x
y_tensor = x_tensor
else:
y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
descriptor = infiniopClipDescriptor_t()
y = TestTensor(shape, y_stride, dtype, device)
if y.is_broadcast():
return
print(
f"Testing Clip on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"min_val:{min_val} max_val:{max_val} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val)
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateClipDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
LIBINFINIOP.infiniopCreateClipDescriptor(
handle,
ctypes.byref(descriptor),
y.descriptor,
x.descriptor,
min_.descriptor,
max_.descriptor,
)
)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetClipWorkspaceSize(
LIBINFINIOP.infiniopGetClipWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, x.device)
workspace = TestWorkspace(workspace_size.value, x.device)
def lib_clip():
check_error(
lib.infiniopClip(
LIBINFINIOP.infiniopClip(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data() if workspace is not None else None,
workspace_size.value,
y_tensor.data,
x_tensor.data,
c_float(min_val),
c_float(max_val),
y.data(),
x.data(),
min_.data(),
max_.data(),
None,
)
)
lib_clip()
# Now we can destroy the tensor descriptors
x_tensor.destroyDesc(lib)
if inplace != Inplace.INPLACE_X:
y_tensor.destroyDesc(lib)
# Destroy the tensor descriptors
for tensor in [x, y, min_, max_]:
tensor.destroy_desc()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol):
print("\nExpected:")
print(ans)
print("\nActual:")
print(y)
print("\nDifference:")
print(torch.abs(y - ans))
print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
if DEBUG:
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_clip(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyClipDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyClipDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateClipDescriptor.restype = c_int32
lib.infiniopCreateClipDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopClipDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetClipWorkspaceSize.restype = c_int32
lib.infiniopGetClipWorkspaceSize.argtypes = [
infiniopClipDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopClip.restype = c_int32
lib.infiniopClip.argtypes = [
infiniopClipDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyClipDescriptor.restype = c_int32
lib.infiniopDestroyClipDescriptor.argtypes = [
infiniopClipDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
......@@ -241,6 +193,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import torch
import ctypes
import sys
import os
import time
from ctypes import c_uint64
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from operatorspy.tests.test_utils import get_args
import torch
from enum import Enum, auto
from typing import List, Tuple
import math
import ctypes
from torch.nn import functional as F
from typing import List, Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
......@@ -29,36 +29,104 @@ from typing import List, Tuple
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
_TEST_CASES = [
# x_shape, x_stride, w_shape, w_stride, pads, strides, dilations, x_strides
(
(32, 3, 4),
(12, 4, 1),
(32, 3, 5),
(15, 5, 1),
(1,),
(1,),
(1,),
),
(
(1, 3, 4, 4),
(48, 16, 4, 1),
(2, 3, 3, 3),
(27, 9, 3, 1),
(1, 1),
(1, 2),
(2, 1),
),
(
(32, 3, 32, 32),
(32 * 32 * 3, 32 * 32, 32, 1),
(64, 3, 5, 5),
(75, 25, 5, 1),
(2, 2),
(2, 2),
(1, 1),
),
(
(1, 1, 4, 4, 4),
(64, 64, 16, 4, 1),
(1, 1, 5, 5, 5),
(125, 125, 25, 5, 1),
(1, 1, 1),
(1, 1, 1),
(1, 1, 1),
),
(
(32, 3, 32, 32, 32),
(32 * 32 * 32 * 3, 32 * 32 * 32, 32 * 32, 32, 1),
(64, 3, 5, 5, 5),
(375, 125, 25, 5, 1),
(3, 2, 2),
(4, 3, 3),
(2, 2, 1),
),
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ConvDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopConvDescriptor_t = POINTER(ConvDescriptor)
def conv(x, w, stride, padding, dilation):
def conv(x, w, stride, padding, dilation, y_tensor, bias=None):
match len(x.shape) - 2:
case 1:
return F.conv1d(x, w, stride=stride, padding=padding, dilation=dilation)
y_tensor.copy_(
F.conv1d(
x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
)
)
case 2:
return F.conv2d(x, w, stride=stride, padding=padding, dilation=dilation)
y_tensor.copy_(
F.conv2d(
x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
)
)
case 3:
return F.conv3d(x, w, stride=stride, padding=padding, dilation=dilation)
y_tensor.copy_(
F.conv3d(
x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
)
)
case _:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
# infer the shape of the output given the inputs for a N-ary convolution
def inferShape(
def inferShapeStride(
x_shape: List[int],
w_shape: List[int],
pads: List[int],
strides: List[int],
dilations: List[int],
) -> Tuple[int, ...]:
) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:
assert (
len(x_shape)
== len(w_shape)
......@@ -74,7 +142,12 @@ def inferShape(
)
for i in range(len(pads))
]
return (x_shape[0], w_shape[0]) + tuple(output_dims)
output_shape = (x_shape[0], w_shape[0]) + tuple(output_dims)
output_strides = [1]
for s in reversed(output_shape[1:]):
output_strides.insert(0, output_strides[0] * s)
output_strides = tuple(output_strides)
return output_shape, output_strides
# convert a python tuple to a ctype void pointer
......@@ -85,52 +158,54 @@ def tuple_to_void_p(py_tuple: Tuple):
def test(
lib,
handle,
torch_device,
device,
x_shape,
x_stride,
w_shape,
w_stride,
pads,
strides,
dilations,
tensor_stride=None,
tensor_dtype=torch.float16,
sync=None
tensor_dtype=InfiniDtype.F16,
sync=None,
):
assert len(pads) == len(strides) == len(dilations)
x = TestTensor(x_shape, x_stride, dt=tensor_dtype, device=device, scale=0.01)
w = TestTensor(w_shape, w_stride, dt=tensor_dtype, device=device, scale=0.01)
y_shape, y_stride = inferShapeStride(x_shape, w_shape, pads, strides, dilations)
y = TestTensor(y_shape, y_stride, dt=tensor_dtype, device=device)
b = (
TestTensor((w.shape[0],), (1,), dt=tensor_dtype, device=device, scale=0.01)
if w.shape[0] > 1
else None
)
print(
f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}"
f"Testing Conv on {InfiniDeviceNames[device]} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {x_stride} dtype:{InfiniDtypeNames[tensor_dtype]}"
)
conv(
x.torch_tensor(),
w.torch_tensor(),
strides,
pads,
dilations,
y.torch_tensor(),
b.torch_tensor() if b is not None else None,
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device)
y = torch.zeros(
inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = conv(x, w, strides, pads, dilations)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = conv(x, w, strides, pads, dilations)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
w_tensor = to_tensor(w, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopConvDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateConvDescriptor(
LIBINFINIOP.infiniopCreateConvDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
w_tensor.descriptor,
y.descriptor,
x.descriptor,
w.descriptor,
b.descriptor if b is not None else None,
tuple_to_void_p(pads),
tuple_to_void_p(strides),
tuple_to_void_p(dilations),
......@@ -139,169 +214,56 @@ def test(
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
w_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for tensor in [x, y, w, b]:
if tensor is not None:
tensor.destroy_desc()
workspaceSize = ctypes.c_uint64(0)
workspace_size = ctypes.c_uint64(0)
check_error(
lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
LIBINFINIOP.infiniopGetConvWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
workspace = TestWorkspace(workspace_size.value, y.device)
for i in range(NUM_PRERUN if PROFILE else 1):
def lib_conv():
check_error(
lib.infiniopConv(
LIBINFINIOP.infiniopConv(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
w_tensor.data,
workspace.data(),
workspace_size.value,
y.data(),
x.data(),
w.data(),
b.data() if b is not None else None,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopConv(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
w_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
if tensor_dtype == torch.float16:
assert torch.allclose(y, ans, atol=0, rtol=1e-2)
else:
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyConvDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
# fmt: off
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
lib_conv()
atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
if DEBUG:
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
# fmt: off
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
# Profiling workflow
if PROFILE:
# fmt: off
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
profile_operation("PyTorch", lambda: conv(x.torch_tensor(), w.torch_tensor(), strides, pads, dilations, b.torch_tensor() if b is not None else None), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_conv(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
destroy_handle(lib, handle)
check_error(LIBINFINIOP.infiniopDestroyConvDescriptor(descriptor))
if __name__ == "__main__":
test_cases = [
# x_shape, w_shape, pads, strides, dilations, x_strides
(
(32, 3, 4),
(32, 3, 5),
(1,),
(1,),
(1,),
None,
),
(
(1, 3, 4, 4),
(2, 3, 3, 3),
(1, 1),
(1, 2),
(2, 1),
None,
),
(
(32, 3, 128, 128),
(64, 3, 5, 5),
(2, 2),
(2, 2),
(1, 1),
None,
),
(
(1, 1, 4, 4, 4),
(1, 1, 5, 5, 5),
(1, 1, 1),
(1, 1, 1),
(1, 1, 1),
None,
),
(
(32, 3, 32, 32, 32),
(64, 3, 5, 5, 5),
(3, 2, 2),
(4, 3, 3),
(2, 2, 1),
None,
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateConvDescriptor.restype = c_int32
lib.infiniopCreateConvDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopConvDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopConv.restype = c_int32
lib.infiniopConv.argtypes = [
infiniopConvDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyConvDescriptor.restype = c_int32
lib.infiniopDestroyConvDescriptor.argtypes = [
infiniopConvDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
)
from operatorspy.tests.test_utils import get_args
import torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ExpandDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
def expand(x, y):
if PROFILE:
ans = x.expand_as(y).clone()
torch.cuda.synchronize()
return ans
return x.expand_as(y)
def test(
lib,
handle,
torch_device,
y_shape,
x_shape,
y_stride=None,
x_stride=None,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = expand(x, y)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = expand(x, y)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopExpandDescriptor_t()
check_error(
lib.infiniopCreateExpandDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# fmt: off
# y_shape, x_shape, y_stride, x_stride
((), (), None, None),
((3, 3), (1,), None, None),
((5, 4, 3), (4, 3,), None, (6, 1)),
((99, 111), (111,), None, None),
((2, 4, 3), (1, 3), None, None),
((2, 20, 3), (2, 1, 3), None, None),
((2, 3, 4, 5), (5,), None, None),
((3, 2, 4, 5), (3, 2, 1, 1), None, None),
((32, 256, 112, 112), (32, 256, 112, 1), None, None),
# fmt: on
]
args = get_args()
lib = open_lib()
lib.infiniopCreateExpandDescriptor.restype = c_int32
lib.infiniopCreateExpandDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopExpandDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopExpand.restype = c_int32
lib.infiniopExpand.argtypes = [
infiniopExpandDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyExpandDescriptor.restype = c_int32
lib.infiniopDestroyExpandDescriptor.argtypes = [
infiniopExpandDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
# ==============================================================================
......@@ -31,13 +32,13 @@ _TEST_CASES = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32, torch.bfloat16]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2},
torch.float32: {"atol": 0, "rtol": 1e-3},
torch.bfloat16: {"atol": 0, "rtol": 5e-2},
InfiniDtype.F16: {"atol": 0, "rtol": 1e-2},
InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
InfiniDtype.BF16: {"atol": 0, "rtol": 5e-2},
}
DEBUG = False
......@@ -46,16 +47,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
# ==============================================================================
# Definitions
# ==============================================================================
class GemmDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGemmDescriptor_t = POINTER(GemmDescriptor)
# PyTorch implementation for matrix multiplication
def gemm(d, _c, beta, _a, _b, alpha):
try:
......@@ -73,9 +64,8 @@ def gemm(d, _c, beta, _a, _b, alpha):
# The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES
def test(
lib,
handle,
torch_device,
device,
alpha,
beta,
a_shape,
......@@ -84,65 +74,71 @@ def test(
a_stride=None,
b_stride=None,
c_stride=None,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
f"Testing Gemm on {InfiniDeviceNames[device]} with alpha:{alpha}, beta:{beta},"
f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{dtype}"
f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{InfiniDtypeNames[dtype]}"
)
# Initialize tensors
a = torch.rand(a_shape, dtype=dtype).to(torch_device)
b = torch.rand(b_shape, dtype=dtype).to(torch_device)
c = torch.ones(c_shape, dtype=dtype).to(torch_device)
ans = torch.zeros(c_shape, dtype=dtype).to(torch_device)
a = TestTensor(a_shape, a_stride, dtype, device)
b = TestTensor(b_shape, b_stride, dtype, device)
c = TestTensor(c_shape, c_stride, dtype, device, mode="ones")
ans = TestTensor(c_shape, c_stride, dtype, device, mode="zeros")
# Compute the PyTorch reference result
gemm(ans, c, beta, a, b, alpha)
def torch_gemm():
gemm(
ans.torch_tensor(),
c.torch_tensor(),
beta,
a.torch_tensor(),
b.torch_tensor(),
alpha,
)
a, b, c = [
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
]
a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
torch_gemm()
if sync is not None:
sync()
descriptor = infiniopGemmDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateGemmDescriptor(
LIBINFINIOP.infiniopCreateGemmDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c.descriptor,
a.descriptor,
b.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
for tensor in [a, b, c]:
tensor.destroy_desc()
# Get workspace size and create workspace
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetGemmWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetGemmWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, a.device)
workspace = TestWorkspace(workspace_size.value, device)
# Execute infiniop gemm operator
def lib_gemm():
check_error(
lib.infiniopGemm(
LIBINFINIOP.infiniopGemm(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
c.data(),
a.data(),
b.data(),
alpha,
beta,
None,
......@@ -155,17 +151,17 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: gemm(ans, c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_gemm(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: torch_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyGemmDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyGemmDescriptor(descriptor))
# ==============================================================================
......@@ -173,40 +169,6 @@ def test(
# ==============================================================================
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateGemmDescriptor.restype = c_int32
lib.infiniopCreateGemmDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGemmDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGemmWorkspaceSize.restype = c_int32
lib.infiniopGetGemmWorkspaceSize.argtypes = [
infiniopGemmDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopGemm.restype = c_int32
lib.infiniopGemm.argtypes = [
infiniopGemmDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyGemmDescriptor.restype = c_int32
lib.infiniopDestroyGemmDescriptor.argtypes = [
infiniopGemmDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -216,6 +178,6 @@ if __name__ == "__main__":
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch, time
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class GlobalAvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
def inferShape(x):
return x.shape[:2] + (1,) * (x.dim() - 2)
def globalAvgPool(x):
y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
if PROFILE:
torch.cuda.synchronize()
return y.view(*inferShape(x))
def test(
lib,
handle,
torch_device,
x_shape,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = globalAvgPool(x)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = globalAvgPool(x)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopGlobalAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateGlobalAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetGlobalAvgPoolWorkspaceSize(
descriptor, ctypes.byref(workspaceSize)
)
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopGlobalAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopGlobalAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape
((1, 3, 3)),
((1, 3, 1, 1, 3)),
((1, 3, 1, 1, 257)),
((1, 2, 1, 1, 514)),
((1, 3, 1, 1, 1025)),
((32, 256, 1, 112, 112)),
((2, 3, 2048000)),
((2, 1, 10243)),
((2, 20, 100)),
((3, 33, 333)),
((32, 20, 512)),
((3, 3, 11, 11, 11, 3, 2)),
((32, 256, 1, 112, 112)),
((32, 256, 112, 112)),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGlobalAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopGlobalAvgPool.restype = c_int32
lib.infiniopGlobalAvgPool.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
......@@ -4,10 +4,11 @@ import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
from .liboperators import (
open_lib,
CTensor,
infiniopHandle_t,
infiniopTensorDescriptor_t,
LIBINFINIOP,
)
from .devices import *
from .utils import *
from .datatypes import *
from .structs import *
......@@ -19,3 +19,27 @@ class InfiniDtype:
C32 = 17
C64 = 18
BF16 = 19
InfiniDtypeNames = {
InfiniDtype.INVALID: "INVALID",
InfiniDtype.BYTE: "BYTE",
InfiniDtype.BOOL: "BOOL",
InfiniDtype.I8: "I8",
InfiniDtype.I16: "I16",
InfiniDtype.I32: "I32",
InfiniDtype.I64: "I64",
InfiniDtype.U8: "U8",
InfiniDtype.U16: "U16",
InfiniDtype.U32: "U32",
InfiniDtype.U64: "U64",
InfiniDtype.F8: "F8",
InfiniDtype.F16: "F16",
InfiniDtype.F32: "F32",
InfiniDtype.F64: "F64",
InfiniDtype.C8: "C8",
InfiniDtype.C16: "C16",
InfiniDtype.C32: "C32",
InfiniDtype.C64: "C64",
InfiniDtype.BF16: "BF16",
}
......@@ -10,8 +10,20 @@ class InfiniDeviceEnum:
SUGON = 8
InfiniDeviceNames = {
InfiniDeviceEnum.CPU: "CPU",
InfiniDeviceEnum.NVIDIA: "NVIDIA",
InfiniDeviceEnum.CAMBRICON: "Cambricon",
InfiniDeviceEnum.ASCEND: "Ascend",
InfiniDeviceEnum.METAX: "Metax",
InfiniDeviceEnum.MOORE: "Moore",
InfiniDeviceEnum.ILUVATAR: "Iluvatar",
InfiniDeviceEnum.KUNLUN: "Kunlun",
InfiniDeviceEnum.SUGON: "Sugon",
}
# Mapping that maps InfiniDeviceEnum to torch device string
infiniDeviceEnum_str_map = {
torch_device_map = {
InfiniDeviceEnum.CPU: "cpu",
InfiniDeviceEnum.NVIDIA: "cuda",
InfiniDeviceEnum.CAMBRICON: "mlu",
......
import os
import platform
import ctypes
from ctypes import c_int, c_int64, c_uint64, Structure, POINTER
from ctypes import c_int, c_int64, c_uint64, POINTER
from .datatypes import *
from .devices import *
from .op_register import OpRegister
from pathlib import Path
Device = c_int
Optype = c_int
from .structs import *
INFINI_ROOT = os.getenv("INFINI_ROOT") or str(Path.home() / ".infini")
class TensorDescriptor(Structure):
_fields_ = []
infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor)
class CTensor:
def __init__(self, desc, torch_tensor):
self.descriptor = desc
self.torch_tensor_ = torch_tensor
self.data = torch_tensor.data_ptr()
def destroyDesc(self, lib_):
lib_.infiniopDestroyTensorDescriptor(self.descriptor)
self.descriptor = None
class Handle(Structure):
_fields_ = [("device", c_int), ("device_id", c_int)]
infiniopHandle_t = POINTER(Handle)
class InfiniLib:
def __init__(self, librt, libop):
self.librt = librt
......@@ -98,4 +72,9 @@ def open_lib():
lib.infinirtSetDevice.argtypes = [c_int, c_int]
lib.infinirtSetDevice.restype = c_int
OpRegister.register_lib(lib)
return lib
LIBINFINIOP = open_lib()
from .structs import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
infiniopOperatorDescriptor_t,
)
from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
class OpRegister:
registry = []
@classmethod
def operator(cls, op):
cls.registry.append(op)
return op
@classmethod
def register_lib(cls, lib):
for op in cls.registry:
op(lib)
@OpRegister.operator
def add_(lib):
lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetAddWorkspaceSize.restype = c_int32
lib.infiniopGetAddWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def attention_(lib):
lib.infiniopCreateAttentionDescriptor.restype = c_int32
lib.infiniopCreateAttentionDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_size_t,
]
lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
lib.infiniopGetAttentionWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopAttention.restype = c_int32
lib.infiniopAttention.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAttentionDescriptor.restype = c_int32
lib.infiniopDestroyAttentionDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def causal_softmax_(lib):
lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopCausalSoftmax.restype = c_int32
lib.infiniopCausalSoftmax.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
]
lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def clip_(lib):
lib.infiniopCreateClipDescriptor.restype = c_int32
lib.infiniopCreateClipDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetClipWorkspaceSize.restype = c_int32
lib.infiniopGetClipWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopClip.restype = c_int32
lib.infiniopClip.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyClipDescriptor.restype = c_int32
lib.infiniopDestroyClipDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def conv_(lib):
pass
@OpRegister.operator
def gemm_(lib):
lib.infiniopCreateGemmDescriptor.restype = c_int32
lib.infiniopCreateGemmDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGemmWorkspaceSize.restype = c_int32
lib.infiniopGetGemmWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopGemm.restype = c_int32
lib.infiniopGemm.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyGemmDescriptor.restype = c_int32
lib.infiniopDestroyGemmDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def mul_(lib):
lib.infiniopCreateMulDescriptor.restype = c_int32
lib.infiniopCreateMulDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetMulWorkspaceSize.restype = c_int32
lib.infiniopGetMulWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopMul.restype = c_int32
lib.infiniopMul.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMulDescriptor.restype = c_int32
lib.infiniopDestroyMulDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def random_sample_(lib):
lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
lib.infiniopCreateRandomSampleDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRandomSample.restype = c_int32
lib.infiniopRandomSample.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_size_t,
c_void_p,
c_float,
c_float,
c_int32,
c_float,
c_void_p,
]
lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def rearrange_(lib):
lib.infiniopCreateRearrangeDescriptor.restype = c_int32
lib.infiniopCreateRearrangeDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopRearrange.restype = c_int32
lib.infiniopRearrange.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t]
@OpRegister.operator
def relu_(lib):
lib.infiniopCreateReluDescriptor.restype = c_int32
lib.infiniopCreateReluDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopRelu.restype = c_int32
lib.infiniopRelu.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyReluDescriptor.restype = c_int32
lib.infiniopDestroyReluDescriptor.argtypes = [infiniopOperatorDescriptor_t]
@OpRegister.operator
def rms_norm_(lib):
lib.infiniopCreateRMSNormDescriptor.restype = c_int32
lib.infiniopCreateRMSNormDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
]
lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRMSNorm.restype = c_int32
lib.infiniopRMSNorm.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
lib.infiniopDestroyRMSNormDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def rope_(lib):
lib.infiniopCreateRoPEDescriptor.restype = c_int32
lib.infiniopCreateRoPEDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
lib.infiniopGetRoPEWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRoPE.restype = c_int32
lib.infiniopRoPE.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRoPEDescriptor.restype = c_int32
lib.infiniopDestroyRoPEDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def sub_(lib):
lib.infiniopCreateSubDescriptor.restype = c_int32
lib.infiniopCreateSubDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetSubWorkspaceSize.restype = c_int32
lib.infiniopGetSubWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopSub.restype = c_int32
lib.infiniopSub.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySubDescriptor.restype = c_int32
lib.infiniopDestroySubDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def swiglu_(lib):
lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
lib.infiniopCreateSwiGLUDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetSwiGLUWorkspaceSize.restype = c_int32
lib.infiniopGetSwiGLUWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopSwiGLU.restype = c_int32
lib.infiniopSwiGLU.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
lib.infiniopDestroySwiGLUDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def conv_(lib):
lib.infiniopCreateConvDescriptor.restype = c_int32
lib.infiniopCreateConvDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_size_t,
]
lib.infiniopGetConvWorkspaceSize.restype = c_int32
lib.infiniopGetConvWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopConv.restype = c_int32
lib.infiniopConv.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyConvDescriptor.restype = c_int32
lib.infiniopDestroyConvDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
from ctypes import c_int, Structure, POINTER
class TensorDescriptor(Structure):
_fields_ = []
infiniopTensorDescriptor_t = POINTER(TensorDescriptor)
class Handle(Structure):
_fields_ = [("device", c_int), ("device_id", c_int)]
infiniopHandle_t = POINTER(Handle)
class OpDescriptor(Structure):
_fields_ = [("device", c_int), ("device_id", c_int)]
infiniopOperatorDescriptor_t = POINTER(OpDescriptor)
from typing import Sequence
import torch
import ctypes
from .datatypes import *
from .devices import *
from typing import Sequence
from .liboperators import infiniopTensorDescriptor_t, CTensor, infiniopHandle_t
from .liboperators import infiniopTensorDescriptor_t, LIBINFINIOP, infiniopHandle_t
def check_error(status):
......@@ -11,71 +11,173 @@ def check_error(status):
raise Exception("Error code " + str(status))
def to_tensor(tensor, lib, force_unsigned=False):
"""
Convert a PyTorch tensor to a library Tensor(descriptor, data).
"""
import torch
class CTensor:
def __init__(self, dt: InfiniDtype, shape, strides):
self.descriptor = infiniopTensorDescriptor_t()
self.dt = dt
self.ndim = len(shape)
if strides is None:
strides = [1 for _ in shape]
for i in range(self.ndim - 2, -1, -1):
strides[i] = strides[i + 1] * shape[i + 1]
assert self.ndim == len(strides)
self.c_shape = (ctypes.c_size_t * self.ndim)(*shape)
self.c_strides = (ctypes.c_ssize_t * self.ndim)(*strides)
LIBINFINIOP.infiniopCreateTensorDescriptor(
ctypes.byref(self.descriptor),
self.ndim,
self.c_shape,
self.c_strides,
self.dt,
)
ndim = tensor.ndimension()
shape = (ctypes.c_size_t * ndim)(*tensor.shape)
strides = (ctypes.c_int64 * ndim)(*(tensor.stride()))
# fmt: off
dt = (
InfiniDtype.I8 if tensor.dtype == torch.int8 else
InfiniDtype.I16 if tensor.dtype == torch.int16 else
InfiniDtype.I32 if tensor.dtype == torch.int32 else
InfiniDtype.I64 if tensor.dtype == torch.int64 else
InfiniDtype.U8 if tensor.dtype == torch.uint8 else
InfiniDtype.F16 if tensor.dtype == torch.float16 else
InfiniDtype.BF16 if tensor.dtype == torch.bfloat16 else
InfiniDtype.F32 if tensor.dtype == torch.float32 else
InfiniDtype.F64 if tensor.dtype == torch.float64 else
# TODO: These following types may not be supported by older
# versions of PyTorch.
InfiniDtype.U16 if tensor.dtype == torch.uint16 else
InfiniDtype.U32 if tensor.dtype == torch.uint32 else
InfiniDtype.U64 if tensor.dtype == torch.uint64 else
None
)
if force_unsigned:
dt = (
InfiniDtype.U8 if dt == InfiniDtype.I8 else
InfiniDtype.U16 if dt == InfiniDtype.I16 else
InfiniDtype.U32 if dt == InfiniDtype.I32 else
InfiniDtype.U64 if dt == InfiniDtype.I64 else
dt
def destroy_desc(self):
if self.descriptor is not None:
LIBINFINIOP.infiniopDestroyTensorDescriptor(self.descriptor)
self.descriptor = None
class TestTensor(CTensor):
def __init__(
self,
shape,
strides,
dt: InfiniDtype,
device: InfiniDeviceEnum,
mode="random",
scale=None,
bias=None,
set_tensor=None,
):
self.dt = dt
self.device = device
self.shape = shape
self.strides = strides
torch_shape = []
torch_strides = [] if strides is not None else None
for i in range(len(shape)):
if strides is not None and strides[i] == 0:
torch_shape.append(1)
torch_strides.append(1)
elif strides is not None and strides[i] != 0:
torch_shape.append(shape[i])
torch_strides.append(strides[i])
else:
torch_shape.append(shape[i])
if mode == "random":
self._torch_tensor = torch.rand(
torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
)
elif mode == "zeros":
self._torch_tensor = torch.zeros(
torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
)
elif mode == "ones":
self._torch_tensor = torch.ones(
torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
)
elif mode == "manual":
assert set_tensor is not None
assert torch_shape == list(set_tensor.shape)
assert torch_strides == list(set_tensor.stride())
self._torch_tensor = set_tensor.to(to_torch_dtype(dt)).to(
torch_device_map[device]
)
else:
raise ValueError("Unsupported mode")
if scale is not None:
self._torch_tensor *= scale
if bias is not None:
self._torch_tensor += bias
if strides is not None:
self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides)
else:
self._data_tensor = self._torch_tensor.clone()
super().__init__(self.dt, shape, strides)
def torch_tensor(self):
return self._torch_tensor
def actual_tensor(self):
return self._data_tensor
def data(self):
return self._data_tensor.data_ptr()
def is_broadcast(self):
return self.strides is not None and 0 in self.strides
@staticmethod
def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
shape_ = list(torch_tensor.shape)
strides_ = list(torch_tensor.stride())
return TestTensor(
shape_, strides_, dt, device, mode="manual", set_tensor=torch_tensor
)
# fmt: on
assert dt is not None
# Create TensorDecriptor
tensor_desc = infiniopTensorDescriptor_t()
lib.infiniopCreateTensorDescriptor(
ctypes.byref(tensor_desc), ndim, shape, strides, dt
)
# Create Tensor
return CTensor(tensor_desc, tensor)
def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
if dt == InfiniDtype.I8:
return torch.int8
elif dt == InfiniDtype.I16:
return torch.int16
elif dt == InfiniDtype.I32:
return torch.int32
elif dt == InfiniDtype.I64:
return torch.int64
elif dt == InfiniDtype.U8:
return torch.uint8
elif dt == InfiniDtype.F16:
return torch.float16
elif dt == InfiniDtype.BF16:
return torch.bfloat16
elif dt == InfiniDtype.F32:
return torch.float32
elif dt == InfiniDtype.F64:
return torch.float64
# TODO: These following types may not be supported by older
# versions of PyTorch. Use compatability mode to convert them.
elif dt == InfiniDtype.U16:
return torch.int16 if compatability_mode else torch.uint16
elif dt == InfiniDtype.U32:
return torch.int32 if compatability_mode else torch.uint32
elif dt == InfiniDtype.U64:
return torch.int64 if compatability_mode else torch.uint64
else:
raise ValueError("Unsupported data type")
def create_workspace(size, torch_device):
print(f" - Workspace Size : {size}")
if size == 0:
return None
import torch
return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)
class TestWorkspace:
def __init__(self, size, device):
if size != 0:
self.tensor = TestTensor((size,), None, InfiniDtype.U8, device, mode="ones")
else:
self.tensor = None
self._size = size
def data(self):
if self.tensor is not None:
return self.tensor.data()
else:
return None
def size(self):
return ctypes.c_uint64(self._size)
def create_handle(lib):
def create_handle():
handle = infiniopHandle_t()
check_error(lib.infiniopCreateHandle(ctypes.byref(handle)))
check_error(LIBINFINIOP.infiniopCreateHandle(ctypes.byref(handle)))
return handle
def destroy_handle(lib, handle):
check_error(lib.infiniopDestroyHandle(handle))
def destroy_handle(handle):
check_error(LIBINFINIOP.infiniopDestroyHandle(handle))
def rearrange_tensor(tensor, new_strides):
......@@ -124,13 +226,6 @@ def rearrange_tensor(tensor, new_strides):
return new_tensor
def rearrange_if_needed(tensor, stride):
"""
Rearrange a PyTorch tensor if the given stride is not None.
"""
return rearrange_tensor(tensor, stride) if stride is not None else tensor
def get_args():
import argparse
......@@ -167,6 +262,11 @@ def get_args():
action="store_true",
help="Run NVIDIA GPU test",
)
parser.add_argument(
"--iluvatar",
action="store_true",
help="Run Iluvatar GPU test",
)
parser.add_argument(
"--cambricon",
action="store_true",
......@@ -224,6 +324,7 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
If True, the function will print detailed information about any discrepancies between the tensors.
"""
import numpy as np
# 如果是BF16,全部转成FP32再比对
if actual.dtype == torch.bfloat16 or desired.dtype == torch.bfloat16:
actual = actual.to(torch.float32)
......@@ -308,7 +409,9 @@ def debug_all(
assert passed, "\033[31mThe condition has not been satisfied\033[0m"
def print_discrepancy(actual, expected, atol=0, rtol=1e-3, equal_nan=True, verbose=True):
def print_discrepancy(
actual, expected, atol=0, rtol=1e-3, equal_nan=True, verbose=True
):
if actual.shape != expected.shape:
raise ValueError("Tensors must have the same shape to compare.")
......@@ -321,8 +424,12 @@ def print_discrepancy(actual, expected, atol=0, rtol=1e-3, equal_nan=True, verbo
expected_isnan = torch.isnan(expected)
# Calculate the difference mask based on atol and rtol
nan_mismatch = actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan
diff_mask = nan_mismatch | (torch.abs(actual - expected) > (atol + rtol * torch.abs(expected)))
nan_mismatch = (
actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan
)
diff_mask = nan_mismatch | (
torch.abs(actual - expected) > (atol + rtol * torch.abs(expected))
)
diff_indices = torch.nonzero(diff_mask, as_tuple=False)
delta = actual - expected
......@@ -419,35 +526,33 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
print(f" {desc} time: {elapsed * 1000 :6f} ms")
def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
def test_operator(device, test_func, test_cases, tensor_dtypes):
"""
Testing a specified operator on the given device with the given test function, test cases, and tensor data types.
Arguments:
----------
- lib (ctypes.CDLL): The library object containing the operator implementations.
- device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py.
- test_func (function): The test function to be executed for each test case.
- test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
to be passed to `test_func`.
- tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
"""
lib.infinirtSetDevice(device, ctypes.c_int(0))
handle = create_handle(lib)
LIBINFINIOP.infinirtSetDevice(device, ctypes.c_int(0))
handle = create_handle()
tensor_dtypes = filter_tensor_dtypes_by_device(device, tensor_dtypes)
try:
for test_case in test_cases:
for tensor_dtype in tensor_dtypes:
test_func(
lib,
handle,
infiniDeviceEnum_str_map[device],
device,
*test_case,
tensor_dtype,
get_sync_func(device),
)
finally:
destroy_handle(lib, handle)
destroy_handle(handle)
def get_test_devices(args):
......@@ -466,6 +571,8 @@ def get_test_devices(args):
devices_to_test.append(InfiniDeviceEnum.CPU)
if args.nvidia:
devices_to_test.append(InfiniDeviceEnum.NVIDIA)
if args.iluvatar:
devices_to_test.append(InfiniDeviceEnum.ILUVATAR)
if args.cambricon:
import torch_mlu
......@@ -498,7 +605,7 @@ def get_test_devices(args):
def get_sync_func(device):
import torch
device_str = infiniDeviceEnum_str_map[device]
device_str = torch_device_map[device]
if device == InfiniDeviceEnum.CPU:
sync = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment