Commit c2e87202 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/142

parents 41818f84 c203635b
......@@ -43,7 +43,7 @@ fp16_t _f32_to_f16(float val) {
int32_t exponent = ((f32 >> 23) & 0xFF) - 127; // Extract and de-bias the exponent
uint32_t mantissa = f32 & 0x7FFFFF; // Extract the mantissa (fraction part)
if (exponent >= 31) { // Special cases for Inf and NaN
if (exponent >= 16) { // Special cases for Inf and NaN
// NaN
if (exponent == 128 && mantissa != 0) {
return fp16_t{static_cast<uint16_t>(sign | 0x7E00)};
......
......@@ -138,4 +138,73 @@ void rearrange(
}
}
utils::Result<RearrangeMeta> RearrangeMeta::distributeUnit(const std::vector<size_t> &candidates) const {
// 获取当前的unit大小
size_t current_unit = _meta[0];
// 寻找满足条件的unit值:当前unit能被其整除
size_t new_unit = 0;
for (size_t candidate : candidates) {
if (current_unit % candidate == 0) {
new_unit = candidate;
break;
}
}
// 如果没找到合适的值,返回错误
if (new_unit == 0) {
return INFINI_STATUS_BAD_PARAM;
}
// 如果找到的值就是当前unit,返回自身的副本
if (new_unit == current_unit) {
return Result<RearrangeMeta>(_meta);
}
// 获取当前维度
size_t ndim_value = this->ndim();
// 创建新的布局数组
std::vector<ptrdiff_t> layout(2 + (ndim_value + 1) * 3, 0);
// 设置新的unit值
layout[0] = new_unit;
// 计算扩展因子
ptrdiff_t extra = current_unit / new_unit;
// 计算步长指针的偏移量
ptrdiff_t idx_offset = 1;
// 在新布局中设置相应的指针
ptrdiff_t *new_idx = layout.data() + 1;
ptrdiff_t *new_dst = layout.data() + 2 + (ndim_value + 1);
ptrdiff_t *new_src = layout.data() + 2 + (ndim_value + 1) * 2;
// 复制并调整索引步长
// 索引步长需要重新计算
// 首先复制原来的索引步长
for (size_t i = 0; i < ndim_value + 1; ++i) {
new_idx[i] = _meta[idx_offset + i] * extra;
}
// 设置最后一个维度的步长为1
new_idx[ndim_value + 1] = 1;
// 复制目标步长数据,并添加新单元大小
for (size_t i = 0; i < ndim_value; ++i) {
new_dst[i] = dst_strides()[i];
}
new_dst[ndim_value] = new_unit;
// 复制源步长数据,并添加新单元大小
for (size_t i = 0; i < ndim_value; ++i) {
new_src[i] = src_strides()[i];
}
new_src[ndim_value] = new_unit;
return Result<RearrangeMeta>(layout);
}
} // namespace utils
......@@ -28,6 +28,9 @@ public:
const ptrdiff_t *src_strides() const;
void launch(void *dst, const void *src) const;
// 拆分 unit 到更小的规模以利于并行
utils::Result<RearrangeMeta> distributeUnit(const std::vector<size_t> &candidates) const;
};
void rearrange(
......
from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides
from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
......@@ -29,6 +29,14 @@ def gguf_strides(*args: int) -> list[int] | None:
return list(args)[::-1] if args else None
def contiguous_gguf_strides(shape: tuple[int, ...]) -> list[int]:
strides = []
acc = 1
for size in reversed(shape):
strides.append(acc)
acc *= size
return strides[::-1]
class InfiniopTestCase:
op_name: str
......
from ast import List
import numpy as np
import gguf
from typing import List
from numpy.lib.stride_tricks import as_strided
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
def add(
a: np.ndarray,
b: np.ndarray,
):
return a + b
def process_tensor(a, b, stride_a=None, stride_b=None):
def normalize_stride(tensor, stride):
if stride:
slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride)
return tensor[slices]
else:
return tensor
a_unique = normalize_stride(a, stride_a)
b_unique = normalize_stride(b, stride_b)
return a_unique, b_unique
class AddTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
shape_a: List[int] | None,
stride_a: List[int] | None,
b: np.ndarray,
shape_b: List[int] | None,
stride_b: List[int] | None,
c: np.ndarray,
shape_c: List[int] | None,
stride_c: List[int] | None,
):
super().__init__("add")
self.a = a
self.shape_a = shape_a
self.stride_a = stride_a
self.b = b
self.shape_b = shape_b
self.stride_b = stride_b
self.c = c
self.shape_c = shape_c
self.stride_c = stride_c
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.shape_a is not None:
test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a)
if self.shape_b is not None:
test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
if self.shape_c is not None:
test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
test_writer.add_array(
test_writer.gguf_key("c.strides"),
gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
)
ans = add(
self.a.astype(np.float64),
self.b.astype(np.float64),
)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("add.gguf")
test_cases = []
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4), (0, 1), None, None),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
_TENSOR_DTYPES_ = [np.float32, np.float16]
for dtype in _TENSOR_DTYPES_:
for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
a = np.random.rand(*shape).astype(dtype)
b = np.random.rand(*shape).astype(dtype)
c = np.empty(tuple(0 for _ in shape), dtype=dtype)
a, b = process_tensor(a, b, stride_a, stride_b)
if stride_c is None:
stride_c = contiguous_gguf_strides(shape)
test_case = AddTestCase(
a=a,
shape_a=shape,
stride_a=stride_a,
b=b,
shape_b=shape,
stride_b=stride_b,
c=c,
shape_c=shape,
stride_c=stride_c,
)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
\ No newline at end of file
import numpy as np
import gguf
from typing import List, Optional, Tuple
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
def clip(
x: np.ndarray,
min_val: np.ndarray,
max_val: np.ndarray,
) -> np.ndarray:
"""
Clip the values in input tensor x to the range [min_val, max_val].
Args:
x: Input tensor
min_val: Tensor with minimum values (same shape as x)
max_val: Tensor with maximum values (same shape as x)
Returns:
Clipped tensor with the same shape as x
"""
return np.maximum(np.minimum(x, max_val), min_val)
def random_tensor(shape, dtype):
"""
Generate a random tensor with values in the range [-2, 2].
Args:
shape: Shape of the tensor
dtype: Data type of the tensor
Returns:
Random tensor with the specified shape and dtype
"""
return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0)
class ClipTestCase(InfiniopTestCase):
"""
Test case for the Clip operator.
"""
def __init__(
self,
x: np.ndarray,
x_stride: Optional[List[int]],
min_val: np.ndarray,
min_stride: Optional[List[int]],
max_val: np.ndarray,
max_stride: Optional[List[int]],
y: np.ndarray,
y_stride: Optional[List[int]],
):
super().__init__("clip")
self.x = x
self.x_stride = x_stride
self.min_val = min_val
self.min_stride = min_stride
self.max_val = max_val
self.max_stride = max_stride
self.y = y
self.y_stride = y_stride
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
# Add strides as arrays if they exist
if self.x_stride is not None:
test_writer.add_array(test_writer.gguf_key("x.strides"), self.x_stride)
if self.min_stride is not None:
test_writer.add_array(test_writer.gguf_key("min_val.strides"), self.min_stride)
if self.max_stride is not None:
test_writer.add_array(test_writer.gguf_key("max_val.strides"), self.max_stride)
if self.y_stride is not None:
test_writer.add_array(test_writer.gguf_key("y.strides"), self.y_stride)
# Add tensors to the test
test_writer.add_tensor(
test_writer.gguf_key("x"),
self.x,
raw_dtype=np_dtype_to_ggml(self.x.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("min_val"),
self.min_val,
raw_dtype=np_dtype_to_ggml(self.min_val.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("max_val"),
self.max_val,
raw_dtype=np_dtype_to_ggml(self.max_val.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("y"),
self.y,
raw_dtype=np_dtype_to_ggml(self.y.dtype)
)
# Calculate the expected result
ans = clip(
self.x.astype(np.float64),
self.min_val.astype(np.float64),
self.max_val.astype(np.float64)
)
# Add the expected result to the test
test_writer.add_tensor(
test_writer.gguf_key("ans"),
ans,
raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("clip.gguf")
# Create test cases for different shapes, strides, and data types
test_cases = []
# Test case shapes
shapes = [
(10,), # 1D tensor
(5, 10), # 2D tensor
(2, 3, 4), # 3D tensor
(7, 13), # Prime dimensions
(1, 1), # Minimum shape
(100, 100), # Large shape
(16, 16, 16), # Large 3D
]
# Test case min/max values
min_max_values = [
(-1.0, 1.0), # Standard range
(0.0, 2.0), # Positive range
(-2.0, 0.0), # Negative range
(-1000.0, 1000.0), # Large range
(-0.001, 0.001), # Small range
(0.0, 0.0), # min=max
]
# Data types to test
dtypes = [np.float16, np.float32, np.float64]
# Generate test cases with contiguous tensors
for shape in shapes:
for min_val, max_val in min_max_values:
for dtype in dtypes:
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, min_val, dtype=dtype)
max_tensor = np.full(shape, max_val, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=None,
min_val=min_tensor,
min_stride=None,
max_val=max_tensor,
max_stride=None,
y=y,
y_stride=None
)
)
# Generate test cases with strided tensors (for 2D shapes only)
for shape in [s for s in shapes if len(s) == 2]:
for dtype in dtypes:
# Row-major stride
row_stride = gguf_strides(shape[1], 1)
# Column-major stride
col_stride = gguf_strides(1, shape[0])
# Test case with row-major input and output
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, -1.0, dtype=dtype)
max_tensor = np.full(shape, 1.0, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=row_stride,
min_val=min_tensor,
min_stride=row_stride,
max_val=max_tensor,
max_stride=row_stride,
y=y,
y_stride=row_stride
)
)
# Test case with column-major input and output
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, -1.0, dtype=dtype)
max_tensor = np.full(shape, 1.0, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=col_stride,
min_val=min_tensor,
min_stride=col_stride,
max_val=max_tensor,
max_stride=col_stride,
y=y,
y_stride=col_stride
)
)
# Test case with different strides for input and output
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, -1.0, dtype=dtype)
max_tensor = np.full(shape, 1.0, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=row_stride,
min_val=min_tensor,
min_stride=row_stride,
max_val=max_tensor,
max_stride=row_stride,
y=y,
y_stride=col_stride
)
)
# Add all test cases to the writer
test_writer.add_tests(test_cases)
# Save the test cases to a GGUF file
test_writer.save()
print(f"Generated {len(test_cases)} test cases for the Clip operator")
import numpy as np
import gguf
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
def mul(
a: np.ndarray,
b: np.ndarray
):
return np.multiply(a, b)
def random_tensor(shape, dtype):
rate = 1e-3
var = 0.5 * rate # 数值范围在[-5e-4, 5e-4]
return rate * np.random.rand(*shape).astype(dtype) - var
class MulTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
shape_a: List[int] | None,
stride_a: List[int] | None,
b: np.ndarray,
shape_b: List[int] | None,
stride_b: List[int] | None,
c: np.ndarray,
shape_c: List[int] | None,
stride_c: List[int] | None,
):
super().__init__("mul")
self.a = a
self.shape_a = shape_a
self.stride_a = stride_a
self.b = b
self.shape_b = shape_b
self.stride_b = stride_b
self.c = c
self.shape_c = shape_c
self.stride_c = stride_c
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.shape_a is not None:
test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a)
if self.shape_b is not None:
test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
if self.shape_c is not None:
test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
test_writer.add_array(
test_writer.gguf_key("c.strides"),
gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
)
a_fp64 = self.a.astype(np.float64)
b_fp64 = self.b.astype(np.float64)
ans_fp64 = np.multiply(a_fp64, b_fp64)
ans = mul(self.a, self.b)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(ans.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("ans_fp64"),
ans_fp64,
raw_dtype=np_dtype_to_ggml(ans_fp64.dtype),
)
if __name__ == '__main__':
test_writer = InfiniopTestWriter("mul.gguf")
test_cases = []
_TEST_CASES_ = [
((2, 3), (3, 1), (1, 2), (3, 1)),
((2, 3), (1, 2), (3, 1), (1, 2)),
((2, 3), (3, 1), (3, 1), (1, 2)),
((4, 6), (1, 4), (1, 5), (6, 1)),
((1, 2048), (1, 1), (2048, 1), (1, 1)),
((2048, 2048), None, (1, 2048), None),
((2, 4, 2048), (4 * 2048, 2048, 1), (1, 2, 8), (4 * 2048, 2048, 1)),
((2, 4, 2048), (1, 2, 8), None, (1, 2, 8)),
((2048, 2560), (2560, 1), (1, 2048), (2560, 1)),
((4, 48, 64), (64 * 48, 64, 1), (1, 4, 192), None),
((4, 48, 64), None, (1, 4, 192), (48 * 64, 64, 1)),
]
_TENSOR_DTYPES_ = [np.float32, np.float16]
for dtype in _TENSOR_DTYPES_:
for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
a = random_tensor(shape, dtype)
b = random_tensor(shape, dtype)
c = np.empty(tuple(0 for _ in shape), dtype=dtype)
test_cases.append(
MulTestCase(
a=a,
shape_a=shape,
stride_a=stride_a,
b=b,
shape_b=shape,
stride_b=stride_b,
c=c,
shape_c=shape,
stride_c=stride_c,
)
)
test_writer.add_tests(test_cases)
test_writer.save()
import numpy as np
import gguf
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
def swiglu(
a: np.ndarray,
b: np.ndarray,
):
c = a * b / (1.0 + np.exp(-b))
return c
class SwiGLUTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
shape_a: List[int] | None,
stride_a: List[int] | None,
b: np.ndarray,
shape_b: List[int] | None,
stride_b: List[int] | None,
c: np.ndarray,
shape_c: List[int] | None,
stride_c: List[int] | None,
):
super().__init__("swiglu")
self.a = a
self.shape_a = shape_a
self.stride_a = stride_a
self.b = b
self.shape_b = shape_b
self.stride_b = stride_b
self.c = c
self.shape_c = shape_c
self.stride_c = stride_c
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.shape_a is not None:
test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a)
if self.shape_b is not None:
test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
if self.shape_c is not None:
test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
test_writer.add_array(
test_writer.gguf_key("c.strides"),
gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
)
ans = swiglu(
self.a.astype(np.float64),
self.b.astype(np.float64),
)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("swiglu.gguf")
test_cases = []
_TEST_CASES_ = [
((64, 128), None, None, None),
((64, 121), None, None, None),
((15, 512), None, None, None),
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((16, 5632), (5632, 1), (5632, 1), (1, 16)),
((2, 3, 400), (1200, 400, 1), (1200, 400, 1), (1, 2, 6)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
_TENSOR_DTYPES_ = [np.float32, np.float16]
for dtype in _TENSOR_DTYPES_:
for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
a = np.random.rand(*shape).astype(dtype)
b = np.random.rand(*shape).astype(dtype)
c = np.empty(tuple(0 for _ in shape), dtype=dtype)
test_case = SwiGLUTestCase(
a=a,
shape_a=list(shape),
stride_a=stride_a,
b=b,
shape_b=list(shape),
stride_b=stride_b,
c=c,
shape_c=list(shape),
stride_c=stride_c,
)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
from ctypes import POINTER, Structure, c_int32, c_void_p
import torch
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
open_lib,
to_tensor,
get_test_devices,
check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
)
from operatorspy.tests.test_utils import get_args
from enum import Enum, auto
import torch
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4), (0, 1), None, None),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
......@@ -26,6 +43,35 @@ class Inplace(Enum):
INPLACE_B = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_A,
Inplace.INPLACE_B,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-7},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class AddDescriptor(Structure):
_fields_ = [("device", c_int32)]
......@@ -37,42 +83,71 @@ def add(x, y):
return torch.add(x, y)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def test(
lib,
handle,
torch_device,
c_shape,
a_shape,
b_shape,
tensor_dtype=torch.float16,
shape,
a_stride=None,
b_stride=None,
c_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
print(
f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
f"Testing Add on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
)
if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
print("Unsupported test: broadcasting does not support in-place")
return
a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
c = (
torch.rand(c_shape, dtype=tensor_dtype).to(torch_device)
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
ans = add(a, b)
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
descriptor = infiniopAddDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopAddDescriptor_t()
check_error(
lib.infiniopCreateAddDescriptor(
handle,
......@@ -84,74 +159,48 @@ def test(
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
c_tensor.descriptor.contents.invalidate()
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0)
check_error(
lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
lib.infiniopGetAddWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
assert torch.allclose(c, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyAddDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
# fmt: off
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
# fmt: off
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle)
workspace = create_workspace(workspace_size.value, c.device)
def lib_add():
check_error(
lib.infiniopAdd(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
None,
)
)
lib_add()
def test_bang(lib, test_cases):
import torch_mlu
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
# Profiling workflow
if PROFILE:
# fmt: off
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
profile_operation("PyTorch", lambda: add(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_add(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
destroy_handle(lib, handle)
check_error(lib.infiniopDestroyAddDescriptor(descriptor))
if __name__ == "__main__":
test_cases = [
# fmt: off
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
((), (), (), Inplace.OUT_OF_PLACE),
((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
# fmt: on
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t,
......@@ -160,25 +209,36 @@ if __name__ == "__main__":
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetAddWorkspaceSize.restype = c_int32
lib.infiniopGetAddWorkspaceSize.argtypes = [
infiniopAddDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [
infiniopAddDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopAddDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
from libinfiniop import (
open_lib,
to_tensor,
CTensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
get_args,
get_test_devices,
test_operator,
debug,
get_tolerance,
profile_operation,
)
from operatorspy.tests.test_utils import get_args
import torch
import torch.nn.functional as F
class AttentionDescriptor(Structure):
......@@ -95,12 +95,13 @@ def test(
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype=torch.float16,
q_stride=None,
k_stride=None,
v_stride=None,
k_cache_stride=None,
v_cache_stride=None,
dtype=torch.float16,
sync=None,
):
print(
f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
......@@ -140,6 +141,9 @@ def test(
k_cache_tensor = to_tensor(k_cache, lib)
v_cache_tensor = to_tensor(v_cache, lib)
if sync is not None:
sync()
descriptor = infiniopAttentionDescriptor_t()
check_error(
lib.infiniopCreateAttentionDescriptor(
......@@ -156,12 +160,15 @@ def test(
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
out_tensor.descriptor.contents.invalidate()
q_tensor.descriptor.contents.invalidate()
k_tensor.descriptor.contents.invalidate()
v_tensor.descriptor.contents.invalidate()
k_cache_tensor.descriptor.contents.invalidate()
v_cache_tensor.descriptor.contents.invalidate()
for tensor in [
out_tensor,
q_tensor,
k_tensor,
v_tensor,
k_cache_tensor,
v_cache_tensor,
]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0)
check_error(
......@@ -169,152 +176,52 @@ def test(
)
workspace = create_workspace(workspace_size.value, out.device)
check_error(
lib.infiniopAttention(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
out_tensor.data,
q_tensor.data,
k_tensor.data,
v_tensor.data,
k_cache_tensor.data,
v_cache_tensor.data,
None,
)
)
assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cpu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
def lib_attention():
check_error(
lib.infiniopAttention(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
out_tensor.data,
q_tensor.data,
k_tensor.data,
v_tensor.data,
k_cache_tensor.data,
v_cache_tensor.data,
None,
)
)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cuda",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
lib_attention()
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"mlu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
# Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(out, ans, atol=atol, rtol=rtol)
assert torch.allclose(out, ans, atol=atol, rtol=rtol)
destroy_handle(lib, handle)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
if __name__ == "__main__":
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2},
torch.float32: {"atol": 1e-5, "rtol": 1e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
test_cases = [
# prefill
(
......@@ -325,7 +232,6 @@ if __name__ == "__main__":
0, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride
......@@ -341,7 +247,6 @@ if __name__ == "__main__":
3, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride
......@@ -357,13 +262,26 @@ if __name__ == "__main__":
1, # pos
8, # k_cache_buf_len
8, # v_cache_buf_len
torch.float16, # dtype
None, # q_stride
None, # k_stride
None, # v_stride
None, # k_cache_stride
None, # v_cache_stride
),
(
28, # n_q_head
28, # n_kv_head
15, # seq_len
128, # head_dim
0, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
[128, 10752, 1], # q_stride
[128, 10752, 1], # k_stride
[128, 10752, 1], # v_stride
[128, 3584, 1], # k_cache_stride
[128, 3584, 1], # v_cache_stride
),
]
args = get_args()
lib = open_lib()
......@@ -406,12 +324,13 @@ if __name__ == "__main__":
infiniopAttentionDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, test_cases, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -88,6 +88,7 @@ def test(
padding,
strides,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
......@@ -109,6 +110,10 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopAvgPoolDescriptor_t()
check_error(
......
......@@ -16,18 +16,20 @@ from libinfiniop import (
get_tolerance,
profile_operation,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES = [
# x_shape, x_stride
((32, 512), None),
((32, 512), (1024, 1)),
((32, 5, 5), None),
((32, 20, 512), None),
((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
_TEST_CASES_ = [
# shape, x_stride, y_stride
((3, 3), None, None),
((32, 512), None, None),
((32, 512), (1024, 1), (1024, 1)),
((32, 5, 5), None, None),
((32, 20, 512), None, None),
((32, 20, 512), (20480, 512, 1), None),
]
# Data types used for testing
......@@ -35,9 +37,26 @@ _TENSOR_DTYPES = [torch.float16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2},
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
}
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
_INPLACE = [
Inplace.INPLACE_X,
Inplace.OUT_OF_PLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
......@@ -59,12 +78,22 @@ def causal_softmax(x):
return torch.nn.functional.softmax(masked, dim=-1).to(type)
def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16):
def test(
lib,
handle,
torch_device,
shape,
x_stride=None,
y_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None
):
print(
f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{dtype}"
f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
)
x = torch.rand(x_shape, dtype=dtype).to(torch_device)
x = torch.rand(shape, dtype=dtype).to(torch_device)
ans = causal_softmax(x)
......@@ -72,10 +101,21 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16)
x_tensor = to_tensor(x, lib)
if inplace == Inplace.INPLACE_X:
y = x
y_tensor = x_tensor
else:
y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopCausalSoftmaxDescriptor_t()
check_error(
lib.infiniopCreateCausalSoftmaxDescriptor(
handle, ctypes.byref(descriptor), x_tensor.descriptor
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
)
)
......@@ -96,17 +136,21 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16)
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
y_tensor.data,
x_tensor.data,
None,
)
)
lib_causal_softmax()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(x, ans, atol=atol, rtol=rtol)
assert torch.allclose(x, ans, atol=atol, rtol=rtol)
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
......
#!/usr/bin/env python3
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, x_stride, y_stride, min_val, max_val
# 基本形状测试
((10,), None, None, -1.0, 1.0),
((5, 10), None, None, -1.0, 1.0),
((2, 3, 4), None, None, -1.0, 1.0),
# 不同的min_val和max_val
((10,), None, None, 0.0, 2.0),
((5, 10), None, None, 0.0, 2.0),
((2, 3, 4), None, None, 0.0, 2.0),
((10,), None, None, -2.0, 0.0),
((5, 10), None, None, -2.0, 0.0),
((2, 3, 4), None, None, -2.0, 0.0),
# 奇怪形状测试
((7, 13), None, None, -1.0, 1.0), # 质数维度
((3, 5, 7), None, None, -1.0, 1.0), # 三维质数
# 非标准形状测试
((1, 1), None, None, -1.0, 1.0), # 最小形状
((100, 100), None, None, -1.0, 1.0), # 大形状
((16, 16, 16), None, None, -1.0, 1.0), # 大三维
# 极端值测试
((10,), None, None, -1000.0, 1000.0), # 大范围
((10,), None, None, -0.001, 0.001), # 小范围
((10,), None, None, 0.0, 0.0), # min=max
# 特殊形状测试
((0,), None, None, -1.0, 1.0), # 空张量
((1, 0), None, None, -1.0, 1.0), # 空维度
]
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-6},
}
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
_INPLACE = [
Inplace.INPLACE_X,
Inplace.OUT_OF_PLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ClipDescriptor(Structure):
_fields_ = [("device_type", c_int32), ("device_id", c_int32)]
infiniopClipDescriptor_t = POINTER(ClipDescriptor)
def clip(x, min_val, max_val):
return torch.clamp(x, min_val, max_val)
def create_tensor_with_stride(shape, stride, dtype, device):
"""Create a tensor with specific stride without using view() that might cause errors."""
x = torch.rand(shape, dtype=dtype, device=device) * 4.0 - 2.0 # Range: [-2, 2]
if stride is None:
return x
if len(shape) == 2 and len(stride) == 2:
if stride == (shape[1], 1):
return x.contiguous()
elif stride == (1, shape[0]):
return x.transpose(0, 1).contiguous().transpose(0, 1)
else:
y = torch.zeros(shape, dtype=dtype, device=device)
for i in range(shape[0]):
for j in range(shape[1]):
y[i, j] = x[i, j]
return y.contiguous()
return x
def test(
lib,
handle,
torch_device,
shape,
x_stride=None,
y_stride=None,
min_val=-1.0,
max_val=1.0,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float32,
):
print(
f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}"
)
x = create_tensor_with_stride(shape, x_stride, dtype, torch_device)
ans = clip(x, min_val, max_val)
x = rearrange_if_needed(x, x_stride)
x_tensor = to_tensor(x, lib)
if inplace == Inplace.INPLACE_X:
y = x
y_tensor = x_tensor
else:
y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
descriptor = infiniopClipDescriptor_t()
check_error(
lib.infiniopCreateClipDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
)
)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetClipWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, x.device)
def lib_clip():
check_error(
lib.infiniopClip(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
y_tensor.data,
x_tensor.data,
c_float(min_val),
c_float(max_val),
None,
)
)
lib_clip()
# Now we can destroy the tensor descriptors
x_tensor.destroyDesc(lib)
if inplace != Inplace.INPLACE_X:
y_tensor.destroyDesc(lib)
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol):
print("\nExpected:")
print(ans)
print("\nActual:")
print(y)
print("\nDifference:")
print(torch.abs(y - ans))
print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyClipDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateClipDescriptor.restype = c_int32
lib.infiniopCreateClipDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopClipDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetClipWorkspaceSize.restype = c_int32
lib.infiniopGetClipWorkspaceSize.argtypes = [
infiniopClipDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopClip.restype = c_int32
lib.infiniopClip.argtypes = [
infiniopClipDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyClipDescriptor.restype = c_int32
lib.infiniopDestroyClipDescriptor.argtypes = [
infiniopClipDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -95,6 +95,7 @@ def test(
dilations,
tensor_stride=None,
tensor_dtype=torch.float16,
sync=None
):
assert len(pads) == len(strides) == len(dilations)
print(
......@@ -118,8 +119,11 @@ def test(
x_tensor = to_tensor(x, lib)
w_tensor = to_tensor(w, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopConvDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopConvDescriptor_t()
check_error(
lib.infiniopCreateConvDescriptor(
handle,
......
......@@ -52,6 +52,7 @@ def test(
y_stride=None,
x_stride=None,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
......@@ -76,8 +77,11 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopExpandDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopExpandDescriptor_t()
check_error(
lib.infiniopCreateExpandDescriptor(
handle,
......
......@@ -83,6 +83,7 @@ def test(
b_stride=None,
c_stride=None,
dtype=torch.float16,
sync=None
):
print(
f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
......@@ -104,6 +105,9 @@ def test(
]
a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
if sync is not None:
sync()
descriptor = infiniopGemmDescriptor_t()
check_error(
lib.infiniopCreateGemmDescriptor(
......
......@@ -51,6 +51,7 @@ def test(
torch_device,
x_shape,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
......@@ -70,8 +71,11 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopGlobalAvgPoolDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopGlobalAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateGlobalAvgPoolDescriptor(
handle,
......
......@@ -10,7 +10,7 @@ def check_error(status):
raise Exception("Error code " + str(status))
def to_tensor(tensor, lib):
def to_tensor(tensor, lib, force_unsigned=False):
"""
Convert a PyTorch tensor to a library Tensor(descriptor, data).
"""
......@@ -37,6 +37,16 @@ def to_tensor(tensor, lib):
InfiniDtype.U64 if tensor.dtype == torch.uint64 else
None
)
if force_unsigned:
dt = (
InfiniDtype.U8 if dt == InfiniDtype.I8 else
InfiniDtype.U16 if dt == InfiniDtype.I16 else
InfiniDtype.U32 if dt == InfiniDtype.I32 else
InfiniDtype.U64 if dt == InfiniDtype.I64 else
dt
)
# fmt: on
assert dt is not None
# Create TensorDecriptor
......@@ -413,6 +423,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
infiniDeviceEnum_str_map[device],
*test_case,
tensor_dtype,
get_sync_func(device)
)
finally:
destroy_handle(lib, handle)
......@@ -461,3 +472,15 @@ def get_test_devices(args):
devices_to_test = [InfiniDeviceEnum.CPU]
return devices_to_test
def get_sync_func(device):
import torch
device_str = infiniDeviceEnum_str_map[device]
if device == InfiniDeviceEnum.CPU:
sync = None
else:
sync = getattr(torch, device_str).synchronize
return sync
......@@ -83,6 +83,7 @@ def test(
padding,
strides,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
......@@ -104,8 +105,11 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopMaxPoolDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopMaxPoolDescriptor_t()
check_error(
lib.infiniopCreateMaxPoolDescriptor(
handle,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment