Commit 9b32b4b1 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/150

parents 15bcbdfc 4799ddbf
from ast import List
import numpy as np
import gguf
from typing import List
from numpy.lib.stride_tricks import as_strided
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
def add(
a: np.ndarray,
b: np.ndarray,
):
return a + b
def process_tensor(a, b, stride_a=None, stride_b=None):
def normalize_stride(tensor, stride):
if stride:
slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride)
return tensor[slices]
else:
return tensor
a_unique = normalize_stride(a, stride_a)
b_unique = normalize_stride(b, stride_b)
return a_unique, b_unique
class AddTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
shape_a: List[int] | None,
stride_a: List[int] | None,
b: np.ndarray,
shape_b: List[int] | None,
stride_b: List[int] | None,
c: np.ndarray,
shape_c: List[int] | None,
stride_c: List[int] | None,
):
super().__init__("add")
self.a = a
self.shape_a = shape_a
self.stride_a = stride_a
self.b = b
self.shape_b = shape_b
self.stride_b = stride_b
self.c = c
self.shape_c = shape_c
self.stride_c = stride_c
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.shape_a is not None:
test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a)
if self.shape_b is not None:
test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
if self.shape_c is not None:
test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
test_writer.add_array(
test_writer.gguf_key("c.strides"),
gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
)
ans = add(
self.a.astype(np.float64),
self.b.astype(np.float64),
)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("add.gguf")
test_cases = []
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4), (0, 1), None, None),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
_TENSOR_DTYPES_ = [np.float32, np.float16]
for dtype in _TENSOR_DTYPES_:
for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
a = np.random.rand(*shape).astype(dtype)
b = np.random.rand(*shape).astype(dtype)
c = np.empty(tuple(0 for _ in shape), dtype=dtype)
a, b = process_tensor(a, b, stride_a, stride_b)
if stride_c is None:
stride_c = contiguous_gguf_strides(shape)
test_case = AddTestCase(
a=a,
shape_a=shape,
stride_a=stride_a,
b=b,
shape_b=shape,
stride_b=stride_b,
c=c,
shape_c=shape,
stride_c=stride_c,
)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
\ No newline at end of file
import numpy as np
import gguf
from typing import List, Optional, Tuple
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
def clip(
x: np.ndarray,
min_val: np.ndarray,
max_val: np.ndarray,
) -> np.ndarray:
"""
Clip the values in input tensor x to the range [min_val, max_val].
Args:
x: Input tensor
min_val: Tensor with minimum values (same shape as x)
max_val: Tensor with maximum values (same shape as x)
Returns:
Clipped tensor with the same shape as x
"""
return np.maximum(np.minimum(x, max_val), min_val)
def random_tensor(shape, dtype):
"""
Generate a random tensor with values in the range [-2, 2].
Args:
shape: Shape of the tensor
dtype: Data type of the tensor
Returns:
Random tensor with the specified shape and dtype
"""
return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0)
class ClipTestCase(InfiniopTestCase):
"""
Test case for the Clip operator.
"""
def __init__(
self,
x: np.ndarray,
x_stride: Optional[List[int]],
min_val: np.ndarray,
min_stride: Optional[List[int]],
max_val: np.ndarray,
max_stride: Optional[List[int]],
y: np.ndarray,
y_stride: Optional[List[int]],
):
super().__init__("clip")
self.x = x
self.x_stride = x_stride
self.min_val = min_val
self.min_stride = min_stride
self.max_val = max_val
self.max_stride = max_stride
self.y = y
self.y_stride = y_stride
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
# Add strides as arrays if they exist
if self.x_stride is not None:
test_writer.add_array(test_writer.gguf_key("x.strides"), self.x_stride)
if self.min_stride is not None:
test_writer.add_array(test_writer.gguf_key("min_val.strides"), self.min_stride)
if self.max_stride is not None:
test_writer.add_array(test_writer.gguf_key("max_val.strides"), self.max_stride)
if self.y_stride is not None:
test_writer.add_array(test_writer.gguf_key("y.strides"), self.y_stride)
# Add tensors to the test
test_writer.add_tensor(
test_writer.gguf_key("x"),
self.x,
raw_dtype=np_dtype_to_ggml(self.x.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("min_val"),
self.min_val,
raw_dtype=np_dtype_to_ggml(self.min_val.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("max_val"),
self.max_val,
raw_dtype=np_dtype_to_ggml(self.max_val.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("y"),
self.y,
raw_dtype=np_dtype_to_ggml(self.y.dtype)
)
# Calculate the expected result
ans = clip(
self.x.astype(np.float64),
self.min_val.astype(np.float64),
self.max_val.astype(np.float64)
)
# Add the expected result to the test
test_writer.add_tensor(
test_writer.gguf_key("ans"),
ans,
raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("clip.gguf")
# Create test cases for different shapes, strides, and data types
test_cases = []
# Test case shapes
shapes = [
(10,), # 1D tensor
(5, 10), # 2D tensor
(2, 3, 4), # 3D tensor
(7, 13), # Prime dimensions
(1, 1), # Minimum shape
(100, 100), # Large shape
(16, 16, 16), # Large 3D
]
# Test case min/max values
min_max_values = [
(-1.0, 1.0), # Standard range
(0.0, 2.0), # Positive range
(-2.0, 0.0), # Negative range
(-1000.0, 1000.0), # Large range
(-0.001, 0.001), # Small range
(0.0, 0.0), # min=max
]
# Data types to test
dtypes = [np.float16, np.float32, np.float64]
# Generate test cases with contiguous tensors
for shape in shapes:
for min_val, max_val in min_max_values:
for dtype in dtypes:
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, min_val, dtype=dtype)
max_tensor = np.full(shape, max_val, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=None,
min_val=min_tensor,
min_stride=None,
max_val=max_tensor,
max_stride=None,
y=y,
y_stride=None
)
)
# Generate test cases with strided tensors (for 2D shapes only)
for shape in [s for s in shapes if len(s) == 2]:
for dtype in dtypes:
# Row-major stride
row_stride = gguf_strides(shape[1], 1)
# Column-major stride
col_stride = gguf_strides(1, shape[0])
# Test case with row-major input and output
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, -1.0, dtype=dtype)
max_tensor = np.full(shape, 1.0, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=row_stride,
min_val=min_tensor,
min_stride=row_stride,
max_val=max_tensor,
max_stride=row_stride,
y=y,
y_stride=row_stride
)
)
# Test case with column-major input and output
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, -1.0, dtype=dtype)
max_tensor = np.full(shape, 1.0, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=col_stride,
min_val=min_tensor,
min_stride=col_stride,
max_val=max_tensor,
max_stride=col_stride,
y=y,
y_stride=col_stride
)
)
# Test case with different strides for input and output
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, -1.0, dtype=dtype)
max_tensor = np.full(shape, 1.0, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=row_stride,
min_val=min_tensor,
min_stride=row_stride,
max_val=max_tensor,
max_stride=row_stride,
y=y,
y_stride=col_stride
)
)
# Add all test cases to the writer
test_writer.add_tests(test_cases)
# Save the test cases to a GGUF file
test_writer.save()
print(f"Generated {len(test_cases)} test cases for the Clip operator")
......@@ -2,7 +2,7 @@ import numpy as np
import gguf
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
def mul(
a: np.ndarray,
......@@ -19,28 +19,44 @@ class MulTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
shape_a: List[int] | None,
stride_a: List[int] | None,
b: np.ndarray,
shape_b: List[int] | None,
stride_b: List[int] | None,
c: np.ndarray,
shape_c: List[int] | None,
stride_c: List[int] | None,
):
super().__init__("mul")
self.a = a
self.shape_a = shape_a
self.stride_a = stride_a
self.b = b
self.shape_b = shape_b
self.stride_b = stride_b
self.c = c
self.shape_c = shape_c
self.stride_c = stride_c
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.shape_a is not None:
test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a)
if self.shape_b is not None:
test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
if self.shape_c is not None:
test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
if self.stride_c is not None:
test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
test_writer.add_array(
test_writer.gguf_key("c.strides"),
gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
......@@ -52,6 +68,7 @@ class MulTestCase(InfiniopTestCase):
)
a_fp64 = self.a.astype(np.float64)
b_fp64 = self.b.astype(np.float64)
ans_fp64 = np.multiply(a_fp64, b_fp64)
ans = mul(self.a, self.b)
test_writer.add_tensor(
......@@ -65,95 +82,43 @@ class MulTestCase(InfiniopTestCase):
if __name__ == '__main__':
test_writer = InfiniopTestWriter("mul.gguf")
test_cases = [
MulTestCase(
random_tensor((2, 3), np.float32),
gguf_strides(3, 1),
random_tensor((2, 3), np.float32),
gguf_strides(1, 2),
random_tensor((2, 3), np.float32),
gguf_strides(3, 1),
),
MulTestCase(
random_tensor((2, 3), np.float16),
gguf_strides(1, 2),
random_tensor((2, 3), np.float16),
gguf_strides(3, 1),
random_tensor((2, 3), np.float16),
gguf_strides(1, 2),
),
MulTestCase(
random_tensor((2, 3), np.float64),
gguf_strides(3, 1),
random_tensor((2, 3), np.float64),
gguf_strides(3, 1),
random_tensor((2, 3), np.float64),
gguf_strides(1, 2),
),
MulTestCase(
random_tensor((4, 6), np.float16),
gguf_strides(1, 4),
random_tensor((4, 6), np.float16),
gguf_strides(1, 5),
random_tensor((4, 6), np.float16),
gguf_strides(6, 1),
),
MulTestCase(
random_tensor((1, 2048), np.float16),
gguf_strides(1, 1),
random_tensor((1, 2048), np.float16),
gguf_strides(2048, 1),
random_tensor((1, 2048), np.float16),
gguf_strides(1, 1),
),
MulTestCase(
random_tensor((2048, 2048), np.float32),
None,
random_tensor((2048, 2048), np.float32),
gguf_strides(1, 2048),
random_tensor((2048, 2048), np.float32),
None,
),
MulTestCase(
random_tensor((2, 4, 2048), np.float16),
gguf_strides(4 * 2048, 2048, 1),
random_tensor((2, 4, 2048), np.float16),
gguf_strides(1, 2, 2 * 4),
random_tensor((2, 4, 2048), np.float16),
gguf_strides(4 * 2048, 2048, 1),
),
MulTestCase(
random_tensor((2, 4, 2048), np.float32),
gguf_strides(1, 2, 2 * 4),
random_tensor((2, 4, 2048), np.float32),
None,
random_tensor((2, 4, 2048), np.float32),
gguf_strides(1, 2, 2 * 4),
),
MulTestCase(
random_tensor((2048, 2560), np.float32),
gguf_strides(2560, 1),
random_tensor((2048, 2560), np.float32),
gguf_strides(1, 2048),
random_tensor((2048, 2560), np.float32),
gguf_strides(2560, 1),
),
MulTestCase(
random_tensor((4, 48, 64), np.float16),
gguf_strides(64 * 48, 64, 1),
random_tensor((4, 48, 64), np.float16),
gguf_strides(1, 4, 4 * 48),
random_tensor((4, 48, 64), np.float16),
None
),
MulTestCase(
random_tensor((4, 48, 64), np.float32),
None,
random_tensor((4, 48, 64), np.float32),
gguf_strides(1, 4, 4 * 48),
random_tensor((4, 48, 64), np.float32),
gguf_strides(48 * 64, 64, 1),
)
]
test_cases = []
_TEST_CASES_ = [
((2, 3), (3, 1), (1, 2), (3, 1)),
((2, 3), (1, 2), (3, 1), (1, 2)),
((2, 3), (3, 1), (3, 1), (1, 2)),
((4, 6), (1, 4), (1, 5), (6, 1)),
((1, 2048), (1, 1), (2048, 1), (1, 1)),
((2048, 2048), None, (1, 2048), None),
((2, 4, 2048), (4 * 2048, 2048, 1), (1, 2, 8), (4 * 2048, 2048, 1)),
((2, 4, 2048), (1, 2, 8), None, (1, 2, 8)),
((2048, 2560), (2560, 1), (1, 2048), (2560, 1)),
((4, 48, 64), (64 * 48, 64, 1), (1, 4, 192), None),
((4, 48, 64), None, (1, 4, 192), (48 * 64, 64, 1)),
]
_TENSOR_DTYPES_ = [np.float32, np.float16]
for dtype in _TENSOR_DTYPES_:
for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
a = random_tensor(shape, dtype)
b = random_tensor(shape, dtype)
c = np.empty(tuple(0 for _ in shape), dtype=dtype)
test_cases.append(
MulTestCase(
a=a,
shape_a=shape,
stride_a=stride_a,
b=b,
shape_b=shape,
stride_b=stride_b,
c=c,
shape_c=shape,
stride_c=stride_c,
)
)
test_writer.add_tests(test_cases)
test_writer.save()
import numpy as np
import gguf
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
def swiglu(
a: np.ndarray,
b: np.ndarray,
):
c = a * b / (1.0 + np.exp(-b))
return c
class SwiGLUTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
shape_a: List[int] | None,
stride_a: List[int] | None,
b: np.ndarray,
shape_b: List[int] | None,
stride_b: List[int] | None,
c: np.ndarray,
shape_c: List[int] | None,
stride_c: List[int] | None,
):
super().__init__("swiglu")
self.a = a
self.shape_a = shape_a
self.stride_a = stride_a
self.b = b
self.shape_b = shape_b
self.stride_b = stride_b
self.c = c
self.shape_c = shape_c
self.stride_c = stride_c
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.shape_a is not None:
test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a)
if self.shape_b is not None:
test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
if self.shape_c is not None:
test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
test_writer.add_array(
test_writer.gguf_key("c.strides"),
gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
)
ans = swiglu(
self.a.astype(np.float64),
self.b.astype(np.float64),
)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("swiglu.gguf")
test_cases = []
_TEST_CASES_ = [
((64, 128), None, None, None),
((64, 121), None, None, None),
((15, 512), None, None, None),
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((16, 5632), (5632, 1), (5632, 1), (1, 16)),
((2, 3, 400), (1200, 400, 1), (1200, 400, 1), (1, 2, 6)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
_TENSOR_DTYPES_ = [np.float32, np.float16]
for dtype in _TENSOR_DTYPES_:
for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
a = np.random.rand(*shape).astype(dtype)
b = np.random.rand(*shape).astype(dtype)
c = np.empty(tuple(0 for _ in shape), dtype=dtype)
test_case = SwiGLUTestCase(
a=a,
shape_a=list(shape),
stride_a=stride_a,
b=b,
shape_b=list(shape),
stride_b=stride_b,
c=c,
shape_c=list(shape),
stride_c=stride_c,
)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
from libinfiniop import (
open_lib,
to_tensor,
CTensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
get_args,
get_test_devices,
test_operator,
debug,
get_tolerance,
profile_operation,
)
from operatorspy.tests.test_utils import get_args
import torch
import torch.nn.functional as F
class AttentionDescriptor(Structure):
......@@ -95,12 +95,13 @@ def test(
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype=torch.float16,
q_stride=None,
k_stride=None,
v_stride=None,
k_cache_stride=None,
v_cache_stride=None,
dtype=torch.float16,
sync=None,
):
print(
f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
......@@ -140,6 +141,9 @@ def test(
k_cache_tensor = to_tensor(k_cache, lib)
v_cache_tensor = to_tensor(v_cache, lib)
if sync is not None:
sync()
descriptor = infiniopAttentionDescriptor_t()
check_error(
lib.infiniopCreateAttentionDescriptor(
......@@ -156,12 +160,15 @@ def test(
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
out_tensor.descriptor.contents.invalidate()
q_tensor.descriptor.contents.invalidate()
k_tensor.descriptor.contents.invalidate()
v_tensor.descriptor.contents.invalidate()
k_cache_tensor.descriptor.contents.invalidate()
v_cache_tensor.descriptor.contents.invalidate()
for tensor in [
out_tensor,
q_tensor,
k_tensor,
v_tensor,
k_cache_tensor,
v_cache_tensor,
]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0)
check_error(
......@@ -169,152 +176,52 @@ def test(
)
workspace = create_workspace(workspace_size.value, out.device)
check_error(
lib.infiniopAttention(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
out_tensor.data,
q_tensor.data,
k_tensor.data,
v_tensor.data,
k_cache_tensor.data,
v_cache_tensor.data,
None,
)
)
assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cpu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
def lib_attention():
check_error(
lib.infiniopAttention(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
out_tensor.data,
q_tensor.data,
k_tensor.data,
v_tensor.data,
k_cache_tensor.data,
v_cache_tensor.data,
None,
)
)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cuda",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
lib_attention()
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"mlu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
# Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(out, ans, atol=atol, rtol=rtol)
assert torch.allclose(out, ans, atol=atol, rtol=rtol)
destroy_handle(lib, handle)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
if __name__ == "__main__":
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2},
torch.float32: {"atol": 1e-5, "rtol": 1e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
test_cases = [
# prefill
(
......@@ -325,7 +232,6 @@ if __name__ == "__main__":
0, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride
......@@ -341,7 +247,6 @@ if __name__ == "__main__":
3, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride
......@@ -357,13 +262,26 @@ if __name__ == "__main__":
1, # pos
8, # k_cache_buf_len
8, # v_cache_buf_len
torch.float16, # dtype
None, # q_stride
None, # k_stride
None, # v_stride
None, # k_cache_stride
None, # v_cache_stride
),
(
28, # n_q_head
28, # n_kv_head
15, # seq_len
128, # head_dim
0, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
[128, 10752, 1], # q_stride
[128, 10752, 1], # k_stride
[128, 10752, 1], # v_stride
[128, 3584, 1], # k_cache_stride
[128, 3584, 1], # v_cache_stride
),
]
args = get_args()
lib = open_lib()
......@@ -406,12 +324,13 @@ if __name__ == "__main__":
infiniopAttentionDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, test_cases, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -88,6 +88,7 @@ def test(
padding,
strides,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
......@@ -109,6 +110,10 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopAvgPoolDescriptor_t()
check_error(
......
......@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2},
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
}
......@@ -87,6 +87,7 @@ def test(
y_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None
):
print(
f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
......@@ -107,6 +108,9 @@ def test(
y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopCausalSoftmaxDescriptor_t()
check_error(
......@@ -139,6 +143,9 @@ def test(
)
lib_causal_softmax()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
......
#!/usr/bin/env python3
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, x_stride, y_stride, min_val, max_val
# 基本形状测试
((10,), None, None, -1.0, 1.0),
((5, 10), None, None, -1.0, 1.0),
((2, 3, 4), None, None, -1.0, 1.0),
# 不同的min_val和max_val
((10,), None, None, 0.0, 2.0),
((5, 10), None, None, 0.0, 2.0),
((2, 3, 4), None, None, 0.0, 2.0),
((10,), None, None, -2.0, 0.0),
((5, 10), None, None, -2.0, 0.0),
((2, 3, 4), None, None, -2.0, 0.0),
# 奇怪形状测试
((7, 13), None, None, -1.0, 1.0), # 质数维度
((3, 5, 7), None, None, -1.0, 1.0), # 三维质数
# 非标准形状测试
((1, 1), None, None, -1.0, 1.0), # 最小形状
((100, 100), None, None, -1.0, 1.0), # 大形状
((16, 16, 16), None, None, -1.0, 1.0), # 大三维
# 极端值测试
((10,), None, None, -1000.0, 1000.0), # 大范围
((10,), None, None, -0.001, 0.001), # 小范围
((10,), None, None, 0.0, 0.0), # min=max
# 特殊形状测试
((0,), None, None, -1.0, 1.0), # 空张量
((1, 0), None, None, -1.0, 1.0), # 空维度
]
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-6},
}
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
_INPLACE = [
Inplace.INPLACE_X,
Inplace.OUT_OF_PLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ClipDescriptor(Structure):
_fields_ = [("device_type", c_int32), ("device_id", c_int32)]
infiniopClipDescriptor_t = POINTER(ClipDescriptor)
def clip(x, min_val, max_val):
return torch.clamp(x, min_val, max_val)
def create_tensor_with_stride(shape, stride, dtype, device):
"""Create a tensor with specific stride without using view() that might cause errors."""
x = torch.rand(shape, dtype=dtype, device=device) * 4.0 - 2.0 # Range: [-2, 2]
if stride is None:
return x
if len(shape) == 2 and len(stride) == 2:
if stride == (shape[1], 1):
return x.contiguous()
elif stride == (1, shape[0]):
return x.transpose(0, 1).contiguous().transpose(0, 1)
else:
y = torch.zeros(shape, dtype=dtype, device=device)
for i in range(shape[0]):
for j in range(shape[1]):
y[i, j] = x[i, j]
return y.contiguous()
return x
def test(
lib,
handle,
torch_device,
shape,
x_stride=None,
y_stride=None,
min_val=-1.0,
max_val=1.0,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float32,
):
print(
f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}"
)
x = create_tensor_with_stride(shape, x_stride, dtype, torch_device)
ans = clip(x, min_val, max_val)
x = rearrange_if_needed(x, x_stride)
x_tensor = to_tensor(x, lib)
if inplace == Inplace.INPLACE_X:
y = x
y_tensor = x_tensor
else:
y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
descriptor = infiniopClipDescriptor_t()
check_error(
lib.infiniopCreateClipDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
)
)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetClipWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, x.device)
def lib_clip():
check_error(
lib.infiniopClip(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
y_tensor.data,
x_tensor.data,
c_float(min_val),
c_float(max_val),
None,
)
)
lib_clip()
# Now we can destroy the tensor descriptors
x_tensor.destroyDesc(lib)
if inplace != Inplace.INPLACE_X:
y_tensor.destroyDesc(lib)
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol):
print("\nExpected:")
print(ans)
print("\nActual:")
print(y)
print("\nDifference:")
print(torch.abs(y - ans))
print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyClipDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateClipDescriptor.restype = c_int32
lib.infiniopCreateClipDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopClipDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetClipWorkspaceSize.restype = c_int32
lib.infiniopGetClipWorkspaceSize.argtypes = [
infiniopClipDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopClip.restype = c_int32
lib.infiniopClip.argtypes = [
infiniopClipDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyClipDescriptor.restype = c_int32
lib.infiniopDestroyClipDescriptor.argtypes = [
infiniopClipDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -95,6 +95,7 @@ def test(
dilations,
tensor_stride=None,
tensor_dtype=torch.float16,
sync=None
):
assert len(pads) == len(strides) == len(dilations)
print(
......@@ -118,8 +119,11 @@ def test(
x_tensor = to_tensor(x, lib)
w_tensor = to_tensor(w, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopConvDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopConvDescriptor_t()
check_error(
lib.infiniopCreateConvDescriptor(
handle,
......
......@@ -52,6 +52,7 @@ def test(
y_stride=None,
x_stride=None,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
......@@ -76,8 +77,11 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopExpandDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopExpandDescriptor_t()
check_error(
lib.infiniopCreateExpandDescriptor(
handle,
......
......@@ -83,6 +83,7 @@ def test(
b_stride=None,
c_stride=None,
dtype=torch.float16,
sync=None
):
print(
f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
......@@ -104,6 +105,9 @@ def test(
]
a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
if sync is not None:
sync()
descriptor = infiniopGemmDescriptor_t()
check_error(
lib.infiniopCreateGemmDescriptor(
......
......@@ -51,6 +51,7 @@ def test(
torch_device,
x_shape,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
......@@ -70,8 +71,11 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopGlobalAvgPoolDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopGlobalAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateGlobalAvgPoolDescriptor(
handle,
......
......@@ -423,6 +423,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
infiniDeviceEnum_str_map[device],
*test_case,
tensor_dtype,
get_sync_func(device)
)
finally:
destroy_handle(lib, handle)
......@@ -471,3 +472,15 @@ def get_test_devices(args):
devices_to_test = [InfiniDeviceEnum.CPU]
return devices_to_test
def get_sync_func(device):
import torch
device_str = infiniDeviceEnum_str_map[device]
if device == InfiniDeviceEnum.CPU:
sync = None
else:
sync = getattr(torch, device_str).synchronize
return sync
......@@ -83,6 +83,7 @@ def test(
padding,
strides,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
......@@ -104,8 +105,11 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopMaxPoolDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopMaxPoolDescriptor_t()
check_error(
lib.infiniopCreateMaxPoolDescriptor(
handle,
......
......@@ -65,6 +65,7 @@ def test(
y_stride=None,
w12_stride=None,
w3_stride=None,
sync=None
):
print(
f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
......@@ -97,6 +98,10 @@ def test(
x_tensor = to_tensor(x, lib)
w12_tensor = to_tensor(w12, lib)
w3_tensor = to_tensor(w3, lib)
if sync is not None:
sync()
descriptor = infiniopMLPDescriptor_t()
check_error(
lib.infiniopCreateMLPDescriptor(
......
......@@ -103,6 +103,7 @@ def test(
topk,
temperature,
dtype=torch.float16,
sync=None
):
print(
f"Testing RandomSample on {torch_device} with voc:{voc} random_val:{random_val} topp:{topp} topk:{topk} temperature:{temperature} dtype:{dtype}"
......@@ -122,6 +123,9 @@ def test(
indices_tensor.descriptor.contents.dt = InfiniDtype.U64 # treat int64 as uint64
if sync is not None:
sync()
descriptor = infiniopRandomSampleDescriptor_t()
check_error(
lib.infiniopCreateRandomSampleDescriptor(
......
......@@ -131,6 +131,7 @@ def test(
x_stride,
y_stride,
dtype=torch.float16,
sync=None
):
print(
f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
......@@ -145,6 +146,9 @@ def test(
]
x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
if sync is not None:
sync()
descriptor = infiniopRearrangeDescriptor_t()
check_error(
......
......@@ -55,6 +55,7 @@ def test(
tensor_shape,
tensor_dtype=torch.float16,
inplace=Inplace.OUT_OF_PLACE,
sync=None
):
print(
f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
......@@ -78,8 +79,11 @@ def test(
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
descriptor = infiniopReluDescriptor_t()
if sync is not None:
sync()
descriptor = infiniopReluDescriptor_t()
check_error(
lib.infiniopCreateReluDescriptor(
handle,
......
......@@ -72,6 +72,7 @@ def test(
x_stride,
w_dtype=torch.float16,
dtype=torch.float16,
sync=None
):
print(
f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
......@@ -89,9 +90,11 @@ def test(
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([x, y], [x_stride, y_stride])
]
x_tensor, y_tensor, w_tensor = [to_tensor(tensor, lib) for tensor in [x, y, w]]
if sync is not None:
sync()
descriptor = infiniopRMSNormDescriptor_t()
check_error(
......
......@@ -117,6 +117,7 @@ def test(
y_strides=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float32,
sync=None
):
if inplace == Inplace.INPLACE_X:
y_strides = x_strides
......@@ -147,8 +148,8 @@ def test(
else:
y_tensor = to_tensor(y, lib)
if torch_device == "npu":
synchronize_device(torch_device)
if sync is not None:
sync()
check_error(
lib.infiniopCreateRoPEDescriptor(
......@@ -188,6 +189,9 @@ def test(
)
lib_rope()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment