Unverified Commit 8d09630a authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge branch 'demo131' into Issue/862

parents ab52dead 012df56c
......@@ -4,10 +4,15 @@ import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
from framework.base import BaseOperatorTest, TensorSpec, TestCase
from framework.runner import GenericTestRunner
from framework.tensor import TensorInitializer
from framework.utils import convert_infinicore_to_torch
from framework import (
BaseOperatorTest,
GenericTestRunner,
TensorInitializer,
TensorSpec,
TestCase,
convert_infinicore_to_torch,
)
import infinicore
......@@ -109,14 +114,9 @@ class OpTest(BaseOperatorTest):
def infinicore_operator(self, x, weight):
"""InfiniCore nn.Embedding implementation"""
if x.device.type != "cpu":
# 将 input的数据 转移到 cpu 上
x_torch = convert_infinicore_to_torch(x)
x_torch_cpu = x_torch.contiguous().cpu()
x = infinicore.from_torch(x_torch_cpu)
# Note: embedding now supports device-side input for graph recording
# No need to convert to CPU anymore - the implementation handles both CPU and device inputs
num_embeddings, embedding_dim = weight.shape
model = infinicore.nn.Embedding(
......
......@@ -4,8 +4,13 @@ import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
from framework.base import BaseOperatorTest, TensorSpec, TestCase
from framework.runner import GenericTestRunner
from framework import (
BaseOperatorTest,
GenericTestRunner,
TensorSpec,
TestCase
)
import infinicore
......
......@@ -4,8 +4,13 @@ import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
from framework.base import BaseOperatorTest, TensorSpec, TestCase
from framework.runner import GenericTestRunner
from framework import (
BaseOperatorTest,
GenericTestRunner,
TensorSpec,
TestCase
)
import infinicore
......
......@@ -4,8 +4,13 @@ import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
from framework.base import BaseOperatorTest, TensorSpec, TestCase
from framework.runner import GenericTestRunner
from framework import (
BaseOperatorTest,
GenericTestRunner,
TensorSpec,
TestCase
)
from infinicore.nn.functional import RopeAlgo
import infinicore
......
......@@ -7,6 +7,7 @@ import torch
import infinicore
from framework import (
BaseOperatorTest,
CaseResult,
TensorSpec,
TestCase,
GenericTestRunner,
......@@ -76,7 +77,7 @@ class OpTest(BaseOperatorTest):
and isinstance(test_case.inputs[0], TensorSpec)
and test_case.inputs[0].strides is not None
):
return TestResult(
return CaseResult(
success=False,
return_code=-2,
test_case=test_case,
......
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
from framework import (
BaseOperatorTest,
TensorSpec,
TestCase,
GenericTestRunner,
is_broadcast,
)
# ==============================================================================
# Operator-specific configuration
# ==============================================================================
# Test cases format: (y_shape, a_shape, b_shape, w_shape, y_strides, a_strides, b_strides)
_TEST_CASES_DATA = [
# Basic cases
((1, 4), (1, 4), (1, 4), (4,), None, None, None),
((2, 4), (2, 4), (2, 4), (4,), None, None, None),
((2, 2, 4), (2, 2, 4), (2, 2, 4), (4,), None, None, None),
# Strided cases
((2, 2, 4), (2, 2, 4), (2, 2, 4), (4,), (12, 8, 1), (12, 8, 1), (12, 8, 1)),
# Large tensors
((16, 2048), (16, 2048), (16, 2048), (2048,), None, None, None),
((16, 2048), (16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1), (4096, 1)),
((15, 3584), (15, 3584), (15, 3584), (3584,), None, None, None),
((4, 4, 2048), (4, 4, 2048), (4, 4, 2048), (2048,), None, None, None),
(
(4, 4, 2048),
(4, 4, 2048),
(4, 4, 2048),
(2048,),
(2048, 8192, 1),
(2048, 8192, 1),
(2048, 8192, 1),
),
(
(4, 4, 2048),
(4, 4, 2048),
(4, 4, 2048),
(2048,),
(16384, 4096, 1),
(16384, 4096, 1),
(16384, 4096, 1),
),
]
# Tolerance configuration
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 2e-3, "rtol": 2e-3},
infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
}
# Data types for individual tensors
_INPUT_DTYPES = [infinicore.float16, infinicore.bfloat16]
_WEIGHT_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
# EPSILON constant for AddRMSNorm
_EPSILON = 1e-5
def parse_test_cases():
"""
Parse AddRMSNorm test case data and return list of TestCase objects.
Format: (y_shape, a_shape, b_shape, w_shape, y_strides, a_strides, b_strides)
"""
test_cases = []
for data in _TEST_CASES_DATA:
y_shape = data[0] # Output shape
a_shape = data[1] # First input shape
b_shape = data[2] # Second input shape
w_shape = data[3] # Weight shape (1D)
y_strides = data[4] if len(data) > 4 else None
a_strides = data[5] if len(data) > 5 else None
b_strides = data[6] if len(data) > 6 else None
# Check if tensors support in-place operations
a_supports_inplace = not is_broadcast(a_strides)
b_supports_inplace = not is_broadcast(b_strides)
y_supports_inplace = not is_broadcast(y_strides)
# Generate test cases for all dtype combinations
for input_dtype in _INPUT_DTYPES:
for weight_dtype in _WEIGHT_DTYPES:
# Use input dtype tolerance for output
tolerance = _TOLERANCE_MAP.get(
input_dtype, {"atol": 1e-5, "rtol": 1e-4}
)
# Create typed tensor specs
a_spec = TensorSpec.from_tensor(a_shape, a_strides, input_dtype)
b_spec = TensorSpec.from_tensor(b_shape, b_strides, input_dtype)
w_spec = TensorSpec.from_tensor(
w_shape, None, weight_dtype
) # Weight is always contiguous
y_spec = TensorSpec.from_tensor(y_shape, y_strides, input_dtype)
# Test Case 1: Out-of-place (return value) - returns (normalized_result, add_result)
residual_out_spec = TensorSpec.from_tensor(
a_shape, a_strides, input_dtype
)
test_cases.append(
TestCase(
inputs=[a_spec, b_spec, w_spec],
kwargs={"epsilon": _EPSILON},
output_specs=None, # Two outputs
comparison_target=None,
tolerance=tolerance,
output_count=2, # Two outputs: normalized_result and add_result
description=f"AddRMSNorm - OUT_OF_PLACE",
)
)
# Test Case 2: In-place with explicit output tensors (add_rms_norm_(y, residual_out, a, b, w))
# if y_supports_inplace:
# residual_out_spec = TensorSpec.from_tensor(
# a_shape, a_strides, input_dtype
# )
# test_cases.append(
# TestCase(
# inputs=[a_spec, b_spec, w_spec],
# kwargs={
# "epsilon": _EPSILON,
# "out": y_spec,
# "residual": residual_out_spec,
# },
# output_specs=[y_spec, residual_out_spec], # Two outputs
# comparison_target="out",
# tolerance=tolerance,
# output_count=2,
# description=f"AddRMSNorm - INPLACE(out)",
# )
# )
return test_cases
class OpTest(BaseOperatorTest):
"""AddRMSNorm operator test with simplified implementation"""
def __init__(self):
super().__init__("AddRMSNorm")
def get_test_cases(self):
return parse_test_cases()
def torch_operator(
self, a, b, weight, epsilon=_EPSILON, out=None, residual=None, **kwargs
):
"""PyTorch AddRMSNorm implementation - returns (normalized_result, add_result)"""
input_dtype = a.dtype
# Compute add(a, b)
sum_tensor = a.to(torch.float32) + b.to(torch.float32)
weight_fp32 = weight.to(torch.float32)
# Calculate RMSNorm: (a + b) * weight / sqrt(mean((a+b)^2) + epsilon)
variance = sum_tensor.pow(2).mean(-1, keepdim=True)
normalized_result = sum_tensor * torch.rsqrt(variance + epsilon) * weight_fp32
# Convert back to original dtype
normalized_result = normalized_result.to(input_dtype)
add_result = sum_tensor.to(input_dtype)
if out is not None:
out.copy_(normalized_result)
if residual is not None:
residual.copy_(add_result)
return (normalized_result, add_result)
def infinicore_operator(
self, a, b, weight, epsilon=_EPSILON, out=None, residual=None, **kwargs
):
"""InfiniCore AddRMSNorm implementation - returns (normalized_result, add_result)"""
return infinicore.add_rms_norm(
a, b, weight, epsilon, out=out, residual=residual
)
def main():
"""Main entry point"""
runner = GenericTestRunner(OpTest)
runner.run_and_exit()
if __name__ == "__main__":
main()
......@@ -6,7 +6,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner
from framework.tensor import TensorInitializer
from framework.utils import (
from framework.utils.tensor_utils import (
convert_infinicore_to_torch,
infinicore_tensor_from_torch,
to_torch_dtype,
......@@ -102,23 +102,9 @@ class OpTest(BaseOperatorTest):
def infinicore_operator(self, input, weight, out=None, **kwargs):
"""InfiniCore Embedding implementation"""
if input.device.type == "cpu":
input_cpu = input
else:
# 将 input的数据 转移到 cpu 上
torch_reference = torch.zeros(
input.shape,
dtype=to_torch_dtype(input.dtype),
device="cpu" if "cpu" == input.device.type else "cuda",
)
torch_reference = convert_infinicore_to_torch(input)
torch_reference = torch_reference.contiguous().cpu()
# 创建cpu的 input
input_cpu = infinicore_tensor_from_torch(torch_reference)
return infinicore.nn.functional.embedding(input_cpu, weight, out=out)
# Note: embedding now supports device-side input for graph recording
# No need to convert to CPU anymore - the implementation handles both CPU and device inputs
return infinicore.nn.functional.embedding(input, weight, out=out)
def main():
......
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
from framework import (
BaseOperatorTest,
TensorSpec,
TensorInitializer,
TestCase,
GenericTestRunner,
)
# Test cases format: (q_shape, k_shape, v_shape, attn_mask_or_None, dropout_p, is_causal)
# q/k/v typically have shape (..., seq_len, head_dim) or (batch, seq_len, num_heads, head_dim)
_TEST_CASES_DATA = [
((1, 1, 2, 16), (1, 1, 8, 16), (1, 1, 8, 16), None, 0.0, False),
((1, 2, 128, 16), (1, 2, 256, 16), (1, 2, 256, 16), None, 0.0, False),
((1, 1, 4, 32), (1, 1, 32, 32), (1, 1, 32, 32), None, 0.0, True),
((1, 8, 256, 16), (1, 8, 512, 16), (1, 8, 512, 16), None, 0.0, True),
((1, 8, 4, 16), (1, 8, 64, 16), (1, 8, 64, 16), None, 0.0, False),
((8, 28, 256, 128), (8, 28, 512, 128), (8, 28, 512, 128), None, 0.0, True),
]
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 1e-2, "rtol": 1e-2},
infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
infinicore.float32: {"atol": 1e-3, "rtol": 1e-3},
}
_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
def parse_test_cases():
import random
cases = []
for q_shape, k_shape, v_shape, attn_mask, dropout_p, is_causal in _TEST_CASES_DATA:
for dtype in _TENSOR_DTYPES:
tol = _TOLERANCE_MAP[dtype]
q_spec = TensorSpec.from_tensor(q_shape, None, dtype)
k_spec = TensorSpec.from_tensor(k_shape, None, dtype)
v_spec = TensorSpec.from_tensor(v_shape, None, dtype)
len_shape = (q_shape[0],)
total_len = random.randint(1, k_shape[2])
total_kv_len_spec = TensorSpec.from_tensor(
len_shape,
None,
infinicore.int64,
init_mode=TensorInitializer.RANDINT,
low=total_len,
high=total_len + 1,
)
kwargs = {
"attn_mask": attn_mask,
"dropout_p": dropout_p,
"is_causal": is_causal,
}
# remove None keys
kwargs = {k: v for k, v in kwargs.items() if v is not None}
cases.append(
TestCase(
inputs=[q_spec, k_spec, v_spec, total_kv_len_spec, total_len],
kwargs=kwargs,
output_spec=None,
comparison_target=None,
tolerance=tol,
description="Flash Attention",
)
)
return cases
def torch_flash_attn(q, k, v, total_kv_len, cheat, **kwargs):
k_slice = k[:, :, :cheat, :]
v_slice = v[:, :, :cheat, :]
return torch.nn.functional.scaled_dot_product_attention(
q, k_slice, v_slice, **kwargs
)
def infini_flash_attn(q, k, v, total_kv_len, cheat, **kwargs):
return infinicore.nn.functional.flash_attention(q, k, v, total_kv_len, **kwargs)
class OpTest(BaseOperatorTest):
"""ScaledDotProductAttention operator test with simplified implementation"""
def __init__(self):
super().__init__("ScaledDotProductAttention")
def get_test_cases(self):
return parse_test_cases()
def torch_operator(self, *args, **kwargs):
return torch_flash_attn(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
return infini_flash_attn(*args, **kwargs)
def main():
"""Main entry point"""
runner = GenericTestRunner(OpTest)
runner.run_and_exit()
if __name__ == "__main__":
main()
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
from framework import (
BaseOperatorTest,
TensorSpec,
TensorInitializer,
TestCase,
GenericTestRunner,
is_broadcast,
)
# ==============================================================================
# Operator-specific configuration
# ==============================================================================
# Test cases format: (shape (bs, nkvh, seq_len, hd), strides)
_TEST_CASES_DATA = [
((1, 1, 8, 1), None),
((1, 8, 32, 32), None),
((8, 8, 64, 32), None),
((1, 32, 8, 64), (32768, 1024, 64, 1)),
((4, 8, 32, 16), (65536, 8192, 256, 16)),
((8, 16, 64, 128), (8388608, 524288, 8192, 1)),
]
# Tolerance configuration
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 0, "rtol": 0},
infinicore.bfloat16: {"atol": 0, "rtol": 0},
infinicore.float32: {"atol": 0, "rtol": 0},
}
# Data types to test
_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
def parse_test_cases():
test_cases = []
for data in _TEST_CASES_DATA:
import random
cache_shape = data[0]
kv_shape = (
cache_shape[0],
cache_shape[1],
random.randint(1, cache_shape[2]),
cache_shape[3],
)
past_shape = (cache_shape[0],)
strides = data[1]
past_length = random.randint(0, cache_shape[2] - kv_shape[2])
for dtype in _TENSOR_DTYPES:
tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0})
cache_spec = TensorSpec.from_tensor(cache_shape, strides, dtype)
kv_spec = TensorSpec.from_tensor(kv_shape, None, dtype)
past_kv_lengths_spec = TensorSpec.from_tensor(
past_shape,
None,
infinicore.int64,
init_mode=TensorInitializer.RANDINT,
low=past_length,
high=past_length + 1,
)
test_cases.append(
TestCase(
inputs=[
cache_spec,
cache_spec,
kv_spec,
kv_spec,
past_kv_lengths_spec,
],
kwargs={},
output_spec=None,
comparison_target=[0, 1],
tolerance=tolerance,
description=f"KV Caching",
)
)
return test_cases
def torch_kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
batch_size, num_kv_heads, _, head_dim = k_cache.shape
seq_len = k.shape[2]
for b in range(batch_size):
past_len = past_kv_lengths[b].item()
for h in range(num_kv_heads):
k_cache[b, h, past_len : past_len + seq_len, :] = k[b, h, :, :]
v_cache[b, h, past_len : past_len + seq_len, :] = v[b, h, :, :]
return k_cache, v_cache
def infinicore_kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
infinicore.kv_caching(k_cache, v_cache, k, v, past_kv_lengths)
return k_cache, v_cache
class OpTest(BaseOperatorTest):
def __init__(self):
super().__init__("KV Caching")
def get_test_cases(self):
return parse_test_cases()
def torch_operator(self, *args, **kwargs):
return torch_kv_caching(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
return infinicore_kv_caching(*args, **kwargs)
def main():
test_runner = GenericTestRunner(OpTest)
test_runner.run_and_exit()
if __name__ == "__main__":
main()
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
from framework import (
BaseOperatorTest,
TensorSpec,
TestCase,
GenericTestRunner,
is_broadcast,
TensorInitializer,
)
# ==============================================================================
# Operator-specific configuration
# ==============================================================================
# Test cases format:
_TEST_CASES_DATA = [
# (num_seqs, num_heads, num_kv_heads, head_size, block_size, max_seq_len, use_alibi)
(1, 1, 1, 128, 16, 15, False),
(4, 40, 40, 128, 16, 1024, False),
(6, 40, 40, 128, 16, 1024, False),
(3, 8, 8, 128, 16, 1024, False),
(3, 8, 8, 64, 16, 1024, False),
(8, 64, 8, 128, 16, 2048, False),
]
# Tolerance configuration
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 0, "rtol": 1e-2},
infinicore.float32: {"atol": 1e-4, "rtol": 1e-3},
infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
}
# Data types to test
_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16]
# ==============================================================================
# Reference Implementation
# ==============================================================================
def parse_test_cases():
"""
Parse test case data and return list of TestCase objects for paged_attention operation.
Each test case contains all necessary information for execution and validation.
"""
test_cases = []
for (
num_seqs,
num_heads,
num_kv_heads,
head_size,
block_size,
max_seq_len,
use_alibi,
) in _TEST_CASES_DATA:
scale = 1.0 / (head_size**0.5)
max_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
num_blocks = num_seqs * max_blocks_per_seq # A reasonable number for testing
cache_lens_torch = torch.randint(1, max_seq_len, (num_seqs,), dtype=torch.int64)
block_tables = torch.arange(
0, num_seqs * max_blocks_per_seq, dtype=torch.int64
).view(num_seqs, max_blocks_per_seq)
q_shape = (num_seqs, num_heads, head_size)
k_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
v_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
block_tables_shape = block_tables.shape
cache_lens_shape = cache_lens_torch.shape
# Generate test cases for all data types
for dtype in _TENSOR_DTYPES:
tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-3})
# Create typed tensor specs
q_spec = TensorSpec.from_tensor(q_shape, None, dtype)
k_cache_spec = TensorSpec.from_tensor(k_cache_shape, None, dtype)
v_cache_spec = TensorSpec.from_tensor(v_cache_shape, None, dtype)
block_tables_spec = TensorSpec.from_tensor(
block_tables_shape,
init_mode=TensorInitializer.MANUAL,
set_tensor=block_tables,
dtype=infinicore.int64,
)
cache_lens_spec = TensorSpec.from_tensor(
cache_lens_shape,
init_mode=TensorInitializer.MANUAL,
set_tensor=cache_lens_torch,
dtype=infinicore.int64,
)
# Paged attention operation: returns output tensor
out_shape = (num_seqs, num_heads, head_size)
out_spec = TensorSpec.from_tensor(out_shape, None, dtype)
test_cases.append(
TestCase(
inputs=[
q_spec,
k_cache_spec,
v_cache_spec,
block_tables_spec,
cache_lens_spec,
],
kwargs={"alibi_slopes": None, "scale": scale},
output_spec=None,
comparison_target=0,
tolerance=tolerance,
description=f"PagedAttention",
)
)
return test_cases
def ref_masked_attention(query, key, value, scale, attn_mask=None):
# Reference implementation for a single masked attention head.
attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
if attn_mask is not None:
attn_weights = attn_weights + attn_mask.float()
attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1).to(value.dtype)
out = torch.einsum("hqk,khd->qhd", attn_weights, value)
return out
def ref_single_query_cached_kv_attention(
query, key_cache, value_cache, block_tables, cache_lens, alibi_slopes, scale
):
# Reference implementation for paged attention, iterating through each sequence.
output = torch.empty_like(query)
num_query_heads, num_kv_heads = query.shape[1], value_cache.shape[1]
num_queries_per_kv = num_query_heads // num_kv_heads
head_size, block_size = value_cache.shape[3], value_cache.shape[2]
num_seqs = query.shape[0]
for i in range(num_seqs):
q = query[i].unsqueeze(0)
seq_len = cache_lens[i].item()
block_table = block_tables[i]
keys_lst, values_lst = [], []
for j in range(seq_len):
block_num = block_table[j // block_size].item()
block_off = j % block_size
k = key_cache[block_num, :, block_off, :]
v = value_cache[block_num, :, block_off, :]
keys_lst.append(k)
values_lst.append(v)
keys = torch.stack(keys_lst, dim=0)
values = torch.stack(values_lst, dim=0)
if num_queries_per_kv > 1:
keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
alibi_bias = None
if alibi_slopes is not None:
pos = torch.arange(seq_len, device=query.device).int()
alibi_bias = (pos - seq_len + 1).float()
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(1, 1, -1)
out = ref_masked_attention(q, keys, values, scale, alibi_bias)
output[i] = out.view(num_query_heads, head_size)
return output
class OpTest(BaseOperatorTest):
"""PagedAttention operator test with simplified implementation"""
def __init__(self):
super().__init__("PagedAttention")
def get_test_cases(self):
return parse_test_cases()
def torch_operator(self, *args, **kwargs):
"""PyTorch paged_caching implementation"""
return ref_single_query_cached_kv_attention(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore paged_attention implementation"""
out = infinicore.paged_attention(*args, **kwargs)
infinicore.sync_stream()
return out
def main():
"""Main entry point"""
runner = GenericTestRunner(OpTest)
runner.run_and_exit()
if __name__ == "__main__":
main()
import os
import sys
import torch
import infinicore
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from framework import (
BaseOperatorTest,
GenericTestRunner,
TensorInitializer,
TensorSpec,
TestCase,
)
# Test Cases: (num_seqs, num_heads, num_kv_heads, head_size, block_size, max_step_len, num_rounds)
_TEST_CASES_DATA = [
(1, 1, 1, 128, 8, 16, 1),
(1, 4, 4, 128, 8, 16, 4),
(2, 8, 8, 128, 16, 32, 2),
(4, 16, 16, 128, 8, 64, 3),
(8, 64, 64, 128, 8, 16, 5),
]
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 1e-2, "rtol": 1e-2},
infinicore.float32: {"atol": 1e-4, "rtol": 1e-4}, # float32 调优容限
infinicore.bfloat16: {"atol": 2e-2, "rtol": 2e-2},
}
_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16]
class SimpleCacheManager:
def __init__(self, num_blocks, block_size):
self.num_blocks = num_blocks
self.block_size = block_size
self.free_blocks = list(range(num_blocks))
self.request_to_blocks = {}
self.request_to_len = {}
def allocate_slots(self, request_id, num_new_tokens):
if request_id not in self.request_to_len:
self.request_to_len[request_id] = 0
self.request_to_blocks[request_id] = []
start_pos = self.request_to_len[request_id]
new_total_len = start_pos + num_new_tokens
needed_blocks = (new_total_len + self.block_size - 1) // self.block_size
added_blocks = needed_blocks - len(self.request_to_blocks[request_id])
for _ in range(added_blocks):
self.request_to_blocks[request_id].append(self.free_blocks.pop(0))
self.request_to_len[request_id] = new_total_len
return self.request_to_blocks[request_id], new_total_len
def parse_test_cases():
test_cases = []
for (
num_seqs,
num_heads,
num_kv_heads,
head_size,
block_size,
max_step_len,
num_rounds,
) in _TEST_CASES_DATA:
scale = head_size**-0.5
num_blocks = 8192
manager = SimpleCacheManager(num_blocks, block_size)
kv_lens = torch.zeros(num_seqs, dtype=torch.int64)
persistent_k = torch.zeros((num_blocks, num_kv_heads, block_size, head_size))
persistent_v = torch.zeros((num_blocks, num_kv_heads, block_size, head_size))
for r in range(num_rounds):
q_lens = torch.randint(1, max_step_len + 1, (num_seqs,), dtype=torch.int64)
kv_lens = kv_lens + q_lens
total_q_tokens = q_lens.sum().item()
cum_seqlens_q = torch.zeros(num_seqs + 1, dtype=torch.int64)
cum_seqlens_q[1:] = torch.cumsum(q_lens, dim=0)
query_base = torch.randn((total_q_tokens, num_heads, head_size))
round_block_tables_list = []
for i in range(num_seqs):
p_blocks, total_len = manager.allocate_slots(i, q_lens[i].item())
round_block_tables_list.append(p_blocks)
h_len = kv_lens[i].item() - q_lens[i].item()
for t in range(q_lens[i].item()):
logical_pos = h_len + t
b_id = p_blocks[logical_pos // block_size]
off = logical_pos % block_size
persistent_k[b_id, :, off, :] = torch.randn(num_kv_heads, head_size)
persistent_v[b_id, :, off, :] = torch.randn(num_kv_heads, head_size)
max_blks = max(len(t) for t in round_block_tables_list)
padded_tables = torch.tensor(
[t + [0] * (max_blks - len(t)) for t in round_block_tables_list]
)
for dtype in _TENSOR_DTYPES:
tolerance = _TOLERANCE_MAP.get(dtype)
test_cases.append(
TestCase(
inputs=[
TensorSpec.from_tensor(
query_base.shape,
init_mode=TensorInitializer.MANUAL,
set_tensor=query_base.clone(),
dtype=dtype,
),
TensorSpec.from_tensor(
persistent_k.shape,
init_mode=TensorInitializer.MANUAL,
set_tensor=persistent_k.clone(),
dtype=dtype,
),
TensorSpec.from_tensor(
persistent_v.shape,
init_mode=TensorInitializer.MANUAL,
set_tensor=persistent_v.clone(),
dtype=dtype,
),
TensorSpec.from_tensor(
padded_tables.shape,
init_mode=TensorInitializer.MANUAL,
set_tensor=padded_tables.clone(),
dtype=infinicore.int64,
),
TensorSpec.from_tensor(
kv_lens.shape,
init_mode=TensorInitializer.MANUAL,
set_tensor=kv_lens.clone(),
dtype=infinicore.int64,
),
TensorSpec.from_tensor(
cum_seqlens_q.shape,
init_mode=TensorInitializer.MANUAL,
set_tensor=cum_seqlens_q.clone(),
dtype=infinicore.int64,
),
],
kwargs={"scale": scale},
tolerance=tolerance,
description=f"PagedAttentionPrefill_Round_{r}_{str(dtype).split('.')[-1]}",
)
)
return test_cases
def ref_paged_attention_multi_turn(
query, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, scale
):
output = torch.zeros_like(query)
num_seqs = len(kv_lens)
block_size = k_cache.shape[2]
for i in range(num_seqs):
q_start, q_end = cum_seqlens_q[i].item(), cum_seqlens_q[i + 1].item()
cur_q = query[q_start:q_end]
q_len = q_end - q_start
h_len = kv_lens[i].item() - q_len
total_len = h_len + q_len
table = block_tables[i]
keys, values = [], []
for j in range(total_len):
b_id = table[j // block_size].item()
off = j % block_size
keys.append(k_cache[b_id, :, off, :])
values.append(v_cache[b_id, :, off, :])
K = torch.stack(keys, dim=0)
V = torch.stack(values, dim=0)
scores = torch.einsum("qhd,khd->hqk", cur_q.float(), K.float()) * scale
mask = torch.full((q_len, total_len), float("-inf"), device=query.device)
for t in range(q_len):
mask[t, : h_len + t + 1] = 0.0
attn = torch.softmax(scores + mask.unsqueeze(0), dim=-1).to(query.dtype)
output[q_start:q_end] = torch.einsum("hqk,khd->qhd", attn, V)
return output
class OpTest(BaseOperatorTest):
def __init__(self):
super().__init__("PagedAttentionPrefill")
def get_test_cases(self):
return parse_test_cases()
def torch_operator(
self,
query,
k_cache,
v_cache,
block_tables,
kv_lens,
cum_seqlens_q,
scale=1.0,
):
return ref_paged_attention_multi_turn(
query, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, scale
)
def infinicore_operator(
self,
query,
k_cache,
v_cache,
block_tables,
kv_lens,
cum_seqlens_q,
scale=1.0,
):
out = infinicore.paged_attention_prefill(
query,
k_cache,
v_cache,
block_tables,
kv_lens,
cum_seqlens_q,
alibi_slopes=None,
scale=scale,
)
infinicore.sync_stream()
return out
def main():
"""Main entry point"""
runner = GenericTestRunner(OpTest)
runner.run_and_exit()
if __name__ == "__main__":
main()
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
from framework import (
BaseOperatorTest,
TensorSpec,
TestCase,
GenericTestRunner,
is_broadcast,
TensorInitializer,
)
# ==============================================================================
# Operator-specific configuration
# ==============================================================================
# Test cases format: (num_seqs, max_seq_len, num_kv_heads, head_size, block_size)
_TEST_CASES_DATA = [
(1, 128, 8, 128, 16),
(5, 512, 40, 128, 16),
(16, 1024, 8, 64, 32),
(10, 1024, 40, 64, 32),
]
# Tolerance configuration
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 0, "rtol": 1e-5},
infinicore.float32: {"atol": 0, "rtol": 1e-5},
infinicore.bfloat16: {"atol": 0, "rtol": 1e-5},
}
# Data types to test
_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
# ==============================================================================
# Reference Implementation
# ==============================================================================
def ref_paged_caching(key_cache_pool, value_cache_pool, key, value, slot_mapping):
"""
Reference implementation for paged_caching operator.
Args:
key_cache_pool (torch.Tensor): K cache pool, shape [num_blocks, nkvh, block_size, dh]
value_cache_pool (torch.Tensor): V cache pool, shape [num_blocks, nkvh, block_size, dh]
key (torch.Tensor): Keys, shape [ntok, nkvh, dh]
value (torch.Tensor): Values, shape [ntok, nkvh, dh]
slot_mapping (torch.Tensor): Slot mapping, shape [ntok]
"""
ntok = key.shape[0]
block_size = key_cache_pool.shape[2]
# This reference implementation operates on a cloned cache to avoid modifying the original input tensor,
# mimicking the behavior where the custom operator writes to its output tensor.
k_cache_ref = key_cache_pool
v_cache_ref = value_cache_pool
for i in range(ntok):
slot = slot_mapping[i].item()
block_idx = slot // block_size
block_offset = slot % block_size
key_token = key[i]
value_token = value[i]
k_cache_ref[block_idx, :, block_offset, :] = key_token
v_cache_ref[block_idx, :, block_offset, :] = value_token
return k_cache_ref, v_cache_ref
def parse_test_cases():
"""
Parse test case data and return list of TestCase objects for paged_caching operation.
Each test case contains all necessary information for execution and validation.
"""
test_cases = []
for num_seqs, max_seq_len, num_kv_heads, head_size, block_size in _TEST_CASES_DATA:
num_blocks = 4096 # A reasonably large cache pool for testing
# Create metadata: variable context lengths for each sequence in the batch
context_lens_torch = torch.randint(
1, max_seq_len + 1, (num_seqs,), dtype=torch.int64
)
ntok = torch.sum(context_lens_torch).item()
# Simulate the scheduler's behavior to create the slot_mapping
slot_mapping_list = []
current_slot = 0
for length in context_lens_torch:
# Find a contiguous chunk of 'length' slots
start_slot = current_slot
slot_mapping_list.extend(range(start_slot, start_slot + length.item()))
current_slot += length.item()
# Ensure we don't exceed the total number of slots in the cache
assert current_slot <= num_blocks * block_size, (
"Not enough blocks in the cache pool for this test case"
)
slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64)
# print("slot_mapping", slot_mapping)
slot_mapping_shape = slot_mapping.shape
k_shape = (ntok, num_kv_heads, head_size)
v_shape = (ntok, num_kv_heads, head_size)
k_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
v_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
# Generate test cases for all data types
for dtype in _TENSOR_DTYPES:
tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-3})
# Create typed tensor specs
k_spec = TensorSpec.from_tensor(k_shape, None, dtype)
v_spec = TensorSpec.from_tensor(v_shape, None, dtype)
k_cache_spec = TensorSpec.from_tensor(
k_cache_shape, None, dtype, init_mode=TensorInitializer.ZEROS
)
v_cache_spec = TensorSpec.from_tensor(
v_cache_shape, None, dtype, init_mode=TensorInitializer.ZEROS
)
slot_mapping_spec = TensorSpec.from_tensor(
slot_mapping_shape,
init_mode=TensorInitializer.MANUAL,
set_tensor=slot_mapping,
dtype=infinicore.int64,
)
# In-place operation: modifies k_cache (index 2) and v_cache (index 3)
test_cases.append(
TestCase(
inputs=[
k_cache_spec,
v_cache_spec,
k_spec,
v_spec,
slot_mapping_spec,
],
kwargs=None,
output_spec=None,
comparison_target=0, # Only compare k_cache
tolerance=tolerance,
description=f"PagedCaching",
)
)
return test_cases
class OpTest(BaseOperatorTest):
"""PagedCaching operator test with simplified implementation"""
def __init__(self):
super().__init__("PagedCaching")
def get_test_cases(self):
return parse_test_cases()
def torch_operator(self, *args, **kwargs):
"""PyTorch paged_caching implementation"""
return ref_paged_caching(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore paged_caching implementation"""
return infinicore.paged_caching(*args, **kwargs)
def main():
"""Main entry point"""
runner = GenericTestRunner(OpTest)
runner.run_and_exit()
if __name__ == "__main__":
main()
......@@ -222,8 +222,8 @@ class OpTest(BaseOperatorTest):
# Re-run operations with the same logits to get results for comparison
# prepare_pytorch_inputs_and_kwargs will reuse self._current_logits if it exists
from framework.base import TestResult
from framework.utils import (
from framework.results import CaseResult
from framework.utils.tensor_utils import (
convert_infinicore_to_torch,
infinicore_tensor_from_torch,
)
......@@ -268,8 +268,8 @@ class OpTest(BaseOperatorTest):
# Check if indices are equal (standard case)
if ic_idx == ref_idx:
# Return a successful TestResult object
return TestResult(
# Return a successful CaseResult object
return CaseResult(
success=True,
return_code=0,
test_case=test_case,
......@@ -283,8 +283,8 @@ class OpTest(BaseOperatorTest):
logits_ic = logits_tensor[ic_idx].item()
if logits_ic == logits_ref:
# Valid: different indices but same logits value
# Return a successful TestResult object
return TestResult(
# Return a successful CaseResult object
return CaseResult(
success=True,
return_code=0,
test_case=test_case,
......
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
from framework import (
BaseOperatorTest,
TensorSpec,
TestCase,
GenericTestRunner,
is_broadcast,
)
# ==============================================================================
# Operator-specific configuration
# ==============================================================================
# Test cases format: (input_shape)
# The operator splits the last dimension: Input (..., 2*d) -> Output (..., d)
_TEST_CASES_DATA = [
(2, 4),
(1024, 1024),
(2, 4, 8),
(1, 22016),
(2, 4, 256),
(2, 4, 16, 256),
]
# Tolerance configuration for different precisions
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
infinicore.float32: {"atol": 1e-5, "rtol": 1e-5},
infinicore.bfloat16: {"atol": 5e-3, "rtol": 1e-2},
}
_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
def parse_test_cases():
"""
Parse SiLUAndMul test case data.
Input shape: [..., 2*d], Output shape: [..., d]
Note: In-place is not supported due to shape mismatch between input and output.
"""
test_cases = []
for input_shape in _TEST_CASES_DATA:
# Calculate output shape based on SwiGLU logic
output_shape = list(input_shape)
output_shape[-1] //= 2
output_shape = tuple(output_shape)
for dtype in _TENSOR_DTYPES:
tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
input_spec = TensorSpec.from_tensor(input_shape, None, dtype)
output_spec = TensorSpec.from_tensor(output_shape, None, dtype)
# Case 1: Functional style (allocates new memory for output)
test_cases.append(
TestCase(
inputs=[input_spec],
kwargs={},
output_spec=None,
comparison_target=None,
tolerance=tolerance,
description=f"SiLUAndMul_Functional_{dtype}",
)
)
# Case 2: Explicit output tensor style (uses pre-allocated buffer)
test_cases.append(
TestCase(
inputs=[input_spec],
kwargs=None,
output_spec=output_spec,
comparison_target="out",
tolerance=tolerance,
description=f"SiLUAndMul_OutParam_{dtype}",
)
)
return test_cases
class OpTest(BaseOperatorTest):
"""SiLUAndMul operator test (SwiGLU activation)"""
def __init__(self):
super().__init__("SiLUAndMul")
def get_test_cases(self):
return parse_test_cases()
def torch_operator(self, input, out=None, **kwargs):
"""
PyTorch SwiGLU reference implementation:
Formula: SiLU(gate) * up, where [gate, up] = split(input)
"""
d = input.shape[-1] // 2
# Split the last dimension into two equal parts
gate, up = torch.split(input, [d, d], dim=-1)
result = torch.nn.functional.silu(gate) * up
if out is not None:
out.copy_(result)
return out
return result
def infinicore_operator(self, input, out=None, **kwargs):
"""InfiniCore SiLUAndMul implementation wrapper"""
import infinicore.nn.functional as F
return F.silu_and_mul(input, out=out)
def main():
"""Main entry point for the test runner"""
runner = GenericTestRunner(OpTest)
runner.run_and_exit()
if __name__ == "__main__":
main()
......@@ -7,6 +7,7 @@ import torch
import infinicore
from framework import (
BaseOperatorTest,
CaseResult,
TensorSpec,
TestCase,
GenericTestRunner,
......@@ -180,7 +181,7 @@ class OpTest(BaseOperatorTest):
and isinstance(test_case.inputs[0], TensorSpec)
and test_case.inputs[0].strides is not None
):
return TestResult(
return CaseResult(
success=False,
return_code=-2,
test_case=test_case,
......@@ -193,7 +194,7 @@ class OpTest(BaseOperatorTest):
)
for spec in output_specs:
if isinstance(spec, TensorSpec) and spec.strides is not None:
return TestResult(
return CaseResult(
success=False,
return_code=-2,
test_case=test_case,
......
......@@ -7,6 +7,7 @@ import torch
import infinicore
from framework import (
BaseOperatorTest,
CaseResult,
TensorSpec,
TestCase,
GenericTestRunner,
......@@ -122,7 +123,7 @@ class OpTest(BaseOperatorTest):
and isinstance(test_case.inputs[0], TensorSpec)
and test_case.inputs[0].strides is not None
):
return TestResult(
return CaseResult(
success=False,
return_code=-2,
test_case=test_case,
......@@ -135,7 +136,7 @@ class OpTest(BaseOperatorTest):
and isinstance(test_case.output_spec, TensorSpec)
and test_case.output_spec.strides is not None
):
return TestResult(
return CaseResult(
success=False,
return_code=-2,
test_case=test_case,
......
import os
import sys
import argparse
import traceback
import json
import os
from pathlib import Path
import importlib.util
from framework import get_hardware_args_group, add_common_test_args
def find_ops_directory(location=None):
"""
Find the ops directory by searching from location upwards.
Args:
location: Starting directory for search (default: current file's parent)
Returns:
Path: Path to ops directory or None if not found
"""
if location is None:
location = Path(__file__).parent / "ops"
ops_dir = location.resolve()
if ops_dir.exists() and any(ops_dir.glob("*.py")):
return ops_dir
return None
def get_available_operators(ops_dir):
"""
Get list of available operators from ops directory.
Args:
ops_dir: Path to ops directory
Returns:
List of operator names
"""
if not ops_dir or not ops_dir.exists():
return []
test_files = list(ops_dir.glob("*.py"))
current_script = Path(__file__).name
test_files = [f for f in test_files if f.name != current_script]
operators = []
for test_file in test_files:
try:
with open(test_file, "r", encoding="utf-8") as f:
content = f.read()
if "infinicore" in content and (
"BaseOperatorTest" in content or "GenericTestRunner" in content
):
operators.append(test_file.stem)
except:
continue
return sorted(operators)
def import_operator_test(test_file_path):
"""
Import an operator test module and return the test class instance.
Args:
test_file_path: Path to the test file
Returns:
tuple: (success, test_instance_or_error)
"""
try:
# Create a unique module name
module_name = f"op_test_{test_file_path.stem}"
# Load the module from file
spec = importlib.util.spec_from_file_location(module_name, test_file_path)
if spec is None or spec.loader is None:
return False, f"Could not load module from {test_file_path}"
module = importlib.util.module_from_spec(spec)
# Add the module to sys.modules
sys.modules[module_name] = module
# Execute the module
spec.loader.exec_module(module)
# Find the test class (usually named OpTest)
test_class = None
for attr_name in dir(module):
attr = getattr(module, attr_name)
if (
isinstance(attr, type)
and hasattr(attr, "__bases__")
and any("BaseOperatorTest" in str(base) for base in attr.__bases__)
):
test_class = attr
break
if test_class is None:
return False, f"No test class found in {test_file_path}"
# Create an instance
test_instance = test_class()
return True, test_instance
except Exception as e:
return False, f"Error importing {test_file_path}: {str(e)}"
def run_all_op_tests(
ops_dir=None,
specific_ops=None,
bench=False,
bench_mode="both",
verbose=False,
debug=False,
):
"""
Run all operator test scripts in the ops directory using direct import.
Args:
ops_dir (str, optional): Path to the ops directory. If None, uses auto-detection.
specific_ops (list, optional): List of specific operator names to test.
bench (bool): Whether benchmarking is enabled
bench_mode (str): Benchmark mode - "host", "device", or "both"
verbose (bool): Whether verbose mode is enabled
Returns:
dict: Results dictionary with test names as keys and (success, test_runner, stdout, stderr) as values.
"""
if ops_dir is None:
ops_dir = find_ops_directory()
else:
ops_dir = Path(ops_dir)
if not ops_dir or not ops_dir.exists():
print(f"Error: Ops directory '{ops_dir}' does not exist.")
return {}
print(f"Looking for test files in: {ops_dir}")
# Find all Python test files
test_files = list(ops_dir.glob("*.py"))
# Filter out this script itself and non-operator test files
current_script = Path(__file__).name
test_files = [f for f in test_files if f.name != current_script]
# Filter to include only files that look like operator tests
operator_test_files = []
for test_file in test_files:
try:
with open(test_file, "r", encoding="utf-8") as f:
content = f.read()
# Look for characteristic patterns of operator tests
if "infinicore" in content and (
"BaseOperatorTest" in content or "GenericTestRunner" in content
):
operator_test_files.append(test_file)
except Exception as e:
continue
# Filter for specific operators if requested
if specific_ops:
filtered_files = []
for test_file in operator_test_files:
test_name = test_file.stem.lower()
if any(op.lower() == test_name for op in specific_ops):
filtered_files.append(test_file)
operator_test_files = filtered_files
if not operator_test_files:
print(f"No operator test files found in {ops_dir}")
print(f"Available Python files: {[f.name for f in test_files]}")
return {}
print(f"Found {len(operator_test_files)} operator test files:")
for test_file in operator_test_files:
print(f" - {test_file.name}")
results = {}
cumulative_timing = {
"total_torch_host_time": 0.0,
"total_torch_device_time": 0.0,
"total_infinicore_host_time": 0.0,
"total_infinicore_device_time": 0.0,
"operators_tested": 0,
}
for test_file in operator_test_files:
test_name = test_file.stem
try:
# Import and run the test directly
success, test_instance_or_error = import_operator_test(test_file)
if not success:
print(f"💥 {test_name}: ERROR - {test_instance_or_error}")
results[test_name] = {
"success": False,
"return_code": -1,
"torch_host_time": 0.0,
"torch_device_time": 0.0,
"infini_host_time": 0.0,
"infini_device_time": 0.0,
"error_message": test_instance_or_error,
"test_runner": None,
"stdout": "",
"stderr": test_instance_or_error,
}
continue
# Get the test runner class from the module
test_module = sys.modules[f"op_test_{test_file.stem}"]
if not hasattr(test_module, "GenericTestRunner"):
print(f"💥 {test_name}: ERROR - No GenericTestRunner found")
results[test_name] = {
"success": False,
"return_code": -1,
"torch_host_time": 0.0,
"torch_device_time": 0.0,
"infini_host_time": 0.0,
"infini_device_time": 0.0,
"error_message": "No GenericTestRunner found",
"test_runner": None,
"stdout": "",
"stderr": "No GenericTestRunner found",
}
continue
# Create and run the test runner
test_runner_class = test_module.GenericTestRunner
runner_instance = test_runner_class(test_instance_or_error.__class__)
# Temporarily redirect stdout to capture output
from io import StringIO
stdout_capture = StringIO()
stderr_capture = StringIO()
old_stdout = sys.stdout
old_stderr = sys.stderr
sys.stdout = stdout_capture
sys.stderr = stderr_capture
try:
# Run the test
test_success, test_runner = runner_instance.run()
# Get captured output
stdout_output = stdout_capture.getvalue()
stderr_output = stderr_capture.getvalue()
# Restore stdout/stderr
sys.stdout = old_stdout
sys.stderr = old_stderr
# Print the captured output
if stdout_output:
print(stdout_output.rstrip())
if stderr_output:
print("\nSTDERR:")
print(stderr_output.rstrip())
# Analyze test results
test_results = test_runner.get_test_results() if test_runner else []
# Determine overall test status
if test_success:
return_code = 0
status_icon = "✅"
status_text = "PASSED"
else:
# Check if there are any failed tests
has_failures = any(
result.return_code == -1 for result in test_results
)
has_partial = any(
result.return_code == -3 for result in test_results
)
has_skipped = any(
result.return_code == -2 for result in test_results
)
if has_failures:
return_code = -1
status_icon = "❌"
status_text = "FAILED"
elif has_partial:
return_code = -3
status_icon = "⚠️"
status_text = "PARTIAL"
elif has_skipped:
return_code = -2
status_icon = "⏭️"
status_text = "SKIPPED"
else:
return_code = -1
status_icon = "❌"
status_text = "FAILED"
# Calculate timing for all four metrics
torch_host_time = sum(result.torch_host_time for result in test_results)
torch_device_time = sum(
result.torch_device_time for result in test_results
)
infini_host_time = sum(
result.infini_host_time for result in test_results
)
infini_device_time = sum(
result.infini_device_time for result in test_results
)
results[test_name] = {
"success": test_success,
"return_code": return_code,
"torch_host_time": torch_host_time,
"torch_device_time": torch_device_time,
"infini_host_time": infini_host_time,
"infini_device_time": infini_device_time,
"error_message": "",
"test_runner": test_runner,
"stdout": stdout_output,
"stderr": stderr_output,
}
print(
f"{status_icon} {test_name}: {status_text} (return code: {return_code})"
)
# Extract benchmark timing if in bench mode
if bench and test_success and return_code == 0:
cumulative_timing["total_torch_host_time"] += torch_host_time
cumulative_timing["total_torch_device_time"] += torch_device_time
cumulative_timing["total_infinicore_host_time"] += infini_host_time
cumulative_timing[
"total_infinicore_device_time"
] += infini_device_time
cumulative_timing["operators_tested"] += 1
except Exception as e:
# Restore stdout/stderr in case of exception
sys.stdout = old_stdout
sys.stderr = old_stderr
raise e
# In verbose mode, stop execution on first failure
if verbose and not test_success and return_code != 0:
break
except Exception as e:
print(f"💥 {test_name}: ERROR - {str(e)}")
results[test_name] = {
"success": False,
"return_code": -1,
"torch_host_time": 0.0,
"torch_device_time": 0.0,
"infini_host_time": 0.0,
"infini_device_time": 0.0,
"error_message": str(e),
"test_runner": None,
"stdout": "",
"stderr": str(e),
}
# In verbose mode, stop execution on any exception
if verbose:
print(f"\n{'!'*60}")
print(
f"VERBOSE MODE: Stopping execution due to exception in {test_name}"
)
print(f"{'!'*60}")
break
if debug:
traceback.print_exc()
break
return results, cumulative_timing
def print_summary(
results,
verbose=False,
total_expected_tests=0,
cumulative_timing=None,
bench_mode="both",
):
"""Print a comprehensive summary of test results including benchmark data."""
print(f"\n{'='*80}")
print("CUMULATIVE TEST SUMMARY")
print(f"{'='*80}")
if not results:
print("No tests were run.")
return False
# Count different types of results
passed = 0
failed = 0
skipped = 0
partial = 0
passed_operators = [] # Store passed operator names
failed_operators = [] # Store failed operator names
skipped_operators = [] # Store skipped operator names
partial_operators = [] # Store partial operator names
for test_name, result_data in results.items():
return_code = result_data["return_code"]
if return_code == 0:
passed += 1
passed_operators.append(test_name)
elif return_code == -2: # Special code for skipped tests
skipped += 1
skipped_operators.append(test_name)
elif return_code == -3: # Special code for partial tests
partial += 1
partial_operators.append(test_name)
else:
failed += 1
failed_operators.append(test_name)
total = len(results)
print(f"Total tests run: {total}")
if total_expected_tests > 0 and total < total_expected_tests:
print(f"Total tests expected: {total_expected_tests}")
print(f"Tests not executed: {total_expected_tests - total}")
print(f"Passed: {passed}")
print(f"Failed: {failed}")
if skipped > 0:
print(f"Skipped: {skipped}")
if partial > 0:
print(f"Partial: {partial}")
# Print benchmark summary if cumulative_timing data is available
if cumulative_timing and cumulative_timing["operators_tested"] > 0:
print(f"{'-'*40}")
print("BENCHMARK SUMMARY:")
print(f" Operators Tested: {cumulative_timing['operators_tested']}")
# Display timing based on bench_mode
if bench_mode in ["host", "both"]:
print(
f" PyTorch Host Total Time: {cumulative_timing['total_torch_host_time']:12.3f} ms"
)
print(
f" InfiniCore Host Total Time: {cumulative_timing['total_infinicore_host_time']:12.3f} ms"
)
if bench_mode in ["device", "both"]:
print(
f" PyTorch Device Total Time: {cumulative_timing['total_torch_device_time']:12.3f} ms"
)
print(
f" InfiniCore Device Total Time: {cumulative_timing['total_infinicore_device_time']:12.3f} ms"
)
print(f"{'-'*40}")
# Display passed operators
if passed_operators:
print(f"\n✅ PASSED OPERATORS ({len(passed_operators)}):")
# Display operators in groups of 10 per line
for i in range(0, len(passed_operators), 10):
line_ops = passed_operators[i : i + 10]
print(" " + ", ".join(line_ops))
else:
print(f"\n✅ PASSED OPERATORS: None")
# Display failed operators (if any)
if failed_operators:
print(f"\n❌ FAILED OPERATORS ({len(failed_operators)}):")
for i in range(0, len(failed_operators), 10):
line_ops = failed_operators[i : i + 10]
print(" " + ", ".join(line_ops))
# Display skipped operators (if any)
if skipped_operators:
print(f"\n⏭️ SKIPPED OPERATORS ({len(skipped_operators)}):")
for i in range(0, len(skipped_operators), 10):
line_ops = skipped_operators[i : i + 10]
print(" " + ", ".join(line_ops))
# Display partial operators (if any)
if partial_operators:
print(f"\n⚠️ PARTIAL OPERATORS ({len(partial_operators)}):")
for i in range(0, len(partial_operators), 10):
line_ops = partial_operators[i : i + 10]
print(" " + ", ".join(line_ops))
if total > 0:
# Calculate success rate based on actual executed tests
executed_tests = passed + failed + partial
if executed_tests > 0:
success_rate = passed / executed_tests * 100
print(f"\nSuccess rate: {success_rate:.1f}%")
if verbose and total < total_expected_tests:
print(f"\n💡 Verbose mode: Execution stopped after first failure")
print(f" {total_expected_tests - total} tests were not executed")
if failed == 0:
if skipped > 0 or partial > 0:
print(f"\n⚠️ Tests completed with some operators not implemented")
print(f" - {skipped} tests skipped (both operators not implemented)")
print(f" - {partial} tests partial (one operator not implemented)")
else:
print(f"\n🎉 All tests passed!")
return True
else:
print(f"\n{failed} tests failed")
return False
def list_available_tests(ops_dir=None):
"""List all available operator test files."""
if ops_dir is None:
ops_dir = find_ops_directory()
else:
ops_dir = Path(ops_dir)
if not ops_dir or not ops_dir.exists():
print(f"Error: Ops directory '{ops_dir}' does not exist.")
return
operators = get_available_operators(ops_dir)
if operators:
print(f"Available operator test files in {ops_dir}:")
for operator in operators:
print(f" - {operator}")
print(f"\nTotal: {len(operators)} operators")
else:
print(f"No operator test files found in {ops_dir}")
# Show available Python files for debugging
test_files = list(ops_dir.glob("*.py"))
current_script = Path(__file__).name
test_files = [f for f in test_files if f.name != current_script]
if test_files:
print(f"Available Python files: {[f.name for f in test_files]}")
from framework import (
get_hardware_args_group,
add_common_test_args,
InfiniDeviceEnum,
InfiniDeviceNames,
)
from framework.test_manager import TestCollector, TestManager
def generate_help_epilog(ops_dir):
def generate_help_epilog(ops_dir=None):
"""
Generate dynamic help epilog with available operators and hardware platforms.
Args:
ops_dir: Path to ops directory
Returns:
str: Formatted help text
Generate dynamic help epilog containing available operators and hardware platforms.
Maintains the original output format for backward compatibility.
"""
# Get available operators
operators = get_available_operators(ops_dir)
# === Adapter: Use TestCollector to get operator list ===
# Temporarily instantiate a Collector just to fetch the list
collector = TestCollector(ops_dir)
operators = collector.get_available_operators()
# Build epilog text
# Build epilog text (fully replicating original logic)
epilog_parts = []
# Examples section
......@@ -627,18 +89,142 @@ def generate_help_epilog(ops_dir):
return "\n".join(epilog_parts)
def main():
"""Main entry point with comprehensive command line argument parsing."""
# First, find ops directory for dynamic help generation
ops_dir = find_ops_directory()
def fill_defaults_for_local_mode(args):
"""
Helper function specifically for Local Scan mode to fill default arguments.
Since parser defaults are set to None (to handle override logic in load mode),
we need to manually fill None with default values in local mode.
"""
# 1. Copy args to avoid modifying the original object and affecting other logic
# argparse.Namespace can be converted to dict and back, or copied directly
local_args = argparse.Namespace(**vars(args))
# 2. Fill default values for numeric arguments
if local_args.num_prerun is None:
local_args.num_prerun = 10
if local_args.num_iterations is None:
local_args.num_iterations = 1000
return local_args
def load_and_override_cases(load_paths, args):
"""
Load JSON, apply CLI overrides, and handle all argument logic.
"""
cases = []
files_to_read = []
# 1. Scan
for p_str in load_paths:
p = Path(p_str)
if p.is_dir():
files_to_read.extend(p.glob("*.json"))
elif p.is_file():
files_to_read.append(p)
# 2. Read and Validate
loaded_count = 0
skipped_count = 0
for f_path in files_to_read:
try:
with open(f_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Unify as a list to handle both single dict and list of dicts
current_batch = data if isinstance(data, list) else [data]
valid_batch = []
for item in current_batch:
# We only require the 'operator' field to identify the test case.
if isinstance(item, dict) and "operator" in item:
valid_batch.append(item)
else:
skipped_count += 1
if valid_batch:
cases.extend(valid_batch)
loaded_count += 1
except Exception as e:
# Log warning only; do not crash the program on bad files to ensure flow continuity.
print(f"❌ Error loading {f_path.name}: {e}")
if skipped_count > 0:
print(f"ℹ️ Ignored {skipped_count} items/files (invalid format).")
# ==================================================
# Device Logic using InfiniDeviceEnum
# ==================================================
# 1. Identify active devices from CLI arguments
cli_active_devices = []
# Iterate through the Enum to check corresponding CLI args
# Logic: Enum name (e.g., CAMBRICON) -> lower() -> arg name (cambricon)
# Value: InfiniDeviceNames mapping (e.g., "Cambricon")
for device_enum, device_name in InfiniDeviceNames.items():
# device_name is like "CPU", "NVIDIA", "Cambricon"
# arg_name becomes "cpu", "nvidia", "cambricon"
arg_name = device_name.lower()
if getattr(args, arg_name, False):
cli_active_devices.append(device_name)
print(f"\n[Config Processing]")
for case in cases:
if "args" not in case or case["args"] is None:
case["args"] = {}
case_args = case["args"]
# 2. Apply Device Overrides (CLI > JSON)
if cli_active_devices:
case["device"] = ",".join(cli_active_devices)
final_dev_str = case.get("device", "").upper() # Uppercase for easier matching
# 3. Set Boolean flags in case_args based on final device string
for device_enum, device_name in InfiniDeviceNames.items():
arg_name = device_name.lower()
# Check if the standard name (e.g., "Cambricon" or "NVIDIA") is in the device string
# We convert both to upper to ensure case-insensitive matching
is_active = device_name.upper() in final_dev_str
case_args[arg_name] = is_active
case_args["save"] = getattr(args, "save", None)
# Standard arguments (CLI > JSON > Default)
case_args["bench"] = (
args.bench if args.bench is not None else case_args.get("bench")
)
# Boolean Flags
case_args["verbose"] = args.verbose or case_args.get("verbose", False)
case_args["debug"] = args.debug or case_args.get("debug", False)
case_args["eq_nan"] = args.eq_nan or case_args.get("eq_nan", False)
case_args["num_prerun"] = (
args.num_prerun
if args.num_prerun is not None
else (case_args.get("num_prerun") or 10)
)
case_args["num_iterations"] = (
args.num_iterations
if args.num_iterations is not None
else (case_args.get("num_iterations") or 1000)
)
print(f"📂 Processed {len(cases)} cases ready for execution.\n")
return cases
def main():
"""Main entry point for the InfiniCore Operator Test Runner."""
parser = argparse.ArgumentParser(
description="Run InfiniCore operator tests across multiple hardware platforms",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=generate_help_epilog(ops_dir),
epilog=generate_help_epilog(),
)
# Core options
parser.add_argument(
"--ops-dir", type=str, help="Path to the ops directory (default: auto-detect)"
)
......@@ -650,119 +236,106 @@ def main():
action="store_true",
help="List all available test files without running them",
)
# Call common method to add shared arguments (bench, debug, verbose, save...)
add_common_test_args(parser)
parser.add_argument(
"--load",
nargs="+",
help="Load test cases from JSON",
)
# Default value is None to determine if user provided input
parser.add_argument("--num_prerun", type=lambda x: max(0, int(x)), default=None)
parser.add_argument("--num_iterations", type=lambda x: max(0, int(x)), default=None)
# Add common test arguments (including --save, --bench, etc.)
add_common_test_args(parser)
get_hardware_args_group(parser)
# Parse known args first, leave the rest for the test scripts
args, unknown_args = parser.parse_known_args()
# Show what extra arguments will be passed
if unknown_args:
print(f"Passing extra arguments to test scripts: {unknown_args}")
# Handle list command
# 1. Discovery
collector = TestCollector(args.ops_dir)
if args.list:
list_available_tests(args.ops_dir)
print("Available operators:", collector.get_available_operators())
return
# Auto-detect ops directory if not provided
if args.ops_dir is None:
ops_dir = find_ops_directory()
if not ops_dir:
print(
"Error: Could not auto-detect ops directory. Please specify with --ops-dir"
)
sys.exit(1)
else:
ops_dir = Path(args.ops_dir)
if not ops_dir.exists():
print(f"Error: Ops directory '{ops_dir}' does not exist.")
# ==========================================================================
# Branch 1: Load Mode (JSON Data Driven)
# ==========================================================================
if args.load:
# 1. Load and override arguments
json_cases = load_and_override_cases(args.load, args)
if not json_cases:
sys.exit(1)
# Show what extra arguments will be passed
if unknown_args:
print(f"Passing extra arguments to test scripts: {unknown_args}")
# 2. Determine global Bench status (for Summary display)
bench = json_cases[0]["args"].get("bench")
verbose = json_cases[0]["args"].get("verbose")
# Get available operators for display
available_operators = get_available_operators(ops_dir)
if verbose:
print(
f"Verbose mode: ENABLED (will stop on first error with full traceback)"
)
print(f"InfiniCore Operator Test Runner")
print(f"Operating directory: {ops_dir}")
print(f"Available operators: {len(available_operators)}")
if bench:
print(f"Benchmark mode: {args.bench.upper()} timing")
if args.verbose:
print(f"Verbose mode: ENABLED (will stop on first error with full traceback)")
# 3. Initialize and Execute
test_manager = TestManager(
ops_dir=args.ops_dir, verbose=verbose, bench_mode=bench
)
if args.bench:
bench_mode = args.bench if args.bench != "both" else "both"
print(f"Benchmark mode: {bench_mode.upper()} timing")
success, _ = test_manager.test(json_cases_list=json_cases)
if args.ops:
# Validate requested operators
valid_ops = []
invalid_ops = []
for op in args.ops:
if op in available_operators:
valid_ops.append(op)
else:
invalid_ops.append(op)
if invalid_ops:
print(f"Warning: Unknown operators: {', '.join(invalid_ops)}")
print(f"Available operators: {', '.join(available_operators)}")
if valid_ops:
print(f"Testing operators: {', '.join(valid_ops)}")
total_expected_tests = len(valid_ops)
else:
print("No valid operators specified. Running all available tests.")
total_expected_tests = len(available_operators)
# ==========================================================================
# Branch 2: Local Scan Mode
# ==========================================================================
else:
print("Testing all available operators")
total_expected_tests = len(available_operators)
print()
# Run all tests
results, cumulative_timing = run_all_op_tests(
ops_dir=ops_dir,
specific_ops=args.ops,
bench=bool(args.bench),
bench_mode=args.bench if args.bench else "both",
verbose=args.verbose,
debug=args.debug,
)
if args.verbose:
print(
f"Verbose mode: ENABLED (will stop on first error with full traceback)"
)
# Print summary and exit with appropriate code
all_passed = print_summary(
results,
args.verbose,
total_expected_tests,
cumulative_timing,
bench_mode=args.bench if args.bench else "both",
)
if args.bench:
print(f"Benchmark mode: {args.bench.upper()} timing")
# 2. Filtering
target_ops = None
if args.ops:
available_ops = set(collector.get_available_operators())
requested_ops = set(args.ops)
valid_ops = list(requested_ops & available_ops)
invalid_ops = list(requested_ops - available_ops)
if invalid_ops:
print(f"⚠️ Warning: The following requested operators were not found:")
print(f" {', '.join(invalid_ops)}")
print(f" (Use --list to see available operators)")
if not valid_ops:
# Case A: User input provided, but ALL were invalid.
print(f"⚠️ No valid operators remained from your list.")
print(f"🔄 Fallback: Proceeding to run ALL available tests...")
else:
# Case B: At least some valid operators found.
print(f"🎯 Targeted operators: {', '.join(valid_ops)}")
target_ops = valid_ops
# Check if there were any tests with missing implementations
has_missing_implementations = any(
result_data["return_code"] in [-2, -3] for result_data in results.values()
)
# 3. Execution Preparation
# Fill defaults for local mode (since parser default is None)
global_exec_args = fill_defaults_for_local_mode(args)
if all_passed and has_missing_implementations:
print(f"\n⚠️ Note: Some operators are not fully implemented")
print(f" Run individual tests for details on missing implementations")
# 4. Initialize API & Execute
test_manager = TestManager(
ops_dir=args.ops_dir, verbose=args.verbose, bench_mode=args.bench
)
if args.verbose and not all_passed:
print(
f"\n💡 Verbose mode tip: Use individual test commands for detailed debugging:"
success, _ = test_manager.test(
target_ops=target_ops, global_exec_args=global_exec_args
)
failed_ops = [
name
for name, result_data in results.items()
if result_data["return_code"] == -1
]
for op in failed_ops[:3]: # Show first 3 failed operators
print(f" python {ops_dir / (op + '.py')} --verbose")
sys.exit(0 if all_passed else 1)
sys.exit(0 if success else 1)
if __name__ == "__main__":
......
......@@ -5,9 +5,15 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
from framework.base import BaseOperatorTest, TensorSpec, TestCase
from framework.runner import GenericTestRunner
from framework.utils import is_broadcast
from framework import (
BaseOperatorTest,
GenericTestRunner,
is_broadcast,
TensorSpec,
TestCase
)
# ==============================================================================
# Operator-specific configuration
......
......@@ -5,9 +5,15 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
from framework.base import BaseOperatorTest, TensorSpec, TestCase
from framework.runner import GenericTestRunner
from framework.utils import is_broadcast
from framework import (
BaseOperatorTest,
GenericTestRunner,
TensorSpec,
TestCase,
is_broadcast
)
# ==============================================================================
# Operator-specific configuration
......
......@@ -5,9 +5,15 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import torch
import infinicore
from framework.base import BaseOperatorTest, TensorSpec, TestCase
from framework.runner import GenericTestRunner
from framework.utils import is_broadcast
from framework import (
BaseOperatorTest,
GenericTestRunner,
TensorSpec,
TestCase,
is_broadcast
)
# ==============================================================================
# Operator-specific configuration
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment