Merge branch 'demo131' into Issue/862

8d09630a · gongchensu · GitHub · ab52dead · 012df56c · 8d09630a
Unverified Commit 8d09630a authored Feb 11, 2026 by gongchensu Committed by GitHub Feb 11, 2026
20 changed files
--- a/test/infinicore/nn/embedding.py
+++ b/test/infinicore/nn/embedding.py
@@ -4,10 +4,15 @@ import sys
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

 import torch
-from framework.base import BaseOperatorTest, TensorSpec, TestCase
-from framework.runner import GenericTestRunner
-from framework.tensor import TensorInitializer
-from framework.utils import convert_infinicore_to_torch
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorInitializer,
+    TensorSpec,
+    TestCase,
+    convert_infinicore_to_torch,
+)

 import infinicore

@@ -109,14 +114,9 @@ class OpTest(BaseOperatorTest):

    def infinicore_operator(self, x, weight):
        """InfiniCore nn.Embedding implementation"""
-
-        if x.device.type != "cpu":
-            # 将 input的数据 转移到 cpu 上
-            x_torch = convert_infinicore_to_torch(x)
-            x_torch_cpu = x_torch.contiguous().cpu()
-
-            x = infinicore.from_torch(x_torch_cpu)
-
+        # Note: embedding now supports device-side input for graph recording
+        # No need to convert to CPU anymore - the implementation handles both CPU and device inputs
+        
        num_embeddings, embedding_dim = weight.shape

        model = infinicore.nn.Embedding(

--- a/test/infinicore/nn/linear.py
+++ b/test/infinicore/nn/linear.py
@@ -4,8 +4,13 @@ import sys
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

 import torch
-from framework.base import BaseOperatorTest, TensorSpec, TestCase
-from framework.runner import GenericTestRunner
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase
+)

 import infinicore


--- a/test/infinicore/nn/rmsnorm.py
+++ b/test/infinicore/nn/rmsnorm.py
@@ -4,8 +4,13 @@ import sys
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

 import torch
-from framework.base import BaseOperatorTest, TensorSpec, TestCase
-from framework.runner import GenericTestRunner
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase
+)

 import infinicore


--- a/test/infinicore/nn/rope.py
+++ b/test/infinicore/nn/rope.py
@@ -4,8 +4,13 @@ import sys
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

 import torch
-from framework.base import BaseOperatorTest, TensorSpec, TestCase
-from framework.runner import GenericTestRunner
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase
+)
 from infinicore.nn.functional import RopeAlgo

 import infinicore

--- a/test/infinicore/ops/adaptive_max_pool2d.py
+++ b/test/infinicore/ops/adaptive_max_pool2d.py
@@ -7,6 +7,7 @@ import torch
 import infinicore
 from framework import (
    BaseOperatorTest,
+    CaseResult,
    TensorSpec,
    TestCase,
    GenericTestRunner,
@@ -76,7 +77,7 @@ class OpTest(BaseOperatorTest):
                and isinstance(test_case.inputs[0], TensorSpec)
                and test_case.inputs[0].strides is not None
            ):
-                return TestResult(
+                return CaseResult(
                    success=False,
                    return_code=-2,
                    test_case=test_case,

--- a/test/infinicore/ops/add_rms_norm.py
+++ b/test/infinicore/ops/add_rms_norm.py
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework import (
+    BaseOperatorTest,
+    TensorSpec,
+    TestCase,
+    GenericTestRunner,
+    is_broadcast,
+)
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (y_shape, a_shape, b_shape, w_shape, y_strides, a_strides, b_strides)
+_TEST_CASES_DATA = [
+    # Basic cases
+    ((1, 4), (1, 4), (1, 4), (4,), None, None, None),
+    ((2, 4), (2, 4), (2, 4), (4,), None, None, None),
+    ((2, 2, 4), (2, 2, 4), (2, 2, 4), (4,), None, None, None),
+    # Strided cases
+    ((2, 2, 4), (2, 2, 4), (2, 2, 4), (4,), (12, 8, 1), (12, 8, 1), (12, 8, 1)),
+    # Large tensors
+    ((16, 2048), (16, 2048), (16, 2048), (2048,), None, None, None),
+    ((16, 2048), (16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1), (4096, 1)),
+    ((15, 3584), (15, 3584), (15, 3584), (3584,), None, None, None),
+    ((4, 4, 2048), (4, 4, 2048), (4, 4, 2048), (2048,), None, None, None),
+    (
+        (4, 4, 2048),
+        (4, 4, 2048),
+        (4, 4, 2048),
+        (2048,),
+        (2048, 8192, 1),
+        (2048, 8192, 1),
+        (2048, 8192, 1),
+    ),
+    (
+        (4, 4, 2048),
+        (4, 4, 2048),
+        (4, 4, 2048),
+        (2048,),
+        (16384, 4096, 1),
+        (16384, 4096, 1),
+        (16384, 4096, 1),
+    ),
+]
+
+# Tolerance configuration
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 2e-3, "rtol": 2e-3},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
+}
+
+# Data types for individual tensors
+_INPUT_DTYPES = [infinicore.float16, infinicore.bfloat16]
+_WEIGHT_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+# EPSILON constant for AddRMSNorm
+_EPSILON = 1e-5
+
+
+def parse_test_cases():
+    """
+    Parse AddRMSNorm test case data and return list of TestCase objects.
+    Format: (y_shape, a_shape, b_shape, w_shape, y_strides, a_strides, b_strides)
+    """
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        y_shape = data[0]  # Output shape
+        a_shape = data[1]  # First input shape
+        b_shape = data[2]  # Second input shape
+        w_shape = data[3]  # Weight shape (1D)
+        y_strides = data[4] if len(data) > 4 else None
+        a_strides = data[5] if len(data) > 5 else None
+        b_strides = data[6] if len(data) > 6 else None
+
+        # Check if tensors support in-place operations
+        a_supports_inplace = not is_broadcast(a_strides)
+        b_supports_inplace = not is_broadcast(b_strides)
+        y_supports_inplace = not is_broadcast(y_strides)
+
+        # Generate test cases for all dtype combinations
+        for input_dtype in _INPUT_DTYPES:
+            for weight_dtype in _WEIGHT_DTYPES:
+                # Use input dtype tolerance for output
+                tolerance = _TOLERANCE_MAP.get(
+                    input_dtype, {"atol": 1e-5, "rtol": 1e-4}
+                )
+
+                # Create typed tensor specs
+                a_spec = TensorSpec.from_tensor(a_shape, a_strides, input_dtype)
+                b_spec = TensorSpec.from_tensor(b_shape, b_strides, input_dtype)
+                w_spec = TensorSpec.from_tensor(
+                    w_shape, None, weight_dtype
+                )  # Weight is always contiguous
+                y_spec = TensorSpec.from_tensor(y_shape, y_strides, input_dtype)
+
+                # Test Case 1: Out-of-place (return value) - returns (normalized_result, add_result)
+                residual_out_spec = TensorSpec.from_tensor(
+                    a_shape, a_strides, input_dtype
+                )
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec, w_spec],
+                        kwargs={"epsilon": _EPSILON},
+                        output_specs=None,  # Two outputs
+                        comparison_target=None,
+                        tolerance=tolerance,
+                        output_count=2,  # Two outputs: normalized_result and add_result
+                        description=f"AddRMSNorm - OUT_OF_PLACE",
+                    )
+                )
+
+                # Test Case 2: In-place with explicit output tensors (add_rms_norm_(y, residual_out, a, b, w))
+                # if y_supports_inplace:
+                #     residual_out_spec = TensorSpec.from_tensor(
+                #         a_shape, a_strides, input_dtype
+                #     )
+                #     test_cases.append(
+                #         TestCase(
+                #             inputs=[a_spec, b_spec, w_spec],
+                #             kwargs={
+                #                 "epsilon": _EPSILON,
+                #                 "out": y_spec,
+                #                 "residual": residual_out_spec,
+                #             },
+                #             output_specs=[y_spec, residual_out_spec],  # Two outputs
+                #             comparison_target="out",
+                #             tolerance=tolerance,
+                #             output_count=2,
+                #             description=f"AddRMSNorm - INPLACE(out)",
+                #         )
+                #     )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """AddRMSNorm operator test with simplified implementation"""
+
+    def __init__(self):
+        super().__init__("AddRMSNorm")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(
+        self, a, b, weight, epsilon=_EPSILON, out=None, residual=None, **kwargs
+    ):
+        """PyTorch AddRMSNorm implementation - returns (normalized_result, add_result)"""
+        input_dtype = a.dtype
+
+        # Compute add(a, b)
+        sum_tensor = a.to(torch.float32) + b.to(torch.float32)
+        weight_fp32 = weight.to(torch.float32)
+
+        # Calculate RMSNorm: (a + b) * weight / sqrt(mean((a+b)^2) + epsilon)
+        variance = sum_tensor.pow(2).mean(-1, keepdim=True)
+        normalized_result = sum_tensor * torch.rsqrt(variance + epsilon) * weight_fp32
+
+        # Convert back to original dtype
+        normalized_result = normalized_result.to(input_dtype)
+        add_result = sum_tensor.to(input_dtype)
+
+        if out is not None:
+            out.copy_(normalized_result)
+        if residual is not None:
+            residual.copy_(add_result)
+
+        return (normalized_result, add_result)
+
+    def infinicore_operator(
+        self, a, b, weight, epsilon=_EPSILON, out=None, residual=None, **kwargs
+    ):
+        """InfiniCore AddRMSNorm implementation - returns (normalized_result, add_result)"""
+        return infinicore.add_rms_norm(
+            a, b, weight, epsilon, out=out, residual=residual
+        )
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/ops/embedding.py
+++ b/test/infinicore/ops/embedding.py
@@ -6,7 +6,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 import torch
 from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner
 from framework.tensor import TensorInitializer
-from framework.utils import (
+from framework.utils.tensor_utils import (
    convert_infinicore_to_torch,
    infinicore_tensor_from_torch,
    to_torch_dtype,
@@ -102,23 +102,9 @@ class OpTest(BaseOperatorTest):

    def infinicore_operator(self, input, weight, out=None, **kwargs):
        """InfiniCore Embedding implementation"""
-
-        if input.device.type == "cpu":
-            input_cpu = input
-        else:
-            # 将 input的数据 转移到 cpu 上
-            torch_reference = torch.zeros(
-                input.shape,
-                dtype=to_torch_dtype(input.dtype),
-                device="cpu" if "cpu" == input.device.type else "cuda",
-            )
-            torch_reference = convert_infinicore_to_torch(input)
-            torch_reference = torch_reference.contiguous().cpu()
-
-            # 创建cpu的 input
-            input_cpu = infinicore_tensor_from_torch(torch_reference)
-
-        return infinicore.nn.functional.embedding(input_cpu, weight, out=out)
+        # Note: embedding now supports device-side input for graph recording
+        # No need to convert to CPU anymore - the implementation handles both CPU and device inputs
+        return infinicore.nn.functional.embedding(input, weight, out=out)


 def main():

--- a/test/infinicore/ops/flash_attention.py
+++ b/test/infinicore/ops/flash_attention.py
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework import (
+    BaseOperatorTest,
+    TensorSpec,
+    TensorInitializer,
+    TestCase,
+    GenericTestRunner,
+)
+
+# Test cases format: (q_shape, k_shape, v_shape, attn_mask_or_None, dropout_p, is_causal)
+# q/k/v typically have shape (..., seq_len, head_dim) or (batch, seq_len, num_heads, head_dim)
+
+_TEST_CASES_DATA = [
+    ((1, 1, 2, 16), (1, 1, 8, 16), (1, 1, 8, 16), None, 0.0, False),
+    ((1, 2, 128, 16), (1, 2, 256, 16), (1, 2, 256, 16), None, 0.0, False),
+    ((1, 1, 4, 32), (1, 1, 32, 32), (1, 1, 32, 32), None, 0.0, True),
+    ((1, 8, 256, 16), (1, 8, 512, 16), (1, 8, 512, 16), None, 0.0, True),
+    ((1, 8, 4, 16), (1, 8, 64, 16), (1, 8, 64, 16), None, 0.0, False),
+    ((8, 28, 256, 128), (8, 28, 512, 128), (8, 28, 512, 128), None, 0.0, True),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-2, "rtol": 1e-2},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-3, "rtol": 1e-3},
+}
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    import random
+
+    cases = []
+    for q_shape, k_shape, v_shape, attn_mask, dropout_p, is_causal in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP[dtype]
+            q_spec = TensorSpec.from_tensor(q_shape, None, dtype)
+            k_spec = TensorSpec.from_tensor(k_shape, None, dtype)
+            v_spec = TensorSpec.from_tensor(v_shape, None, dtype)
+
+            len_shape = (q_shape[0],)
+            total_len = random.randint(1, k_shape[2])
+            total_kv_len_spec = TensorSpec.from_tensor(
+                len_shape,
+                None,
+                infinicore.int64,
+                init_mode=TensorInitializer.RANDINT,
+                low=total_len,
+                high=total_len + 1,
+            )
+
+            kwargs = {
+                "attn_mask": attn_mask,
+                "dropout_p": dropout_p,
+                "is_causal": is_causal,
+            }
+            # remove None keys
+            kwargs = {k: v for k, v in kwargs.items() if v is not None}
+
+            cases.append(
+                TestCase(
+                    inputs=[q_spec, k_spec, v_spec, total_kv_len_spec, total_len],
+                    kwargs=kwargs,
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="Flash Attention",
+                )
+            )
+
+    return cases
+
+
+def torch_flash_attn(q, k, v, total_kv_len, cheat, **kwargs):
+    k_slice = k[:, :, :cheat, :]
+    v_slice = v[:, :, :cheat, :]
+    return torch.nn.functional.scaled_dot_product_attention(
+        q, k_slice, v_slice, **kwargs
+    )
+
+
+def infini_flash_attn(q, k, v, total_kv_len, cheat, **kwargs):
+    return infinicore.nn.functional.flash_attention(q, k, v, total_kv_len, **kwargs)
+
+
+class OpTest(BaseOperatorTest):
+    """ScaledDotProductAttention operator test with simplified implementation"""
+
+    def __init__(self):
+        super().__init__("ScaledDotProductAttention")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_flash_attn(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infini_flash_attn(*args, **kwargs)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/ops/kv_caching.py
+++ b/test/infinicore/ops/kv_caching.py
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework import (
+    BaseOperatorTest,
+    TensorSpec,
+    TensorInitializer,
+    TestCase,
+    GenericTestRunner,
+    is_broadcast,
+)
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (shape (bs, nkvh, seq_len, hd), strides)
+_TEST_CASES_DATA = [
+    ((1, 1, 8, 1), None),
+    ((1, 8, 32, 32), None),
+    ((8, 8, 64, 32), None),
+    ((1, 32, 8, 64), (32768, 1024, 64, 1)),
+    ((4, 8, 32, 16), (65536, 8192, 256, 16)),
+    ((8, 16, 64, 128), (8388608, 524288, 8192, 1)),
+]
+
+# Tolerance configuration
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 0},
+    infinicore.bfloat16: {"atol": 0, "rtol": 0},
+    infinicore.float32: {"atol": 0, "rtol": 0},
+}
+
+# Data types to test
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        import random
+
+        cache_shape = data[0]
+        kv_shape = (
+            cache_shape[0],
+            cache_shape[1],
+            random.randint(1, cache_shape[2]),
+            cache_shape[3],
+        )
+        past_shape = (cache_shape[0],)
+
+        strides = data[1]
+
+        past_length = random.randint(0, cache_shape[2] - kv_shape[2])
+
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0})
+
+            cache_spec = TensorSpec.from_tensor(cache_shape, strides, dtype)
+            kv_spec = TensorSpec.from_tensor(kv_shape, None, dtype)
+
+            past_kv_lengths_spec = TensorSpec.from_tensor(
+                past_shape,
+                None,
+                infinicore.int64,
+                init_mode=TensorInitializer.RANDINT,
+                low=past_length,
+                high=past_length + 1,
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[
+                        cache_spec,
+                        cache_spec,
+                        kv_spec,
+                        kv_spec,
+                        past_kv_lengths_spec,
+                    ],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=[0, 1],
+                    tolerance=tolerance,
+                    description=f"KV Caching",
+                )
+            )
+
+    return test_cases
+
+
+def torch_kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
+    batch_size, num_kv_heads, _, head_dim = k_cache.shape
+    seq_len = k.shape[2]
+
+    for b in range(batch_size):
+        past_len = past_kv_lengths[b].item()
+        for h in range(num_kv_heads):
+            k_cache[b, h, past_len : past_len + seq_len, :] = k[b, h, :, :]
+            v_cache[b, h, past_len : past_len + seq_len, :] = v[b, h, :, :]
+
+    return k_cache, v_cache
+
+
+def infinicore_kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
+    infinicore.kv_caching(k_cache, v_cache, k, v, past_kv_lengths)
+    return k_cache, v_cache
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("KV Caching")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_kv_caching(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore_kv_caching(*args, **kwargs)
+
+
+def main():
+    test_runner = GenericTestRunner(OpTest)
+    test_runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/ops/paged_attention.py
+++ b/test/infinicore/ops/paged_attention.py
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework import (
+    BaseOperatorTest,
+    TensorSpec,
+    TestCase,
+    GenericTestRunner,
+    is_broadcast,
+    TensorInitializer,
+)
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format:
+_TEST_CASES_DATA = [
+    # (num_seqs, num_heads, num_kv_heads, head_size, block_size, max_seq_len, use_alibi)
+    (1, 1, 1, 128, 16, 15, False),
+    (4, 40, 40, 128, 16, 1024, False),
+    (6, 40, 40, 128, 16, 1024, False),
+    (3, 8, 8, 128, 16, 1024, False),
+    (3, 8, 8, 64, 16, 1024, False),
+    (8, 64, 8, 128, 16, 2048, False),
+]
+
+# Tolerance configuration
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-4, "rtol": 1e-3},
+    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
+}
+
+# Data types to test
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16]
+
+
+# ==============================================================================
+#  Reference Implementation
+# ==============================================================================
+def parse_test_cases():
+    """
+    Parse test case data and return list of TestCase objects for paged_attention operation.
+    Each test case contains all necessary information for execution and validation.
+    """
+    test_cases = []
+    for (
+        num_seqs,
+        num_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        max_seq_len,
+        use_alibi,
+    ) in _TEST_CASES_DATA:
+        scale = 1.0 / (head_size**0.5)
+
+        max_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+        num_blocks = num_seqs * max_blocks_per_seq  # A reasonable number for testing
+
+        cache_lens_torch = torch.randint(1, max_seq_len, (num_seqs,), dtype=torch.int64)
+
+        block_tables = torch.arange(
+            0, num_seqs * max_blocks_per_seq, dtype=torch.int64
+        ).view(num_seqs, max_blocks_per_seq)
+
+        q_shape = (num_seqs, num_heads, head_size)
+        k_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
+        v_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
+
+        block_tables_shape = block_tables.shape
+        cache_lens_shape = cache_lens_torch.shape
+
+        # Generate test cases for all data types
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-3})
+
+            # Create typed tensor specs
+            q_spec = TensorSpec.from_tensor(q_shape, None, dtype)
+            k_cache_spec = TensorSpec.from_tensor(k_cache_shape, None, dtype)
+            v_cache_spec = TensorSpec.from_tensor(v_cache_shape, None, dtype)
+            block_tables_spec = TensorSpec.from_tensor(
+                block_tables_shape,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=block_tables,
+                dtype=infinicore.int64,
+            )
+            cache_lens_spec = TensorSpec.from_tensor(
+                cache_lens_shape,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=cache_lens_torch,
+                dtype=infinicore.int64,
+            )
+
+            # Paged attention operation: returns output tensor
+            out_shape = (num_seqs, num_heads, head_size)
+            out_spec = TensorSpec.from_tensor(out_shape, None, dtype)
+            test_cases.append(
+                TestCase(
+                    inputs=[
+                        q_spec,
+                        k_cache_spec,
+                        v_cache_spec,
+                        block_tables_spec,
+                        cache_lens_spec,
+                    ],
+                    kwargs={"alibi_slopes": None, "scale": scale},
+                    output_spec=None,
+                    comparison_target=0,
+                    tolerance=tolerance,
+                    description=f"PagedAttention",
+                )
+            )
+
+    return test_cases
+
+
+def ref_masked_attention(query, key, value, scale, attn_mask=None):
+    # Reference implementation for a single masked attention head.
+    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        attn_weights = attn_weights + attn_mask.float()
+    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+    return out
+
+
+def ref_single_query_cached_kv_attention(
+    query, key_cache, value_cache, block_tables, cache_lens, alibi_slopes, scale
+):
+    # Reference implementation for paged attention, iterating through each sequence.
+    output = torch.empty_like(query)
+    num_query_heads, num_kv_heads = query.shape[1], value_cache.shape[1]
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    head_size, block_size = value_cache.shape[3], value_cache.shape[2]
+    num_seqs = query.shape[0]
+
+    for i in range(num_seqs):
+        q = query[i].unsqueeze(0)
+        seq_len = cache_lens[i].item()
+        block_table = block_tables[i]
+
+        keys_lst, values_lst = [], []
+        for j in range(seq_len):
+            block_num = block_table[j // block_size].item()
+            block_off = j % block_size
+            k = key_cache[block_num, :, block_off, :]
+            v = value_cache[block_num, :, block_off, :]
+            keys_lst.append(k)
+            values_lst.append(v)
+
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
+        if num_queries_per_kv > 1:
+            keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
+            values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
+        alibi_bias = None
+        if alibi_slopes is not None:
+            pos = torch.arange(seq_len, device=query.device).int()
+            alibi_bias = (pos - seq_len + 1).float()
+            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(1, 1, -1)
+
+        out = ref_masked_attention(q, keys, values, scale, alibi_bias)
+        output[i] = out.view(num_query_heads, head_size)
+    return output
+
+
+class OpTest(BaseOperatorTest):
+    """PagedAttention operator test with simplified implementation"""
+
+    def __init__(self):
+        super().__init__("PagedAttention")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        """PyTorch paged_caching implementation"""
+        return ref_single_query_cached_kv_attention(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore paged_attention implementation"""
+        out = infinicore.paged_attention(*args, **kwargs)
+        infinicore.sync_stream()
+        return out
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/ops/paged_attention_prefill.py
+++ b/test/infinicore/ops/paged_attention_prefill.py
+import os
+import sys
+
+import torch
+
+import infinicore
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorInitializer,
+    TensorSpec,
+    TestCase,
+)
+
+# Test Cases: (num_seqs, num_heads, num_kv_heads, head_size, block_size, max_step_len, num_rounds)
+_TEST_CASES_DATA = [
+    (1, 1, 1, 128, 8, 16, 1),
+    (1, 4, 4, 128, 8, 16, 4),
+    (2, 8, 8, 128, 16, 32, 2),
+    (4, 16, 16, 128, 8, 64, 3),
+    (8, 64, 64, 128, 8, 16, 5),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-2, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-4, "rtol": 1e-4},  # float32 调优容限
+    infinicore.bfloat16: {"atol": 2e-2, "rtol": 2e-2},
+}
+
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16]
+
+
+class SimpleCacheManager:
+    def __init__(self, num_blocks, block_size):
+        self.num_blocks = num_blocks
+        self.block_size = block_size
+        self.free_blocks = list(range(num_blocks))
+        self.request_to_blocks = {}
+        self.request_to_len = {}
+
+    def allocate_slots(self, request_id, num_new_tokens):
+        if request_id not in self.request_to_len:
+            self.request_to_len[request_id] = 0
+            self.request_to_blocks[request_id] = []
+
+        start_pos = self.request_to_len[request_id]
+        new_total_len = start_pos + num_new_tokens
+        needed_blocks = (new_total_len + self.block_size - 1) // self.block_size
+        added_blocks = needed_blocks - len(self.request_to_blocks[request_id])
+
+        for _ in range(added_blocks):
+            self.request_to_blocks[request_id].append(self.free_blocks.pop(0))
+
+        self.request_to_len[request_id] = new_total_len
+        return self.request_to_blocks[request_id], new_total_len
+
+
+def parse_test_cases():
+    test_cases = []
+
+    for (
+        num_seqs,
+        num_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        max_step_len,
+        num_rounds,
+    ) in _TEST_CASES_DATA:
+        scale = head_size**-0.5
+        num_blocks = 8192
+        manager = SimpleCacheManager(num_blocks, block_size)
+        kv_lens = torch.zeros(num_seqs, dtype=torch.int64)
+
+        persistent_k = torch.zeros((num_blocks, num_kv_heads, block_size, head_size))
+        persistent_v = torch.zeros((num_blocks, num_kv_heads, block_size, head_size))
+
+        for r in range(num_rounds):
+            q_lens = torch.randint(1, max_step_len + 1, (num_seqs,), dtype=torch.int64)
+            kv_lens = kv_lens + q_lens
+            total_q_tokens = q_lens.sum().item()
+            cum_seqlens_q = torch.zeros(num_seqs + 1, dtype=torch.int64)
+            cum_seqlens_q[1:] = torch.cumsum(q_lens, dim=0)
+
+            query_base = torch.randn((total_q_tokens, num_heads, head_size))
+
+            round_block_tables_list = []
+            for i in range(num_seqs):
+                p_blocks, total_len = manager.allocate_slots(i, q_lens[i].item())
+                round_block_tables_list.append(p_blocks)
+
+                h_len = kv_lens[i].item() - q_lens[i].item()
+
+                for t in range(q_lens[i].item()):
+                    logical_pos = h_len + t
+                    b_id = p_blocks[logical_pos // block_size]
+                    off = logical_pos % block_size
+                    persistent_k[b_id, :, off, :] = torch.randn(num_kv_heads, head_size)
+                    persistent_v[b_id, :, off, :] = torch.randn(num_kv_heads, head_size)
+
+            max_blks = max(len(t) for t in round_block_tables_list)
+            padded_tables = torch.tensor(
+                [t + [0] * (max_blks - len(t)) for t in round_block_tables_list]
+            )
+
+            for dtype in _TENSOR_DTYPES:
+                tolerance = _TOLERANCE_MAP.get(dtype)
+
+                test_cases.append(
+                    TestCase(
+                        inputs=[
+                            TensorSpec.from_tensor(
+                                query_base.shape,
+                                init_mode=TensorInitializer.MANUAL,
+                                set_tensor=query_base.clone(),
+                                dtype=dtype,
+                            ),
+                            TensorSpec.from_tensor(
+                                persistent_k.shape,
+                                init_mode=TensorInitializer.MANUAL,
+                                set_tensor=persistent_k.clone(),
+                                dtype=dtype,
+                            ),
+                            TensorSpec.from_tensor(
+                                persistent_v.shape,
+                                init_mode=TensorInitializer.MANUAL,
+                                set_tensor=persistent_v.clone(),
+                                dtype=dtype,
+                            ),
+                            TensorSpec.from_tensor(
+                                padded_tables.shape,
+                                init_mode=TensorInitializer.MANUAL,
+                                set_tensor=padded_tables.clone(),
+                                dtype=infinicore.int64,
+                            ),
+                            TensorSpec.from_tensor(
+                                kv_lens.shape,
+                                init_mode=TensorInitializer.MANUAL,
+                                set_tensor=kv_lens.clone(),
+                                dtype=infinicore.int64,
+                            ),
+                            TensorSpec.from_tensor(
+                                cum_seqlens_q.shape,
+                                init_mode=TensorInitializer.MANUAL,
+                                set_tensor=cum_seqlens_q.clone(),
+                                dtype=infinicore.int64,
+                            ),
+                        ],
+                        kwargs={"scale": scale},
+                        tolerance=tolerance,
+                        description=f"PagedAttentionPrefill_Round_{r}_{str(dtype).split('.')[-1]}",
+                    )
+                )
+
+    return test_cases
+
+
+def ref_paged_attention_multi_turn(
+    query, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, scale
+):
+    output = torch.zeros_like(query)
+    num_seqs = len(kv_lens)
+    block_size = k_cache.shape[2]
+
+    for i in range(num_seqs):
+        q_start, q_end = cum_seqlens_q[i].item(), cum_seqlens_q[i + 1].item()
+        cur_q = query[q_start:q_end]
+        q_len = q_end - q_start
+        h_len = kv_lens[i].item() - q_len
+        total_len = h_len + q_len
+
+        table = block_tables[i]
+        keys, values = [], []
+        for j in range(total_len):
+            b_id = table[j // block_size].item()
+            off = j % block_size
+            keys.append(k_cache[b_id, :, off, :])
+            values.append(v_cache[b_id, :, off, :])
+
+        K = torch.stack(keys, dim=0)
+        V = torch.stack(values, dim=0)
+
+        scores = torch.einsum("qhd,khd->hqk", cur_q.float(), K.float()) * scale
+        mask = torch.full((q_len, total_len), float("-inf"), device=query.device)
+        for t in range(q_len):
+            mask[t, : h_len + t + 1] = 0.0
+
+        attn = torch.softmax(scores + mask.unsqueeze(0), dim=-1).to(query.dtype)
+        output[q_start:q_end] = torch.einsum("hqk,khd->qhd", attn, V)
+    return output
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("PagedAttentionPrefill")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(
+        self,
+        query,
+        k_cache,
+        v_cache,
+        block_tables,
+        kv_lens,
+        cum_seqlens_q,
+        scale=1.0,
+    ):
+        return ref_paged_attention_multi_turn(
+            query, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, scale
+        )
+
+    def infinicore_operator(
+        self,
+        query,
+        k_cache,
+        v_cache,
+        block_tables,
+        kv_lens,
+        cum_seqlens_q,
+        scale=1.0,
+    ):
+        out = infinicore.paged_attention_prefill(
+            query,
+            k_cache,
+            v_cache,
+            block_tables,
+            kv_lens,
+            cum_seqlens_q,
+            alibi_slopes=None,
+            scale=scale,
+        )
+        infinicore.sync_stream()
+        return out
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/ops/paged_caching.py
+++ b/test/infinicore/ops/paged_caching.py
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework import (
+    BaseOperatorTest,
+    TensorSpec,
+    TestCase,
+    GenericTestRunner,
+    is_broadcast,
+    TensorInitializer,
+)
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (num_seqs, max_seq_len, num_kv_heads, head_size, block_size)
+_TEST_CASES_DATA = [
+    (1, 128, 8, 128, 16),
+    (5, 512, 40, 128, 16),
+    (16, 1024, 8, 64, 32),
+    (10, 1024, 40, 64, 32),
+]
+
+# Tolerance configuration
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 1e-5},
+    infinicore.float32: {"atol": 0, "rtol": 1e-5},
+    infinicore.bfloat16: {"atol": 0, "rtol": 1e-5},
+}
+
+# Data types to test
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+# ==============================================================================
+#  Reference Implementation
+# ==============================================================================
+def ref_paged_caching(key_cache_pool, value_cache_pool, key, value, slot_mapping):
+    """
+    Reference implementation for paged_caching operator.
+
+    Args:
+        key_cache_pool (torch.Tensor): K cache pool, shape [num_blocks, nkvh, block_size, dh]
+        value_cache_pool (torch.Tensor): V cache pool, shape [num_blocks, nkvh, block_size, dh]
+        key (torch.Tensor): Keys, shape [ntok, nkvh, dh]
+        value (torch.Tensor): Values, shape [ntok, nkvh, dh]
+        slot_mapping (torch.Tensor): Slot mapping, shape [ntok]
+    """
+    ntok = key.shape[0]
+    block_size = key_cache_pool.shape[2]
+
+    # This reference implementation operates on a cloned cache to avoid modifying the original input tensor,
+    # mimicking the behavior where the custom operator writes to its output tensor.
+    k_cache_ref = key_cache_pool
+    v_cache_ref = value_cache_pool
+
+    for i in range(ntok):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+
+        key_token = key[i]
+        value_token = value[i]
+
+        k_cache_ref[block_idx, :, block_offset, :] = key_token
+        v_cache_ref[block_idx, :, block_offset, :] = value_token
+
+    return k_cache_ref, v_cache_ref
+
+
+def parse_test_cases():
+    """
+    Parse test case data and return list of TestCase objects for paged_caching operation.
+    Each test case contains all necessary information for execution and validation.
+    """
+    test_cases = []
+    for num_seqs, max_seq_len, num_kv_heads, head_size, block_size in _TEST_CASES_DATA:
+        num_blocks = 4096  # A reasonably large cache pool for testing
+
+        # Create metadata: variable context lengths for each sequence in the batch
+        context_lens_torch = torch.randint(
+            1, max_seq_len + 1, (num_seqs,), dtype=torch.int64
+        )
+        ntok = torch.sum(context_lens_torch).item()
+
+        # Simulate the scheduler's behavior to create the slot_mapping
+        slot_mapping_list = []
+        current_slot = 0
+        for length in context_lens_torch:
+            # Find a contiguous chunk of 'length' slots
+            start_slot = current_slot
+            slot_mapping_list.extend(range(start_slot, start_slot + length.item()))
+            current_slot += length.item()
+
+        # Ensure we don't exceed the total number of slots in the cache
+        assert current_slot <= num_blocks * block_size, (
+            "Not enough blocks in the cache pool for this test case"
+        )
+
+        slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64)
+
+        # print("slot_mapping", slot_mapping)
+        slot_mapping_shape = slot_mapping.shape
+
+        k_shape = (ntok, num_kv_heads, head_size)
+        v_shape = (ntok, num_kv_heads, head_size)
+        k_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
+        v_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
+
+        # Generate test cases for all data types
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-3})
+
+            # Create typed tensor specs
+            k_spec = TensorSpec.from_tensor(k_shape, None, dtype)
+            v_spec = TensorSpec.from_tensor(v_shape, None, dtype)
+            k_cache_spec = TensorSpec.from_tensor(
+                k_cache_shape, None, dtype, init_mode=TensorInitializer.ZEROS
+            )
+            v_cache_spec = TensorSpec.from_tensor(
+                v_cache_shape, None, dtype, init_mode=TensorInitializer.ZEROS
+            )
+            slot_mapping_spec = TensorSpec.from_tensor(
+                slot_mapping_shape,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=slot_mapping,
+                dtype=infinicore.int64,
+            )
+
+            # In-place operation: modifies k_cache (index 2) and v_cache (index 3)
+            test_cases.append(
+                TestCase(
+                    inputs=[
+                        k_cache_spec,
+                        v_cache_spec,
+                        k_spec,
+                        v_spec,
+                        slot_mapping_spec,
+                    ],
+                    kwargs=None,
+                    output_spec=None,
+                    comparison_target=0,  # Only compare k_cache
+                    tolerance=tolerance,
+                    description=f"PagedCaching",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """PagedCaching operator test with simplified implementation"""
+
+    def __init__(self):
+        super().__init__("PagedCaching")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        """PyTorch paged_caching implementation"""
+        return ref_paged_caching(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore paged_caching implementation"""
+        return infinicore.paged_caching(*args, **kwargs)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/ops/random_sample.py
+++ b/test/infinicore/ops/random_sample.py
@@ -222,8 +222,8 @@ class OpTest(BaseOperatorTest):

            # Re-run operations with the same logits to get results for comparison
            # prepare_pytorch_inputs_and_kwargs will reuse self._current_logits if it exists
-            from framework.base import TestResult
-            from framework.utils import (
+            from framework.results import CaseResult
+            from framework.utils.tensor_utils import (
                convert_infinicore_to_torch,
                infinicore_tensor_from_torch,
            )
@@ -268,8 +268,8 @@ class OpTest(BaseOperatorTest):

            # Check if indices are equal (standard case)
            if ic_idx == ref_idx:
-                # Return a successful TestResult object
-                return TestResult(
+                # Return a successful CaseResult object
+                return CaseResult(
                    success=True,
                    return_code=0,
                    test_case=test_case,
@@ -283,8 +283,8 @@ class OpTest(BaseOperatorTest):
                logits_ic = logits_tensor[ic_idx].item()
                if logits_ic == logits_ref:
                    # Valid: different indices but same logits value
-                    # Return a successful TestResult object
-                    return TestResult(
+                    # Return a successful CaseResult object
+                    return CaseResult(
                        success=True,
                        return_code=0,
                        test_case=test_case,

--- a/test/infinicore/ops/silu_and_mul.py
+++ b/test/infinicore/ops/silu_and_mul.py
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework import (
+    BaseOperatorTest,
+    TensorSpec,
+    TestCase,
+    GenericTestRunner,
+    is_broadcast,
+)
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (input_shape)
+# The operator splits the last dimension: Input (..., 2*d) -> Output (..., d)
+_TEST_CASES_DATA = [
+    (2, 4),
+    (1024, 1024),
+    (2, 4, 8),
+    (1, 22016),
+    (2, 4, 256),
+    (2, 4, 16, 256),
+]
+
+# Tolerance configuration for different precisions
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-5},
+    infinicore.bfloat16: {"atol": 5e-3, "rtol": 1e-2},
+}
+
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    """
+    Parse SiLUAndMul test case data.
+    Input shape: [..., 2*d], Output shape: [..., d]
+    Note: In-place is not supported due to shape mismatch between input and output.
+    """
+    test_cases = []
+
+    for input_shape in _TEST_CASES_DATA:
+        # Calculate output shape based on SwiGLU logic
+        output_shape = list(input_shape)
+        output_shape[-1] //= 2
+        output_shape = tuple(output_shape)
+
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+
+            input_spec = TensorSpec.from_tensor(input_shape, None, dtype)
+            output_spec = TensorSpec.from_tensor(output_shape, None, dtype)
+
+            # Case 1: Functional style (allocates new memory for output)
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description=f"SiLUAndMul_Functional_{dtype}",
+                )
+            )
+
+            # Case 2: Explicit output tensor style (uses pre-allocated buffer)
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs=None,
+                    output_spec=output_spec,
+                    comparison_target="out",
+                    tolerance=tolerance,
+                    description=f"SiLUAndMul_OutParam_{dtype}",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """SiLUAndMul operator test (SwiGLU activation)"""
+
+    def __init__(self):
+        super().__init__("SiLUAndMul")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, input, out=None, **kwargs):
+        """
+        PyTorch SwiGLU reference implementation:
+        Formula: SiLU(gate) * up, where [gate, up] = split(input)
+        """
+        d = input.shape[-1] // 2
+        # Split the last dimension into two equal parts
+        gate, up = torch.split(input, [d, d], dim=-1)
+        result = torch.nn.functional.silu(gate) * up
+
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+
+    def infinicore_operator(self, input, out=None, **kwargs):
+        """InfiniCore SiLUAndMul implementation wrapper"""
+        import infinicore.nn.functional as F
+
+        return F.silu_and_mul(input, out=out)
+
+
+def main():
+    """Main entry point for the test runner"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/ops/sort.py
+++ b/test/infinicore/ops/sort.py
@@ -7,6 +7,7 @@ import torch
 import infinicore
 from framework import (
    BaseOperatorTest,
+    CaseResult,
    TensorSpec,
    TestCase,
    GenericTestRunner,
@@ -180,7 +181,7 @@ class OpTest(BaseOperatorTest):
                and isinstance(test_case.inputs[0], TensorSpec)
                and test_case.inputs[0].strides is not None
            ):
-                return TestResult(
+                return CaseResult(
                    success=False,
                    return_code=-2,
                    test_case=test_case,
@@ -193,7 +194,7 @@ class OpTest(BaseOperatorTest):
            )
            for spec in output_specs:
                if isinstance(spec, TensorSpec) and spec.strides is not None:
-                    return TestResult(
+                    return CaseResult(
                        success=False,
                        return_code=-2,
                        test_case=test_case,

--- a/test/infinicore/ops/std.py
+++ b/test/infinicore/ops/std.py
@@ -7,6 +7,7 @@ import torch
 import infinicore
 from framework import (
    BaseOperatorTest,
+    CaseResult,
    TensorSpec,
    TestCase,
    GenericTestRunner,
@@ -122,7 +123,7 @@ class OpTest(BaseOperatorTest):
                and isinstance(test_case.inputs[0], TensorSpec)
                and test_case.inputs[0].strides is not None
            ):
-                return TestResult(
+                return CaseResult(
                    success=False,
                    return_code=-2,
                    test_case=test_case,
@@ -135,7 +136,7 @@ class OpTest(BaseOperatorTest):
                and isinstance(test_case.output_spec, TensorSpec)
                and test_case.output_spec.strides is not None
            ):
-                return TestResult(
+                return CaseResult(
                    success=False,
                    return_code=-2,
                    test_case=test_case,

--- a/test/infinicore/run.py
+++ b/test/infinicore/run.py
-import os
 import sys
 import argparse
-import traceback
+import json
+import os
 from pathlib import Path
-import importlib.util
-
-from framework import get_hardware_args_group, add_common_test_args
-
-
-def find_ops_directory(location=None):
-    """
-    Find the ops directory by searching from location upwards.
-
-    Args:
-        location: Starting directory for search (default: current file's parent)
-
-    Returns:
-        Path: Path to ops directory or None if not found
-    """
-    if location is None:
-        location = Path(__file__).parent / "ops"
-
-    ops_dir = location.resolve()
-    if ops_dir.exists() and any(ops_dir.glob("*.py")):
-        return ops_dir
-
-    return None
-
-
-def get_available_operators(ops_dir):
-    """
-    Get list of available operators from ops directory.
-
-    Args:
-        ops_dir: Path to ops directory
-
-    Returns:
-        List of operator names
-    """
-    if not ops_dir or not ops_dir.exists():
-        return []
-
-    test_files = list(ops_dir.glob("*.py"))
-    current_script = Path(__file__).name
-    test_files = [f for f in test_files if f.name != current_script]
-
-    operators = []
-    for test_file in test_files:
-        try:
-            with open(test_file, "r", encoding="utf-8") as f:
-                content = f.read()
-                if "infinicore" in content and (
-                    "BaseOperatorTest" in content or "GenericTestRunner" in content
-                ):
-                    operators.append(test_file.stem)
-        except:
-            continue
-
-    return sorted(operators)
-
-
-def import_operator_test(test_file_path):
-    """
-    Import an operator test module and return the test class instance.
-
-    Args:
-        test_file_path: Path to the test file
-
-    Returns:
-        tuple: (success, test_instance_or_error)
-    """
-    try:
-        # Create a unique module name
-        module_name = f"op_test_{test_file_path.stem}"
-
-        # Load the module from file
-        spec = importlib.util.spec_from_file_location(module_name, test_file_path)
-        if spec is None or spec.loader is None:
-            return False, f"Could not load module from {test_file_path}"
-
-        module = importlib.util.module_from_spec(spec)
-
-        # Add the module to sys.modules
-        sys.modules[module_name] = module
-
-        # Execute the module
-        spec.loader.exec_module(module)
-
-        # Find the test class (usually named OpTest)
-        test_class = None
-        for attr_name in dir(module):
-            attr = getattr(module, attr_name)
-            if (
-                isinstance(attr, type)
-                and hasattr(attr, "__bases__")
-                and any("BaseOperatorTest" in str(base) for base in attr.__bases__)
-            ):
-                test_class = attr
-                break
-
-        if test_class is None:
-            return False, f"No test class found in {test_file_path}"
-
-        # Create an instance
-        test_instance = test_class()
-        return True, test_instance
-
-    except Exception as e:
-        return False, f"Error importing {test_file_path}: {str(e)}"
-
-
-def run_all_op_tests(
-    ops_dir=None,
-    specific_ops=None,
-    bench=False,
-    bench_mode="both",
-    verbose=False,
-    debug=False,
-):
-    """
-    Run all operator test scripts in the ops directory using direct import.
-
-    Args:
-        ops_dir (str, optional): Path to the ops directory. If None, uses auto-detection.
-        specific_ops (list, optional): List of specific operator names to test.
-        bench (bool): Whether benchmarking is enabled
-        bench_mode (str): Benchmark mode - "host", "device", or "both"
-        verbose (bool): Whether verbose mode is enabled
-
-    Returns:
-        dict: Results dictionary with test names as keys and (success, test_runner, stdout, stderr) as values.
-    """
-    if ops_dir is None:
-        ops_dir = find_ops_directory()
-    else:
-        ops_dir = Path(ops_dir)
-
-    if not ops_dir or not ops_dir.exists():
-        print(f"Error: Ops directory '{ops_dir}' does not exist.")
-        return {}
-
-    print(f"Looking for test files in: {ops_dir}")
-
-    # Find all Python test files
-    test_files = list(ops_dir.glob("*.py"))
-
-    # Filter out this script itself and non-operator test files
-    current_script = Path(__file__).name
-    test_files = [f for f in test_files if f.name != current_script]
-
-    # Filter to include only files that look like operator tests
-    operator_test_files = []
-    for test_file in test_files:
-        try:
-            with open(test_file, "r", encoding="utf-8") as f:
-                content = f.read()
-                # Look for characteristic patterns of operator tests
-                if "infinicore" in content and (
-                    "BaseOperatorTest" in content or "GenericTestRunner" in content
-                ):
-                    operator_test_files.append(test_file)
-        except Exception as e:
-            continue
-
-    # Filter for specific operators if requested
-    if specific_ops:
-        filtered_files = []
-        for test_file in operator_test_files:
-            test_name = test_file.stem.lower()
-            if any(op.lower() == test_name for op in specific_ops):
-                filtered_files.append(test_file)
-        operator_test_files = filtered_files
-
-    if not operator_test_files:
-        print(f"No operator test files found in {ops_dir}")
-        print(f"Available Python files: {[f.name for f in test_files]}")
-        return {}
-
-    print(f"Found {len(operator_test_files)} operator test files:")
-    for test_file in operator_test_files:
-        print(f"  - {test_file.name}")
-
-    results = {}
-
-    cumulative_timing = {
-        "total_torch_host_time": 0.0,
-        "total_torch_device_time": 0.0,
-        "total_infinicore_host_time": 0.0,
-        "total_infinicore_device_time": 0.0,
-        "operators_tested": 0,
-    }
-
-    for test_file in operator_test_files:
-        test_name = test_file.stem
-
-        try:
-            # Import and run the test directly
-            success, test_instance_or_error = import_operator_test(test_file)
-
-            if not success:
-                print(f"💥 {test_name}: ERROR - {test_instance_or_error}")
-                results[test_name] = {
-                    "success": False,
-                    "return_code": -1,
-                    "torch_host_time": 0.0,
-                    "torch_device_time": 0.0,
-                    "infini_host_time": 0.0,
-                    "infini_device_time": 0.0,
-                    "error_message": test_instance_or_error,
-                    "test_runner": None,
-                    "stdout": "",
-                    "stderr": test_instance_or_error,
-                }
-                continue
-
-            # Get the test runner class from the module
-            test_module = sys.modules[f"op_test_{test_file.stem}"]
-            if not hasattr(test_module, "GenericTestRunner"):
-                print(f"💥 {test_name}: ERROR - No GenericTestRunner found")
-                results[test_name] = {
-                    "success": False,
-                    "return_code": -1,
-                    "torch_host_time": 0.0,
-                    "torch_device_time": 0.0,
-                    "infini_host_time": 0.0,
-                    "infini_device_time": 0.0,
-                    "error_message": "No GenericTestRunner found",
-                    "test_runner": None,
-                    "stdout": "",
-                    "stderr": "No GenericTestRunner found",
-                }
-                continue
-
-            # Create and run the test runner
-            test_runner_class = test_module.GenericTestRunner
-            runner_instance = test_runner_class(test_instance_or_error.__class__)
-
-            # Temporarily redirect stdout to capture output
-            from io import StringIO
-
-            stdout_capture = StringIO()
-            stderr_capture = StringIO()
-
-            old_stdout = sys.stdout
-            old_stderr = sys.stderr
-            sys.stdout = stdout_capture
-            sys.stderr = stderr_capture
-
-            try:
-                # Run the test
-                test_success, test_runner = runner_instance.run()
-
-                # Get captured output
-                stdout_output = stdout_capture.getvalue()
-                stderr_output = stderr_capture.getvalue()
-
-                # Restore stdout/stderr
-                sys.stdout = old_stdout
-                sys.stderr = old_stderr
-
-                # Print the captured output
-                if stdout_output:
-                    print(stdout_output.rstrip())
-                if stderr_output:
-                    print("\nSTDERR:")
-                    print(stderr_output.rstrip())
-
-                # Analyze test results
-                test_results = test_runner.get_test_results() if test_runner else []
-
-                # Determine overall test status
-                if test_success:
-                    return_code = 0
-                    status_icon = "✅"
-                    status_text = "PASSED"
-                else:
-                    # Check if there are any failed tests
-                    has_failures = any(
-                        result.return_code == -1 for result in test_results
-                    )
-                    has_partial = any(
-                        result.return_code == -3 for result in test_results
-                    )
-                    has_skipped = any(
-                        result.return_code == -2 for result in test_results
-                    )
-
-                    if has_failures:
-                        return_code = -1
-                        status_icon = "❌"
-                        status_text = "FAILED"
-                    elif has_partial:
-                        return_code = -3
-                        status_icon = "⚠️"
-                        status_text = "PARTIAL"
-                    elif has_skipped:
-                        return_code = -2
-                        status_icon = "⏭️"
-                        status_text = "SKIPPED"
-                    else:
-                        return_code = -1
-                        status_icon = "❌"
-                        status_text = "FAILED"
-
-                # Calculate timing for all four metrics
-                torch_host_time = sum(result.torch_host_time for result in test_results)
-                torch_device_time = sum(
-                    result.torch_device_time for result in test_results
-                )
-                infini_host_time = sum(
-                    result.infini_host_time for result in test_results
-                )
-                infini_device_time = sum(
-                    result.infini_device_time for result in test_results
-                )
-
-                results[test_name] = {
-                    "success": test_success,
-                    "return_code": return_code,
-                    "torch_host_time": torch_host_time,
-                    "torch_device_time": torch_device_time,
-                    "infini_host_time": infini_host_time,
-                    "infini_device_time": infini_device_time,
-                    "error_message": "",
-                    "test_runner": test_runner,
-                    "stdout": stdout_output,
-                    "stderr": stderr_output,
-                }
-
-                print(
-                    f"{status_icon}  {test_name}: {status_text} (return code: {return_code})"
-                )
-
-                # Extract benchmark timing if in bench mode
-                if bench and test_success and return_code == 0:
-                    cumulative_timing["total_torch_host_time"] += torch_host_time
-                    cumulative_timing["total_torch_device_time"] += torch_device_time
-                    cumulative_timing["total_infinicore_host_time"] += infini_host_time
-                    cumulative_timing[
-                        "total_infinicore_device_time"
-                    ] += infini_device_time
-                    cumulative_timing["operators_tested"] += 1
-
-            except Exception as e:
-                # Restore stdout/stderr in case of exception
-                sys.stdout = old_stdout
-                sys.stderr = old_stderr
-                raise e
-
-            # In verbose mode, stop execution on first failure
-            if verbose and not test_success and return_code != 0:
-                break
-
-        except Exception as e:
-            print(f"💥 {test_name}: ERROR - {str(e)}")
-            results[test_name] = {
-                "success": False,
-                "return_code": -1,
-                "torch_host_time": 0.0,
-                "torch_device_time": 0.0,
-                "infini_host_time": 0.0,
-                "infini_device_time": 0.0,
-                "error_message": str(e),
-                "test_runner": None,
-                "stdout": "",
-                "stderr": str(e),
-            }
-
-            # In verbose mode, stop execution on any exception
-            if verbose:
-                print(f"\n{'!'*60}")
-                print(
-                    f"VERBOSE MODE: Stopping execution due to exception in {test_name}"
-                )
-                print(f"{'!'*60}")
-                break
-
-            if debug:
-                traceback.print_exc()
-                break
-
-    return results, cumulative_timing
-
-
-def print_summary(
-    results,
-    verbose=False,
-    total_expected_tests=0,
-    cumulative_timing=None,
-    bench_mode="both",
-):
-    """Print a comprehensive summary of test results including benchmark data."""
-    print(f"\n{'='*80}")
-    print("CUMULATIVE TEST SUMMARY")
-    print(f"{'='*80}")
-
-    if not results:
-        print("No tests were run.")
-        return False
-
-    # Count different types of results
-    passed = 0
-    failed = 0
-    skipped = 0
-    partial = 0
-    passed_operators = []  # Store passed operator names
-    failed_operators = []  # Store failed operator names
-    skipped_operators = []  # Store skipped operator names
-    partial_operators = []  # Store partial operator names
-
-    for test_name, result_data in results.items():
-        return_code = result_data["return_code"]
-        if return_code == 0:
-            passed += 1
-            passed_operators.append(test_name)
-        elif return_code == -2:  # Special code for skipped tests
-            skipped += 1
-            skipped_operators.append(test_name)
-        elif return_code == -3:  # Special code for partial tests
-            partial += 1
-            partial_operators.append(test_name)
-        else:
-            failed += 1
-            failed_operators.append(test_name)
-
-    total = len(results)
-
-    print(f"Total tests run: {total}")
-    if total_expected_tests > 0 and total < total_expected_tests:
-        print(f"Total tests expected: {total_expected_tests}")
-        print(f"Tests not executed: {total_expected_tests - total}")
-
-    print(f"Passed: {passed}")
-    print(f"Failed: {failed}")
-
-    if skipped > 0:
-        print(f"Skipped: {skipped}")
-
-    if partial > 0:
-        print(f"Partial: {partial}")
-
-    # Print benchmark summary if cumulative_timing data is available
-    if cumulative_timing and cumulative_timing["operators_tested"] > 0:
-        print(f"{'-'*40}")
-        print("BENCHMARK SUMMARY:")
-        print(f"  Operators Tested: {cumulative_timing['operators_tested']}")
-
-        # Display timing based on bench_mode
-        if bench_mode in ["host", "both"]:
-            print(
-                f"  PyTorch    Host Total Time:   {cumulative_timing['total_torch_host_time']:12.3f} ms"
-            )
-            print(
-                f"  InfiniCore Host Total Time:   {cumulative_timing['total_infinicore_host_time']:12.3f} ms"
-            )
-
-        if bench_mode in ["device", "both"]:
-            print(
-                f"  PyTorch    Device Total Time: {cumulative_timing['total_torch_device_time']:12.3f} ms"
-            )
-            print(
-                f"  InfiniCore Device Total Time: {cumulative_timing['total_infinicore_device_time']:12.3f} ms"
-            )
-
-        print(f"{'-'*40}")
-
-    # Display passed operators
-    if passed_operators:
-        print(f"\n✅ PASSED OPERATORS ({len(passed_operators)}):")
-        # Display operators in groups of 10 per line
-        for i in range(0, len(passed_operators), 10):
-            line_ops = passed_operators[i : i + 10]
-            print("  " + ", ".join(line_ops))
-    else:
-        print(f"\n✅ PASSED OPERATORS: None")
-
-    # Display failed operators (if any)
-    if failed_operators:
-        print(f"\n❌ FAILED OPERATORS ({len(failed_operators)}):")
-        for i in range(0, len(failed_operators), 10):
-            line_ops = failed_operators[i : i + 10]
-            print("  " + ", ".join(line_ops))
-
-    # Display skipped operators (if any)
-    if skipped_operators:
-        print(f"\n⏭️ SKIPPED OPERATORS ({len(skipped_operators)}):")
-        for i in range(0, len(skipped_operators), 10):
-            line_ops = skipped_operators[i : i + 10]
-            print("  " + ", ".join(line_ops))
-
-    # Display partial operators (if any)
-    if partial_operators:
-        print(f"\n⚠️  PARTIAL OPERATORS ({len(partial_operators)}):")
-        for i in range(0, len(partial_operators), 10):
-            line_ops = partial_operators[i : i + 10]
-            print("  " + ", ".join(line_ops))
-
-    if total > 0:
-        # Calculate success rate based on actual executed tests
-        executed_tests = passed + failed + partial
-        if executed_tests > 0:
-            success_rate = passed / executed_tests * 100
-            print(f"\nSuccess rate: {success_rate:.1f}%")
-
-    if verbose and total < total_expected_tests:
-        print(f"\n💡 Verbose mode: Execution stopped after first failure")
-        print(f"   {total_expected_tests - total} tests were not executed")
-
-    if failed == 0:
-        if skipped > 0 or partial > 0:
-            print(f"\n⚠️  Tests completed with some operators not implemented")
-            print(f"   - {skipped} tests skipped (both operators not implemented)")
-            print(f"   - {partial} tests partial (one operator not implemented)")
-        else:
-            print(f"\n🎉 All tests passed!")
-        return True
-    else:
-        print(f"\n❌ {failed} tests failed")
-        return False
-
-
-def list_available_tests(ops_dir=None):
-    """List all available operator test files."""
-    if ops_dir is None:
-        ops_dir = find_ops_directory()
-    else:
-        ops_dir = Path(ops_dir)
-
-    if not ops_dir or not ops_dir.exists():
-        print(f"Error: Ops directory '{ops_dir}' does not exist.")
-        return
-
-    operators = get_available_operators(ops_dir)

-    if operators:
-        print(f"Available operator test files in {ops_dir}:")
-        for operator in operators:
-            print(f"  - {operator}")
-        print(f"\nTotal: {len(operators)} operators")
-    else:
-        print(f"No operator test files found in {ops_dir}")
-        # Show available Python files for debugging
-        test_files = list(ops_dir.glob("*.py"))
-        current_script = Path(__file__).name
-        test_files = [f for f in test_files if f.name != current_script]
-        if test_files:
-            print(f"Available Python files: {[f.name for f in test_files]}")
+from framework import (
+    get_hardware_args_group,
+    add_common_test_args,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+)
+from framework.test_manager import TestCollector, TestManager


-def generate_help_epilog(ops_dir):
+def generate_help_epilog(ops_dir=None):
    """
-    Generate dynamic help epilog with available operators and hardware platforms.
-
-    Args:
-        ops_dir: Path to ops directory
-
-    Returns:
-        str: Formatted help text
+    Generate dynamic help epilog containing available operators and hardware platforms.
+    Maintains the original output format for backward compatibility.
    """
-    # Get available operators
-    operators = get_available_operators(ops_dir)
+    # === Adapter: Use TestCollector to get operator list ===
+    # Temporarily instantiate a Collector just to fetch the list
+    collector = TestCollector(ops_dir)
+    operators = collector.get_available_operators()

-    # Build epilog text
+    # Build epilog text (fully replicating original logic)
    epilog_parts = []

    # Examples section
@@ -627,18 +89,142 @@ def generate_help_epilog(ops_dir):
    return "\n".join(epilog_parts)


-def main():
-    """Main entry point with comprehensive command line argument parsing."""
-    # First, find ops directory for dynamic help generation
-    ops_dir = find_ops_directory()
+def fill_defaults_for_local_mode(args):
+    """
+    Helper function specifically for Local Scan mode to fill default arguments.
+    Since parser defaults are set to None (to handle override logic in load mode),
+    we need to manually fill None with default values in local mode.
+    """
+    # 1. Copy args to avoid modifying the original object and affecting other logic
+    # argparse.Namespace can be converted to dict and back, or copied directly
+    local_args = argparse.Namespace(**vars(args))
+
+    # 2. Fill default values for numeric arguments
+    if local_args.num_prerun is None:
+        local_args.num_prerun = 10
+
+    if local_args.num_iterations is None:
+        local_args.num_iterations = 1000
+
+    return local_args
+
+
+def load_and_override_cases(load_paths, args):
+    """
+    Load JSON, apply CLI overrides, and handle all argument logic.
+    """
+    cases = []
+    files_to_read = []
+
+    # 1. Scan
+    for p_str in load_paths:
+        p = Path(p_str)
+        if p.is_dir():
+            files_to_read.extend(p.glob("*.json"))
+        elif p.is_file():
+            files_to_read.append(p)
+
+    # 2. Read and Validate
+    loaded_count = 0
+    skipped_count = 0
+
+    for f_path in files_to_read:
+        try:
+            with open(f_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+                # Unify as a list to handle both single dict and list of dicts
+                current_batch = data if isinstance(data, list) else [data]
+
+                valid_batch = []
+                for item in current_batch:
+                    # We only require the 'operator' field to identify the test case.
+                    if isinstance(item, dict) and "operator" in item:
+                        valid_batch.append(item)
+                    else:
+                        skipped_count += 1
+
+                if valid_batch:
+                    cases.extend(valid_batch)
+                    loaded_count += 1
+
+        except Exception as e:
+            # Log warning only; do not crash the program on bad files to ensure flow continuity.
+            print(f"❌ Error loading {f_path.name}: {e}")
+
+    if skipped_count > 0:
+        print(f"ℹ️  Ignored {skipped_count} items/files (invalid format).")
+
+    # ==================================================
+    # Device Logic using InfiniDeviceEnum
+    # ==================================================
+    # 1. Identify active devices from CLI arguments
+    cli_active_devices = []
+
+    # Iterate through the Enum to check corresponding CLI args
+    # Logic: Enum name (e.g., CAMBRICON) -> lower() -> arg name (cambricon)
+    # Value: InfiniDeviceNames mapping (e.g., "Cambricon")
+    for device_enum, device_name in InfiniDeviceNames.items():
+        # device_name is like "CPU", "NVIDIA", "Cambricon"
+        # arg_name becomes "cpu", "nvidia", "cambricon"
+        arg_name = device_name.lower()
+
+        if getattr(args, arg_name, False):
+            cli_active_devices.append(device_name)
+
+    print(f"\n[Config Processing]")
+
+    for case in cases:
+        if "args" not in case or case["args"] is None:
+            case["args"] = {}
+        case_args = case["args"]
+
+        # 2. Apply Device Overrides (CLI > JSON)
+        if cli_active_devices:
+            case["device"] = ",".join(cli_active_devices)
+
+        final_dev_str = case.get("device", "").upper()  # Uppercase for easier matching
+
+        # 3. Set Boolean flags in case_args based on final device string
+        for device_enum, device_name in InfiniDeviceNames.items():
+            arg_name = device_name.lower()
+            # Check if the standard name (e.g., "Cambricon" or "NVIDIA") is in the device string
+            # We convert both to upper to ensure case-insensitive matching
+            is_active = device_name.upper() in final_dev_str
+            case_args[arg_name] = is_active
+
+        case_args["save"] = getattr(args, "save", None)
+        # Standard arguments (CLI > JSON > Default)
+        case_args["bench"] = (
+            args.bench if args.bench is not None else case_args.get("bench")
+        )
+
+        # Boolean Flags
+        case_args["verbose"] = args.verbose or case_args.get("verbose", False)
+        case_args["debug"] = args.debug or case_args.get("debug", False)
+        case_args["eq_nan"] = args.eq_nan or case_args.get("eq_nan", False)
+        case_args["num_prerun"] = (
+            args.num_prerun
+            if args.num_prerun is not None
+            else (case_args.get("num_prerun") or 10)
+        )
+        case_args["num_iterations"] = (
+            args.num_iterations
+            if args.num_iterations is not None
+            else (case_args.get("num_iterations") or 1000)
+        )
+
+    print(f"📂 Processed {len(cases)} cases ready for execution.\n")
+    return cases

+
+def main():
+    """Main entry point for the InfiniCore Operator Test Runner."""
    parser = argparse.ArgumentParser(
        description="Run InfiniCore operator tests across multiple hardware platforms",
        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=generate_help_epilog(ops_dir),
+        epilog=generate_help_epilog(),
    )
-
-    # Core options
    parser.add_argument(
        "--ops-dir", type=str, help="Path to the ops directory (default: auto-detect)"
    )
@@ -650,119 +236,106 @@ def main():
        action="store_true",
        help="List all available test files without running them",
    )
-    
-    # Call common method to add shared arguments (bench, debug, verbose, save...)
-    add_common_test_args(parser)
+    parser.add_argument(
+        "--load",
+        nargs="+",
+        help="Load test cases from JSON",
+    )

+    # Default value is None to determine if user provided input
+    parser.add_argument("--num_prerun", type=lambda x: max(0, int(x)), default=None)
+    parser.add_argument("--num_iterations", type=lambda x: max(0, int(x)), default=None)
+
+    # Add common test arguments (including --save, --bench, etc.)
+    add_common_test_args(parser)
    get_hardware_args_group(parser)

-    # Parse known args first, leave the rest for the test scripts
    args, unknown_args = parser.parse_known_args()
+    # Show what extra arguments will be passed
+    if unknown_args:
+        print(f"Passing extra arguments to test scripts: {unknown_args}")

-    # Handle list command
+    # 1. Discovery
+    collector = TestCollector(args.ops_dir)
    if args.list:
-        list_available_tests(args.ops_dir)
+        print("Available operators:", collector.get_available_operators())
        return

-    # Auto-detect ops directory if not provided
-    if args.ops_dir is None:
-        ops_dir = find_ops_directory()
-        if not ops_dir:
-            print(
-                "Error: Could not auto-detect ops directory. Please specify with --ops-dir"
-            )
-            sys.exit(1)
-    else:
-        ops_dir = Path(args.ops_dir)
-        if not ops_dir.exists():
-            print(f"Error: Ops directory '{ops_dir}' does not exist.")
+    # ==========================================================================
+    # Branch 1: Load Mode (JSON Data Driven)
+    # ==========================================================================
+    if args.load:
+        # 1. Load and override arguments
+        json_cases = load_and_override_cases(args.load, args)
+        if not json_cases:
            sys.exit(1)

-    # Show what extra arguments will be passed
-    if unknown_args:
-        print(f"Passing extra arguments to test scripts: {unknown_args}")
+        # 2. Determine global Bench status (for Summary display)
+        bench = json_cases[0]["args"].get("bench")
+        verbose = json_cases[0]["args"].get("verbose")

-    # Get available operators for display
-    available_operators = get_available_operators(ops_dir)
+        if verbose:
+            print(
+                f"Verbose mode: ENABLED (will stop on first error with full traceback)"
+            )

-    print(f"InfiniCore Operator Test Runner")
-    print(f"Operating directory: {ops_dir}")
-    print(f"Available operators: {len(available_operators)}")
+        if bench:
+            print(f"Benchmark mode: {args.bench.upper()} timing")

-    if args.verbose:
-        print(f"Verbose mode: ENABLED (will stop on first error with full traceback)")
+        # 3. Initialize and Execute
+        test_manager = TestManager(
+            ops_dir=args.ops_dir, verbose=verbose, bench_mode=bench
+        )

-    if args.bench:
-        bench_mode = args.bench if args.bench != "both" else "both"
-        print(f"Benchmark mode: {bench_mode.upper()} timing")
+        success, _ = test_manager.test(json_cases_list=json_cases)

-    if args.ops:
-        # Validate requested operators
-        valid_ops = []
-        invalid_ops = []
-        for op in args.ops:
-            if op in available_operators:
-                valid_ops.append(op)
-            else:
-                invalid_ops.append(op)
-
-        if invalid_ops:
-            print(f"Warning: Unknown operators: {', '.join(invalid_ops)}")
-            print(f"Available operators: {', '.join(available_operators)}")
-
-        if valid_ops:
-            print(f"Testing operators: {', '.join(valid_ops)}")
-            total_expected_tests = len(valid_ops)
-        else:
-            print("No valid operators specified. Running all available tests.")
-            total_expected_tests = len(available_operators)
+    # ==========================================================================
+    # Branch 2: Local Scan Mode
+    # ==========================================================================
    else:
-        print("Testing all available operators")
-        total_expected_tests = len(available_operators)
-
-    print()
-
-    # Run all tests
-    results, cumulative_timing = run_all_op_tests(
-        ops_dir=ops_dir,
-        specific_ops=args.ops,
-        bench=bool(args.bench),
-        bench_mode=args.bench if args.bench else "both",
-        verbose=args.verbose,
-        debug=args.debug,
-    )
+        if args.verbose:
+            print(
+                f"Verbose mode: ENABLED (will stop on first error with full traceback)"
+            )

-    # Print summary and exit with appropriate code
-    all_passed = print_summary(
-        results,
-        args.verbose,
-        total_expected_tests,
-        cumulative_timing,
-        bench_mode=args.bench if args.bench else "both",
-    )
+        if args.bench:
+            print(f"Benchmark mode: {args.bench.upper()} timing")
+
+        # 2. Filtering
+        target_ops = None
+        if args.ops:
+            available_ops = set(collector.get_available_operators())
+            requested_ops = set(args.ops)
+            valid_ops = list(requested_ops & available_ops)
+            invalid_ops = list(requested_ops - available_ops)
+
+            if invalid_ops:
+                print(f"⚠️  Warning: The following requested operators were not found:")
+                print(f"   {', '.join(invalid_ops)}")
+                print(f"   (Use --list to see available operators)")
+
+            if not valid_ops:
+                # Case A: User input provided, but ALL were invalid.
+                print(f"⚠️  No valid operators remained from your list.")
+                print(f"🔄 Fallback: Proceeding to run ALL available tests...")
+            else:
+                # Case B: At least some valid operators found.
+                print(f"🎯 Targeted operators: {', '.join(valid_ops)}")
+                target_ops = valid_ops

-    # Check if there were any tests with missing implementations
-    has_missing_implementations = any(
-        result_data["return_code"] in [-2, -3] for result_data in results.values()
-    )
+        # 3. Execution Preparation
+        # Fill defaults for local mode (since parser default is None)
+        global_exec_args = fill_defaults_for_local_mode(args)

-    if all_passed and has_missing_implementations:
-        print(f"\n⚠️  Note: Some operators are not fully implemented")
-        print(f"   Run individual tests for details on missing implementations")
+        # 4. Initialize API & Execute
+        test_manager = TestManager(
+            ops_dir=args.ops_dir, verbose=args.verbose, bench_mode=args.bench
+        )

-    if args.verbose and not all_passed:
-        print(
-            f"\n💡 Verbose mode tip: Use individual test commands for detailed debugging:"
+        success, _ = test_manager.test(
+            target_ops=target_ops, global_exec_args=global_exec_args
        )
-        failed_ops = [
-            name
-            for name, result_data in results.items()
-            if result_data["return_code"] == -1
-        ]
-        for op in failed_ops[:3]:  # Show first 3 failed operators
-            print(f"   python {ops_dir / (op + '.py')} --verbose")
-
-    sys.exit(0 if all_passed else 1)
+    sys.exit(0 if success else 1)


 if __name__ == "__main__":

--- a/test/infinicore/tensor/narrow.py
+++ b/test/infinicore/tensor/narrow.py
@@ -5,9 +5,15 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

 import torch
 import infinicore
-from framework.base import BaseOperatorTest, TensorSpec, TestCase
-from framework.runner import GenericTestRunner
-from framework.utils import is_broadcast
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    is_broadcast,
+    TensorSpec,
+    TestCase
+)
+

 # ==============================================================================
 # Operator-specific configuration

--- a/test/infinicore/tensor/squeeze.py
+++ b/test/infinicore/tensor/squeeze.py
@@ -5,9 +5,15 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

 import torch
 import infinicore
-from framework.base import BaseOperatorTest, TensorSpec, TestCase
-from framework.runner import GenericTestRunner
-from framework.utils import is_broadcast
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+    is_broadcast
+)
+

 # ==============================================================================
 # Operator-specific configuration

--- a/test/infinicore/tensor/unsqueeze.py
+++ b/test/infinicore/tensor/unsqueeze.py
@@ -5,9 +5,15 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

 import torch
 import infinicore
-from framework.base import BaseOperatorTest, TensorSpec, TestCase
-from framework.runner import GenericTestRunner
-from framework.utils import is_broadcast
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+    is_broadcast
+)
+

 # ==============================================================================
 # Operator-specific configuration