issue/556 - support more inplace cases

6b8949ce · wooway777 · 2e5b2342 · 6b8949ce · 6b8949ce · 6b8949ce
Commit 6b8949ce authored Nov 07, 2025 by wooway777
20 changed files
--- a/python/infinicore/ops/add.py
+++ b/python/infinicore/ops/add.py
@@ -7,3 +7,5 @@ def add(input, other, *, out=None):
        return Tensor(_infinicore.add(input._underlying, other._underlying))

    _infinicore.add_(out._underlying, input._underlying, other._underlying)
+
+    return out
--- a/python/infinicore/ops/attention.py
+++ b/python/infinicore/ops/attention.py
@@ -24,3 +24,5 @@ def attention(q, k, v, k_cache, v_cache, pos, *, out=None):
        v_cache._underlying,
        pos,
    )
+
+    return out
--- a/python/infinicore/ops/causal_softmax.py
+++ b/python/infinicore/ops/causal_softmax.py
@@ -7,3 +7,5 @@ def causal_softmax(input, *, out=None):
        return Tensor(_infinicore.causal_softmax(input._underlying))

    _infinicore.causal_softmax_(out._underlying, input._underlying)
+
+    return out
--- a/python/infinicore/ops/matmul.py
+++ b/python/infinicore/ops/matmul.py
@@ -7,3 +7,5 @@ def matmul(input, other, *, out=None):
        return Tensor(_infinicore.matmul(input._underlying, other._underlying))

    _infinicore.matmul_(out._underlying, input._underlying, other._underlying)
+
+    return out
--- a/python/infinicore/ops/rearrange.py
+++ b/python/infinicore/ops/rearrange.py
@@ -7,3 +7,5 @@ def rearrange(input, other, *, out=None):
        return Tensor(_infinicore.rearrange(input._underlying))

    _infinicore.rearrange_(out._underlying, input._underlying)
+
+    return out
--- a/python/infinicore/ops/rms_norm.py
+++ b/python/infinicore/ops/rms_norm.py
@@ -11,3 +11,5 @@ def rms_norm(input, weight, epsilon=1e-5, *, out=None):
    _infinicore.rms_norm_(
        out._underlying, input._underlying, weight._underlying, epsilon
    )
+
+    return out
--- a/python/infinicore/ops/silu.py
+++ b/python/infinicore/ops/silu.py
@@ -7,3 +7,5 @@ def silu(input, *, out=None):
        return Tensor(_infinicore.silu(input._underlying))

    _infinicore.silu_(out._underlying, input._underlying)
+
+    return out
--- a/python/infinicore/ops/swiglu.py
+++ b/python/infinicore/ops/swiglu.py
@@ -7,3 +7,5 @@ def swiglu(input, other, *, out=None):
        return Tensor(_infinicore.swiglu(input._underlying, other._underlying))

    _infinicore.swiglu_(out._underlying, input._underlying, other._underlying)
+
+    return out
--- a/test/infinicore/framework/__init__.py
+++ b/test/infinicore/framework/__init__.py
-# [file name]: __init__.py
-# [file content begin]
 from .base import TestConfig, TestRunner, TestCase, BaseOperatorTest
 from .tensor import TensorSpec, TensorInitializer
 from .utils import (
@@ -16,7 +14,6 @@ from .config import get_test_devices, get_args
 from .devices import InfiniDeviceEnum, InfiniDeviceNames, torch_device_map
 from .datatypes import to_torch_dtype, to_infinicore_dtype
 from .runner import GenericTestRunner
-from .templates import BinaryOperatorTest, UnaryOperatorTest

 __all__ = [
    "TensorSpec",
@@ -41,6 +38,4 @@ __all__ = [
    "to_torch_dtype",
    "to_infinicore_dtype",
    "GenericTestRunner",
-    "BinaryOperatorTest",
-    "UnaryOperatorTest",
 ]
--- a/test/infinicore/framework/base.py
+++ b/test/infinicore/framework/base.py
--- a/test/infinicore/framework/datatypes.py
+++ b/test/infinicore/framework/datatypes.py
@@ -20,6 +20,8 @@ def to_torch_dtype(infini_dtype):
        return torch.int64
    elif infini_dtype == infinicore.uint8:
        return torch.uint8
+    elif infini_dtype == infinicore.bool:
+        return torch.bool
    else:
        raise ValueError(f"Unsupported infinicore dtype: {infini_dtype}")

@@ -42,5 +44,7 @@ def to_infinicore_dtype(torch_dtype):
        return infinicore.int64
    elif torch_dtype == torch.uint8:
        return infinicore.uint8
+    elif torch_dtype == torch.bool:
+        return infinicore.bool
    else:
        raise ValueError(f"Unsupported torch dtype: {torch_dtype}")
--- a/test/infinicore/framework/runner.py
+++ b/test/infinicore/framework/runner.py
@@ -20,13 +20,10 @@ class GenericTestRunner:
    def run(self):
        """Execute the complete test suite"""
        config = TestConfig(
-            tensor_dtypes=self.operator_test.tensor_dtypes,
-            tolerance_map=self.operator_test.tolerance_map,
            debug=self.args.debug,
            bench=self.args.bench,
            num_prerun=self.args.num_prerun,
            num_iterations=self.args.num_iterations,
-            dtype_combinations=self.operator_test.dtype_combinations,
        )

        runner = TestRunner(self.operator_test.test_cases, config)

--- a/test/infinicore/framework/tensor.py
+++ b/test/infinicore/framework/tensor.py
--- a/test/infinicore/framework/utils.py
+++ b/test/infinicore/framework/utils.py
@@ -37,43 +37,16 @@ def profile_operation(desc, func, torch_device, num_prerun, num_iterations):
    print(f"    {desc} time: {elapsed * 1000 :6f} ms")


-def is_integer_dtype(dtype):
-    """Check if dtype is integer type"""
-    return dtype in [
-        infinicore.int8,
-        infinicore.int16,
-        infinicore.int32,
-        infinicore.int64,
-        infinicore.uint8,
-    ]
-
-
-def is_float_dtype(dtype):
-    """Check if dtype is floating point type"""
-    return dtype in [infinicore.float16, infinicore.float32, infinicore.bfloat16]
-
-
-def debug(
-    actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True, dtype=None
-):
+def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
    """
    Debug function to compare two tensors and print differences
    """
-    # Convert to float32 for bfloat16 comparison
    if actual.dtype == torch.bfloat16 or desired.dtype == torch.bfloat16:
        actual = actual.to(torch.float32)
        desired = desired.to(torch.float32)

-    print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose, dtype)
+    print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose)

-    # Use appropriate comparison based on dtype
-    if dtype and is_integer_dtype(dtype):
-        # For integer types, require exact equality
-        import numpy as np
-
-        np.testing.assert_array_equal(actual.cpu(), desired.cpu())
-    else:
-        # For float types, use allclose
    import numpy as np

    np.testing.assert_allclose(
@@ -82,7 +55,7 @@ def debug(


 def print_discrepancy(
-    actual, expected, atol=0, rtol=1e-3, equal_nan=True, verbose=True, dtype=None
+    actual, expected, atol=0, rtol=1e-3, equal_nan=True, verbose=True
 ):
    """Print detailed tensor differences"""
    if actual.shape != expected.shape:
@@ -96,21 +69,13 @@ def print_discrepancy(
    actual_isnan = torch.isnan(actual)
    expected_isnan = torch.isnan(expected)

-    # Calculate difference mask based on dtype
-    if dtype and is_integer_dtype(dtype):
-        # For integer types, exact equality required
-        diff_mask = actual != expected
-    else:
-        # For float types, use tolerance-based comparison
+    # Calculate difference mask
    nan_mismatch = (
-            actual_isnan ^ expected_isnan
-            if equal_nan
-            else actual_isnan | expected_isnan
+        actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan
    )
    diff_mask = nan_mismatch | (
        torch.abs(actual - expected) > (atol + rtol * torch.abs(expected))
    )
-
    diff_indices = torch.nonzero(diff_mask, as_tuple=False)
    delta = actual - expected

@@ -142,7 +107,6 @@ def print_discrepancy(

        print(f"  - Actual dtype: {actual.dtype}")
        print(f"  - Desired dtype: {expected.dtype}")
-        if not (dtype and is_integer_dtype(dtype)):
        print(f"  - Atol: {atol}")
        print(f"  - Rtol: {rtol}")
        print(
@@ -166,10 +130,6 @@ def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3
    """
    Get tolerance settings based on data type
    """
-    # For integer types, return zero tolerance (exact match required)
-    if is_integer_dtype(tensor_dtype):
-        return 0, 0
-
    tolerance = tolerance_map.get(
        tensor_dtype, {"atol": default_atol, "rtol": default_rtol}
    )
@@ -202,6 +162,8 @@ def convert_infinicore_to_torch(infini_result, torch_reference):
    Args:
        infini_result: infinicore tensor result
        torch_reference: PyTorch tensor reference (for shape and device)
+        dtype: infinicore data type
+        device_str: torch device string

    Returns:
        torch.Tensor: PyTorch tensor with infinicore data
@@ -217,70 +179,103 @@ def convert_infinicore_to_torch(infini_result, torch_reference):


 def compare_results(
-    infini_result, torch_result, atol=1e-5, rtol=1e-5, debug_mode=False, dtype=None
+    infini_result, torch_result, atol=1e-5, rtol=1e-5, debug_mode=False
 ):
    """
    Generic function to compare infinicore result with PyTorch reference result
+    Supports both floating-point (with tolerance) and integer (exact) comparison

    Args:
        infini_result: infinicore tensor result
        torch_result: PyTorch tensor reference result
-        atol: absolute tolerance
-        rtol: relative tolerance
+        atol: absolute tolerance (for floating-point only)
+        rtol: relative tolerance (for floating-point only)
        debug_mode: whether to enable debug output
-        dtype: infinicore data type for comparison logic

    Returns:
-        bool: True if results match within tolerance
+        bool: True if results match within tolerance (FP) or exactly (integer)
    """
    # Convert infinicore result to PyTorch tensor for comparison
    torch_result_from_infini = convert_infinicore_to_torch(infini_result, torch_result)

-    # Choose comparison method based on dtype
-    if dtype and is_integer_dtype(dtype):
-        # For integer types, require exact equality
-        result = torch.equal(torch_result_from_infini, torch_result)
+    # Handle scalar integer comparison
+    if isinstance(torch_result_from_infini, (int, float)) and isinstance(
+        torch_result, (int, float)
+    ):
+        if isinstance(torch_result_from_infini, int) and isinstance(torch_result, int):
+            # Exact integer scalar comparison
+            result_equal = torch_result_from_infini == torch_result
+            if debug_mode and not result_equal:
+                print(
+                    f"Integer scalar mismatch: {torch_result_from_infini} != {torch_result}"
+                )
+            return result_equal
        else:
-        # For float types, use tolerance-based comparison
-        result = torch.allclose(
-            torch_result_from_infini, torch_result, atol=atol, rtol=rtol
+            # Floating-point scalar comparison with tolerance
+            return abs(torch_result_from_infini - torch_result) <= atol + rtol * abs(
+                torch_result
            )

    # Debug mode: detailed comparison
    if debug_mode:
-        debug(torch_result_from_infini, torch_result, atol=atol, rtol=rtol, dtype=dtype)
-
-    return result
+        debug(torch_result_from_infini, torch_result, atol=atol, rtol=rtol)
+
+    # Choose comparison method based on data type
+    if is_integer_dtype(torch_result_from_infini.dtype) or is_integer_dtype(
+        torch_result.dtype
+    ):
+        # Exact equality for integer types
+        result_equal = torch.equal(torch_result_from_infini, torch_result)
+        if debug_mode and not result_equal:
+            print("Integer tensor comparison failed - requiring exact equality")
+        return result_equal
+    else:
+        # Tolerance-based comparison for floating-point types
+        return torch.allclose(
+            torch_result_from_infini, torch_result, atol=atol, rtol=rtol
+        )


-def create_test_comparator(config, dtype, tolerance_map=None, mode_name=""):
+def create_test_comparator(config, atol, rtol, mode_name=""):
    """
-    Create a test-specific comparison function that handles test configuration
+    Create a test-specific comparison function

    Args:
        config: test configuration
-        dtype: infinicore data type
-        tolerance_map: optional tolerance map (defaults to config's tolerance_map)
+        atol: absolute tolerance (for floating-point only)
+        rtol: relative tolerance (for floating-point only)
        mode_name: operation mode name for debug output

    Returns:
        callable: function that takes (infini_result, torch_result) and returns bool
    """
-    if tolerance_map is None:
-        tolerance_map = config.tolerance_map
-
-    atol, rtol = get_tolerance(tolerance_map, dtype)

    def compare_test_results(infini_result, torch_result):
        if config.debug and mode_name:
            print(f"\033[94mDEBUG INFO - {mode_name}:\033[0m")
+
+        # For integer types, override tolerance to require exact equality
+        actual_atol = atol
+        actual_rtol = rtol
+
+        # Check if we're dealing with integer types
+        try:
+            # Try to get dtype from infinicore tensor
+            if hasattr(infini_result, "dtype"):
+                infini_dtype = infini_result.dtype
+                torch_dtype = to_torch_dtype(infini_dtype)
+                if is_integer_dtype(torch_dtype):
+                    actual_atol = 0
+                    actual_rtol = 0
+        except:
+            pass
+
        return compare_results(
            infini_result,
            torch_result,
-            atol=atol,
-            rtol=rtol,
+            atol=actual_atol,
+            rtol=actual_rtol,
            debug_mode=config.debug,
-            dtype=dtype,
        )

    return compare_test_results
@@ -330,3 +325,30 @@ def rearrange_tensor(tensor, new_strides):
    new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))

    return new_tensor
+
+
+def is_broadcast(strides):
+    """
+    Check if strides indicate a broadcasted tensor
+
+    Args:
+        strides: Tensor strides or None
+
+    Returns:
+        bool: True if the tensor is broadcasted (has zero strides)
+    """
+    if strides is None:
+        return False
+    return any(s == 0 for s in strides)
+
+
+def is_integer_dtype(dtype):
+    """Check if dtype is integer type"""
+    return dtype in [
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.bool,
+    ]
--- a/test/infinicore/ops/add.py
+++ b/test/infinicore/ops/add.py
@@ -7,93 +7,138 @@ import torch
 import infinicore
 from framework.base import BaseOperatorTest, TensorSpec, TestCase
 from framework.runner import GenericTestRunner
+from framework.utils import is_broadcast

 # ==============================================================================
 # Operator-specific configuration
 # ==============================================================================

-# Test cases format: (operation_mode, shape, a_strides, b_strides, c_strides)
+# Test cases format: (shape, a_strides, b_strides, c_strides)
 _TEST_CASES_DATA = [
-    (TestCase.BOTH, (13, 4), None, None, None),
-    (TestCase.BOTH, (13, 4), (10, 1), (10, 1), (10, 1)),
-    (TestCase.BOTH, (13, 4), (0, 1), None, None),
-    (TestCase.BOTH, (13, 4, 4), None, None, None),
-    (TestCase.BOTH, (13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    (TestCase.BOTH, (13, 4, 4), (4, 0, 1), (0, 4, 1), None),
-    (TestCase.BOTH, (16, 5632), None, None, None),
-    (TestCase.BOTH, (16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    # Basic cases
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), None),
+    # Strided cases
+    ((13, 4), None, None, (10, 1)),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    # 3D cases
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), None),
+    # Broadcast cases
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    # Large tensors
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), None),
 ]

-
-def parse_test_cases(data):
-    """
-    Parse add test case data according to format:
-    (operation_mode, shape, a_strides, b_strides, c_strides)
-    """
-    operation_mode = data[0]
-    shape = data[1]
-    a_strides = data[2] if len(data) > 2 else None
-    b_strides = data[3] if len(data) > 3 else None
-    c_strides = data[4] if len(data) > 4 else None
-
-    # Create input specifications
-    inputs = []
-
-    # Input tensor a
-    if a_strides is not None:
-        inputs.append(TensorSpec.from_strided_tensor(shape, a_strides))
-    else:
-        inputs.append(TensorSpec.from_tensor(shape))
-
-    # Input tensor b (same shape as a)
-    if b_strides is not None:
-        inputs.append(TensorSpec.from_strided_tensor(shape, b_strides))
-    else:
-        inputs.append(TensorSpec.from_tensor(shape))
-
-    # Output tensor
-    if c_strides is not None:
-        output = TensorSpec.from_strided_tensor(shape, c_strides)
-    else:
-        output = TensorSpec.from_tensor(shape)
-
-    return TestCase(operation_mode, inputs, output)
-
-
-# Parse test cases
-_TEST_CASES = [parse_test_cases(data) for data in _TEST_CASES_DATA]
-
-# Data types
-_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
-
-# Tolerance
+# Tolerance configuration
 _TOLERANCE_MAP = {
    infinicore.float16: {"atol": 0, "rtol": 1e-2},
    infinicore.float32: {"atol": 0, "rtol": 1e-3},
    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
 }

+# Data types to test
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    """
+    Parse test case data and return list of TestCase objects for all operation types.
+    Each test case contains all necessary information for execution and validation.
+    """
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        a_strides = data[1] if len(data) > 1 else None
+        b_strides = data[2] if len(data) > 2 else None
+        c_strides = data[3] if len(data) > 3 else None
+
+        # Check if tensors support in-place operations
+        a_supports_inplace = not is_broadcast(a_strides)
+        b_supports_inplace = not is_broadcast(b_strides)
+        c_supports_inplace = not is_broadcast(c_strides)
+
+        # Generate test cases for all data types
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-3})
+
+            # Create typed tensor specs
+            a_spec = TensorSpec.from_tensor(shape, a_strides, dtype)
+            b_spec = TensorSpec.from_tensor(shape, b_strides, dtype)
+            c_spec = TensorSpec.from_tensor(shape, c_strides, dtype)
+
+            # Test Case 1: Out-of-place (return value)
+            test_cases.append(
+                TestCase(
+                    inputs=[a_spec, b_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description=f"Add - OUT_OF_PLACE",
+                )
+            )
+
+            # Test Case 2: In-place with explicit output tensor (add(a, b, out=c))
+            if c_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs=None,
+                        output_spec=c_spec,  # Specify the output tensor spec
+                        comparison_target="out",
+                        tolerance=tolerance,
+                        description=f"Add - INPLACE(out)",
+                    )
+                )
+
+            # Test Case 3: In-place on first input (add(a, b, out=a))
+            if a_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs={"out": 0},  # Use index 0 for first input
+                        output_spec=None,
+                        comparison_target=0,  # Compare first input
+                        tolerance=tolerance,
+                        description=f"Add - INPLACE(a)",
+                    )
+                )
+
+            # Test Case 4: In-place on second input (add(a, b, out=b))
+            if b_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs={"out": 1},  # Use index 1 for second input
+                        output_spec=None,
+                        comparison_target=1,  # Compare second input
+                        tolerance=tolerance,
+                        description=f"Add - INPLACE(b)",
+                    )
+                )
+
+    return test_cases
+

 class OpTest(BaseOperatorTest):
-    """Add test with simplified test case parsing"""
+    """Add operator test with simplified implementation"""

    def __init__(self):
        super().__init__("Add")

    def get_test_cases(self):
-        return _TEST_CASES
-
-    def get_tensor_dtypes(self):
-        return _TENSOR_DTYPES
-
-    def get_tolerance_map(self):
-        return _TOLERANCE_MAP
+        return parse_test_cases()

-    def torch_operator(self, a, b, out=None, **kwargs):
-        return torch.add(a, b, out=out)
+    def torch_operator(self, *args, **kwargs):
+        """PyTorch add implementation"""
+        return torch.add(*args, **kwargs)

-    def infinicore_operator(self, a, b, out=None, **kwargs):
-        return infinicore.add(a, b, out=out)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore add implementation"""
+        return infinicore.add(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/attention_temp.py
+++ b/test/infinicore/ops/attention_temp.py
@@ -11,18 +11,17 @@ import torch
 import infinicore
 from framework.base import BaseOperatorTest, TensorSpec, TestCase
 from framework.runner import GenericTestRunner
+from framework.utils import is_broadcast

 # ==============================================================================
 # Operator-specific configuration
 # ==============================================================================

-# Test cases format: (operation_mode, n_q_head, n_kv_head, seq_len, head_dim, pos,
-#                    k_cache_buf_len, v_cache_buf_len, q_strides, k_strides, v_strides,
-#                    k_cache_strides, v_cache_strides)
+# Test cases format: (n_q_head, n_kv_head, seq_len, head_dim, pos, k_cache_buf_len, v_cache_buf_len,
+#                    q_strides, k_strides, v_strides, k_cache_strides, v_cache_strides)
 _TEST_CASES_DATA = [
    # Prefill stage
    (
-        TestCase.OUT_OF_PLACE,
        32,
        4,
        5,
@@ -38,7 +37,6 @@ _TEST_CASES_DATA = [
    ),
    # Decode stage
    (
-        TestCase.OUT_OF_PLACE,
        32,
        4,
        1,
@@ -53,10 +51,9 @@ _TEST_CASES_DATA = [
        [64, 11264, 1],
    ),
    # Small test case
-    (TestCase.OUT_OF_PLACE, 8, 4, 2, 16, 1, 8, 8, None, None, None, None, None),
+    (8, 4, 2, 16, 1, 8, 8, None, None, None, None, None),
    # Another prefill case
    (
-        TestCase.OUT_OF_PLACE,
        28,
        28,
        15,
@@ -137,124 +134,114 @@ def torch_attention(q, k, v, k_cache, v_cache, pos):
    return attn_output


-def parse_test_cases(data):
+def parse_test_cases():
    """
    Parse attention test case data according to format:
-    (operation_mode, n_q_head, n_kv_head, seq_len, head_dim, pos,
-     k_cache_buf_len, v_cache_buf_len, q_strides, k_strides, v_strides,
-     k_cache_strides, v_cache_strides)
+    (n_q_head, n_kv_head, seq_len, head_dim, pos, k_cache_buf_len, v_cache_buf_len,
+     q_strides, k_strides, v_strides, k_cache_strides, v_cache_strides)
    """
-    operation_mode = data[0]
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
        n_q_head, n_kv_head, seq_len, head_dim, pos = (
+            data[0],
            data[1],
            data[2],
            data[3],
            data[4],
-        data[5],
-    )
-    k_cache_buf_len, v_cache_buf_len = data[6], data[7]
-    q_strides = data[8] if len(data) > 8 else None
-    k_strides = data[9] if len(data) > 9 else None
-    v_strides = data[10] if len(data) > 10 else None
-    k_cache_strides = data[11] if len(data) > 11 else None
-    v_cache_strides = data[12] if len(data) > 12 else None
-
-    # Create input specifications
-    inputs = []
-
-    # Query tensor: (n_q_head, seq_len, head_dim)
-    if q_strides is not None:
-        inputs.append(
-            TensorSpec.from_strided_tensor((n_q_head, seq_len, head_dim), q_strides)
        )
-    else:
-        inputs.append(TensorSpec.from_tensor((n_q_head, seq_len, head_dim)))
+        k_cache_buf_len, v_cache_buf_len = data[5], data[6]
+        q_strides = data[7] if len(data) > 7 else None
+        k_strides = data[8] if len(data) > 8 else None
+        v_strides = data[9] if len(data) > 9 else None
+        k_cache_strides = data[10] if len(data) > 10 else None
+        v_cache_strides = data[11] if len(data) > 11 else None
+
+        # Check if output tensor supports in-place operations
+        # For attention, output shape is (seq_len, n_q_head, head_dim)
+        output_shape = (seq_len, n_q_head, head_dim)
+        output_supports_inplace = True  # Output is always contiguous for attention

-    # Key tensor: (n_kv_head, seq_len, head_dim)
-    if k_strides is not None:
-        inputs.append(
-            TensorSpec.from_strided_tensor((n_kv_head, seq_len, head_dim), k_strides)
-        )
-    else:
-        inputs.append(TensorSpec.from_tensor((n_kv_head, seq_len, head_dim)))
+        # Generate test cases for all data types
+        for dtype in [infinicore.float16, infinicore.bfloat16, infinicore.float32]:
+            tolerance = {
+                infinicore.float16: {"atol": 1e-4, "rtol": 1e-2},
+                infinicore.float32: {"atol": 1e-5, "rtol": 1e-3},
+                infinicore.bfloat16: {"atol": 1e-3, "rtol": 5e-2},
+            }.get(dtype, {"atol": 1e-5, "rtol": 1e-4})

-    # Value tensor: (n_kv_head, seq_len, head_dim)
-    if v_strides is not None:
-        inputs.append(
-            TensorSpec.from_strided_tensor((n_kv_head, seq_len, head_dim), v_strides)
+            # Create typed tensor specs
+            q_spec = TensorSpec.from_tensor(
+                (n_q_head, seq_len, head_dim), q_strides, dtype
            )
-    else:
-        inputs.append(TensorSpec.from_tensor((n_kv_head, seq_len, head_dim)))
-
-    # Key cache: (n_kv_head, k_cache_buf_len, head_dim)
-    if k_cache_strides is not None:
-        inputs.append(
-            TensorSpec.from_strided_tensor(
-                (n_kv_head, k_cache_buf_len, head_dim), k_cache_strides
+            k_spec = TensorSpec.from_tensor(
+                (n_kv_head, seq_len, head_dim), k_strides, dtype
            )
+            v_spec = TensorSpec.from_tensor(
+                (n_kv_head, seq_len, head_dim), v_strides, dtype
            )
-    else:
-        inputs.append(TensorSpec.from_tensor((n_kv_head, k_cache_buf_len, head_dim)))
-
-    # Value cache: (n_kv_head, v_cache_buf_len, head_dim)
-    if v_cache_strides is not None:
-        inputs.append(
-            TensorSpec.from_strided_tensor(
-                (n_kv_head, v_cache_buf_len, head_dim), v_cache_strides
+            k_cache_spec = TensorSpec.from_tensor(
+                (n_kv_head, k_cache_buf_len, head_dim), k_cache_strides, dtype
+            )
+            v_cache_spec = TensorSpec.from_tensor(
+                (n_kv_head, v_cache_buf_len, head_dim), v_cache_strides, dtype
+            )
+            pos_spec = TensorSpec.from_scalar(pos)
+            output_spec = TensorSpec.from_tensor(
+                output_shape, None, dtype
+            )  # Output is always contiguous
+
+            # Inputs list
+            inputs = [q_spec, k_spec, v_spec, k_cache_spec, v_cache_spec, pos_spec]
+
+            # Test Case 1: Out-of-place (return value)
+            test_cases.append(
+                TestCase(
+                    inputs=inputs,
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description=f"Attention - OUT_OF_PLACE",
                )
            )
-    else:
-        inputs.append(TensorSpec.from_tensor((n_kv_head, v_cache_buf_len, head_dim)))
-
-    # Position (scalar)
-    inputs.append(TensorSpec.from_scalar(pos))
-
-    # Output tensor: (seq_len, n_q_head, head_dim)
-    output_shape = (seq_len, n_q_head, head_dim)
-    output = TensorSpec.from_tensor(output_shape)
-
-    return TestCase(operation_mode, inputs, output)

+            # Test Case 2: In-place with explicit output tensor (attention(q, k, v, k_cache, v_cache, pos, out=output))
+            if output_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=inputs,
+                        kwargs=None,
+                        output_spec=output_spec,  # Specify the output tensor spec
+                        comparison_target="out",
+                        tolerance=tolerance,
+                        description=f"Attention - INPLACE(out)",
+                    )
+                )

-# Parse test cases
-_TEST_CASES = [parse_test_cases(data) for data in _TEST_CASES_DATA]
-
-# Data types
-_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
-
-# Tolerance
-_TOLERANCE_MAP = {
-    infinicore.float16: {"atol": 1e-4, "rtol": 1e-2},
-    infinicore.float32: {"atol": 1e-5, "rtol": 1e-3},
-    infinicore.bfloat16: {"atol": 1e-3, "rtol": 5e-2},
-}
+    return test_cases


 class OpTest(BaseOperatorTest):
-    """Attention test with simplified test case parsing"""
+    """Attention operator test with simplified implementation"""

    def __init__(self):
        super().__init__("Attention")

    def get_test_cases(self):
-        return _TEST_CASES
-
-    def get_tensor_dtypes(self):
-        return _TENSOR_DTYPES
-
-    def get_tolerance_map(self):
-        return _TOLERANCE_MAP
+        return parse_test_cases()

    def torch_operator(self, q, k, v, k_cache, v_cache, pos, out=None, **kwargs):
+        """PyTorch attention implementation"""
        result = torch_attention(q, k, v, k_cache, v_cache, pos)

        if out is not None:
-            out.set_(result)
+            out.copy_(result)
            return out
-        else:
        return result

    def infinicore_operator(self, q, k, v, k_cache, v_cache, pos, out=None, **kwargs):
+        """InfiniCore attention implementation"""
        return infinicore.attention(q, k, v, k_cache, v_cache, pos, out=out)



--- a/test/infinicore/ops/bitwise_xor.py
+++ b/test/infinicore/ops/bitwise_xor.py
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework.base import BaseOperatorTest, TensorSpec, TestCase
+from framework.runner import GenericTestRunner
+from framework.utils import is_broadcast
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (shape, a_strides, b_strides, c_strides)
+_TEST_CASES_DATA = [
+    # Basic cases
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), None),
+    # Strided cases
+    ((13, 4), None, None, (10, 1)),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    # 3D cases
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), None),
+    # Broadcast cases
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    # Large tensors
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), None),
+]
+
+# Tolerance configuration - exact match required for bitwise operations
+_TOLERANCE_MAP = {
+    infinicore.int8: {"atol": 0, "rtol": 0},
+    infinicore.int16: {"atol": 0, "rtol": 0},
+    infinicore.int32: {"atol": 0, "rtol": 0},
+    infinicore.int64: {"atol": 0, "rtol": 0},
+    infinicore.uint8: {"atol": 0, "rtol": 0},
+    infinicore.bool: {"atol": 0, "rtol": 0},
+}
+
+# Data types to test - integer types for bitwise operations
+_TENSOR_DTYPES = [
+    infinicore.int8,
+    infinicore.int16,
+    infinicore.int32,
+    infinicore.int64,
+    infinicore.uint8,
+    infinicore.bool,  # XOR also supports boolean tensors
+]
+
+
+def parse_test_cases():
+    """
+    Parse test case data and return list of TestCase objects for all operation types.
+    Each test case contains all necessary information for execution and validation.
+    """
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        a_strides = data[1] if len(data) > 1 else None
+        b_strides = data[2] if len(data) > 2 else None
+        c_strides = data[3] if len(data) > 3 else None
+
+        # Check if tensors support in-place operations
+        a_supports_inplace = not is_broadcast(a_strides)
+        b_supports_inplace = not is_broadcast(b_strides)
+        c_supports_inplace = not is_broadcast(c_strides)
+
+        # Generate test cases for all data types
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0})
+
+            # Create typed tensor specs
+            a_spec = TensorSpec.from_tensor(shape, a_strides, dtype)
+            b_spec = TensorSpec.from_tensor(shape, b_strides, dtype)
+            c_spec = TensorSpec.from_tensor(shape, c_strides, dtype)
+
+            # Test Case 1: Out-of-place (return value)
+            test_cases.append(
+                TestCase(
+                    inputs=[a_spec, b_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description=f"BitwiseXor - OUT_OF_PLACE",
+                )
+            )
+
+            # Test Case 2: In-place with explicit output tensor (bitwise_xor(a, b, out=c))
+            if c_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs=None,
+                        output_spec=c_spec,  # Specify the output tensor spec
+                        comparison_target="out",
+                        tolerance=tolerance,
+                        description=f"BitwiseXor - INPLACE(out)",
+                    )
+                )
+
+            # Test Case 3: In-place on first input (bitwise_xor(a, b, out=a))
+            if a_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs={"out": 0},  # Use index 0 for first input
+                        output_spec=None,
+                        comparison_target=0,  # Compare first input
+                        tolerance=tolerance,
+                        description=f"BitwiseXor - INPLACE(a)",
+                    )
+                )
+
+            # Test Case 4: In-place on second input (bitwise_xor(a, b, out=b))
+            if b_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs={"out": 1},  # Use index 1 for second input
+                        output_spec=None,
+                        comparison_target=1,  # Compare second input
+                        tolerance=tolerance,
+                        description=f"BitwiseXor - INPLACE(b)",
+                    )
+                )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """Bitwise XOR operator test with simplified implementation"""
+
+    def __init__(self):
+        super().__init__("BitwiseXor")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        """PyTorch bitwise_xor implementation"""
+        return torch.bitwise_xor(*args, **kwargs)
+
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore bitwise_xor implementation"""
+    #     return infinicore.bitwise_xor(*args, **kwargs)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/ops/causal_softmax.py
+++ b/test/infinicore/ops/causal_softmax.py
@@ -7,6 +7,7 @@ import torch
 import infinicore
 from framework.base import BaseOperatorTest, TensorSpec, TestCase
 from framework.runner import GenericTestRunner
+from framework.utils import is_broadcast

 # ==============================================================================
 # Operator-specific configuration
@@ -16,52 +17,17 @@ from framework.runner import GenericTestRunner
 # Causal softmax is a single-input function that applies causal masking before softmax
 _TEST_CASES_DATA = [
    # Basic 2D causal softmax
-    (TestCase.BOTH, (3, 3), None, None),
-    (TestCase.BOTH, (32, 512), None, None),
+    ((3, 3), None, None),
+    ((32, 512), None, None),
    # Strided tensors
-    (TestCase.BOTH, (32, 512), (1024, 1), (1024, 1)),
+    ((32, 512), (1024, 1), (1024, 1)),
    # 3D causal softmax
-    (TestCase.BOTH, (32, 5, 5), None, None),
-    (TestCase.BOTH, (32, 20, 512), None, None),
-    (TestCase.BOTH, (32, 20, 512), (20480, 512, 1), None),
-    (TestCase.BOTH, (28, 15, 15), None, None),
+    ((32, 5, 5), None, None),
+    ((32, 20, 512), None, None),
+    ((32, 20, 512), (20480, 512, 1), None),
+    ((28, 15, 15), None, None),
 ]

-
-def parse_test_cases(data):
-    """
-    Parse causal_softmax test case data according to format:
-    (operation_mode, shape, input_strides, output_strides)
-    """
-    operation_mode = data[0]
-    shape = data[1]
-    input_strides = data[2] if len(data) > 2 else None
-    output_strides = data[3] if len(data) > 3 else None
-
-    # Create input specifications
-    inputs = []
-
-    # Tensor input
-    if input_strides is not None:
-        inputs.append(TensorSpec.from_strided_tensor(shape, input_strides))
-    else:
-        inputs.append(TensorSpec.from_tensor(shape))
-
-    # Output tensor
-    if output_strides is not None:
-        output = TensorSpec.from_strided_tensor(shape, output_strides)
-    else:
-        output = TensorSpec.from_tensor(shape)
-
-    return TestCase(operation_mode, inputs, output)
-
-
-# Parse test cases
-_TEST_CASES = [parse_test_cases(data) for data in _TEST_CASES_DATA]
-
-# Data types
-_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
-
 # Tolerance
 _TOLERANCE_MAP = {
    infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
@@ -69,6 +35,74 @@ _TOLERANCE_MAP = {
    infinicore.bfloat16: {"atol": 5e-3, "rtol": 5e-2},
 }

+# Data types
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    """
+    Parse causal_softmax test case data according to format:
+    (shape, input_strides, output_strides)
+    """
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        input_strides = data[1] if len(data) > 1 else None
+        output_strides = data[2] if len(data) > 2 else None
+
+        # Check if tensors support in-place operations
+        input_supports_inplace = not is_broadcast(input_strides)
+        output_supports_inplace = not is_broadcast(output_strides)
+
+        # Generate test cases for all data types
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-3})
+
+            # Create typed tensor specs
+            input_spec = TensorSpec.from_tensor(shape, input_strides, dtype)
+            output_spec = TensorSpec.from_tensor(shape, output_strides, dtype)
+
+            # Test Case 1: Out-of-place (return value)
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description=f"Causal Softmax - OUT_OF_PLACE",
+                )
+            )
+
+            # Test Case 2: In-place with explicit output tensor (causal_softmax(input, out=output))
+            if output_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[input_spec],
+                        kwargs=None,
+                        output_spec=output_spec,  # Specify the output tensor spec
+                        comparison_target="out",
+                        tolerance=tolerance,
+                        description=f"Causal Softmax - INPLACE(out)",
+                    )
+                )
+
+            # Test Case 3: In-place on first input (causal_softmax(input, out=input))
+            if input_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[input_spec],
+                        kwargs={"out": 0},  # Use index 0 for first input
+                        output_spec=None,
+                        comparison_target=0,  # Compare first input
+                        tolerance=tolerance,
+                        description=f"Causal Softmax - INPLACE(input)",
+                    )
+                )
+
+    return test_cases
+

 class OpTest(BaseOperatorTest):
    """CausalSoftmax test with simplified test case parsing"""
@@ -77,15 +111,9 @@ class OpTest(BaseOperatorTest):
        super().__init__("CausalSoftmax")

    def get_test_cases(self):
-        return _TEST_CASES
+        return parse_test_cases()

-    def get_tensor_dtypes(self):
-        return _TENSOR_DTYPES
-
-    def get_tolerance_map(self):
-        return _TOLERANCE_MAP
-
-    def torch_operator(self, input, out=None, **kwargs):
+    def torch_causal_softmax(self, input, out=None, **kwargs):
        # Causal softmax implementation: apply causal mask then softmax
        dtype = input.dtype

@@ -100,8 +128,11 @@ class OpTest(BaseOperatorTest):
            return out
        return result

-    def infinicore_operator(self, input, out=None, **kwargs):
-        return infinicore.causal_softmax(input, out=out)
+    def torch_operator(self, *args, **kwargs):
+        return self.torch_causal_softmax(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.causal_softmax(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/elu.py
+++ b/test/infinicore/ops/elu.py
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework.base import BaseOperatorTest, TensorSpec, TestCase
+from framework.runner import GenericTestRunner
+from framework.utils import is_broadcast
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (shape, input_strides, alpha)
+_TEST_CASES_DATA = [
+    # Basic ELU tests without alpha (default alpha=1.0)
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), None),
+    ((13, 4), (0, 1), None),
+    # 3D tensor tests
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), None),
+    ((13, 4, 4), (4, 0, 1), None),
+    # Large tensor tests
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), None),
+    # ELU with different alpha values
+    ((8, 4), None, 0.5),
+    ((8, 4), (10, 1), 0.5),
+    ((8, 4), None, 1.5),
+    ((8, 4), (10, 1), 1.5),
+    ((16, 8), None, 2.0),
+    ((16, 8), (20, 1), 2.0),
+    ((16, 8), None, 0.3),
+    ((16, 8), (20, 1), 0.3),
+    ((32, 16), None, 1.0),
+    ((32, 16), (40, 1), 1.0),
+    ((32, 16), None, 1.8),
+    ((32, 16), (40, 1), 1.8),
+]
+
+# Tolerance configuration
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2},
+}
+
+# Data types to test
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    """
+    Parse ELU test case data according to format:
+    (shape, input_strides, alpha)
+    ELU only supports out-of-place and in-place modes via PyTorch's inplace parameter
+    """
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        input_strides = data[1] if len(data) > 1 else None
+        alpha = data[2] if len(data) > 2 else None
+
+        # Check if input tensor supports in-place operations
+        input_supports_inplace = not is_broadcast(input_strides)
+
+        # Generate test cases for all data types
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+
+            # Create typed tensor spec
+            input_spec = TensorSpec.from_tensor(shape, input_strides, dtype)
+
+            # Build description
+            description_parts = ["ELU"]
+            if alpha is not None:
+                description_parts.append(f"alpha={alpha}")
+            if input_strides is not None:
+                description_parts.append(f"input_strides={input_strides}")
+
+            base_description = " - ".join(description_parts)
+
+            # Test Case 1: Out-of-place (return value)
+            kwargs = {}
+            if alpha is not None:
+                kwargs["alpha"] = alpha
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs=kwargs,
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description=f"{base_description} - OUT_OF_PLACE",
+                )
+            )
+
+            # Test Case 2: In-place operation using PyTorch's inplace parameter
+            if input_supports_inplace:
+                inplace_kwargs = {"inplace": True}
+                if alpha is not None:
+                    inplace_kwargs["alpha"] = alpha
+
+                test_cases.append(
+                    TestCase(
+                        inputs=[input_spec],
+                        kwargs=inplace_kwargs,
+                        output_spec=None,
+                        comparison_target=0,  # Compare first input (modified in-place)
+                        tolerance=tolerance,
+                        description=f"{base_description} - INPLACE",
+                    )
+                )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """ELU operator test with PyTorch-compatible implementation"""
+
+    def __init__(self):
+        super().__init__("ELU")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        """PyTorch ELU implementation"""
+        return torch.nn.functional.elu(*args, **kwargs)
+
+    def infinicore_operator(self, x, alpha=1.0, out=None, **kwargs):
+        """InfiniCore ELU implementation"""
+        return None
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/ops/matmul.py
+++ b/test/infinicore/ops/matmul.py
@@ -7,40 +7,53 @@ import torch
 import infinicore
 from framework.base import BaseOperatorTest, TensorSpec, TestCase
 from framework.runner import GenericTestRunner
+from framework.utils import is_broadcast

 # ==============================================================================
 # Operator-specific configuration
 # ==============================================================================

-# Test cases format: (operation_mode, nbatch, m, n, k, a_strides, b_strides, c_strides)
+# Test cases format: (nbatch, m, n, k, a_strides, b_strides, c_strides)
 # If nbatch is None: a_shape=(m, k), b_shape=(k, n), c_shape=(m, n)
 # If nbatch is provided: a_shape=(nbatch, m, k), b_shape=(nbatch, k, n), c_shape=(nbatch, m, n)
 _TEST_CASES_DATA = [
    # Basic 2D matmul
-    (TestCase.BOTH, None, 2, 4, 3, None, None, None),
-    (TestCase.BOTH, None, 128, 64, 256, None, None, None),
+    (None, 2, 4, 3, None, None, None),
+    (None, 128, 64, 256, None, None, None),
    # Batched matmul
-    (TestCase.BOTH, 2, 4, 2048, 2048, None, None, None),
-    (TestCase.BOTH, 4, 48, 6, 64, None, None, None),
+    (2, 4, 2048, 2048, None, None, None),
+    (4, 48, 6, 64, None, None, None),
    # Strided tensors
-    (TestCase.BOTH, None, 1, 2048, 2048, (4096, 1), (4096, 1), (4096, 1)),
-    (TestCase.BOTH, None, 6, 2560, 2048, (2048, 1), (1, 2048), (2560, 1)),
+    (None, 1, 2048, 2048, (4096, 1), (4096, 1), (4096, 1)),
+    (None, 6, 2560, 2048, (2048, 1), (1, 2048), (2560, 1)),
    # Mixed cases
-    (TestCase.BOTH, 8, 16, 32, 16, None, None, None),
+    (8, 16, 32, 16, None, None, None),
 ]

+# Tolerance configuration
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 1e-2},
+    infinicore.float32: {"atol": 0, "rtol": 1e-3},
+    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
+}
+
+# Data types to test
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]

-def parse_test_cases(data):
+
+def parse_test_cases():
    """
-    Parse matmul test case data according to format:
-    (operation_mode, nbatch, m, n, k, a_strides, b_strides, c_strides)
+    Parse test case data and return list of TestCase objects for matmul operation.
+    Each test case contains all necessary information for execution and validation.
    """
-    operation_mode = data[0]
-    nbatch = data[1]
-    m, n, k = data[2], data[3], data[4]
-    a_strides = data[5] if len(data) > 5 else None
-    b_strides = data[6] if len(data) > 6 else None
-    c_strides = data[7] if len(data) > 7 else None
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        nbatch = data[0]
+        m, n, k = data[1], data[2], data[3]
+        a_strides = data[4] if len(data) > 4 else None
+        b_strides = data[5] if len(data) > 5 else None
+        c_strides = data[6] if len(data) > 6 else None

        # Determine shapes based on batch dimension
        if nbatch is None:
@@ -52,64 +65,62 @@ def parse_test_cases(data):
            b_shape = (nbatch, k, n)
            c_shape = (nbatch, m, n)

-    # Create input specifications
-    inputs = []
-
-    # Tensor a
-    if a_strides is not None:
-        inputs.append(TensorSpec.from_strided_tensor(a_shape, a_strides))
-    else:
-        inputs.append(TensorSpec.from_tensor(a_shape))
-
-    # Tensor b
-    if b_strides is not None:
-        inputs.append(TensorSpec.from_strided_tensor(b_shape, b_strides))
-    else:
-        inputs.append(TensorSpec.from_tensor(b_shape))
-
-    # Output tensor
-    if c_strides is not None:
-        output = TensorSpec.from_strided_tensor(c_shape, c_strides)
-    else:
-        output = TensorSpec.from_tensor(c_shape)
-
-    return TestCase(operation_mode, inputs, output)
-
-
-# Parse test cases
-_TEST_CASES = [parse_test_cases(data) for data in _TEST_CASES_DATA]
-
-# Data types
-_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
-
-# Tolerance
-_TOLERANCE_MAP = {
-    infinicore.float16: {"atol": 0, "rtol": 1e-2},
-    infinicore.float32: {"atol": 0, "rtol": 1e-3},
-    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
-}
+        # Check if tensors support in-place operations
+        c_supports_inplace = not is_broadcast(c_strides)
+
+        # Generate test cases for all data types
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-3})
+
+            # Create typed tensor specs
+            a_spec = TensorSpec.from_tensor(a_shape, a_strides, dtype)
+            b_spec = TensorSpec.from_tensor(b_shape, b_strides, dtype)
+            c_spec = TensorSpec.from_tensor(c_shape, c_strides, dtype)
+
+            # Test Case 1: Out-of-place (return value)
+            test_cases.append(
+                TestCase(
+                    inputs=[a_spec, b_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description=f"Matmul - OUT_OF_PLACE",
+                )
+            )
+
+            # Test Case 2: In-place with explicit output tensor (matmul(a, b, out=c))
+            if c_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs=None,
+                        output_spec=c_spec,  # Specify the output tensor spec
+                        comparison_target="out",
+                        tolerance=tolerance,
+                        description=f"Matmul - INPLACE(out)",
+                    )
+                )
+
+    return test_cases


 class OpTest(BaseOperatorTest):
-    """Matmul test with simplified test case parsing"""
+    """Matmul operator test with simplified implementation"""

    def __init__(self):
        super().__init__("Matmul")

    def get_test_cases(self):
-        return _TEST_CASES
-
-    def get_tensor_dtypes(self):
-        return _TENSOR_DTYPES
-
-    def get_tolerance_map(self):
-        return _TOLERANCE_MAP
+        return parse_test_cases()

-    def torch_operator(self, a, b, out=None, **kwargs):
-        return torch.matmul(a, b, out=out)
+    def torch_operator(self, *args, **kwargs):
+        """PyTorch matmul implementation"""
+        return torch.matmul(*args, **kwargs)

-    def infinicore_operator(self, a, b, out=None, **kwargs):
-        return infinicore.matmul(a, b, out=out)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore matmul implementation"""
+        return infinicore.matmul(*args, **kwargs)


 def main():