issue/573 - supporting multiple outputs in test framework

559e5fe2 · wooway777 · MaYuhang · 10cfd2b0 · 559e5fe2 · 559e5fe2
Commit 559e5fe2 authored Nov 07, 2025 by wooway777 Committed by MaYuhang Nov 13, 2025
7 changed files
--- a/README.md
+++ b/README.md
@@ -181,7 +181,10 @@ pip install . -e
 #### 运行 InfiniCore Python算子接口测试

 ```bash
-python test/infinicore/run.py --verbose --bench [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun]
+# 测试单算子
+python test/infinicore/ops/[operator].py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
+# 测试全部算子
+python test/infinicore/run.py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun]
 ```

 使用 -h 查看更多参数。

--- a/test/infinicore/framework/__init__.py
+++ b/test/infinicore/framework/__init__.py
@@ -9,14 +9,10 @@ from .utils import (
    profile_operation,
    rearrange_tensor,
    convert_infinicore_to_torch,
-    get_operator_help_info,
-    print_operator_testing_tips,
 )
 from .config import (
    get_args,
    get_hardware_args_group,
-    get_hardware_help_text,
-    get_supported_hardware_platforms,
    get_test_devices,
 )
 from .devices import InfiniDeviceEnum, InfiniDeviceNames, torch_device_map
@@ -41,13 +37,9 @@ __all__ = [
    "debug",
    "get_args",
    "get_hardware_args_group",
-    "get_hardware_help_text",
-    "get_operator_help_info",
-    "get_supported_hardware_platforms",
    "get_test_devices",
    "get_tolerance",
    "infinicore_tensor_from_torch",
-    "print_operator_testing_tips",
    "profile_operation",
    "rearrange_tensor",
    # Utility functions

--- a/test/infinicore/framework/base.py
+++ b/test/infinicore/framework/base.py
@@ -27,6 +27,8 @@ class TestCase:
        comparison_target=None,
        description="",
        tolerance=None,
+        output_count=1,
+        output_specs=None,
    ):
        """
        Initialize a test case with complete configuration
@@ -34,10 +36,12 @@ class TestCase:
        Args:
            inputs: List of TensorSpec objects or scalars
            kwargs: Additional keyword arguments for the operator
-            output_spec: TensorSpec for output tensor (for in-place operations)
+            output_spec: TensorSpec for output tensor (for single output operations)
+            output_specs: List of TensorSpec for multiple output tensors
            comparison_target: Target for comparison ('out', index, or None for return value)
            description: Test case description
            tolerance: Tolerance settings for this test case {'atol': float, 'rtol': float}
+            output_count: Number of outputs (default: 1)
        """
        self.inputs = []

@@ -52,9 +56,26 @@ class TestCase:

        self.kwargs = kwargs or {}
        self.output_spec = output_spec
+        self.output_specs = output_specs
        self.comparison_target = comparison_target
        self.description = description
        self.tolerance = tolerance or {"atol": 1e-5, "rtol": 1e-3}
+        self.output_count = output_count
+
+        # Validate output configuration
+        if self.output_count == 1:
+            if self.output_specs is not None:
+                raise ValueError("output_specs cannot be used when output_count=1")
+        else:
+            if self.output_spec is not None:
+                raise ValueError("output_spec cannot be used when output_count>1")
+            if (
+                self.output_specs is not None
+                and len(self.output_specs) != self.output_count
+            ):
+                raise ValueError(
+                    f"output_specs count ({len(self.output_specs)}) must match output_count ({self.output_count})"
+                )

    def get_tensor_input_count(self):
        """Count the number of tensor inputs (excluding scalars)"""
@@ -92,34 +113,56 @@ class TestCase:
            base_str += f"{self.description}"
        base_str += f" - inputs=[{', '.join(input_strs)}]"

-        if self.kwargs or self.output_spec:
+        if self.kwargs or self.output_spec or self.output_specs:
            kwargs_strs = []
            for key, value in self.kwargs.items():
                if key == "out" and isinstance(value, int):
                    kwargs_strs.append(f"{key}={value}")
                else:
                    kwargs_strs.append(f"{key}={value}")
-            output_spec = self.output_spec
-            if output_spec and isinstance(output_spec, TensorSpec):
-                dtype_str = f", {output_spec.dtype}" if output_spec.dtype else ""
+
+            # Handle output specifications
+            if self.output_count == 1 and self.output_spec:
+                dtype_str = (
+                    f", {self.output_spec.dtype}" if self.output_spec.dtype else ""
+                )
                init_str = (
-                    f", init={output_spec.init_mode}"
-                    if output_spec.init_mode != TensorInitializer.RANDOM
+                    f", init={self.output_spec.init_mode}"
+                    if self.output_spec.init_mode != TensorInitializer.RANDOM
                    else ""
                )
-                if hasattr(output_spec, "strides") and output_spec.strides:
-                    strides_str = f", strides={output_spec.strides}"
+                if hasattr(self.output_spec, "strides") and self.output_spec.strides:
+                    strides_str = f", strides={self.output_spec.strides}"
                    kwargs_strs.append(
-                        f"out=tensor{output_spec.shape}{strides_str}{dtype_str}{init_str}"
+                        f"out=tensor{self.output_spec.shape}{strides_str}{dtype_str}{init_str}"
                    )
                else:
                    kwargs_strs.append(
-                        f"out=tensor{output_spec.shape}{dtype_str}{init_str}"
+                        f"out=tensor{self.output_spec.shape}{dtype_str}{init_str}"
+                    )
+            elif self.output_count > 1 and self.output_specs:
+                output_strs = []
+                for i, spec in enumerate(self.output_specs):
+                    dtype_str = f", {spec.dtype}" if spec.dtype else ""
+                    init_str = (
+                        f", init={spec.init_mode}"
+                        if spec.init_mode != TensorInitializer.RANDOM
+                        else ""
                    )
+                    if hasattr(spec, "strides") and spec.strides:
+                        strides_str = f", strides={spec.strides}"
+                        output_strs.append(
+                            f"out_{i}=tensor{spec.shape}{strides_str}{dtype_str}{init_str}"
+                        )
+                    else:
+                        output_strs.append(
+                            f"out_{i}=tensor{spec.shape}{dtype_str}{init_str}"
+                        )
+                kwargs_strs.extend(output_strs)

            base_str += f", kwargs={{{', '.join(kwargs_strs)}}}"

-        base_str += ")"
+        base_str += f", outputs={self.output_count})"
        return base_str


@@ -209,10 +252,20 @@ class BaseOperatorTest(ABC):
            else:
                inputs.append(input_spec)

-        # Prepare output tensor if specified in output_spec
-        if test_case.output_spec is not None:
-            output_tensor = test_case.output_spec.create_torch_tensor(device)
-            kwargs["out"] = output_tensor
+        # Prepare output tensors based on output_count
+        if test_case.output_count == 1:
+            # Single output case
+            if test_case.output_spec is not None:
+                output_tensor = test_case.output_spec.create_torch_tensor(device)
+                kwargs["out"] = output_tensor
+        else:
+            # Multiple outputs case
+            if test_case.output_specs is not None:
+                # Create output tuple for in-place multiple outputs
+                output_tensors = tuple(
+                    spec.create_torch_tensor(device) for spec in test_case.output_specs
+                )
+                kwargs["out"] = output_tensors

        # Handle integer indices for in-place operations
        if "out" in kwargs and isinstance(kwargs["out"], int):
@@ -264,13 +317,24 @@ class BaseOperatorTest(ABC):

        # Handle infinicore output
        infini_kwargs = kwargs.copy()
-        if "out" in infini_kwargs and isinstance(infini_kwargs["out"], torch.Tensor):
-            if isinstance(comparison_target, int):
-                infini_kwargs["out"] = infini_inputs[comparison_target]
-            else:
-                cloned_out = infini_kwargs["out"].clone().detach()
-                torch_input_clones.append(cloned_out)
-                infini_kwargs["out"] = infinicore_tensor_from_torch(cloned_out)
+        if "out" in infini_kwargs:
+            out_value = infini_kwargs["out"]
+            if isinstance(out_value, torch.Tensor):
+                # Single tensor output
+                if isinstance(comparison_target, int):
+                    infini_kwargs["out"] = infini_inputs[comparison_target]
+                else:
+                    cloned_out = out_value.clone().detach()
+                    torch_input_clones.append(cloned_out)
+                    infini_kwargs["out"] = infinicore_tensor_from_torch(cloned_out)
+            elif isinstance(out_value, (tuple, list)):
+                # Multiple tensor outputs
+                infini_outputs = []
+                for tensor in out_value:
+                    cloned_tensor = tensor.clone().detach()
+                    torch_input_clones.append(cloned_tensor)
+                    infini_outputs.append(infinicore_tensor_from_torch(cloned_tensor))
+                infini_kwargs["out"] = tuple(infini_outputs)

        # Check operator implementations
        torch_implemented = True
@@ -307,86 +371,191 @@ class BaseOperatorTest(ABC):
            )

            if config.bench:
-                if torch_implemented:
+                self._run_benchmarking(
+                    config,
+                    device_str,
+                    torch_implemented,
+                    infini_implemented,
+                    inputs,
+                    kwargs,
+                    infini_inputs,
+                    infini_kwargs,
+                    test_case.output_count,
+                    comparison_target,
+                )
+            return

-                    def torch_op():
-                        return self.torch_operator(*inputs, **kwargs)
+        # ==========================================================================
+        # MULTIPLE OUTPUTS COMPARISON LOGIC
+        # ==========================================================================
+        if test_case.output_count > 1:
+            # Handle multiple outputs comparison

-                    profile_operation(
-                        "PyTorch   ",
-                        torch_op,
-                        device_str,
-                        config.num_prerun,
-                        config.num_iterations,
-                    )
-                if infini_implemented:
+            # Determine what to compare based on comparison_target
+            if comparison_target is None:
+                # Compare return values (out-of-place multiple outputs)
+                torch_comparison = torch_result
+                infini_comparison = infini_result
+            elif comparison_target == "out":
+                # Compare output tuple from kwargs (explicit multiple outputs)
+                torch_comparison = kwargs.get("out")
+                infini_comparison = infini_kwargs.get("out")
+            else:
+                raise ValueError(
+                    f"Invalid comparison target for multiple outputs: {comparison_target}"
+                )

-                    def infini_op():
-                        return self.infinicore_operator(*infini_inputs, **infini_kwargs)
+            # Validate that we have multiple outputs to compare
+            if not isinstance(torch_comparison, (tuple, list)) or not isinstance(
+                infini_comparison, (tuple, list)
+            ):
+                raise ValueError(
+                    f"Multiple outputs expected but got single result: "
+                    f"torch={type(torch_comparison)}, infinicore={type(infini_comparison)}"
+                )

-                    profile_operation(
-                        "InfiniCore",
-                        infini_op,
-                        device_str,
-                        config.num_prerun,
-                        config.num_iterations,
-                    )
-            return
+            if len(torch_comparison) != len(infini_comparison):
+                raise ValueError(
+                    f"Output count mismatch: torch={len(torch_comparison)}, infinicore={len(infini_comparison)}"
+                )

-        if comparison_target is None:
-            # Compare return values (out-of-place)
-            torch_comparison = torch_result
-            infini_comparison = infini_result
-        elif comparison_target == "out":
-            # Compare output tensor from kwargs (explicit output)
-            torch_comparison = kwargs.get("out")
-            infini_comparison = infini_kwargs.get("out")
-        elif isinstance(comparison_target, int):
-            # Compare specific input tensor (in-place operation on input)
-            # For in-place operations, we compare the modified input tensor
-            if 0 <= comparison_target < len(inputs):
-                torch_comparison = inputs[comparison_target]
-                infini_comparison = infini_inputs[comparison_target]
-            else:
+            if len(torch_comparison) != test_case.output_count:
                raise ValueError(
-                    f"Invalid comparison target index: {comparison_target}"
+                    f"Output count mismatch: expected {test_case.output_count}, got {len(torch_comparison)}"
+                )
+
+            # Compare each output pair individually
+            all_valid = True
+            for i, (torch_out, infini_out) in enumerate(
+                zip(torch_comparison, infini_comparison)
+            ):
+                atol = test_case.tolerance.get("atol", 1e-5)
+                rtol = test_case.tolerance.get("rtol", 1e-3)
+
+                compare_fn = create_test_comparator(
+                    config, atol, rtol, f"{test_case.description} - output_{i}"
                )
+
+                is_valid = compare_fn(infini_out, torch_out)
+                if not is_valid:
+                    print(f"❌ Output {i} comparison failed")
+                    all_valid = False
+                else:
+                    print(f"✅ Output {i} comparison passed")
+
+            assert all_valid, f"Multiple outputs comparison failed for {test_case}"
+
+        # ==========================================================================
+        # SINGLE OUTPUT COMPARISON LOGIC
+        # ==========================================================================
        else:
-            raise ValueError(f"Invalid comparison target: {comparison_target}")
+            # Determine comparison targets for single output
+            if comparison_target is None:
+                # Compare return values (out-of-place)
+                torch_comparison = torch_result
+                infini_comparison = infini_result
+            elif comparison_target == "out":
+                # Compare output tensor from kwargs (explicit output)
+                torch_comparison = kwargs.get("out")
+                infini_comparison = infini_kwargs.get("out")
+            elif isinstance(comparison_target, int):
+                # Compare specific input tensor (in-place operation on input)
+                if 0 <= comparison_target < len(inputs):
+                    torch_comparison = inputs[comparison_target]
+                    infini_comparison = infini_inputs[comparison_target]
+                else:
+                    raise ValueError(
+                        f"Invalid comparison target index: {comparison_target}"
+                    )
+            else:
+                raise ValueError(f"Invalid comparison target: {comparison_target}")

-        # Validate comparison targets
-        if torch_comparison is None or infini_comparison is None:
-            raise ValueError("Comparison targets cannot be None")
+            # Validate comparison targets
+            if torch_comparison is None or infini_comparison is None:
+                raise ValueError("Comparison targets cannot be None")

-        # Perform comparison
-        atol = test_case.tolerance.get("atol", 1e-5)
-        rtol = test_case.tolerance.get("rtol", 1e-3)
+            # Perform comparison
+            atol = test_case.tolerance.get("atol", 1e-5)
+            rtol = test_case.tolerance.get("rtol", 1e-3)

-        compare_fn = create_test_comparator(config, atol, rtol, test_case.description)
+            compare_fn = create_test_comparator(
+                config, atol, rtol, test_case.description
+            )

-        is_valid = compare_fn(infini_comparison, torch_comparison)
-        assert is_valid, f"Result comparison failed for {test_case}"
+            is_valid = compare_fn(infini_comparison, torch_comparison)
+            assert is_valid, f"Result comparison failed for {test_case}"

-        # Benchmarking
+        # ==========================================================================
+        # UNIFIED BENCHMARKING LOGIC
+        # ==========================================================================
        if config.bench:
-            if comparison_target is None:
-                # Out-of-place benchmarking
+            self._run_benchmarking(
+                config,
+                device_str,
+                True,
+                True,
+                inputs,
+                kwargs,
+                infini_inputs,
+                infini_kwargs,
+                test_case.output_count,
+                comparison_target,
+            )
+
+    def _run_benchmarking(
+        self,
+        config,
+        device_str,
+        torch_implemented,
+        infini_implemented,
+        inputs,
+        kwargs,
+        infini_inputs,
+        infini_kwargs,
+        output_count,
+        comparison_target,
+    ):
+        """
+        Unified benchmarking logic
+        """
+        if torch_implemented:
+            if output_count > 1:
+                # For multiple outputs, just call the operator
                def torch_op():
                    return self.torch_operator(*inputs, **kwargs)

+            else:
+                if comparison_target is None:
+                    # Out-of-place benchmarking
+                    def torch_op():
+                        return self.torch_operator(*inputs, **kwargs)
+
+                else:
+                    # In-place benchmarking
+                    def torch_op():
+                        self.torch_operator(*inputs, **kwargs)
+                        return (
+                            kwargs.get("out")
+                            if "out" in kwargs
+                            else inputs[comparison_target]
+                        )
+
+            profile_operation(
+                "PyTorch   ",
+                torch_op,
+                device_str,
+                config.num_prerun,
+                config.num_iterations,
+            )
+
+        if infini_implemented:
+            if comparison_target is None:
+                # Out-of-place benchmarking
                def infini_op():
                    return self.infinicore_operator(*infini_inputs, **infini_kwargs)

            else:
                # In-place benchmarking
-                def torch_op():
-                    self.torch_operator(*inputs, **kwargs)
-                    return (
-                        kwargs.get("out")
-                        if "out" in kwargs
-                        else inputs[comparison_target]
-                    )
-
                def infini_op():
                    self.infinicore_operator(*infini_inputs, **infini_kwargs)
                    return (
@@ -395,13 +564,6 @@ class BaseOperatorTest(ABC):
                        else infini_inputs[comparison_target]
                    )

-            profile_operation(
-                "PyTorch   ",
-                torch_op,
-                device_str,
-                config.num_prerun,
-                config.num_iterations,
-            )
            profile_operation(
                "InfiniCore",
                infini_op,

--- a/test/infinicore/framework/config.py
+++ b/test/infinicore/framework/config.py
@@ -27,24 +27,6 @@ def get_supported_hardware_platforms():
    ]


-def get_hardware_help_text():
-    """
-    Get formatted help text for hardware platforms.
-
-    Returns:
-        str: Formatted help text for argument parsers
-    """
-    platforms = get_supported_hardware_platforms()
-    help_lines = ["Supported Hardware Platforms:"]
-
-    for flag, description in platforms:
-        # Remove leading dashes for cleaner display
-        name = flag.lstrip("-")
-        help_lines.append(f"  - {name.upper():<10} {description}")
-
-    return "\n".join(help_lines)
-
-
 def get_hardware_args_group(parser):
    """
    Add hardware platform arguments to an argument parser.
@@ -82,7 +64,6 @@ Examples:
  # Run performance profiling with custom iterations
  python test_operator.py --nvidia --bench --num_prerun 50 --num_iterations 5000

-{get_hardware_help_text()}
        """,
    )


--- a/test/infinicore/framework/utils.py
+++ b/test/infinicore/framework/utils.py
 import torch
 import time
 import infinicore
+import numpy as np
 from .datatypes import to_infinicore_dtype, to_torch_dtype


-def get_operator_help_info():
-    """
-    Get help information for operator testing framework
-
-    Returns:
-        str: Comprehensive help information about the testing framework
-    """
-    return """
-InfiniCore Operator Testing Framework
-
-This framework provides comprehensive testing for InfiniCore operators across
-multiple hardware platforms with the following features:
-
-Key Features:
-------------
-1. Multi-platform Support: CPU, NVIDIA, Cambricon, Ascend, Iluvatar, Metax, 
-   Moore, Kunlun, and Hygon devices
-2. Flexible Testing: Out-of-place and in-place operations
-3. Performance Benchmarking: Accurate timing with warm-up runs
-4. Debug Capabilities: Detailed tensor comparison and discrepancy analysis
-5. Tolerance Control: Configurable absolute and relative tolerances per data type
-
-Usage Patterns:
--------------
-Basic testing:
-  python test_operator.py --cpu --nvidia
-
-With benchmarking:
-  python test_operator.py --nvidia --bench --num_iterations 1000
-
-Debug mode:
-  python test_operator.py --cpu --debug
-
-Multiple devices:
-  python test_operator.py --cpu --nvidia
-
-Data Type Support:
-----------------
- Floating point: float16, bfloat16, float32
- Integer: int8, int16, int32, int64, uint8
- Boolean: bool
-
-Tensor Initialization Modes:
---------------------------
- RANDOM: Random values using torch.rand
- ZEROS: All zeros using torch.zeros  
- ONES: All ones using torch.ones
- RANDINT: Random integers using torch.randint
- MANUAL: Use pre-existing tensor with shape/strides validation
- BINARY: Use pre-existing tensor with shape validation only
- FROM_FILE: Load tensor data from file
-
-For detailed examples and advanced usage, refer to the individual operator
-test files and the framework documentation.
-"""
-
-
-def print_operator_testing_tips():
-    """Print useful tips for operator testing"""
-    tips = """
-Operator Testing Tips:
---------------------
-1. Start with CPU tests for basic functionality validation
-2. Use --debug flag to identify precision issues in early development
-3. Benchmark with sufficient iterations (--num_iterations) for stable results
-4. Set appropriate tolerances for different data types (float16 needs higher tolerance)
-5. Test both contiguous and non-contiguous tensor layouts
-6. Validate in-place operations separately from out-of-place operations
-7. Check edge cases: empty tensors, broadcasting, different tensor shapes
-
-Common Tolerance Settings:
-------------------------
- float32: atol=1e-5, rtol=1e-3
- float16: atol=1e-3, rtol=1e-2  
- bfloat16: atol=1e-2, rtol=1e-1
- Integer types: exact equality (atol=0, rtol=0)
-"""
-    print(tips)
-
-
 def synchronize_device(torch_device):
    """Device synchronization"""
    if torch_device == "cuda":
@@ -235,13 +156,12 @@ def infinicore_tensor_from_torch(torch_tensor):
        )


-def convert_infinicore_to_torch(infini_result, torch_reference):
+def convert_infinicore_to_torch(infini_result):
    """
    Convert infinicore tensor to PyTorch tensor for comparison

    Args:
        infini_result: infinicore tensor result
-        torch_reference: PyTorch tensor reference (for shape and device)
        dtype: infinicore data type
        device_str: torch device string

@@ -249,7 +169,7 @@ def convert_infinicore_to_torch(infini_result, torch_reference):
        torch.Tensor: PyTorch tensor with infinicore data
    """
    torch_result_from_infini = torch.zeros(
-        torch_reference.shape,
+        infini_result.shape,
        dtype=to_torch_dtype(infini_result.dtype),
        device=infini_result.device.type,
    )
@@ -263,38 +183,68 @@ def compare_results(
 ):
    """
    Generic function to compare infinicore result with PyTorch reference result
-    Supports both floating-point (with tolerance) and integer (exact) comparison
+    Supports both single and multiple outputs

    Args:
-        infini_result: infinicore tensor result
-        torch_result: PyTorch tensor reference result
+        infini_result: infinicore tensor result (single or tuple)
+        torch_result: PyTorch tensor reference result (single or tuple)
        atol: absolute tolerance (for floating-point only)
        rtol: relative tolerance (for floating-point only)
        debug_mode: whether to enable debug output

    Returns:
-        bool: True if results match within tolerance (FP) or exactly (integer)
+        bool: True if all results match within tolerance
    """
-    # Convert infinicore result to PyTorch tensor for comparison
-    torch_result_from_infini = convert_infinicore_to_torch(infini_result, torch_result)
-
-    # Handle scalar integer comparison
-    if isinstance(torch_result_from_infini, (int, float)) and isinstance(
-        torch_result, (int, float)
+    # Handle multiple outputs
+    if isinstance(infini_result, (tuple, list)) and isinstance(
+        torch_result, (tuple, list)
    ):
-        if isinstance(torch_result_from_infini, int) and isinstance(torch_result, int):
+        if len(infini_result) != len(torch_result):
+            return False
+
+        all_match = True
+        for i, (infini_out, torch_out) in enumerate(zip(infini_result, torch_result)):
+            match = compare_results(infini_out, torch_out, atol, rtol, debug_mode)
+            all_match = all_match and match
+
+        return all_match
+
+    # Handle scalar and bool comparisons
+    if not isinstance(torch_result, torch.Tensor):
+        is_infini_int = isinstance(infini_result, (int, np.integer))
+        is_torch_int = isinstance(torch_result, (int, np.integer))
+        if isinstance(infini_result, bool) and isinstance(torch_result, bool):
+            # Bool comparison
+            result_equal = infini_result == torch_result
+            if debug_mode:
+                status = "match" if result_equal else "mismatch"
+                print(
+                    f"Boolean values {status}: {infini_result} {'==' if result_equal else '!='} {torch_result}"
+                )
+            return result_equal
+        elif is_infini_int and is_torch_int:
            # Exact integer scalar comparison
-            result_equal = torch_result_from_infini == torch_result
-            if debug_mode and not result_equal:
+            result_equal = infini_result == torch_result
+            if debug_mode:
+                status = "match" if result_equal else "mismatch"
                print(
-                    f"Integer scalar mismatch: {torch_result_from_infini} != {torch_result}"
+                    f"Integer scalar {status}: {infini_result} {'==' if result_equal else '!='} {torch_result}"
                )
            return result_equal
        else:
            # Floating-point scalar comparison with tolerance
-            return abs(torch_result_from_infini - torch_result) <= atol + rtol * abs(
+            result_equal = abs(infini_result - torch_result) <= atol + rtol * abs(
                torch_result
            )
+            if debug_mode:
+                status = "match" if result_equal else "mismatch"
+                print(
+                    f"Floating-point scalar {status}: {infini_result} {'~=' if result_equal else '!~='} {torch_result} (tolerance: {atol + rtol * abs(torch_result)})"
+                )
+            return result_equal
+
+    # Convert infinicore result to PyTorch tensor for comparison
+    torch_result_from_infini = convert_infinicore_to_torch(infini_result)

    # Debug mode: detailed comparison
    if debug_mode:

--- a/test/infinicore/ops/aminmax.py
+++ b/test/infinicore/ops/aminmax.py
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework.base import BaseOperatorTest, TensorSpec, TestCase
+from framework.runner import GenericTestRunner
+from framework.utils import is_broadcast
+
+# ==============================================================================
+# Operator-specific configuration for aminmax
+# ==============================================================================
+
+# Test cases format: (shape, dim, keepdim, input_strides, min_strides, max_strides)
+_TEST_CASES_DATA = [
+    # Basic cases - out-of-place
+    ((13, 4), None, False, None, None, None),
+    ((13, 4), 0, False, None, None, None),
+    ((13, 4), 1, False, None, None, None),
+    ((13, 4), -1, False, None, None, None),
+    # With keepdim - out-of-place
+    ((13, 4), None, True, None, None, None),
+    ((13, 4), 0, True, None, None, None),
+    ((13, 4), 1, True, None, None, None),
+    # 3D cases - out-of-place
+    ((4, 5, 6), None, False, None, None, None),
+    ((4, 5, 6), 1, False, None, None, None),
+    ((4, 5, 6), 1, True, None, None, None),
+    ((4, 5, 6), -1, True, None, None, None),
+    # Edge cases - out-of-place
+    ((10,), None, False, None, None, None),
+    ((10,), 0, False, None, None, None),
+    ((1, 5), None, False, None, None, None),
+    # In-place cases with strided tensors
+    (
+        (13, 4),
+        None,
+        False,
+        (10, 1),
+        None,
+        None,
+    ),  # Global min/max - no strides for scalar outputs
+    ((13, 4), 0, False, None, (3,), (3,)),
+    ((13, 4), 1, False, (20, 1), (10,), (10,)),
+    # 3D in-place cases
+    ((4, 5, 6), 1, True, None, (4, 1, 6), (4, 1, 6)),
+    ((4, 5, 6), -1, False, (30, 6, 1), (4, 5), (4, 5)),
+]
+
+# Tolerance configuration
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2},
+}
+
+# Data types to test
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def calculate_output_shape(input_shape, dim, keepdim):
+    """
+    Calculate the output shape for aminmax operation based on input shape, dim, and keepdim
+    """
+    if dim is None:
+        # Global min/max - output should be scalar tensors
+        if keepdim:
+            # When keepdim=True with dim=None, output has same rank but all dimensions are 1
+            return tuple(1 for _ in input_shape)
+        else:
+            # Scalar tensors
+            return ()
+    else:
+        # Reduction along specific dimension
+        output_shape = list(input_shape)
+        if keepdim:
+            output_shape[dim] = 1
+        else:
+            output_shape.pop(dim)
+        return tuple(output_shape)
+
+
+def parse_test_cases():
+    """
+    Parse aminmax test cases including both out-of-place and in-place variants
+    aminmax supports: torch.aminmax(input, *, dim=None, keepdim=False, out=(min_tensor, max_tensor))
+    """
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        dim = data[1] if len(data) > 1 else None
+        keepdim = data[2] if len(data) > 2 else False
+        input_strides = data[3] if len(data) > 3 else None
+        min_strides = data[4] if len(data) > 4 else None
+        max_strides = data[5] if len(data) > 5 else None
+
+        # Generate test cases for all data types
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+
+            # Create input tensor spec
+            input_spec = TensorSpec.from_tensor(shape, input_strides, dtype)
+
+            # Build description
+            description_parts = ["aminmax"]
+            if dim is not None:
+                description_parts.append(f"dim={dim}")
+            if keepdim:
+                description_parts.append("keepdim=True")
+            if input_strides is not None:
+                description_parts.append(f"input_strides={input_strides}")
+
+            base_description = " - ".join(description_parts)
+
+            # Prepare common kwargs
+            kwargs = {}
+            if dim is not None:
+                kwargs["dim"] = dim
+            kwargs["keepdim"] = keepdim
+
+            # ==================================================================
+            # Test Case 1: Out-of-place (return values)
+            # ==================================================================
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs=kwargs,
+                    output_spec=None,  # No output spec for return value comparison
+                    comparison_target=None,  # Compare return values
+                    tolerance=tolerance,
+                    description=f"{base_description} - OUT_OF_PLACE",
+                    output_count=2,  # aminmax returns 2 tensors: (min, max)
+                )
+            )
+
+            # ==================================================================
+            # Test Case 2: In-place with explicit output tensors
+            # ==================================================================
+            # Only create in-place test cases if we have valid output configurations
+            # For global min/max (dim=None), we need special handling
+            if dim is None:
+                # Global min/max - output shapes are either () or (1,1,...) depending on keepdim
+                output_shape = calculate_output_shape(shape, dim, keepdim)
+
+                # For scalar outputs, we don't use strides (they would be empty tuples)
+                if output_shape == ():
+                    # Scalar tensors - create without strides
+                    min_spec = TensorSpec.from_tensor(output_shape, None, dtype)
+                    max_spec = TensorSpec.from_tensor(output_shape, None, dtype)
+                else:
+                    # keepdim=True case - use provided strides or None
+                    min_spec = TensorSpec.from_tensor(output_shape, min_strides, dtype)
+                    max_spec = TensorSpec.from_tensor(output_shape, max_strides, dtype)
+
+                # Check if output tensors support in-place operations
+                min_supports_inplace = not is_broadcast(
+                    getattr(min_spec, "strides", None)
+                )
+                max_supports_inplace = not is_broadcast(
+                    getattr(max_spec, "strides", None)
+                )
+
+                if min_supports_inplace and max_supports_inplace:
+                    inplace_kwargs = kwargs.copy()
+
+                    test_cases.append(
+                        TestCase(
+                            inputs=[input_spec],
+                            kwargs=inplace_kwargs,
+                            output_specs=[
+                                min_spec,
+                                max_spec,
+                            ],  # Multiple output specs for in-place
+                            comparison_target="out",  # Compare the output tuple from kwargs
+                            tolerance=tolerance,
+                            description=f"{base_description} - INPLACE(out)",
+                            output_count=2,  # Specify 2 outputs
+                        )
+                    )
+
+            else:
+                # Reduction along specific dimension
+                if min_strides is not None and max_strides is not None:
+                    output_shape = calculate_output_shape(shape, dim, keepdim)
+
+                    # Create output tensor specs
+                    min_spec = TensorSpec.from_tensor(output_shape, min_strides, dtype)
+                    max_spec = TensorSpec.from_tensor(output_shape, max_strides, dtype)
+
+                    # Check if output tensors support in-place operations
+                    min_supports_inplace = not is_broadcast(min_strides)
+                    max_supports_inplace = not is_broadcast(max_strides)
+
+                    if min_supports_inplace and max_supports_inplace:
+                        inplace_kwargs = kwargs.copy()
+
+                        test_cases.append(
+                            TestCase(
+                                inputs=[input_spec],
+                                kwargs=inplace_kwargs,
+                                output_specs=[
+                                    min_spec,
+                                    max_spec,
+                                ],  # Multiple output specs for in-place
+                                comparison_target="out",  # Compare the output tuple from kwargs
+                                tolerance=tolerance,
+                                description=f"{base_description} - INPLACE(out)",
+                                output_count=2,  # Specify 2 outputs
+                            )
+                        )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """aminmax operator test with multiple outputs support"""
+
+    def __init__(self):
+        super().__init__("aminmax")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, x, dim=None, keepdim=False, out=None, **kwargs):
+        return torch.aminmax(x, dim=dim, keepdim=keepdim, out=out)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/run.py
+++ b/test/infinicore/run.py
@@ -142,21 +142,19 @@ def run_all_op_tests(ops_dir=None, specific_ops=None, extra_args=None):
            if extra_args:
                cmd.extend(extra_args)

-            # Run with captured output
            result = subprocess.run(
                cmd,
                cwd=ops_dir,
-                capture_output=True,
-                text=True,
-                timeout=300,  # 5 minute timeout per test
+                stdout=None,
+                stderr=None,
            )

            success = result.returncode == 0
            results[test_name] = (
                success,
                result.returncode,
-                result.stdout,
-                result.stderr,
+                "",
+                "",
            )

            # Print the output from the test script
@@ -173,13 +171,9 @@ def run_all_op_tests(ops_dir=None, specific_ops=None, extra_args=None):

            status_icon = "✅" if success else "❌"
            print(
-                f"\n{status_icon} {test_name}: {'PASSED' if success else 'FAILED'} (return code: {result.returncode})"
+                f"{status_icon} {test_name}: {'PASSED' if success else 'FAILED'} (return code: {result.returncode})"
            )

-        except subprocess.TimeoutExpired:
-            print(f"⏰ {test_name}: TIMEOUT (exceeded 5 minutes)")
-            results[test_name] = (False, -2, "", "Test execution timed out")
-
        except Exception as e:
            print(f"💥 {test_name}: ERROR - {str(e)}")
            results[test_name] = (False, -1, "", str(e))
@@ -279,8 +273,8 @@ def generate_help_epilog(ops_dir):
    epilog_parts.append("  # Run all operator tests on CPU")
    epilog_parts.append("  python run.py --cpu")
    epilog_parts.append("")
-    epilog_parts.append("  # Run specific operators with benchmarking")
-    epilog_parts.append("  python run.py --ops add matmul --nvidia --bench")
+    epilog_parts.append("  # Run specific operators")
+    epilog_parts.append("  python run.py --ops add matmul --nvidia")
    epilog_parts.append("")
    epilog_parts.append("  # Run with debug mode on multiple devices")
    epilog_parts.append("  python run.py --cpu --nvidia --debug")
@@ -288,11 +282,6 @@ def generate_help_epilog(ops_dir):
    epilog_parts.append("  # List available tests without running")
    epilog_parts.append("  python run.py --list")
    epilog_parts.append("")
-    epilog_parts.append("  # Run with custom performance settings")
-    epilog_parts.append(
-        "  python run.py --nvidia --bench --num_prerun 50 --num_iterations 5000"
-    )
-    epilog_parts.append("")

    # Available operators section
    if operators:
@@ -315,6 +304,9 @@ def generate_help_epilog(ops_dir):
    epilog_parts.append(
        "  - Operators are automatically discovered from the ops directory"
    )
+    epilog_parts.append(
+        "  - --bench option is disabled in batch mode (run individual tests for benchmarking)"
+    )

    return "\n".join(epilog_parts)

@@ -356,6 +348,15 @@ def main():
        list_available_tests(args.ops_dir)
        return

+    # Check for --bench option in extra arguments
+    for arg in unknown_args:
+        if arg in ["--bench"]:
+            print("❌ ERROR: --bench option is not allowed in batch testing mode.")
+            print("")
+            print("Solution: Run individual test scripts for benchmarking:")
+            print("          python path/to/individual_test.py --bench --<platform>")
+            sys.exit(1)
+
    # Auto-detect ops directory if not provided
    if args.ops_dir is None:
        ops_dir = find_ops_directory()