issue/461 InfiniCore 推理运行时

Co-authored-by: Jiacheng Huang <huangjiacheng0709@outlook.com> Co-authored-by: wooway777 <wooway777@gmail.com>

issue/461 InfiniCore 推理运行时
Co-authored-by: Jiacheng Huang <huangjiacheng0709@outlook.com> Co-authored-by: wooway777 <wooway777@gmail.com>
9a05446f · PanZezhong1725 · GitHub · 37411f6d · 9a05446f · 9a05446f
Unverified Commit 9a05446f authored Oct 11, 2025 by PanZezhong1725 Committed by GitHub Oct 11, 2025
12 changed files
--- a/src/utils/check.h
+++ b/src/utils/check.h
@@ -3,6 +3,8 @@
 #include <iostream>
 #include <tuple>

+#include "infini_status_string.h"
+
 #define CHECK_OR_RETURN(CONDITION, ERROR)                                    \
    do {                                                                     \
        if (!(CONDITION)) {                                                  \
@@ -26,7 +28,10 @@

 #define CHECK_INTERNAL(API, EXPECT) CHECK_API_OR(API, EXPECT, return INFINI_STATUS_INTERNAL_ERROR)

-#define CHECK_STATUS(API) CHECK_API_OR(API, INFINI_STATUS_SUCCESS, return api_result_)
+#define CHECK_STATUS(API)                                                                  \
+    CHECK_API_OR(API, INFINI_STATUS_SUCCESS,                                               \
+                 std::cerr << "Error: " << infini_status_string(api_result_) << std::endl; \
+                 return api_result_)

 #define CHECK_DTYPE(DT, ...)                                 \
    do {                                                     \

--- a/src/utils/infini_status_string.h
+++ b/src/utils/infini_status_string.h
+#ifndef INFINI_STATUS_STRING_H
+#define INFINI_STATUS_STRING_H
+#include <infinicore.h>
+
+inline const char *infini_status_string(infiniStatus_t status) {
+    switch (status) {
+    case INFINI_STATUS_SUCCESS:
+        return "Success";
+    case INFINI_STATUS_INTERNAL_ERROR:
+        return "Internal Error";
+    case INFINI_STATUS_NOT_IMPLEMENTED:
+        return "Not Implemented";
+    case INFINI_STATUS_BAD_PARAM:
+        return "Bad Parameter";
+    case INFINI_STATUS_NULL_POINTER:
+        return "Null Pointer";
+    case INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED:
+        return "Device Type Not Supported";
+    case INFINI_STATUS_DEVICE_NOT_FOUND:
+        return "Device Not Found";
+    case INFINI_STATUS_DEVICE_NOT_INITIALIZED:
+        return "Device Not Initialized";
+    case INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED:
+        return "Device Architecture Not Supported";
+    case INFINI_STATUS_BAD_TENSOR_DTYPE:
+        return "Bad Tensor Data Type";
+    case INFINI_STATUS_BAD_TENSOR_SHAPE:
+        return "Bad Tensor Shape";
+    case INFINI_STATUS_BAD_TENSOR_STRIDES:
+        return "Bad Tensor Strides";
+    case INFINI_STATUS_INSUFFICIENT_WORKSPACE:
+        return "Insufficient Workspace";
+    default:
+        return "Unknown Error";
+    }
+}
+
+#endif /* INFINI_STATUS_STRING_H */
--- a/test/infinicore/framework/__init__.py
+++ b/test/infinicore/framework/__init__.py
+from .base import TestConfig, TestRunner, TestCase
+from .utils import (
+    create_infinicore_tensor,
+    compare_results,
+    debug,
+    get_tolerance,
+    profile_operation,
+    rearrange_tensor,
+)
+from .config import get_test_devices, get_args
+from .devices import InfiniDeviceEnum, InfiniDeviceNames, torch_device_map
+from .datatypes import to_torch_dtype, to_infinicore_dtype
+
+__all__ = [
+    "TestConfig",
+    "TestRunner",
+    "TestCase",
+    "create_infinicore_tensor",
+    "compare_results",
+    "debug",
+    "get_tolerance",
+    "profile_operation",
+    "rearrange_tensor",
+    "get_test_devices",
+    "get_args",
+    "InfiniDeviceEnum",
+    "InfiniDeviceNames",
+    "torch_device_map",
+    "to_torch_dtype",
+    "to_infinicore_dtype",
+]
--- a/test/infinicore/framework/base.py
+++ b/test/infinicore/framework/base.py
+import torch
+import infinicore
+from .devices import InfiniDeviceNames
+from .utils import synchronize_device
+
+
+class TestCase:
+    """Base test case class"""
+
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+    def __str__(self):
+        return f"TestCase{self.args}"
+
+
+class TestConfig:
+    """Test configuration"""
+
+    def __init__(
+        self,
+        tensor_dtypes,
+        tolerance_map,
+        debug=False,
+        bench=False,
+        num_prerun=10,
+        num_iterations=1000,
+    ):
+        self.tensor_dtypes = tensor_dtypes
+        self.tolerance_map = tolerance_map
+        self.debug = debug
+        self.bench = bench
+        self.num_prerun = num_prerun
+        self.num_iterations = num_iterations
+
+
+class TestRunner:
+    """Test runner"""
+
+    def __init__(self, test_cases, test_config):
+        self.test_cases = test_cases
+        self.config = test_config
+        self.failed_tests = []  # Track failures
+
+    def run_tests(self, devices, test_func):
+        """Run tests and track failures"""
+        for device in devices:
+            print(f"\n{'='*60}")
+            print(f"Testing on {InfiniDeviceNames[device]}")
+            print(f"{'='*60}")
+
+            # filter unsupported data types
+            tensor_dtypes = self._filter_tensor_dtypes_by_device(
+                device, self.config.tensor_dtypes
+            )
+
+            for test_case in self.test_cases:
+                for dtype in tensor_dtypes:
+                    try:
+                        test_func(device, test_case, dtype, self.config)
+                        print(f"✓ {test_case} with {dtype} passed")
+                    except Exception as e:
+                        error_msg = f"{test_case} with {dtype} on {InfiniDeviceNames[device]}: {e}"
+                        print(f"✗ {error_msg}")
+                        self.failed_tests.append(error_msg)
+                        if self.config.debug:
+                            raise
+
+        # Return whether any tests failed
+        return len(self.failed_tests) == 0
+
+    def _filter_tensor_dtypes_by_device(self, device, tensor_dtypes):
+        """Filter data types based on device"""
+        if device in ():
+            # Filter out unsupported data types on specified devices
+            return [dt for dt in tensor_dtypes if dt != infinicore.bfloat16]
+        else:
+            return tensor_dtypes
+
+    def print_summary(self):
+        """Print test summary"""
+        if self.failed_tests:
+            print(f"\n\033[91m{len(self.failed_tests)} tests failed:\033[0m")
+            for failure in self.failed_tests:
+                print(f"  - {failure}")
+            return False
+        else:
+            print("\n\033[92mAll tests passed!\033[0m")
+            return True
--- a/test/infinicore/framework/config.py
+++ b/test/infinicore/framework/config.py
+import argparse
+from .devices import InfiniDeviceEnum
+
+
+def get_args():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(description="Test Operator")
+    parser.add_argument(
+        "--bench",
+        action="store_true",
+        help="Whether to benchmark performance",
+    )
+    parser.add_argument(
+        "--num_prerun",
+        type=lambda x: max(0, int(x)),
+        default=10,
+        help="Set the number of pre-runs before benchmarking. Default is 10.",
+    )
+    parser.add_argument(
+        "--num_iterations",
+        type=lambda x: max(0, int(x)),
+        default=1000,
+        help="Set the number of iterations for benchmarking. Default is 1000.",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Whether to turn on debug mode.",
+    )
+
+    # Device options
+    device_group = parser.add_argument_group("Device options")
+    device_group.add_argument("--cpu", action="store_true", help="Run CPU test")
+    device_group.add_argument(
+        "--nvidia", action="store_true", help="Run NVIDIA GPU test"
+    )
+    device_group.add_argument(
+        "--cambricon", action="store_true", help="Run Cambricon MLU test"
+    )
+    device_group.add_argument(
+        "--ascend", action="store_true", help="Run ASCEND NPU test"
+    )
+    device_group.add_argument(
+        "--iluvatar", action="store_true", help="Run Iluvatar GPU test"
+    )
+    device_group.add_argument("--metax", action="store_true", help="Run METAX GPU test")
+    device_group.add_argument(
+        "--moore", action="store_true", help="Run MTHREADS GPU test"
+    )
+    device_group.add_argument(
+        "--kunlun", action="store_true", help="Run KUNLUN XPU test"
+    )
+
+    return parser.parse_args()
+
+
+def get_test_devices(args):
+    """
+    Determine which devices to test based on command line arguments
+    """
+    devices_to_test = []
+
+    if args.cpu:
+        devices_to_test.append(InfiniDeviceEnum.CPU)
+    if args.nvidia:
+        devices_to_test.append(InfiniDeviceEnum.NVIDIA)
+    if args.iluvatar:
+        devices_to_test.append(InfiniDeviceEnum.ILUVATAR)
+    if args.cambricon:
+        try:
+            import torch_mlu
+
+            devices_to_test.append(InfiniDeviceEnum.CAMBRICON)
+        except ImportError:
+            print("Warning: torch_mlu not available, skipping Cambricon tests")
+    if args.ascend:
+        try:
+            import torch
+            import torch_npu
+
+            torch.npu.set_device(0)  # Ascend NPU needs explicit device initialization
+            devices_to_test.append(InfiniDeviceEnum.ASCEND)
+        except ImportError:
+            print("Warning: torch_npu not available, skipping Ascend tests")
+    if args.metax:
+        import torch
+
+        devices_to_test.append(InfiniDeviceEnum.METAX)
+    if args.moore:
+        try:
+            import torch
+            import torch_musa
+
+            devices_to_test.append(InfiniDeviceEnum.MOORE)
+        except ImportError:
+            print("Warning: torch_musa not available, skipping Moore tests")
+    if args.kunlun:
+        try:
+            import torch_xmlir
+
+            devices_to_test.append(InfiniDeviceEnum.KUNLUN)
+        except ImportError:
+            print("Warning: torch_xmlir not available, skipping Kunlun tests")
+
+    # Default to CPU if no devices specified
+    if not devices_to_test:
+        devices_to_test = [InfiniDeviceEnum.CPU]
+
+    return devices_to_test
--- a/test/infinicore/framework/datatypes.py
+++ b/test/infinicore/framework/datatypes.py
+import torch
+import infinicore
+
+
+def to_torch_dtype(infini_dtype):
+    """Convert infinicore data type to PyTorch data type"""
+    if infini_dtype == infinicore.float16:
+        return torch.float16
+    elif infini_dtype == infinicore.float32:
+        return torch.float32
+    elif infini_dtype == infinicore.bfloat16:
+        return torch.bfloat16
+    elif infini_dtype == infinicore.int32:
+        return torch.int32
+    elif infini_dtype == infinicore.int64:
+        return torch.int64
+    else:
+        raise ValueError(f"Unsupported infinicore dtype: {infini_dtype}")
+
+
+def to_infinicore_dtype(torch_dtype):
+    """Convert PyTorch data type to infinicore data type"""
+    if torch_dtype == torch.float32:
+        return infinicore.float32
+    elif torch_dtype == torch.float16:
+        return infinicore.float16
+    elif torch_dtype == torch.bfloat16:
+        return infinicore.bfloat16
+    elif torch_dtype == torch.int32:
+        return infinicore.int32
+    elif torch_dtype == torch.int64:
+        return infinicore.int64
+    else:
+        raise ValueError(f"Unsupported torch dtype: {torch_dtype}")
--- a/test/infinicore/framework/devices.py
+++ b/test/infinicore/framework/devices.py
+class InfiniDeviceEnum:
+    CPU = 0
+    NVIDIA = 1
+    CAMBRICON = 2
+    ASCEND = 3
+    METAX = 4
+    MOORE = 5
+    ILUVATAR = 6
+    KUNLUN = 7
+    SUGON = 8
+
+
+InfiniDeviceNames = {
+    InfiniDeviceEnum.CPU: "CPU",
+    InfiniDeviceEnum.NVIDIA: "NVIDIA",
+    InfiniDeviceEnum.CAMBRICON: "Cambricon",
+    InfiniDeviceEnum.ASCEND: "Ascend",
+    InfiniDeviceEnum.METAX: "Metax",
+    InfiniDeviceEnum.MOORE: "Moore",
+    InfiniDeviceEnum.ILUVATAR: "Iluvatar",
+    InfiniDeviceEnum.KUNLUN: "Kunlun",
+    InfiniDeviceEnum.SUGON: "Sugon",
+}
+
+# Mapping that maps InfiniDeviceEnum to torch device string
+torch_device_map = {
+    InfiniDeviceEnum.CPU: "cpu",
+    InfiniDeviceEnum.NVIDIA: "cuda",
+    InfiniDeviceEnum.CAMBRICON: "mlu",
+    InfiniDeviceEnum.ASCEND: "npu",
+    InfiniDeviceEnum.METAX: "cuda",
+    InfiniDeviceEnum.MOORE: "musa",
+    InfiniDeviceEnum.ILUVATAR: "cuda",
+    InfiniDeviceEnum.KUNLUN: "cuda",
+    InfiniDeviceEnum.SUGON: "cuda",
+}
--- a/test/infinicore/framework/utils.py
+++ b/test/infinicore/framework/utils.py
+import torch
+import time
+import infinicore
+from .datatypes import to_infinicore_dtype, to_torch_dtype
+
+
+def create_infinicore_tensor(torch_tensor, device_str):
+    """Create infinicore tensor from PyTorch tensor"""
+    infini_device = infinicore.device(device_str, 0)
+
+    return infinicore.from_blob(
+        torch_tensor.data_ptr(),
+        list(torch_tensor.shape),
+        dtype=to_infinicore_dtype(torch_tensor.dtype),
+        device=infini_device,
+    )
+
+
+def synchronize_device(torch_device):
+    """Device synchronization"""
+    if torch_device == "cuda":
+        torch.cuda.synchronize()
+    elif torch_device == "npu":
+        torch.npu.synchronize()
+    elif torch_device == "mlu":
+        torch.mlu.synchronize()
+
+
+def timed_op(func, num_iterations, device):
+    """Timed operation"""
+    synchronize_device(device)
+    start = time.time()
+    for _ in range(num_iterations):
+        func()
+    synchronize_device(device)
+    return (time.time() - start) / num_iterations
+
+
+def profile_operation(desc, func, torch_device, num_prerun, num_iterations):
+    """
+    Performance profiling workflow
+    """
+    # Warm-up runs
+    for _ in range(num_prerun):
+        func()
+
+    # Timed execution
+    elapsed = timed_op(lambda: func(), num_iterations, torch_device)
+    print(f" {desc} time: {elapsed * 1000 :6f} ms")
+
+
+def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
+    """
+    Debug function to compare two tensors and print differences
+    """
+    if actual.dtype == torch.bfloat16 or desired.dtype == torch.bfloat16:
+        actual = actual.to(torch.float32)
+        desired = desired.to(torch.float32)
+
+    print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose)
+
+    import numpy as np
+
+    np.testing.assert_allclose(
+        actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True
+    )
+
+
+def print_discrepancy(
+    actual, expected, atol=0, rtol=1e-3, equal_nan=True, verbose=True
+):
+    """Print detailed tensor differences"""
+    if actual.shape != expected.shape:
+        raise ValueError("Tensors must have the same shape to compare.")
+
+    import torch
+    import sys
+
+    is_terminal = sys.stdout.isatty()
+
+    actual_isnan = torch.isnan(actual)
+    expected_isnan = torch.isnan(expected)
+
+    # Calculate difference mask
+    nan_mismatch = (
+        actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan
+    )
+    diff_mask = nan_mismatch | (
+        torch.abs(actual - expected) > (atol + rtol * torch.abs(expected))
+    )
+    diff_indices = torch.nonzero(diff_mask, as_tuple=False)
+    delta = actual - expected
+
+    # Display formatting
+    col_width = [18, 20, 20, 20]
+    decimal_places = [0, 12, 12, 12]
+    total_width = sum(col_width) + sum(decimal_places)
+
+    def add_color(text, color_code):
+        if is_terminal:
+            return f"\033[{color_code}m{text}\033[0m"
+        else:
+            return text
+
+    if verbose:
+        for idx in diff_indices:
+            index_tuple = tuple(idx.tolist())
+            actual_str = f"{actual[index_tuple]:<{col_width[1]}.{decimal_places[1]}f}"
+            expected_str = (
+                f"{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}"
+            )
+            delta_str = f"{delta[index_tuple]:<{col_width[3]}.{decimal_places[3]}f}"
+            print(
+                f" > Index: {str(index_tuple):<{col_width[0]}}"
+                f"actual: {add_color(actual_str, 31)}"
+                f"expect: {add_color(expected_str, 32)}"
+                f"delta: {add_color(delta_str, 33)}"
+            )
+
+        print(add_color(" INFO:", 35))
+        print(f"  - Actual dtype: {actual.dtype}")
+        print(f"  - Desired dtype: {expected.dtype}")
+        print(f"  - Atol: {atol}")
+        print(f"  - Rtol: {rtol}")
+        print(
+            f"  - Mismatched elements: {len(diff_indices)} / {actual.numel()} ({len(diff_indices) / actual.numel() * 100}%)"
+        )
+        print(
+            f"  - Min(actual) : {torch.min(actual):<{col_width[1]}} | Max(actual) : {torch.max(actual):<{col_width[2]}}"
+        )
+        print(
+            f"  - Min(desired): {torch.min(expected):<{col_width[1]}} | Max(desired): {torch.max(expected):<{col_width[2]}}"
+        )
+        print(
+            f"  - Min(delta)  : {torch.min(delta):<{col_width[1]}} | Max(delta)  : {torch.max(delta):<{col_width[2]}}"
+        )
+        print("-" * total_width + "\n")
+
+    return diff_indices
+
+
+def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3):
+    """
+    Get tolerance settings based on data type
+    """
+    tolerance = tolerance_map.get(
+        tensor_dtype, {"atol": default_atol, "rtol": default_rtol}
+    )
+    return tolerance["atol"], tolerance["rtol"]
+
+
+def compare_results(
+    infini_result, torch_result, dtype, config, device_str, tolerance_map=None
+):
+    """
+    Compare infinicore result with PyTorch reference result
+
+    Args:
+        infini_result: infinicore tensor result
+        torch_result: PyTorch tensor reference result
+        dtype: infinicore data type
+        config: test config
+        device_str: torch device string
+        device: device enum
+        tolerance_map: optional tolerance map (defaults to config's tolerance_map)
+
+    Returns:
+        bool: True if results match within tolerance
+    """
+    # Convert infinicore result to PyTorch tensor for comparison
+    torch_result_from_infini = torch.zeros(
+        torch_result.shape, dtype=to_torch_dtype(dtype), device=device_str
+    )
+    temp_tensor = create_infinicore_tensor(torch_result_from_infini, device_str)
+    temp_tensor.copy_(infini_result)
+
+    # Retrieve tolerance - use provided map or config's map
+    if tolerance_map is None:
+        tolerance_map = config.tolerance_map
+    atol, rtol = get_tolerance(tolerance_map, dtype)
+
+    # Debug mode: detailed comparison
+    if config.debug:
+        debug(torch_result_from_infini, torch_result, atol=atol, rtol=rtol)
+
+    # Check if results match within tolerance
+    return torch.allclose(torch_result_from_infini, torch_result, atol=atol, rtol=rtol)
+
+
+def rearrange_tensor(tensor, new_strides):
+    """
+    Given a PyTorch tensor and a list of new strides, return a new PyTorch tensor with the given strides.
+    """
+    import torch
+
+    shape = tensor.shape
+
+    new_size = [0] * len(shape)
+    left = 0
+    right = 0
+    for i in range(len(shape)):
+        if new_strides[i] > 0:
+            new_size[i] = (shape[i] - 1) * new_strides[i] + 1
+            right += new_strides[i] * (shape[i] - 1)
+        else:  # TODO: Support negative strides in the future
+            # new_size[i] = (shape[i] - 1) * (-new_strides[i]) + 1
+            # left += new_strides[i] * (shape[i] - 1)
+            raise ValueError("Negative strides are not supported yet")
+
+    # Create a new tensor with zeros
+    new_tensor = torch.zeros(
+        (right - left + 1,), dtype=tensor.dtype, device=tensor.device
+    )
+
+    # Generate indices for original tensor based on original strides
+    indices = [torch.arange(s) for s in shape]
+    mesh = torch.meshgrid(*indices, indexing="ij")
+
+    # Flatten indices for linear indexing
+    linear_indices = [m.flatten() for m in mesh]
+
+    # Calculate new positions based on new strides
+    new_positions = sum(
+        linear_indices[i] * new_strides[i] for i in range(len(shape))
+    ).to(tensor.device)
+    offset = -left
+    new_positions += offset
+
+    # Copy the original data to the new tensor
+    new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
+    new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))
+
+    return new_tensor
--- a/test/infinicore/op/matmul.py
+++ b/test/infinicore/op/matmul.py
+import torch
+import infinicore
+import sys
+import os
+
+# Framework path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from framework import (
+    TestConfig,
+    TestRunner,
+    TestCase,
+    create_infinicore_tensor,
+    compare_results,
+    get_args,
+    get_test_devices,
+    profile_operation,
+    to_torch_dtype,
+    InfiniDeviceNames,
+    torch_device_map,
+)
+
+# ==============================================================================
+# Test Setup
+# ==============================================================================
+
+# Test cases
+_TEST_CASES = [
+    # (a_shape, b_shape, result_shape, a_stride, b_stride, c_stride)
+    TestCase((2, 3), (3, 4), (2, 4), None, None, None),
+    TestCase((128, 256), (256, 64), (128, 64), None, None, None),
+    TestCase((2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
+    TestCase((1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
+    TestCase((6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
+    TestCase((4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
+]
+
+# Data types - now using infinicore native types
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+# Tolerance
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 1e-2},
+    infinicore.float32: {"atol": 0, "rtol": 1e-3},
+    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
+}
+
+# ==============================================================================
+# Test Method
+# ==============================================================================
+
+
+def test_matmul(device, test_case, dtype, config):
+    """
+    Test matmul operation
+
+    Args:
+        device: device enum
+        test_case: test case
+        dtype: infinicore data type
+        config: test config
+    """
+    a_shape, b_shape, result_shape, a_stride, b_stride, c_stride = test_case.args
+
+    print(
+        f"Testing Matmul on {InfiniDeviceNames[device]} with "
+        f"a_shape:{a_shape}, b_shape:{b_shape}, result_shape:{result_shape}, "
+        f"a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, "
+        f"dtype:{dtype}"
+    )
+
+    # Create PyTorch tensors
+    device_str = torch_device_map[device]
+    torch_dtype = to_torch_dtype(dtype)
+
+    torch_a = torch.rand(a_shape, dtype=torch_dtype, device=device_str)
+    torch_b = torch.rand(b_shape, dtype=torch_dtype, device=device_str)
+
+    # Calculate PyTorch reference result
+    def torch_matmul():
+        return torch.matmul(torch_a, torch_b)
+
+    torch_result = torch_matmul()
+
+    # Create infinicore tensors
+    infini_a = create_infinicore_tensor(torch_a, device_str)
+    infini_b = create_infinicore_tensor(torch_b, device_str)
+
+    # Out-of-place matmul
+    def infini_matmul():
+        return infinicore.matmul(infini_a, infini_b)
+
+    infini_result = infini_matmul()
+
+    # Validate results using common method
+    is_valid = compare_results(infini_result, torch_result, dtype, config, device_str)
+    assert is_valid, "Matmul test failed"
+
+    # Performance test
+    if config.bench:
+        profile_operation(
+            "PyTorch",
+            torch_matmul,
+            device_str,
+            config.num_prerun,
+            config.num_iterations,
+        )
+        profile_operation(
+            "Infinicore",
+            infini_matmul,
+            device_str,
+            config.num_prerun,
+            config.num_iterations,
+        )
+
+
+def test_matmul_inplace(device, test_case, dtype, config):
+    """
+    Test in-place matmul operation
+
+    Args:
+        device: device enum
+        test_case: test case
+        dtype: infinicore data type
+        config: test config
+    """
+    a_shape, b_shape, result_shape, a_stride, b_stride, c_stride = test_case.args
+
+    print(
+        f"Testing In-place Matmul on {InfiniDeviceNames[device]} with "
+        f"a_shape:{a_shape}, b_shape:{b_shape}, result_shape:{result_shape}, "
+        f"dtype:{dtype}"
+    )
+
+    device_str = torch_device_map[device]
+    torch_dtype = to_torch_dtype(dtype)
+
+    # Create PyTorch tensors
+    torch_a = torch.rand(a_shape, dtype=torch_dtype, device=device_str)
+    torch_b = torch.rand(b_shape, dtype=torch_dtype, device=device_str)
+
+    # Create pre-allocated result tensor
+    torch_preallocated = torch.zeros(result_shape, dtype=torch_dtype, device=device_str)
+
+    # Calculate PyTorch reference result using in-place operation
+    def torch_matmul_inplace():
+        torch.matmul(torch_a, torch_b, out=torch_preallocated)
+
+    # Execute in-place operation
+    torch_matmul_inplace()
+
+    # Create infinicore tensors
+    infini_a = create_infinicore_tensor(torch_a, device_str)
+    infini_b = create_infinicore_tensor(torch_b, device_str)
+    infini_c = infinicore.empty(
+        result_shape, dtype=dtype, device=infinicore.device(device_str, 0)
+    )
+
+    # Test in-place matmul
+    def infini_matmul_inplace():
+        infinicore.matmul(infini_a, infini_b, out=infini_c)
+
+    # Execute in-place operation
+    infini_matmul_inplace()
+
+    # Validate results using common method
+    is_valid = compare_results(infini_c, torch_preallocated, dtype, config, device_str)
+    assert is_valid, "In-place matmul test failed"
+
+    # Performance test
+    if config.bench:
+        profile_operation(
+            "PyTorch In-place",
+            torch_matmul_inplace,
+            device_str,
+            config.num_prerun,
+            config.num_iterations,
+        )
+        profile_operation(
+            "Infinicore In-place",
+            infini_matmul_inplace,
+            device_str,
+            config.num_prerun,
+            config.num_iterations,
+        )
+
+
+# ==============================================================================
+# Main Execution Function
+# ==============================================================================
+
+
+def main():
+    args = get_args()
+
+    # Create test configuration
+    config = TestConfig(
+        tensor_dtypes=_TENSOR_DTYPES,
+        tolerance_map=_TOLERANCE_MAP,
+        debug=args.debug,
+        bench=args.bench,
+        num_prerun=args.num_prerun,
+        num_iterations=args.num_iterations,
+    )
+
+    # Create test runner
+    runner = TestRunner(_TEST_CASES, config)
+
+    # Get test devices
+    devices = get_test_devices(args)
+
+    print("Starting matmul tests...")
+
+    all_passed = True
+
+    # Run out-of-place tests
+    print("\n--- Testing Out-of-place Matmul ---")
+    out_of_place_passed = runner.run_tests(devices, test_matmul)
+    all_passed = all_passed and out_of_place_passed
+
+    # Run in-place tests
+    print("\n--- Testing In-place Matmul ---")
+    in_place_passed = runner.run_tests(devices, test_matmul_inplace)
+    all_passed = all_passed and in_place_passed
+
+    runner.print_summary()
+
+    sys.exit(0 if all_passed else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/test/infinicore/test.py
+++ b/test/infinicore/test.py
+import infinicore
+import torch
+
+
+def test():
+    shape = [2, 3, 4]
+    shape2 = [3, 4, 2]
+    torch_tensor_ans = torch.rand(shape, dtype=torch.float32, device="cpu")
+    torch_tensor_result = torch.zeros(shape, dtype=torch.float32, device="cpu")
+
+    t_cpu = infinicore.from_blob(
+        torch_tensor_ans.data_ptr(),
+        shape,
+        dtype=infinicore.float32,
+        device=infinicore.device("cpu", 0),
+    )
+
+    t_gpu = t_cpu.to(infinicore.device("cuda", 0))
+
+    t_gpu = t_gpu.permute([1, 2, 0])
+
+    t_gpu2 = infinicore.empty(
+        shape2, dtype=infinicore.float32, device=infinicore.device("cuda", 0)
+    )
+
+    t_gpu2.copy_(t_gpu)
+
+    t_gpu2 = t_gpu2.permute([2, 0, 1]).contiguous()
+
+    t_result = infinicore.from_blob(
+        torch_tensor_result.data_ptr(),
+        shape,
+        dtype=infinicore.float32,
+        device=infinicore.device("cpu", 0),
+    )
+
+    t_result.copy_(t_gpu2)
+
+    assert torch.equal(torch_tensor_ans, torch_tensor_result)
+    print("Test passed")
+
+
+if __name__ == "__main__":
+    test()
--- a/spdlog @ f1d748e5
+++ b/spdlog @ f1d748e5
+Subproject commit f1d748e5e3edfa4b1778edea003bac94781bc7b7
--- a/xmake.lua
+++ b/xmake.lua
 add_rules("mode.debug", "mode.release")
+add_requires("boost", {configs = {stacktrace = true}})
 add_requires("pybind11")

 -- Define color codes
@@ -9,6 +10,7 @@ local NC = '\27[0m'  -- No Color
 set_encodings("utf-8")

 add_includedirs("include")
+add_includedirs("third_party/spdlog/include")

 if is_mode("debug") then
    add_defines("DEBUG_MODE")
@@ -317,14 +319,33 @@ target("infinicore_c_api")
    after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
 target_end()

-target("infinicore")
+target("_infinicore")
+    add_packages("boost")
+    if is_mode("debug") then
+        add_defines("BOOST_STACKTRACE_USE_BACKTRACE")
+        add_links("backtrace")
+    else
+        add_defines("BOOST_STACKTRACE_USE_NOOP")
+    end
+
+    set_default(false)
    add_rules("python.library", {soabi = true})
    add_packages("pybind11")
+    set_languages("cxx17")

    set_kind("shared")
-    add_deps("infinicore_c_api")
+    local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+    add_includedirs(INFINI_ROOT.."/include", { public = true })
+
+    add_linkdirs(INFINI_ROOT.."/lib")
+    add_links("infiniop", "infinirt", "infiniccl")

    add_files("src/infinicore/*.cc")
+    add_files("src/infinicore/context/*.cc")
+    add_files("src/infinicore/context/*/*.cc")
+    add_files("src/infinicore/tensor/*.cc")
+    add_files("src/infinicore/op/*/*.cc")
+    add_files("src/infinicore/pybind11/**.cc")

    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
 target_end()