Revert "Merge pull request #1056 from InfiniTensor/issue/1031"

This reverts commit 7f295448, reversing changes made to e60985dc.

Revert "Merge pull request #1056 from InfiniTensor/issue/1031"
This reverts commit 7f295448, reversing changes made to e60985dc.
def22a08 · wooway777 · 1795b38a · def22a08 · 1795b38a · 1795b38a
Commit def22a08 authored Mar 13, 2026 by wooway777
8 changed files
--- a/test/infinicore/ops/hardtanh.py
+++ b/test/infinicore/ops/hardtanh.py
@@ -17,6 +17,7 @@ from framework import (

 _TEST_CASES_DATA = [
    ((13, 4), None, -1.0, 1.0),
+    ((13, 4), (10, 1), -0.5, 0.5),
    ((8, 8, 8), None, -2.0, 2.0),
 ]

@@ -86,11 +87,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.nn.functional.hardtanh(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation."""
-        import infinicore.nn.functional as F
-
-        return F.hardtanh(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.hardtanh(*args, **kwargs)


 def main():

--- a/test/infiniop/avg_pool1d.py
+++ b/test/infiniop/avg_pool1d.py
-import ctypes
-from ctypes import c_uint64
-
-import torch
-
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-_TEST_CASES = [
-    # input_shape, x_stride, y_stride, kernel_size, stride, padding
-    ((2, 3, 16), None, None, 3, None, 0),
-    ((1, 4, 15), (60, 15, 1), (60, 15, 1), 5, 1, 2),
-    ((2, 1, 32), None, (32, 16, 1), 2, 2, 0),
-    ((3, 2, 7), (14, 7, 1), (9, 3, 1), 3, None, 1),
-    ((4, 6, 31), None, None, 4, 2, 1),
-    ((2, 8, 9), (72, 9, 1), (56, 7, 1), 3, 1, 0),
-]
-
-# Data types used for testing
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def _effective_stride(stride, kernel_size):
-    if stride in (None, 0):
-        return kernel_size
-    return stride
-
-
-def _compute_output_shape(input_shape, kernel_size, stride, padding):
-    stride = _effective_stride(stride, kernel_size)
-    width = input_shape[2]
-    out_width = (width + 2 * padding - kernel_size) // stride + 1
-    return (input_shape[0], input_shape[1], out_width)
-
-
-def avg_pool1d_ref(x, kernel_size, stride, padding):
-    stride = _effective_stride(stride, kernel_size)
-    out = torch.nn.functional.avg_pool1d(
-        x.to(torch.float32), kernel_size=kernel_size, stride=stride, padding=padding
-    )
-    return out.to(x.dtype)
-
-
-def test(
-    handle,
-    device,
-    input_shape,
-    x_stride,
-    y_stride,
-    kernel_size,
-    stride,
-    padding,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    stride_value = _effective_stride(stride, kernel_size)
-    out_shape = _compute_output_shape(
-        input_shape, kernel_size, stride_value, padding
-    )
-    print(
-        f"Testing AvgPool1d on {InfiniDeviceNames[device]} with input_shape:{input_shape}, "
-        f"output_shape:{out_shape}, kernel_size:{kernel_size}, stride:{stride_value}, "
-        f"padding:{padding}, dtype:{InfiniDtypeNames[dtype]}"
-    )
-
-    x = TestTensor(input_shape, x_stride, dtype, device)
-    y = TestTensor(out_shape, y_stride, dtype, device, mode="zeros")
-
-    ans = avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAvgPool1dDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y.descriptor,
-            x.descriptor,
-            kernel_size,
-            stride_value,
-            padding,
-        )
-    )
-
-    # Invalidate descriptors in tensors after creation to make sure kernels read from arguments
-    x.destroy_desc()
-    y.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAvgPool1dWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, x.device)
-
-    def lib_avg_pool1d():
-        check_error(
-            LIBINFINIOP.infiniopAvgPool1d(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                y.data(),
-                x.data(),
-                None,
-            )
-        )
-
-    lib_avg_pool1d()
-
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-
-    if PROFILE:
-        # fmt: off
-        profile_operation(
-            "PyTorch",
-            lambda: avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding),
-            device,
-            NUM_PRERUN,
-            NUM_ITERATIONS,
-        )
-        profile_operation(
-            "    lib",
-            lambda: lib_avg_pool1d(),
-            device,
-            NUM_PRERUN,
-            NUM_ITERATIONS,
-        )
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAvgPool1dDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
-
--- a/test/infiniop/cross_entropy.py
+++ b/test/infiniop/cross_entropy.py
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-
-# ------------------------------------------------------------
-# 用例配置
-# ------------------------------------------------------------
-_TEST_CASES_ = [
-    ((2, 4, 10), None, None),        # logits shape, x_stride, y_stride
-    ((1, 128, 32000), None, None),
-    ((4, 512, 1000), None, None),
-]
-
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 2e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
-}
-
-# ------------------------------------------------------------
-# PyTorch 参考实现
-# ------------------------------------------------------------
-def cross_entropy_ref(logits, target):
-    vocab = logits.shape[-1]
-    logits_flat = logits.reshape(-1, vocab).float()
-    target_flat = target.reshape(-1).long()
-    loss = torch.nn.functional.cross_entropy(logits_flat, target_flat, reduction="none")
-    return loss.view(target.shape).to(logits.dtype)
-
-
-def test(handle, device, shape, x_stride=None, y_stride=None, dtype=InfiniDtype.F16, sync=None):
-    logits_shape = shape
-    label_shape = shape[:-1]
-    vocab = shape[-1]
-
-    print(f"Testing CrossEntropy on {InfiniDeviceNames[device]} logits:{logits_shape} dtype:{InfiniDtypeNames[dtype]}")
-
-    x = TestTensor(logits_shape, x_stride, dtype, device)
-    target = TestTensor(label_shape, None, InfiniDtype.I64, device)
-
-    # 生成有效标签
-    tgt = target.torch_tensor()
-    tgt.copy_(torch.randint(0, vocab, label_shape, dtype=torch.int64, device=tgt.device))
-    target.actual_tensor().copy_(tgt)
-
-    reference = cross_entropy_ref(x.torch_tensor(), target.torch_tensor())
-    y = TestTensor(label_shape, y_stride, dtype, device)
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateCrossEntropyDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, target.descriptor
-        )
-    )
-
-    for tensor in [x, y, target]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(LIBINFINIOP.infiniopGetCrossEntropyWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
-    workspace = TestWorkspace(workspace_size.value, x.device)
-
-    def run():
-        check_error(
-            LIBINFINIOP.infiniopCrossEntropy(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                y.data(),
-                x.data(),
-                target.data(),
-                None,
-            )
-        )
-
-    run()
-    if sync:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    assert torch.allclose(y.actual_tensor(), reference, atol=atol, rtol=rtol)
-
-    check_error(LIBINFINIOP.infiniopDestroyCrossEntropyDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/equal.py
+++ b/test/infiniop/equal.py
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-_TEST_CASES_ = [
-    # shape, a_stride, b_stride, c_stride
-    ((13, 4), None, None, None),
-    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None, None),
-    ((13, 4, 4), None, None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
-    ((16, 5632), None, None, None),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
-    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
-    ((4, 4, 5632), None, None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-# Equal 算子通常不支持 Inplace (输入Float vs 输出Bool，内存大小不同)
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-]
-
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# 测试的输入数据类型
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64]
-
-# 容差设置 (对于 Bool 比较，通常要求完全匹配)
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 0, "rtol": 0},
-    InfiniDtype.F32: {"atol": 0, "rtol": 0},
-    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
-    InfiniDtype.I32: {"atol": 0, "rtol": 0},
-    InfiniDtype.I64: {"atol": 0, "rtol": 0},
-    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-# PyTorch 标准实现
-def equal_func(c, a, b):
-    torch.eq(a, b, out=c)
-
-def test(
-    handle,
-    device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    # 输入 Tensor 使用指定的 dtype (如 float16)
-    a = TestTensor(shape, a_stride, dtype, device)
-    b = TestTensor(shape, b_stride, dtype, device)
-    
-    # [关键修改] 输出 Tensor 强制使用 Bool 类型
-    # 注意：这里 c_stride 如果是按字节计算的，对于 Bool 类型通常是 1 byte
-    c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device)
-
-    if c.is_broadcast():
-        return
-
-    print(
-        f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"input_dtype:{InfiniDtypeNames[dtype]} output_dtype:BOOL"
-    )
-
-    # 运行 PyTorch 对照组
-    equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    
-    # [关键修改] 调用 Equal 的 Create 函数
-    check_error(
-        LIBINFINIOP.infiniopCreateEqualDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c.descriptor, # Output (Bool)
-            a.descriptor, # Input A
-            b.descriptor, # Input B
-        )
-    )
-
-    # Invalidate descriptors
-    for tensor in [a, b, c]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetEqualWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, c.device)
-
-    def lib_equal():
-        check_error(
-            LIBINFINIOP.infiniopEqual(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                c.data(),
-                a.data(),
-                b.data(),
-                None,
-            )
-        )
-
-    lib_equal()
-
-    # 使用 Bool 类型的容差 (实际上就是全等)
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, InfiniDtype.BOOL)
-    
-    if DEBUG:
-        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-    
-    # 验证结果
-    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-        
-    check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/hardswish.py
+++ b/test/infiniop/hardswish.py
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# 复用相同的测试用例配置，因为 HardSwish 也是逐元素操作
-_TEST_CASES_ = [
-    # shape, input_stride, output_stride
-    ((13, 4), None, None),
-    ((13, 4), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None),
-    ((13, 4, 4), None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), None),
-    ((16, 5632), None, None),
-    ((16, 5632), (13312, 1), (13312, 1)),
-    ((4, 4, 5632), None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE = auto()
-
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE,
-]
-
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-    InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def test(
-    handle,
-    device,
-    shape,
-    input_stride=None,
-    output_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    input = TestTensor(shape, input_stride, dtype, device)
-    if inplace == Inplace.INPLACE:
-        if input_stride != output_stride:
-            return
-        output = input
-    else:
-        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
-
-    if output.is_broadcast():
-        return
-
-    print(
-        f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}"
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-
-    new_output = torch.nn.functional.hardswish(input.torch_tensor())
-    output.update_torch_tensor(new_output)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    
-    check_error(
-        LIBINFINIOP.infiniopCreateHardSwishDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            output.descriptor,
-            input.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [input, output]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetHardSwishWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, output.device)
-
-    def lib_hardswish():
-        check_error(
-            LIBINFINIOP.infiniopHardSwish(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                output.data(),
-                input.data(),
-                None,
-            )
-        )
-
-    lib_hardswish()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
-    
-    assert torch.allclose(
-        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
-    )
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: torch.nn.functional.hardswish(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    
-    check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/hardtanh.py
+++ b/test/infiniop/hardtanh.py
-import torch
-import ctypes
-from ctypes import c_uint64, c_float
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-# Configuration
-# ==============================================================================
-_TEST_CASES_ = [
-    # shape, input_stride, output_stride
-    ((13, 4), None, None),
-    ((13, 4), (10, 1), (10, 1)),
-    ((16, 5632), None, None),
-    ((4, 4, 5632), None, None),
-]
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE = auto()
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE,
-]
-
-# HardTanh 特有的参数测试组合 (min_val, max_val)
-_PARAM_CASES = [
-    (-1.0, 1.0),
-    (0.0, 6.0), # 类似于 ReLU6
-    (-2.5, 2.5),
-]
-
-# 组合所有测试用例：shape + inplace + params
-_TEST_CASES = [
-    test_case + (inplace_item, p_min, p_max)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-    for p_min, p_max in _PARAM_CASES
-]
-
-_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-def test(
-    handle,
-    device,
-    shape,
-    input_stride=None,
-    output_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    min_val=-1.0,
-    max_val=1.0,
-    dtype=torch.float16,
-    sync=None,
-):
-    input = TestTensor(shape, input_stride, dtype, device)
-    if inplace == Inplace.INPLACE:
-        if input_stride != output_stride:
-            return
-        output = input
-    else:
-        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
-
-    if output.is_broadcast():
-        return
-
-    print(
-        f"Testing HardTanh on {InfiniDeviceNames[device]} | shape:{shape} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace} range:[{min_val}, {max_val}]"
-    )
-
-    # 计算 PyTorch 真值
-    new_output = torch.nn.functional.hardtanh(input.torch_tensor(), min_val=min_val, max_val=max_val)
-    output.update_torch_tensor(new_output)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-
-    check_error(
-        LIBINFINIOP.infiniopCreateHardTanhDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            output.descriptor,
-            input.descriptor,
-            c_float(min_val),
-            c_float(max_val),
-        )
-    )
-
-    for tensor in [input, output]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetHardTanhWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, output.device)
-
-    def lib_hardtanh():
-        check_error(
-            LIBINFINIOP.infiniopHardTanh(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                output.data(),
-                input.data(),
-                None,
-            )
-        )
-
-    lib_hardtanh()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
-    
-    assert torch.allclose(
-        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
-    )
-
-    if PROFILE:
-        profile_operation("PyTorch", lambda: torch.nn.functional.hardtanh(input.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("   lib", lambda: lib_hardtanh(), device, NUM_PRERUN, NUM_ITERATIONS)
-        
-    check_error(LIBINFINIOP.infiniopDestroyHardTanhDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mHardTanh Test passed!\033[0m")
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -54,54 +54,6 @@ def add_(lib):
        infiniopOperatorDescriptor_t,
    ]

-@OpRegister.operator
-def equal_(lib):
-    # =========================================================
-    # 1. 注册 Create 函数
-    # C函数签名: (handle, &desc, output_desc, input_a_desc, input_b_desc)
-    # =========================================================
-    lib.infiniopCreateEqualDescriptor.restype = c_int32
-    lib.infiniopCreateEqualDescriptor.argtypes = [
-        infiniopHandle_t,                     # handle
-        POINTER(infiniopOperatorDescriptor_t),# desc_ptr (输出)
-        infiniopTensorDescriptor_t,           # output (c)
-        infiniopTensorDescriptor_t,           # input_a
-        infiniopTensorDescriptor_t,           # input_b
-    ]
-
-    # =========================================================
-    # 2. 注册 GetWorkspaceSize 函数
-    # C函数签名: (desc, &size)
-    # =========================================================
-    lib.infiniopGetEqualWorkspaceSize.restype = c_int32
-    lib.infiniopGetEqualWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    # =========================================================
-    # 3. 注册 Execute (计算) 函数
-    # C函数签名: (desc, workspace, size, output_data, input_a_data, input_b_data, stream)
-    # =========================================================
-    lib.infiniopEqual.restype = c_int32
-    lib.infiniopEqual.argtypes = [
-        infiniopOperatorDescriptor_t, # desc
-        c_void_p,                     # workspace ptr
-        c_size_t,                     # workspace size
-        c_void_p,                     # output data ptr
-        c_void_p,                     # input a data ptr
-        c_void_p,                     # input b data ptr
-        c_void_p,                     # stream
-    ]
-
-    # =========================================================
-    # 4. 注册 Destroy 函数
-    # C函数签名: (desc)
-    # =========================================================
-    lib.infiniopDestroyEqualDescriptor.restype = c_int32
-    lib.infiniopDestroyEqualDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]

 @OpRegister.operator
 def attention_(lib):
@@ -210,40 +162,6 @@ def clip_(lib):
    ]


-@OpRegister.operator
-def cross_entropy_(lib):
-    lib.infiniopCreateCrossEntropyDescriptor.restype = c_int32
-    lib.infiniopCreateCrossEntropyDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetCrossEntropyWorkspaceSize.restype = c_int32
-    lib.infiniopGetCrossEntropyWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    lib.infiniopCrossEntropy.restype = c_int32
-    lib.infiniopCrossEntropy.argtypes = [
-        infiniopOperatorDescriptor_t,
-        c_void_p,
-        c_size_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyCrossEntropyDescriptor.restype = c_int32
-    lib.infiniopDestroyCrossEntropyDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]
-
-
 @OpRegister.operator
 def logsoftmax_(lib):
    lib.infiniopCreateLogSoftmaxDescriptor.restype = c_int32
@@ -991,112 +909,6 @@ def silu_(lib):
        infiniopOperatorDescriptor_t,
    ]

-@OpRegister.operator
-def hardtanh_(lib):
-    # 1. Create Descriptor - 注意增加了两个 c_float 参数
-    lib.infiniopCreateHardTanhDescriptor.restype = c_int32
-    lib.infiniopCreateHardTanhDescriptor.argtypes = [
-        infiniopHandle_t,               # handle
-        POINTER(infiniopOperatorDescriptor_t), # desc_ptr
-        infiniopTensorDescriptor_t,     # output
-        infiniopTensorDescriptor_t,     # input
-        c_float,                        # min_val
-        c_float,                        # max_val
-    ]
-
-    # 2. Get Workspace Size
-    lib.infiniopGetHardTanhWorkspaceSize.restype = c_int32
-    lib.infiniopGetHardTanhWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,   # desc
-        POINTER(c_size_t),              # size
-    ]
-
-    # 3. Execute Operator
-    lib.infiniopHardTanh.restype = c_int32
-    lib.infiniopHardTanh.argtypes = [
-        infiniopOperatorDescriptor_t,   # desc
-        c_void_p,                       # workspace
-        c_size_t,                       # workspace_size
-        c_void_p,                       # output
-        c_void_p,                       # input
-        c_void_p,                       # stream
-    ]
-
-    # 4. Destroy Descriptor
-    lib.infiniopDestroyHardTanhDescriptor.restype = c_int32
-    lib.infiniopDestroyHardTanhDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,   # desc
-    ]
-
-@OpRegister.operator
-def hardswish_(lib):
-    lib.infiniopCreateHardSwishDescriptor.restype = c_int32
-    lib.infiniopCreateHardSwishDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetHardSwishWorkspaceSize.restype = c_int32
-    lib.infiniopGetHardSwishWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    lib.infiniopHardSwish.restype = c_int32
-    lib.infiniopHardSwish.argtypes = [
-        infiniopOperatorDescriptor_t,
-        c_void_p,
-        c_size_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-
-    lib.infiniopDestroyHardSwishDescriptor.restype = c_int32
-    lib.infiniopDestroyHardSwishDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]
-
-@OpRegister.operator
-def avg_pool1d_(lib):
-    # 1. Create 函数
-    # C签名: (handle, *desc, y, x, kernel_size, stride, padding)
-    lib.infiniopCreateAvgPool1dDescriptor.restype = c_int32
-    lib.infiniopCreateAvgPool1dDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t,  # y_desc (Output)
-        infiniopTensorDescriptor_t,  # x_desc (Input)
-        c_size_t,                    # kernel_size
-        c_size_t,                    # stride
-        c_size_t,                    # padding
-    ]
-
-    # 2. GetWorkspaceSize 函数
-    lib.infiniopGetAvgPool1dWorkspaceSize.restype = c_int32
-    lib.infiniopGetAvgPool1dWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    # 3. Execute 函数
-    lib.infiniopAvgPool1d.restype = c_int32
-    lib.infiniopAvgPool1d.argtypes = [
-        infiniopOperatorDescriptor_t,
-        c_void_p,  # workspace
-        c_size_t,  # workspace_size
-        c_void_p,  # y (output pointer)
-        c_void_p,  # x (input pointer)
-        c_void_p,  # stream
-    ]
-
-    # 4. Destroy 函数
-    lib.infiniopDestroyAvgPool1dDescriptor.restype = c_int32
-    lib.infiniopDestroyAvgPool1dDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]

 @OpRegister.operator
 def layer_norm_(lib):

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -83,12 +83,8 @@ class TestTensor(CTensor):
                InfiniDtype.BYTE,
                InfiniDtype.BOOL,
            ]:
-                if dt == InfiniDtype.BOOL:
-                    randint_low = 0 if randint_low is None else randint_low
-                    randint_high = 2 if randint_high is None else randint_high
-                else:
-                    randint_low = -2000000000 if randint_low is None else randint_low
-                    randint_high = 2000000000 if randint_high is None else randint_high
+                randint_low = -2000000000 if randint_low is None else randint_low
+                randint_high = 2000000000 if randint_high is None else randint_high
                self._torch_tensor = torch.randint(
                    randint_low,
                    randint_high,