Merge pull request #196 from goldenfox2025/issue180

issue/180：添加clip算子

Merge pull request #196 from goldenfox2025/issue180
issue/180：添加clip算子
3e5842c3 · PanZezhong1725 · GitHub · ce75b918 · c98e68be · 3e5842c3
Unverified Commit 3e5842c3 authored May 29, 2025 by PanZezhong1725 Committed by GitHub May 29, 2025
13 changed files
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -6,6 +6,7 @@
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/avg_pool.h"
 #include "infiniop/ops/causal_softmax.h"
+#include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/expand.h"
 #include "infiniop/ops/gemm.h"

--- a/include/infiniop/ops/clip.h
+++ b/include/infiniop/ops/clip.h
+#ifndef __INFINIOP_CLIP_API_H__
+#define __INFINIOP_CLIP_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopClipDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
+                                                         infiniopClipDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t y,
+                                                         infiniopTensorDescriptor_t x,
+                                                         infiniopTensorDescriptor_t min_val,
+                                                         infiniopTensorDescriptor_t max_val);
+
+__C __export infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *y,
+                                         const void *x,
+                                         const void *min_val,
+                                         const void *max_val,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
+
+#endif
--- a/include/infiniop/ops/mul.h
+++ b/include/infiniop/ops/mul.h
@@ -6,10 +6,10 @@
 typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;

 __C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
-                                                       infiniopMulDescriptor_t *desc_ptr,
-                                                       infiniopTensorDescriptor_t c,
-                                                       infiniopTensorDescriptor_t a,
-                                                       infiniopTensorDescriptor_t b);
+                                                        infiniopMulDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);

 __C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);

@@ -20,7 +20,7 @@ __C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
                                        const void *a,
                                        const void *b,
                                        void *stream);
-                
+
 __C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);

 #endif
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -8,6 +8,7 @@
 DECLARE_INFINIOP_TEST(gemm)
 DECLARE_INFINIOP_TEST(random_sample)
 DECLARE_INFINIOP_TEST(mul)
+DECLARE_INFINIOP_TEST(clip)
 DECLARE_INFINIOP_TEST(swiglu)

 #define REGISTER_INFINIOP_TEST(name)                      \
@@ -27,6 +28,7 @@ DECLARE_INFINIOP_TEST(swiglu)
        REGISTER_INFINIOP_TEST(gemm)          \
        REGISTER_INFINIOP_TEST(random_sample) \
        REGISTER_INFINIOP_TEST(mul)           \
+        REGISTER_INFINIOP_TEST(clip)          \
        REGISTER_INFINIOP_TEST(swiglu)        \
    }


--- a/src/infiniop-test/src/ops/clip.cpp
+++ b/src/infiniop-test/src/ops/clip.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::clip {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> min_val;
+    std::shared_ptr<Tensor> max_val;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("x") == tensors.end()
+        || tensors.find("min_val") == tensors.end()
+        || tensors.find("max_val") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->x = tensors["x"];
+    test->_attributes->min_val = tensors["min_val"];
+    test->_attributes->max_val = tensors["max_val"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopClipDescriptor_t op_desc;
+    auto x = _attributes->x->to(device, device_id);
+    auto min_val = _attributes->min_val->to(device, device_id);
+    auto max_val = _attributes->max_val->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+    CHECK_OR(infiniopCreateClipDescriptor(handle, &op_desc,
+                                          y->desc(),
+                                          x->desc(),
+                                          min_val->desc(),
+                                          max_val->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create clip descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetClipWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopClip(op_desc, workspace, workspace_size,
+                          y->data(),
+                          x->data(),
+                          min_val->data(),
+                          max_val->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopClip(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                min_val->data(),
+                max_val->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyClipDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "min_val", "max_val", "y", "ans"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- min_val: " << _attributes->min_val->info() << std::endl;
+    oss << "- max_val: " << _attributes->max_val->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::clip
--- a/src/infiniop/ops/clip/cpu/clip_cpu.cc
+++ b/src/infiniop/ops/clip/cpu/clip_cpu.cc
+#include "clip_cpu.h"
+
+namespace op::clip::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &in_desc = input_desc_vec.at(0);
+    const auto &min_desc = input_desc_vec.at(1);
+    const auto &max_desc = input_desc_vec.at(2);
+    const auto &out_shape = out_desc->shape();
+    const auto &in_shape = in_desc->shape();
+    const auto &min_shape = min_desc->shape();
+    const auto &max_shape = max_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(out_shape, in_shape);
+    CHECK_SAME_SHAPE(out_shape, min_shape);
+    CHECK_SAME_SHAPE(out_shape, max_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ClipOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ClipOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ClipOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::clip::cpu
--- a/src/infiniop/ops/clip/cpu/clip_cpu.h
+++ b/src/infiniop/ops/clip/cpu/clip_cpu.h
+#ifndef __CLIP_CPU_H__
+#define __CLIP_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "infiniop/ops/clip.h"
+
+ELEMENTWISE_DESCRIPTOR(clip, cpu)
+
+namespace op::clip::cpu {
+
+typedef struct ClipOp {
+public:
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    T operator()(const T &x, const T &min_val, const T &max_val) const {
+        return std::max(std::min(x, max_val), min_val);
+    }
+} ClipOp;
+
+} // namespace op::clip::cpu
+
+#endif // __CLIP_CPU_H__
--- a/src/infiniop/ops/clip/cuda/clip_cuda.cu
+++ b/src/infiniop/ops/clip/cuda/clip_cuda.cu
+#include "clip_cuda.cuh"
+#include "clip_cuda_internal.cuh"
+
+namespace op::clip::cuda {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &in_desc = input_desc_vec.at(0);
+    const auto &min_desc = input_desc_vec.at(1);
+    const auto &max_desc = input_desc_vec.at(2);
+    const auto &out_shape = out_desc->shape();
+    const auto &in_shape = in_desc->shape();
+    const auto &min_shape = min_desc->shape();
+    const auto &max_shape = max_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(out_shape, in_shape);
+    CHECK_SAME_SHAPE(out_shape, min_shape);
+    CHECK_SAME_SHAPE(out_shape, max_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, ClipOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, ClipOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, ClipOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::clip::cuda
--- a/src/infiniop/ops/clip/cuda/clip_cuda.cuh
+++ b/src/infiniop/ops/clip/cuda/clip_cuda.cuh
+#ifndef __CLIP_CUDA_API_H__
+#define __CLIP_CUDA_API_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+#include "infiniop/ops/clip.h"
+
+ELEMENTWISE_DESCRIPTOR(clip, cuda)
+
+#endif // __CLIP_CUDA_API_H__
--- a/src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
+++ b/src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
+#ifndef __CLIP_CUDA_H__
+#define __CLIP_CUDA_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include <cuda_fp16.h>
+
+namespace op::clip::cuda {
+
+typedef struct ClipOp {
+public:
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, const T &min_val, const T &max_val) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmax2(__hmin2(x, max_val), min_val);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hmax(__hmin(x, max_val), min_val);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fmaxf(fminf(x, max_val), min_val);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return fmax(fmin(x, max_val), min_val);
+        } else {
+            return std::max(std::min(x, max_val), min_val);
+        }
+    }
+} ClipOp;
+} // namespace op::clip::cuda
+
+#endif // __CLIP_CUDA_H__
--- a/src/infiniop/ops/clip/operator.cc
+++ b/src/infiniop/ops/clip/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/clip.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/clip_cpu.h"
+#endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/clip_cuda.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateClipDescriptor(
+    infiniopHandle_t handle,
+    infiniopClipDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t min_val,
+    infiniopTensorDescriptor_t max_val) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::clip::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::clip::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                              \
+            {x, min_val, max_val})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::clip::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_CUDA_API
+        GET(INFINI_DEVICE_NVIDIA, cuda)
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopClip(
+    infiniopClipDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *min_val,
+    const void *max_val,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x, min_val, max_val}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
--- a/test/infiniop-test/test_generate/testcases/clip.py
+++ b/test/infiniop-test/test_generate/testcases/clip.py
+import numpy as np
+import gguf
+from typing import List, Optional, Tuple
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
+
+
+def clip(
+    x: np.ndarray,
+    min_val: np.ndarray,
+    max_val: np.ndarray,
+) -> np.ndarray:
+    """
+    Clip the values in input tensor x to the range [min_val, max_val].
+
+    Args:
+        x: Input tensor
+        min_val: Tensor with minimum values (same shape as x)
+        max_val: Tensor with maximum values (same shape as x)
+
+    Returns:
+        Clipped tensor with the same shape as x
+    """
+    return np.maximum(np.minimum(x, max_val), min_val)
+
+
+def random_tensor(shape, dtype):
+    """
+    Generate a random tensor with values in the range [-2, 2].
+
+    Args:
+        shape: Shape of the tensor
+        dtype: Data type of the tensor
+
+    Returns:
+        Random tensor with the specified shape and dtype
+    """
+    return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0)
+
+
+class ClipTestCase(InfiniopTestCase):
+    """
+    Test case for the Clip operator.
+    """
+
+    def __init__(
+        self,
+        x: np.ndarray,
+        x_stride: Optional[List[int]],
+        min_val: np.ndarray,
+        min_stride: Optional[List[int]],
+        max_val: np.ndarray,
+        max_stride: Optional[List[int]],
+        y: np.ndarray,
+        y_stride: Optional[List[int]],
+    ):
+        super().__init__("clip")
+        self.x = x
+        self.x_stride = x_stride
+        self.min_val = min_val
+        self.min_stride = min_stride
+        self.max_val = max_val
+        self.max_stride = max_stride
+        self.y = y
+        self.y_stride = y_stride
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+
+        # Add strides as arrays if they exist
+        if self.x_stride is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), self.x_stride)
+        if self.min_stride is not None:
+            test_writer.add_array(test_writer.gguf_key("min_val.strides"), self.min_stride)
+        if self.max_stride is not None:
+            test_writer.add_array(test_writer.gguf_key("max_val.strides"), self.max_stride)
+        if self.y_stride is not None:
+            test_writer.add_array(test_writer.gguf_key("y.strides"), self.y_stride)
+
+        # Add tensors to the test
+        test_writer.add_tensor(
+            test_writer.gguf_key("x"),
+            self.x,
+            raw_dtype=np_dtype_to_ggml(self.x.dtype)
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("min_val"),
+            self.min_val,
+            raw_dtype=np_dtype_to_ggml(self.min_val.dtype)
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("max_val"),
+            self.max_val,
+            raw_dtype=np_dtype_to_ggml(self.max_val.dtype)
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("y"),
+            self.y,
+            raw_dtype=np_dtype_to_ggml(self.y.dtype)
+        )
+
+        # Calculate the expected result
+        ans = clip(
+            self.x.astype(np.float64),
+            self.min_val.astype(np.float64),
+            self.max_val.astype(np.float64)
+        )
+
+        # Add the expected result to the test
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"),
+            ans,
+            raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("clip.gguf")
+
+    # Create test cases for different shapes, strides, and data types
+    test_cases = []
+
+    # Test case shapes
+    shapes = [
+        (10,),                # 1D tensor
+        (5, 10),              # 2D tensor
+        (2, 3, 4),            # 3D tensor
+        (7, 13),              # Prime dimensions
+        (1, 1),               # Minimum shape
+        (100, 100),           # Large shape
+        (16, 16, 16),         # Large 3D
+    ]
+
+    # Test case min/max values
+    min_max_values = [
+        (-1.0, 1.0),          # Standard range
+        (0.0, 2.0),           # Positive range
+        (-2.0, 0.0),          # Negative range
+        (-1000.0, 1000.0),    # Large range
+        (-0.001, 0.001),      # Small range
+        (0.0, 0.0),           # min=max
+    ]
+
+    # Data types to test
+    dtypes = [np.float16, np.float32, np.float64]
+
+    # Generate test cases with contiguous tensors
+    for shape in shapes:
+        for min_val, max_val in min_max_values:
+            for dtype in dtypes:
+                x = random_tensor(shape, dtype)
+                min_tensor = np.full(shape, min_val, dtype=dtype)
+                max_tensor = np.full(shape, max_val, dtype=dtype)
+                y = np.zeros(shape, dtype=dtype)
+
+                test_cases.append(
+                    ClipTestCase(
+                        x=x,
+                        x_stride=None,
+                        min_val=min_tensor,
+                        min_stride=None,
+                        max_val=max_tensor,
+                        max_stride=None,
+                        y=y,
+                        y_stride=None
+                    )
+                )
+
+    # Generate test cases with strided tensors (for 2D shapes only)
+    for shape in [s for s in shapes if len(s) == 2]:
+        for dtype in dtypes:
+            # Row-major stride
+            row_stride = gguf_strides(shape[1], 1)
+            # Column-major stride
+            col_stride = gguf_strides(1, shape[0])
+
+            # Test case with row-major input and output
+            x = random_tensor(shape, dtype)
+            min_tensor = np.full(shape, -1.0, dtype=dtype)
+            max_tensor = np.full(shape, 1.0, dtype=dtype)
+            y = np.zeros(shape, dtype=dtype)
+
+            test_cases.append(
+                ClipTestCase(
+                    x=x,
+                    x_stride=row_stride,
+                    min_val=min_tensor,
+                    min_stride=row_stride,
+                    max_val=max_tensor,
+                    max_stride=row_stride,
+                    y=y,
+                    y_stride=row_stride
+                )
+            )
+
+            # Test case with column-major input and output
+            x = random_tensor(shape, dtype)
+            min_tensor = np.full(shape, -1.0, dtype=dtype)
+            max_tensor = np.full(shape, 1.0, dtype=dtype)
+            y = np.zeros(shape, dtype=dtype)
+
+            test_cases.append(
+                ClipTestCase(
+                    x=x,
+                    x_stride=col_stride,
+                    min_val=min_tensor,
+                    min_stride=col_stride,
+                    max_val=max_tensor,
+                    max_stride=col_stride,
+                    y=y,
+                    y_stride=col_stride
+                )
+            )
+
+            # Test case with different strides for input and output
+            x = random_tensor(shape, dtype)
+            min_tensor = np.full(shape, -1.0, dtype=dtype)
+            max_tensor = np.full(shape, 1.0, dtype=dtype)
+            y = np.zeros(shape, dtype=dtype)
+
+            test_cases.append(
+                ClipTestCase(
+                    x=x,
+                    x_stride=row_stride,
+                    min_val=min_tensor,
+                    min_stride=row_stride,
+                    max_val=max_tensor,
+                    max_stride=row_stride,
+                    y=y,
+                    y_stride=col_stride
+                )
+            )
+
+    # Add all test cases to the writer
+    test_writer.add_tests(test_cases)
+
+    # Save the test cases to a GGUF file
+    test_writer.save()
+
+    print(f"Generated {len(test_cases)} test cases for the Clip operator")
--- a/test/infiniop/clip.py
+++ b/test/infiniop/clip.py
+#!/usr/bin/env python3
+
+import torch
+import ctypes
+from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
+from libinfiniop import (
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    open_lib,
+    to_tensor,
+    get_test_devices,
+    check_error,
+    rearrange_if_needed,
+    create_workspace,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, x_stride, y_stride, min_val, max_val
+    # 基本形状测试
+    ((10,), None, None, -1.0, 1.0),
+    ((5, 10), None, None, -1.0, 1.0),
+    ((2, 3, 4), None, None, -1.0, 1.0),
+    # 不同的min_val和max_val
+    ((10,), None, None, 0.0, 2.0),
+    ((5, 10), None, None, 0.0, 2.0),
+    ((2, 3, 4), None, None, 0.0, 2.0),
+    ((10,), None, None, -2.0, 0.0),
+    ((5, 10), None, None, -2.0, 0.0),
+    ((2, 3, 4), None, None, -2.0, 0.0),
+    # 奇怪形状测试
+    ((7, 13), None, None, -1.0, 1.0),     # 质数维度
+    ((3, 5, 7), None, None, -1.0, 1.0),   # 三维质数
+    # 非标准形状测试
+    ((1, 1), None, None, -1.0, 1.0),       # 最小形状
+    ((100, 100), None, None, -1.0, 1.0),   # 大形状
+    ((16, 16, 16), None, None, -1.0, 1.0), # 大三维
+    # 极端值测试
+    ((10,), None, None, -1000.0, 1000.0),  # 大范围
+    ((10,), None, None, -0.001, 0.001),    # 小范围
+    ((10,), None, None, 0.0, 0.0),         # min=max
+    # 特殊形状测试
+    ((0,), None, None, -1.0, 1.0),         # 空张量
+    ((1, 0), None, None, -1.0, 1.0),       # 空维度
+
+]
+
+
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+    torch.float32: {"atol": 1e-7, "rtol": 1e-6},
+}
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.INPLACE_X,
+    Inplace.OUT_OF_PLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class ClipDescriptor(Structure):
+    _fields_ = [("device_type", c_int32), ("device_id", c_int32)]
+infiniopClipDescriptor_t = POINTER(ClipDescriptor)
+
+
+def clip(x, min_val, max_val):
+    return torch.clamp(x, min_val, max_val)
+
+
+def create_tensor_with_stride(shape, stride, dtype, device):
+    """Create a tensor with specific stride without using view() that might cause errors."""
+    x = torch.rand(shape, dtype=dtype, device=device) * 4.0 - 2.0  # Range: [-2, 2]
+    if stride is None:
+        return x
+    if len(shape) == 2 and len(stride) == 2:
+        if stride == (shape[1], 1):
+            return x.contiguous()
+        elif stride == (1, shape[0]):
+            return x.transpose(0, 1).contiguous().transpose(0, 1)
+        else:
+            y = torch.zeros(shape, dtype=dtype, device=device)
+            for i in range(shape[0]):
+                for j in range(shape[1]):
+                    y[i, j] = x[i, j]
+            return y.contiguous()
+    return x
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    x_stride=None,
+    y_stride=None,
+    min_val=-1.0,
+    max_val=1.0,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float32,
+):
+    print(
+        f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+        f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}"
+    )
+    x = create_tensor_with_stride(shape, x_stride, dtype, torch_device)
+    ans = clip(x, min_val, max_val)
+    x = rearrange_if_needed(x, x_stride)
+    x_tensor = to_tensor(x, lib)
+    if inplace == Inplace.INPLACE_X:
+        y = x
+        y_tensor = x_tensor
+    else:
+        y = torch.zeros(shape, dtype=dtype).to(torch_device)
+        y = rearrange_if_needed(y, y_stride)
+        y_tensor = to_tensor(y, lib)
+    descriptor = infiniopClipDescriptor_t()
+    check_error(
+        lib.infiniopCreateClipDescriptor(
+            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
+        )
+    )
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetClipWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = create_workspace(workspace_size.value, x.device)
+
+    def lib_clip():
+        check_error(
+            lib.infiniopClip(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                y_tensor.data,
+                x_tensor.data,
+                c_float(min_val),
+                c_float(max_val),
+                None,
+            )
+        )
+
+    lib_clip()
+
+    # Now we can destroy the tensor descriptors
+    x_tensor.destroyDesc(lib)
+    if inplace != Inplace.INPLACE_X:
+        y_tensor.destroyDesc(lib)
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol):
+        print("\nExpected:")
+        print(ans)
+        print("\nActual:")
+        print(y)
+        print("\nDifference:")
+        print(torch.abs(y - ans))
+        print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(lib.infiniopDestroyClipDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateClipDescriptor.restype = c_int32
+    lib.infiniopCreateClipDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopClipDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetClipWorkspaceSize.restype = c_int32
+    lib.infiniopGetClipWorkspaceSize.argtypes = [
+        infiniopClipDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopClip.restype = c_int32
+    lib.infiniopClip.argtypes = [
+        infiniopClipDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_float,
+        c_float,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyClipDescriptor.restype = c_int32
+    lib.infiniopDestroyClipDescriptor.argtypes = [
+        infiniopClipDescriptor_t,
+    ]
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")