Unverified Commit 3e5842c3 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #196 from goldenfox2025/issue180

issue/180:添加clip算子
parents ce75b918 c98e68be
......@@ -6,6 +6,7 @@
#include "infiniop/ops/attention.h"
#include "infiniop/ops/avg_pool.h"
#include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/expand.h"
#include "infiniop/ops/gemm.h"
......
#ifndef __INFINIOP_CLIP_API_H__
#define __INFINIOP_CLIP_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopClipDescriptor_t;
__C __export infiniStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
infiniopClipDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
infiniopTensorDescriptor_t min_val,
infiniopTensorDescriptor_t max_val);
__C __export infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *min_val,
const void *max_val,
void *stream);
__C __export infiniStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
#endif
......@@ -6,10 +6,10 @@
typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;
__C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
infiniopMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);
infiniopMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);
__C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);
......@@ -20,7 +20,7 @@ __C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
const void *a,
const void *b,
void *stream);
__C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
#endif
......@@ -8,6 +8,7 @@
DECLARE_INFINIOP_TEST(gemm)
DECLARE_INFINIOP_TEST(random_sample)
DECLARE_INFINIOP_TEST(mul)
DECLARE_INFINIOP_TEST(clip)
DECLARE_INFINIOP_TEST(swiglu)
#define REGISTER_INFINIOP_TEST(name) \
......@@ -27,6 +28,7 @@ DECLARE_INFINIOP_TEST(swiglu)
REGISTER_INFINIOP_TEST(gemm) \
REGISTER_INFINIOP_TEST(random_sample) \
REGISTER_INFINIOP_TEST(mul) \
REGISTER_INFINIOP_TEST(clip) \
REGISTER_INFINIOP_TEST(swiglu) \
}
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::clip {
struct Test::Attributes {
std::shared_ptr<Tensor> x;
std::shared_ptr<Tensor> min_val;
std::shared_ptr<Tensor> max_val;
std::shared_ptr<Tensor> y;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("x") == tensors.end()
|| tensors.find("min_val") == tensors.end()
|| tensors.find("max_val") == tensors.end()
|| tensors.find("y") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->x = tensors["x"];
test->_attributes->min_val = tensors["min_val"];
test->_attributes->max_val = tensors["max_val"];
test->_attributes->y = tensors["y"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopClipDescriptor_t op_desc;
auto x = _attributes->x->to(device, device_id);
auto min_val = _attributes->min_val->to(device, device_id);
auto max_val = _attributes->max_val->to(device, device_id);
auto y = _attributes->y->to(device, device_id);
CHECK_OR(infiniopCreateClipDescriptor(handle, &op_desc,
y->desc(),
x->desc(),
min_val->desc(),
max_val->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create clip descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetClipWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopClip(op_desc, workspace, workspace_size,
y->data(),
x->data(),
min_val->data(),
max_val->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(y, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopClip(
op_desc, workspace, workspace_size,
y->data(),
x->data(),
min_val->data(),
max_val->data(),
nullptr);
},
warm_ups, iterations);
infiniopDestroyClipDescriptor(op_desc);
infinirtFree(workspace);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"x", "min_val", "max_val", "y", "ans"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- x: " << _attributes->x->info() << std::endl;
oss << "- min_val: " << _attributes->min_val->info() << std::endl;
oss << "- max_val: " << _attributes->max_val->info() << std::endl;
oss << "- y: " << _attributes->y->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::clip
#include "clip_cpu.h"
namespace op::clip::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &in_desc = input_desc_vec.at(0);
const auto &min_desc = input_desc_vec.at(1);
const auto &max_desc = input_desc_vec.at(2);
const auto &out_shape = out_desc->shape();
const auto &in_shape = in_desc->shape();
const auto &min_shape = min_desc->shape();
const auto &max_shape = max_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, in_shape);
CHECK_SAME_SHAPE(out_shape, min_shape);
CHECK_SAME_SHAPE(out_shape, max_shape);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<ClipOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<ClipOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<ClipOp, double>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::clip::cpu
#ifndef __CLIP_CPU_H__
#define __CLIP_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR(clip, cpu)
namespace op::clip::cpu {
typedef struct ClipOp {
public:
static constexpr size_t num_inputs = 3;
template <typename T>
T operator()(const T &x, const T &min_val, const T &max_val) const {
return std::max(std::min(x, max_val), min_val);
}
} ClipOp;
} // namespace op::clip::cpu
#endif // __CLIP_CPU_H__
#include "clip_cuda.cuh"
#include "clip_cuda_internal.cuh"
namespace op::clip::cuda {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &in_desc = input_desc_vec.at(0);
const auto &min_desc = input_desc_vec.at(1);
const auto &max_desc = input_desc_vec.at(2);
const auto &out_shape = out_desc->shape();
const auto &in_shape = in_desc->shape();
const auto &min_shape = min_desc->shape();
const auto &max_shape = max_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, in_shape);
CHECK_SAME_SHAPE(out_shape, min_shape);
CHECK_SAME_SHAPE(out_shape, max_shape);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, ClipOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, ClipOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, ClipOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::clip::cuda
#ifndef __CLIP_CUDA_API_H__
#define __CLIP_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR(clip, cuda)
#endif // __CLIP_CUDA_API_H__
#ifndef __CLIP_CUDA_H__
#define __CLIP_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace op::clip::cuda {
typedef struct ClipOp {
public:
static constexpr size_t num_inputs = 3;
template <typename T>
__device__ __forceinline__ T operator()(const T &x, const T &min_val, const T &max_val) const {
if constexpr (std::is_same_v<T, half2>) {
return __hmax2(__hmin2(x, max_val), min_val);
} else if constexpr (std::is_same_v<T, half>) {
return __hmax(__hmin(x, max_val), min_val);
} else if constexpr (std::is_same_v<T, float>) {
return fmaxf(fminf(x, max_val), min_val);
} else if constexpr (std::is_same_v<T, double>) {
return fmax(fmin(x, max_val), min_val);
} else {
return std::max(std::min(x, max_val), min_val);
}
}
} ClipOp;
} // namespace op::clip::cuda
#endif // __CLIP_CUDA_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/clip.h"
#ifdef ENABLE_CPU_API
#include "cpu/clip_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/clip_cuda.cuh"
#endif
__C infiniStatus_t infiniopCreateClipDescriptor(
infiniopHandle_t handle,
infiniopClipDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
infiniopTensorDescriptor_t min_val,
infiniopTensorDescriptor_t max_val) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::clip::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::clip::NAMESPACE::Descriptor **>(desc_ptr), \
y, \
{x, min_val, max_val})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::clip::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
#endif
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopClip(
infiniopClipDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *min_val,
const void *max_val,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x, min_val, max_val}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
import numpy as np
import gguf
from typing import List, Optional, Tuple
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
def clip(
x: np.ndarray,
min_val: np.ndarray,
max_val: np.ndarray,
) -> np.ndarray:
"""
Clip the values in input tensor x to the range [min_val, max_val].
Args:
x: Input tensor
min_val: Tensor with minimum values (same shape as x)
max_val: Tensor with maximum values (same shape as x)
Returns:
Clipped tensor with the same shape as x
"""
return np.maximum(np.minimum(x, max_val), min_val)
def random_tensor(shape, dtype):
"""
Generate a random tensor with values in the range [-2, 2].
Args:
shape: Shape of the tensor
dtype: Data type of the tensor
Returns:
Random tensor with the specified shape and dtype
"""
return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0)
class ClipTestCase(InfiniopTestCase):
"""
Test case for the Clip operator.
"""
def __init__(
self,
x: np.ndarray,
x_stride: Optional[List[int]],
min_val: np.ndarray,
min_stride: Optional[List[int]],
max_val: np.ndarray,
max_stride: Optional[List[int]],
y: np.ndarray,
y_stride: Optional[List[int]],
):
super().__init__("clip")
self.x = x
self.x_stride = x_stride
self.min_val = min_val
self.min_stride = min_stride
self.max_val = max_val
self.max_stride = max_stride
self.y = y
self.y_stride = y_stride
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
# Add strides as arrays if they exist
if self.x_stride is not None:
test_writer.add_array(test_writer.gguf_key("x.strides"), self.x_stride)
if self.min_stride is not None:
test_writer.add_array(test_writer.gguf_key("min_val.strides"), self.min_stride)
if self.max_stride is not None:
test_writer.add_array(test_writer.gguf_key("max_val.strides"), self.max_stride)
if self.y_stride is not None:
test_writer.add_array(test_writer.gguf_key("y.strides"), self.y_stride)
# Add tensors to the test
test_writer.add_tensor(
test_writer.gguf_key("x"),
self.x,
raw_dtype=np_dtype_to_ggml(self.x.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("min_val"),
self.min_val,
raw_dtype=np_dtype_to_ggml(self.min_val.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("max_val"),
self.max_val,
raw_dtype=np_dtype_to_ggml(self.max_val.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("y"),
self.y,
raw_dtype=np_dtype_to_ggml(self.y.dtype)
)
# Calculate the expected result
ans = clip(
self.x.astype(np.float64),
self.min_val.astype(np.float64),
self.max_val.astype(np.float64)
)
# Add the expected result to the test
test_writer.add_tensor(
test_writer.gguf_key("ans"),
ans,
raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("clip.gguf")
# Create test cases for different shapes, strides, and data types
test_cases = []
# Test case shapes
shapes = [
(10,), # 1D tensor
(5, 10), # 2D tensor
(2, 3, 4), # 3D tensor
(7, 13), # Prime dimensions
(1, 1), # Minimum shape
(100, 100), # Large shape
(16, 16, 16), # Large 3D
]
# Test case min/max values
min_max_values = [
(-1.0, 1.0), # Standard range
(0.0, 2.0), # Positive range
(-2.0, 0.0), # Negative range
(-1000.0, 1000.0), # Large range
(-0.001, 0.001), # Small range
(0.0, 0.0), # min=max
]
# Data types to test
dtypes = [np.float16, np.float32, np.float64]
# Generate test cases with contiguous tensors
for shape in shapes:
for min_val, max_val in min_max_values:
for dtype in dtypes:
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, min_val, dtype=dtype)
max_tensor = np.full(shape, max_val, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=None,
min_val=min_tensor,
min_stride=None,
max_val=max_tensor,
max_stride=None,
y=y,
y_stride=None
)
)
# Generate test cases with strided tensors (for 2D shapes only)
for shape in [s for s in shapes if len(s) == 2]:
for dtype in dtypes:
# Row-major stride
row_stride = gguf_strides(shape[1], 1)
# Column-major stride
col_stride = gguf_strides(1, shape[0])
# Test case with row-major input and output
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, -1.0, dtype=dtype)
max_tensor = np.full(shape, 1.0, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=row_stride,
min_val=min_tensor,
min_stride=row_stride,
max_val=max_tensor,
max_stride=row_stride,
y=y,
y_stride=row_stride
)
)
# Test case with column-major input and output
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, -1.0, dtype=dtype)
max_tensor = np.full(shape, 1.0, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=col_stride,
min_val=min_tensor,
min_stride=col_stride,
max_val=max_tensor,
max_stride=col_stride,
y=y,
y_stride=col_stride
)
)
# Test case with different strides for input and output
x = random_tensor(shape, dtype)
min_tensor = np.full(shape, -1.0, dtype=dtype)
max_tensor = np.full(shape, 1.0, dtype=dtype)
y = np.zeros(shape, dtype=dtype)
test_cases.append(
ClipTestCase(
x=x,
x_stride=row_stride,
min_val=min_tensor,
min_stride=row_stride,
max_val=max_tensor,
max_stride=row_stride,
y=y,
y_stride=col_stride
)
)
# Add all test cases to the writer
test_writer.add_tests(test_cases)
# Save the test cases to a GGUF file
test_writer.save()
print(f"Generated {len(test_cases)} test cases for the Clip operator")
#!/usr/bin/env python3
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, x_stride, y_stride, min_val, max_val
# 基本形状测试
((10,), None, None, -1.0, 1.0),
((5, 10), None, None, -1.0, 1.0),
((2, 3, 4), None, None, -1.0, 1.0),
# 不同的min_val和max_val
((10,), None, None, 0.0, 2.0),
((5, 10), None, None, 0.0, 2.0),
((2, 3, 4), None, None, 0.0, 2.0),
((10,), None, None, -2.0, 0.0),
((5, 10), None, None, -2.0, 0.0),
((2, 3, 4), None, None, -2.0, 0.0),
# 奇怪形状测试
((7, 13), None, None, -1.0, 1.0), # 质数维度
((3, 5, 7), None, None, -1.0, 1.0), # 三维质数
# 非标准形状测试
((1, 1), None, None, -1.0, 1.0), # 最小形状
((100, 100), None, None, -1.0, 1.0), # 大形状
((16, 16, 16), None, None, -1.0, 1.0), # 大三维
# 极端值测试
((10,), None, None, -1000.0, 1000.0), # 大范围
((10,), None, None, -0.001, 0.001), # 小范围
((10,), None, None, 0.0, 0.0), # min=max
# 特殊形状测试
((0,), None, None, -1.0, 1.0), # 空张量
((1, 0), None, None, -1.0, 1.0), # 空维度
]
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-6},
}
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
_INPLACE = [
Inplace.INPLACE_X,
Inplace.OUT_OF_PLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ClipDescriptor(Structure):
_fields_ = [("device_type", c_int32), ("device_id", c_int32)]
infiniopClipDescriptor_t = POINTER(ClipDescriptor)
def clip(x, min_val, max_val):
return torch.clamp(x, min_val, max_val)
def create_tensor_with_stride(shape, stride, dtype, device):
"""Create a tensor with specific stride without using view() that might cause errors."""
x = torch.rand(shape, dtype=dtype, device=device) * 4.0 - 2.0 # Range: [-2, 2]
if stride is None:
return x
if len(shape) == 2 and len(stride) == 2:
if stride == (shape[1], 1):
return x.contiguous()
elif stride == (1, shape[0]):
return x.transpose(0, 1).contiguous().transpose(0, 1)
else:
y = torch.zeros(shape, dtype=dtype, device=device)
for i in range(shape[0]):
for j in range(shape[1]):
y[i, j] = x[i, j]
return y.contiguous()
return x
def test(
lib,
handle,
torch_device,
shape,
x_stride=None,
y_stride=None,
min_val=-1.0,
max_val=1.0,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float32,
):
print(
f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}"
)
x = create_tensor_with_stride(shape, x_stride, dtype, torch_device)
ans = clip(x, min_val, max_val)
x = rearrange_if_needed(x, x_stride)
x_tensor = to_tensor(x, lib)
if inplace == Inplace.INPLACE_X:
y = x
y_tensor = x_tensor
else:
y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
descriptor = infiniopClipDescriptor_t()
check_error(
lib.infiniopCreateClipDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
)
)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetClipWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, x.device)
def lib_clip():
check_error(
lib.infiniopClip(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
y_tensor.data,
x_tensor.data,
c_float(min_val),
c_float(max_val),
None,
)
)
lib_clip()
# Now we can destroy the tensor descriptors
x_tensor.destroyDesc(lib)
if inplace != Inplace.INPLACE_X:
y_tensor.destroyDesc(lib)
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol):
print("\nExpected:")
print(ans)
print("\nActual:")
print(y)
print("\nDifference:")
print(torch.abs(y - ans))
print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyClipDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateClipDescriptor.restype = c_int32
lib.infiniopCreateClipDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopClipDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetClipWorkspaceSize.restype = c_int32
lib.infiniopGetClipWorkspaceSize.argtypes = [
infiniopClipDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopClip.restype = c_int32
lib.infiniopClip.argtypes = [
infiniopClipDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyClipDescriptor.restype = c_int32
lib.infiniopDestroyClipDescriptor.argtypes = [
infiniopClipDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment