"superbench/vscode:/vscode.git/clone" did not exist on "bc1a61b91a03aca30d24a199ca72c97f1542a263"
Unverified Commit 1635fd92 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

issue/440 feat: add softplus operator

parent 97f9ac7e
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rms_norm.h"
#include "infiniop/ops/rope.h" #include "infiniop/ops/rope.h"
#include "infiniop/ops/rope_v2.h" #include "infiniop/ops/rope_v2.h"
#include "infiniop/ops/softplus.h"
#include "infiniop/ops/sub.h" #include "infiniop/ops/sub.h"
#include "infiniop/ops/swiglu.h" #include "infiniop/ops/swiglu.h"
#include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topkrouter.h"
......
#ifndef __INFINIOP_SOFTPLUS_API_H__
#define __INFINIOP_SOFTPLUS_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopSoftplusDescriptor_t;
__C __export infiniStatus_t infiniopCreateSoftplusDescriptor(infiniopHandle_t handle,
infiniopSoftplusDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);
__C __export infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopSoftplus(infiniopSoftplusDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);
__C __export infiniStatus_t infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc);
#endif
...@@ -24,6 +24,7 @@ def run_tests(args): ...@@ -24,6 +24,7 @@ def run_tests(args):
"rope.py", "rope.py",
"sub.py", "sub.py",
"swiglu.py", "swiglu.py",
"softplus.py",
]: ]:
result = subprocess.run( result = subprocess.run(
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
......
#include "softplus_cpu.h"
namespace op::softplus::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<SoftplusOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<SoftplusOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<SoftplusOp, double>(_info, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<SoftplusOp, bf16_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::softplus::cpu
#ifndef __SOFTPLUS_CPU_H__
#define __SOFTPLUS_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(softplus, cpu)
namespace op::softplus::cpu {
typedef struct SoftplusOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x) const {
if (x > T(20)) {
return x;
} else {
return std::log(T(1) + std::exp(x));
}
}
} SoftplusOp;
} // namespace op::softplus::cpu
#endif // __SOFTPLUS_CPU_H__
#ifndef __SOFTPLUS_CUDA_H__
#define __SOFTPLUS_CUDA_H__
namespace op::softplus::cuda {
typedef struct SoftplusOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, half>) {
// promote to float for stability, then cast back
float xf = __half2float(x);
float out = (xf > 20.0f) ? xf : log1pf(expf(xf));
return __float2half(out);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float xf = __bfloat162float(x);
float out = (xf > 20.0f) ? xf : log1pf(expf(xf));
return __float2bfloat16(out);
} else if constexpr (std::is_same_v<T, half2>) {
// process as two lanes
float2 xf = __half22float2(x);
xf.x = (xf.x > 20.0f) ? xf.x : log1pf(expf(xf.x));
xf.y = (xf.y > 20.0f) ? xf.y : log1pf(expf(xf.y));
return __floats2half2_rn(xf.x, xf.y);
} else {
// default: float, double, etc.
return (x > T(20)) ? x : log1p(exp(x));
}
}
} SoftplusOp;
} // namespace op::softplus::cuda
#endif // __SOFTPLUS_CUDA_H__
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "softplus_nvidia.cuh"
namespace op::softplus::nvidia {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::SoftplusOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SoftplusOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::SoftplusOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::SoftplusOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::softplus::nvidia
#ifndef __SOFTPLUS_CUDA_API_H__
#define __SOFTPLUS_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(softplus, nvidia)
#endif // __SOFTPLUS_CUDA_API_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/softplus.h"
#ifdef ENABLE_CPU_API
#include "cpu/softplus_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/softplus_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/softplus_metax.h"
#endif
__C infiniStatus_t infiniopCreateSoftplusDescriptor(
infiniopHandle_t handle,
infiniopSoftplusDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::softplus::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::softplus::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
{x_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::softplus::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopSoftplus(
infiniopSoftplusDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::softplus::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::softplus::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
...@@ -490,6 +490,7 @@ def swiglu_(lib): ...@@ -490,6 +490,7 @@ def swiglu_(lib):
infiniopOperatorDescriptor_t, infiniopOperatorDescriptor_t,
] ]
@OpRegister.operator @OpRegister.operator
def conv_(lib): def conv_(lib):
lib.infiniopCreateConvDescriptor.restype = c_int32 lib.infiniopCreateConvDescriptor.restype = c_int32
...@@ -525,7 +526,8 @@ def conv_(lib): ...@@ -525,7 +526,8 @@ def conv_(lib):
lib.infiniopDestroyConvDescriptor.argtypes = [ lib.infiniopDestroyConvDescriptor.argtypes = [
infiniopOperatorDescriptor_t, infiniopOperatorDescriptor_t,
] ]
@OpRegister.operator @OpRegister.operator
def topkrouter_(lib): def topkrouter_(lib):
lib.infiniopCreateTopkrouterDescriptor.restype = c_int32 lib.infiniopCreateTopkrouterDescriptor.restype = c_int32
...@@ -533,7 +535,7 @@ def topkrouter_(lib): ...@@ -533,7 +535,7 @@ def topkrouter_(lib):
infiniopHandle_t, infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t), POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t, infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t infiniopTensorDescriptor_t,
] ]
lib.infiniopGetTopkrouterWorkspaceSize.restype = c_int32 lib.infiniopGetTopkrouterWorkspaceSize.restype = c_int32
...@@ -562,7 +564,6 @@ def topkrouter_(lib): ...@@ -562,7 +564,6 @@ def topkrouter_(lib):
] ]
@OpRegister.operator @OpRegister.operator
def dequantize_(lib): def dequantize_(lib):
lib.infiniopCreateDequantizeDescriptor.restype = c_int32 lib.infiniopCreateDequantizeDescriptor.restype = c_int32
...@@ -595,4 +596,26 @@ def dequantize_(lib): ...@@ -595,4 +596,26 @@ def dequantize_(lib):
lib.infiniopDestroyDequantizeDescriptor.restype = c_int32 lib.infiniopDestroyDequantizeDescriptor.restype = c_int32
lib.infiniopDestroyDequantizeDescriptor.argtypes = [ lib.infiniopDestroyDequantizeDescriptor.argtypes = [
infiniopOperatorDescriptor_t, infiniopOperatorDescriptor_t,
] ]
\ No newline at end of file
@OpRegister.operator
def softplus_(lib):
lib.infiniopCreateSoftplusDescriptor.restype = c_int32
lib.infiniopCreateSoftplusDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopSoftplus.restype = c_int32
lib.infiniopSoftplus.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySoftplusDescriptor.restype = c_int32
lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t]
import ctypes
from ctypes import c_uint64
from enum import Enum, auto
import torch
from libinfiniop import (
LIBINFINIOP,
InfiniDeviceNames,
InfiniDtype,
InfiniDtypeNames,
TestTensor,
TestWorkspace,
check_error,
debug,
get_args,
get_test_devices,
get_tolerance,
infiniopOperatorDescriptor_t,
profile_operation,
test_operator,
)
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# tensor_shape, inplace
# TODO: Uncomment the following line.
# ((),),
((1, 3),),
((3, 3),),
((32, 20, 512),),
((33, 333, 333),),
((32, 256, 112, 112),),
((3, 3, 13, 9, 17),),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_X,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def softplus(x):
return torch.nn.functional.softplus(x).to(x.dtype)
def test(
handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
):
x_torch_tensor = torch.rand(shape) * 2 - 1
x = TestTensor(
shape,
x_torch_tensor.stride(),
dtype,
device,
mode="manual",
set_tensor=x_torch_tensor,
)
if inplace == Inplace.INPLACE_X:
y = x
else:
y = TestTensor(shape, None, dtype, device)
if y.is_broadcast():
return
print(
f"Testing Softplus on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
)
ans = softplus(x.torch_tensor())
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateSoftplusDescriptor(
handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x, y]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetSoftplusWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, y.device)
def lib_softplus():
LIBINFINIOP.infiniopSoftplus(
descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
)
lib_softplus()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: softplus(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_softplus(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroySoftplusDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment