Add a CPU implementation of ReLU

4ac6e71b · Jiacheng Huang · a19efb54 · 4ac6e71b · 4ac6e71b · 4ac6e71b
Commit 4ac6e71b authored May 29, 2025 by Jiacheng Huang
6 changed files
--- a/include/infiniop/ops/relu.h
+++ b/include/infiniop/ops/relu.h
@@ -11,8 +11,10 @@ __C __export infiniStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle
                                                         infiniopTensorDescriptor_t x);
 __C __export infiniStatus_t infiniopRelu(infiniopReluDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
                                         void *y,
-                                         void const *x,
+                                         const void *x,
                                         void *stream);
 __C __export infiniStatus_t infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc);

--- a/src/infiniop/ops/relu/cpu/relu_cpu.cc
+++ b/src/infiniop/ops/relu/cpu/relu_cpu.cc
+#include "relu_cpu.h"
+namespace op::relu::cpu {
+Descriptor::~Descriptor() = default;
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ReluOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ReluOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ReluOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ReluOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::relu::cpu
--- a/src/infiniop/ops/relu/cpu/relu_cpu.h
+++ b/src/infiniop/ops/relu/cpu/relu_cpu.h
+#ifndef __RELU_CPU_H__
+#define __RELU_CPU_H__
+#include <algorithm>
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+ELEMENTWISE_DESCRIPTOR(relu, cpu)
+namespace op::relu::cpu {
+typedef struct ReluOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::max<T>(x, 0);
+    }
+} ReluOp;
+} // namespace op::relu::cpu
+#endif // __RELU_CPU_H__
--- a/src/infiniop/ops/relu/operator.cc
+++ b/src/infiniop/ops/relu/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/relu.h"
+#ifdef ENABLE_CPU_API
+#include "cpu/relu_cpu.h"
+#endif
+__C infiniStatus_t infiniopCreateReluDescriptor(
+    infiniopHandle_t handle,
+    infiniopReluDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::relu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::relu::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+__C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::relu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+__C infiniStatus_t infiniopRelu(
+    infiniopReluDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+__C infiniStatus_t
+infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -294,6 +294,28 @@ def rearrange_(lib):
    lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+@OpRegister.operator
+def relu_(lib):
+    lib.infiniopCreateReluDescriptor.restype = c_int32
+    lib.infiniopCreateReluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopRelu.restype = c_int32
+    lib.infiniopRelu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReluDescriptor.restype = c_int32
+    lib.infiniopDestroyReluDescriptor.argtypes = [infiniopOperatorDescriptor_t]
 @OpRegister.operator
 def rms_norm_(lib):
    lib.infiniopCreateRMSNormDescriptor.restype = c_int32

--- a/test/infiniop/relu.py
+++ b/test/infiniop/relu.py
-from ctypes import POINTER, Structure, c_int32, c_void_p
 import ctypes
-import sys
+from ctypes import c_uint64
-import os
-import time
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from operatorspy import (
-    open_lib,
-    to_tensor,
-    DeviceEnum,
-    infiniopHandle_t,
-    infiniopTensorDescriptor_t,
-    create_handle,
-    destroy_handle,
-    check_error,
-)
-from operatorspy.tests.test_utils import get_args
 from enum import Enum, auto
 import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
-# constant for control whether profile the pytorch and lib functions
+# ==============================================================================
-# NOTE: need to manually add synchronization function to the lib function,
+#  Configuration (Internal Use Only)
-#       e.g., cudaDeviceSynchronize() for CUDA
+# ==============================================================================
-PROFILE = False
+# These are not meant to be imported from other modules
-NUM_PRERUN = 10
+_TEST_CASES_ = [
-NUM_ITERATIONS = 1000
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
 class Inplace(Enum):
@@ -33,160 +42,121 @@ class Inplace(Enum):
    INPLACE_X = auto()
-class ReluDescriptor(Structure):
+# Inplace options applied for each test case in _TEST_CASES_
-    _fields_ = [("device", c_int32)]
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
-infiniopReluDescriptor_t = POINTER(ReluDescriptor)
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
 def relu(x):
-    if PROFILE:
-        ans = torch.nn.functional.relu(x).to(x.dtype)
-        torch.cuda.synchronize()
-        return ans
    return torch.nn.functional.relu(x).to(x.dtype)
 def test(
-    lib,
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
-    handle,
-    torch_device,
-    tensor_shape,
-    tensor_dtype=torch.float16,
-    inplace=Inplace.OUT_OF_PLACE,
-    sync=None
 ):
-    print(
+    x_torch_tensor = torch.rand(shape) * 2 - 1
-        f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
    )
-    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+    if inplace == Inplace.INPLACE_X:
-    y = (
+        y = x
-        torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device)
+    else:
-        if inplace == Inplace.OUT_OF_PLACE
+        y = TestTensor(shape, None, dtype, device)
-        else x
-    )
-    for i in range(NUM_PRERUN if PROFILE else 1):
+    if y.is_broadcast():
-        ans = relu(x)
+        return
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
-            _ = relu(x)
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"pytorch time: {elapsed :6f}")
-    x_tensor = to_tensor(x, lib)
+    print(
-    y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
+        f"Testing Relu on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+    ans = relu(x.torch_tensor())
    if sync is not None:
-        sync()    
+        sync()
-    descriptor = infiniopReluDescriptor_t()
+    descriptor = infiniopOperatorDescriptor_t()
    check_error(
-        lib.infiniopCreateReluDescriptor(
+        LIBINFINIOP.infiniopCreateReluDescriptor(
-            handle,
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-            ctypes.byref(descriptor),
-            y_tensor.descriptor,
-            x_tensor.descriptor,
        )
    )
    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x_tensor.descriptor.contents.invalidate()
+    for tensor in [x, y]:
-    y_tensor.descriptor.contents.invalidate()
+        tensor.destroy_desc()
-    for i in range(NUM_PRERUN if PROFILE else 1):
+    workspace_size = c_uint64(0)
-        check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
+    check_error(
-    if PROFILE:
+        LIBINFINIOP.infiniopGetReluWorkspaceSize(
-        start_time = time.time()
+            descriptor, ctypes.byref(workspace_size)
-        for i in range(NUM_ITERATIONS):
+        )
-            check_error(
+    )
-                lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)
+    workspace = TestWorkspace(workspace_size.value, y.device)
-            )
-        elapsed = (time.time() - start_time) / NUM_ITERATIONS
-        print(f"    lib time: {elapsed :6f}")
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    check_error(lib.infiniopDestroyReluDescriptor(descriptor))
-def test_cpu(lib, test_cases):
-    device = DeviceEnum.DEVICE_CPU
-    handle = create_handle(lib, device)
-    for tensor_shape, inplace in test_cases:
-        # fmt: off
-        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
-        # fmt: on
-    destroy_handle(lib, handle)
-def test_cuda(lib, test_cases):
-    device = DeviceEnum.DEVICE_CUDA
+    def lib_relu():
-    handle = create_handle(lib, device)
+        LIBINFINIOP.infiniopRelu(
-    for tensor_shape, inplace in test_cases:
+            descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
-        # fmt: off
+        )
-        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
-        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
-        # fmt: on
-    destroy_handle(lib, handle)
+    lib_relu()
-def test_bang(lib, test_cases):
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    import torch_mlu
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    device = DeviceEnum.DEVICE_BANG
+    # Profiling workflow
-    handle = create_handle(lib, device)
+    if PROFILE:
-    for tensor_shape, inplace in test_cases:
        # fmt: off
-        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+        profile_operation("PyTorch", lambda: relu(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        profile_operation("    lib", lambda: lib_relu(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
-    destroy_handle(lib, handle)
+    check_error(LIBINFINIOP.infiniopDestroyReluDescriptor(descriptor))
 if __name__ == "__main__":
-    test_cases = [
-        # tensor_shape, inplace
-        ((), Inplace.OUT_OF_PLACE),
-        ((), Inplace.INPLACE_X),
-        ((1, 3), Inplace.OUT_OF_PLACE),
-        ((3, 3), Inplace.OUT_OF_PLACE),
-        ((3, 3, 13, 9, 17), Inplace.INPLACE_X),
-        ((32, 20, 512), Inplace.INPLACE_X),
-        ((33, 333, 333), Inplace.OUT_OF_PLACE),
-        ((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
-    ]
    args = get_args()
-    lib = open_lib()
-    lib.infiniopCreateReluDescriptor.restype = c_int32
+    # Configure testing options
-    lib.infiniopCreateReluDescriptor.argtypes = [
+    DEBUG = args.debug
-        infiniopHandle_t,
+    PROFILE = args.profile
-        POINTER(infiniopReluDescriptor_t),
+    NUM_PRERUN = args.num_prerun
-        infiniopTensorDescriptor_t,
+    NUM_ITERATIONS = args.num_iterations
-        infiniopTensorDescriptor_t,
-    ]
+    for device in get_test_devices(args):
-    lib.infiniopRelu.restype = c_int32
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-    lib.infiniopRelu.argtypes = [
-        infiniopReluDescriptor_t,
-        c_void_p,
-        c_void_p,
-        c_void_p,
-    ]
-    lib.infiniopDestroyReluDescriptor.restype = c_int32
-    lib.infiniopDestroyReluDescriptor.argtypes = [
-        infiniopReluDescriptor_t,
-    ]
-    if args.cpu:
-        test_cpu(lib, test_cases)
-    if args.cuda:
-        test_cuda(lib, test_cases)
-    if args.bang:
-        test_bang(lib, test_cases)
-    if not (args.cpu or args.cuda or args.bang):
-        test_cpu(lib, test_cases)
    print("\033[92mTest passed!\033[0m")