Commit 4ac6e71b authored by Jiacheng Huang's avatar Jiacheng Huang
Browse files

Add a CPU implementation of ReLU

parent a19efb54
...@@ -11,8 +11,10 @@ __C __export infiniStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle ...@@ -11,8 +11,10 @@ __C __export infiniStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle
infiniopTensorDescriptor_t x); infiniopTensorDescriptor_t x);
__C __export infiniStatus_t infiniopRelu(infiniopReluDescriptor_t desc, __C __export infiniStatus_t infiniopRelu(infiniopReluDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y, void *y,
void const *x, const void *x,
void *stream); void *stream);
__C __export infiniStatus_t infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc); __C __export infiniStatus_t infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc);
......
#include "relu_cpu.h"
namespace op::relu::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<ReluOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<ReluOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<ReluOp, double>(_info, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<ReluOp, bf16_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::relu::cpu
#ifndef __RELU_CPU_H__
#define __RELU_CPU_H__
#include <algorithm>
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(relu, cpu)
namespace op::relu::cpu {
typedef struct ReluOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x) const {
return std::max<T>(x, 0);
}
} ReluOp;
} // namespace op::relu::cpu
#endif // __RELU_CPU_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/relu.h"
#ifdef ENABLE_CPU_API
#include "cpu/relu_cpu.h"
#endif
__C infiniStatus_t infiniopCreateReluDescriptor(
infiniopHandle_t handle,
infiniopReluDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::relu::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::relu::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
{x_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::relu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopRelu(
infiniopReluDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
...@@ -294,6 +294,28 @@ def rearrange_(lib): ...@@ -294,6 +294,28 @@ def rearrange_(lib):
lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t] lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t]
@OpRegister.operator
def relu_(lib):
lib.infiniopCreateReluDescriptor.restype = c_int32
lib.infiniopCreateReluDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopRelu.restype = c_int32
lib.infiniopRelu.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyReluDescriptor.restype = c_int32
lib.infiniopDestroyReluDescriptor.argtypes = [infiniopOperatorDescriptor_t]
@OpRegister.operator @OpRegister.operator
def rms_norm_(lib): def rms_norm_(lib):
lib.infiniopCreateRMSNormDescriptor.restype = c_int32 lib.infiniopCreateRMSNormDescriptor.restype = c_int32
......
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes import ctypes
import sys from ctypes import c_uint64
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
from enum import Enum, auto from enum import Enum, auto
import torch import torch
from libinfiniop import (
LIBINFINIOP,
InfiniDeviceNames,
InfiniDtype,
InfiniDtypeNames,
TestTensor,
TestWorkspace,
check_error,
debug,
get_args,
get_test_devices,
get_tolerance,
infiniopOperatorDescriptor_t,
profile_operation,
test_operator,
)
# constant for control whether profile the pytorch and lib functions # ==============================================================================
# NOTE: need to manually add synchronization function to the lib function, # Configuration (Internal Use Only)
# e.g., cudaDeviceSynchronize() for CUDA # ==============================================================================
PROFILE = False # These are not meant to be imported from other modules
NUM_PRERUN = 10 _TEST_CASES_ = [
NUM_ITERATIONS = 1000 # tensor_shape, inplace
# TODO: Uncomment the following line.
# ((),),
((1, 3),),
((3, 3),),
((32, 20, 512),),
((33, 333, 333),),
((32, 256, 112, 112),),
((3, 3, 13, 9, 17),),
]
class Inplace(Enum): class Inplace(Enum):
...@@ -33,160 +42,121 @@ class Inplace(Enum): ...@@ -33,160 +42,121 @@ class Inplace(Enum):
INPLACE_X = auto() INPLACE_X = auto()
class ReluDescriptor(Structure): # Inplace options applied for each test case in _TEST_CASES_
_fields_ = [("device", c_int32)] _INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_X,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
infiniopReluDescriptor_t = POINTER(ReluDescriptor) # Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def relu(x): def relu(x):
if PROFILE:
ans = torch.nn.functional.relu(x).to(x.dtype)
torch.cuda.synchronize()
return ans
return torch.nn.functional.relu(x).to(x.dtype) return torch.nn.functional.relu(x).to(x.dtype)
def test( def test(
lib, handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
handle,
torch_device,
tensor_shape,
tensor_dtype=torch.float16,
inplace=Inplace.OUT_OF_PLACE,
sync=None
): ):
print( x_torch_tensor = torch.rand(shape) * 2 - 1
f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
x = TestTensor(
shape,
x_torch_tensor.stride(),
dtype,
device,
mode="manual",
set_tensor=x_torch_tensor,
) )
x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 if inplace == Inplace.INPLACE_X:
y = ( y = x
torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) else:
if inplace == Inplace.OUT_OF_PLACE y = TestTensor(shape, None, dtype, device)
else x
)
for i in range(NUM_PRERUN if PROFILE else 1): if y.is_broadcast():
ans = relu(x) return
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = relu(x)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib) print(
y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor f"Testing Relu on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
)
ans = relu(x.torch_tensor())
if sync is not None: if sync is not None:
sync() sync()
descriptor = infiniopReluDescriptor_t() descriptor = infiniopOperatorDescriptor_t()
check_error( check_error(
lib.infiniopCreateReluDescriptor( LIBINFINIOP.infiniopCreateReluDescriptor(
handle, handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
) )
) )
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate() for tensor in [x, y]:
y_tensor.descriptor.contents.invalidate() tensor.destroy_desc()
for i in range(NUM_PRERUN if PROFILE else 1): workspace_size = c_uint64(0)
check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)) check_error(
if PROFILE: LIBINFINIOP.infiniopGetReluWorkspaceSize(
start_time = time.time() descriptor, ctypes.byref(workspace_size)
for i in range(NUM_ITERATIONS): )
check_error( )
lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None) workspace = TestWorkspace(workspace_size.value, y.device)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyReluDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for tensor_shape, inplace in test_cases:
# fmt: off
test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA def lib_relu():
handle = create_handle(lib, device) LIBINFINIOP.infiniopRelu(
for tensor_shape, inplace in test_cases: descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
# fmt: off )
test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle)
lib_relu()
def test_bang(lib, test_cases): atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
import torch_mlu if DEBUG:
debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
device = DeviceEnum.DEVICE_BANG # Profiling workflow
handle = create_handle(lib, device) if PROFILE:
for tensor_shape, inplace in test_cases:
# fmt: off # fmt: off
test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) profile_operation("PyTorch", lambda: relu(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) profile_operation(" lib", lambda: lib_relu(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on # fmt: on
destroy_handle(lib, handle)
check_error(LIBINFINIOP.infiniopDestroyReluDescriptor(descriptor))
if __name__ == "__main__": if __name__ == "__main__":
test_cases = [
# tensor_shape, inplace
((), Inplace.OUT_OF_PLACE),
((), Inplace.INPLACE_X),
((1, 3), Inplace.OUT_OF_PLACE),
((3, 3), Inplace.OUT_OF_PLACE),
((3, 3, 13, 9, 17), Inplace.INPLACE_X),
((32, 20, 512), Inplace.INPLACE_X),
((33, 333, 333), Inplace.OUT_OF_PLACE),
((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
]
args = get_args() args = get_args()
lib = open_lib()
lib.infiniopCreateReluDescriptor.restype = c_int32 # Configure testing options
lib.infiniopCreateReluDescriptor.argtypes = [ DEBUG = args.debug
infiniopHandle_t, PROFILE = args.profile
POINTER(infiniopReluDescriptor_t), NUM_PRERUN = args.num_prerun
infiniopTensorDescriptor_t, NUM_ITERATIONS = args.num_iterations
infiniopTensorDescriptor_t,
] for device in get_test_devices(args):
lib.infiniopRelu.restype = c_int32 test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
lib.infiniopRelu.argtypes = [
infiniopReluDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyReluDescriptor.restype = c_int32
lib.infiniopDestroyReluDescriptor.argtypes = [
infiniopReluDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m") print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment