Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
4ac6e71b
Commit
4ac6e71b
authored
May 29, 2025
by
Jiacheng Huang
Browse files
Add a CPU implementation of ReLU
parent
a19efb54
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
316 additions
and
148 deletions
+316
-148
include/infiniop/ops/relu.h
include/infiniop/ops/relu.h
+3
-1
src/infiniop/ops/relu/cpu/relu_cpu.cc
src/infiniop/ops/relu/cpu/relu_cpu.cc
+52
-0
src/infiniop/ops/relu/cpu/relu_cpu.h
src/infiniop/ops/relu/cpu/relu_cpu.h
+22
-0
src/infiniop/ops/relu/operator.cc
src/infiniop/ops/relu/operator.cc
+100
-0
test/infiniop/libinfiniop/op_register.py
test/infiniop/libinfiniop/op_register.py
+22
-0
test/infiniop/relu.py
test/infiniop/relu.py
+117
-147
No files found.
include/infiniop/ops/relu.h
View file @
4ac6e71b
...
@@ -11,8 +11,10 @@ __C __export infiniStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle
...
@@ -11,8 +11,10 @@ __C __export infiniStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle
infiniopTensorDescriptor_t
x
);
infiniopTensorDescriptor_t
x
);
__C
__export
infiniStatus_t
infiniopRelu
(
infiniopReluDescriptor_t
desc
,
__C
__export
infiniStatus_t
infiniopRelu
(
infiniopReluDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
void
*
y
,
void
const
*
x
,
const
void
*
x
,
void
*
stream
);
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyReluDescriptor
(
infiniopReluDescriptor_t
desc
);
__C
__export
infiniStatus_t
infiniopDestroyReluDescriptor
(
infiniopReluDescriptor_t
desc
);
...
...
src/infiniop/ops/relu/cpu/relu_cpu.cc
0 → 100644
View file @
4ac6e71b
#include "relu_cpu.h"
namespace
op
::
relu
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
x_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
y_shape
=
out_desc
->
shape
();
const
auto
&
x_shape
=
x_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
y_shape
,
x_shape
);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
ReluOp
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
ReluOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
ReluOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
ReluOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::relu::cpu
src/infiniop/ops/relu/cpu/relu_cpu.h
0 → 100644
View file @
4ac6e71b
#ifndef __RELU_CPU_H__
#define __RELU_CPU_H__
#include <algorithm>
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
relu
,
cpu
)
namespace
op
::
relu
::
cpu
{
typedef
struct
ReluOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
T
operator
()(
const
T
&
x
)
const
{
return
std
::
max
<
T
>
(
x
,
0
);
}
}
ReluOp
;
}
// namespace op::relu::cpu
#endif // __RELU_CPU_H__
src/infiniop/ops/relu/operator.cc
0 → 100644
View file @
4ac6e71b
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/relu.h"
#ifdef ENABLE_CPU_API
#include "cpu/relu_cpu.h"
#endif
__C
infiniStatus_t
infiniopCreateReluDescriptor
(
infiniopHandle_t
handle
,
infiniopReluDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::relu::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::relu::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
{x_desc})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetReluWorkspaceSize
(
infiniopReluDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::relu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopRelu
(
infiniopReluDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyReluDescriptor
(
infiniopReluDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
test/infiniop/libinfiniop/op_register.py
View file @
4ac6e71b
...
@@ -294,6 +294,28 @@ def rearrange_(lib):
...
@@ -294,6 +294,28 @@ def rearrange_(lib):
lib
.
infiniopDestroyRearrangeDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
]
lib
.
infiniopDestroyRearrangeDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
]
@
OpRegister
.
operator
def
relu_
(
lib
):
lib
.
infiniopCreateReluDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateReluDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopOperatorDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopRelu
.
restype
=
c_int32
lib
.
infiniopRelu
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
c_void_p
,
c_size_t
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroyReluDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyReluDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
]
@
OpRegister
.
operator
@
OpRegister
.
operator
def
rms_norm_
(
lib
):
def
rms_norm_
(
lib
):
lib
.
infiniopCreateRMSNormDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateRMSNormDescriptor
.
restype
=
c_int32
...
...
test/infiniop/relu.py
View file @
4ac6e71b
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_void_p
import
ctypes
import
ctypes
import
sys
from
ctypes
import
c_uint64
import
os
import
time
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
".."
)))
from
operatorspy
import
(
open_lib
,
to_tensor
,
DeviceEnum
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
create_handle
,
destroy_handle
,
check_error
,
)
from
operatorspy.tests.test_utils
import
get_args
from
enum
import
Enum
,
auto
from
enum
import
Enum
,
auto
import
torch
import
torch
from
libinfiniop
import
(
LIBINFINIOP
,
InfiniDeviceNames
,
InfiniDtype
,
InfiniDtypeNames
,
TestTensor
,
TestWorkspace
,
check_error
,
debug
,
get_args
,
get_test_devices
,
get_tolerance
,
infiniopOperatorDescriptor_t
,
profile_operation
,
test_operator
,
)
# constant for control whether profile the pytorch and lib functions
# ==============================================================================
# NOTE: need to manually add synchronization function to the lib function,
# Configuration (Internal Use Only)
# e.g., cudaDeviceSynchronize() for CUDA
# ==============================================================================
PROFILE
=
False
# These are not meant to be imported from other modules
NUM_PRERUN
=
10
_TEST_CASES_
=
[
NUM_ITERATIONS
=
1000
# tensor_shape, inplace
# TODO: Uncomment the following line.
# ((),),
((
1
,
3
),),
((
3
,
3
),),
((
32
,
20
,
512
),),
((
33
,
333
,
333
),),
((
32
,
256
,
112
,
112
),),
((
3
,
3
,
13
,
9
,
17
),),
]
class
Inplace
(
Enum
):
class
Inplace
(
Enum
):
...
@@ -33,160 +42,121 @@ class Inplace(Enum):
...
@@ -33,160 +42,121 @@ class Inplace(Enum):
INPLACE_X
=
auto
()
INPLACE_X
=
auto
()
class
ReluDescriptor
(
Structure
):
# Inplace options applied for each test case in _TEST_CASES_
_fields_
=
[(
"device"
,
c_int32
)]
_INPLACE
=
[
Inplace
.
OUT_OF_PLACE
,
Inplace
.
INPLACE_X
,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES
=
[
test_case
+
(
inplace_item
,)
for
test_case
in
_TEST_CASES_
for
inplace_item
in
_INPLACE
]
infiniopReluDescriptor_t
=
POINTER
(
ReluDescriptor
)
# Data types used for testing
_TENSOR_DTYPES
=
[
InfiniDtype
.
F16
,
InfiniDtype
.
F32
,
InfiniDtype
.
BF16
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
InfiniDtype
.
F16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
F32
:
{
"atol"
:
1e-7
,
"rtol"
:
1e-7
},
InfiniDtype
.
BF16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
def
relu
(
x
):
def
relu
(
x
):
if
PROFILE
:
ans
=
torch
.
nn
.
functional
.
relu
(
x
).
to
(
x
.
dtype
)
torch
.
cuda
.
synchronize
()
return
ans
return
torch
.
nn
.
functional
.
relu
(
x
).
to
(
x
.
dtype
)
return
torch
.
nn
.
functional
.
relu
(
x
).
to
(
x
.
dtype
)
def
test
(
def
test
(
lib
,
handle
,
device
,
shape
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
dtype
=
torch
.
float16
,
sync
=
None
handle
,
torch_device
,
tensor_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
sync
=
None
):
):
print
(
x_torch_tensor
=
torch
.
rand
(
shape
)
*
2
-
1
f
"Testing Relu on
{
torch_device
}
with tensor_shape:
{
tensor_shape
}
dtype:
{
tensor_dtype
}
inplace:
{
inplace
.
name
}
"
x
=
TestTensor
(
shape
,
x_torch_tensor
.
stride
(),
dtype
,
device
,
mode
=
"manual"
,
set_tensor
=
x_torch_tensor
,
)
)
x
=
torch
.
rand
(
tensor_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
*
2
-
1
if
inplace
==
Inplace
.
INPLACE_X
:
y
=
(
y
=
x
torch
.
rand
(
tensor_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
else
:
if
inplace
==
Inplace
.
OUT_OF_PLACE
y
=
TestTensor
(
shape
,
None
,
dtype
,
device
)
else
x
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
if
y
.
is_broadcast
():
ans
=
relu
(
x
)
return
if
PROFILE
:
start_time
=
time
.
time
()
for
i
in
range
(
NUM_ITERATIONS
):
_
=
relu
(
x
)
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
print
(
f
"pytorch time:
{
elapsed
:
6
f
}
"
)
x_tensor
=
to_tensor
(
x
,
lib
)
print
(
y_tensor
=
to_tensor
(
y
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
x_tensor
f
"Testing Relu on
{
InfiniDeviceNames
[
device
]
}
with shape:
{
shape
}
dtype:
{
InfiniDtypeNames
[
dtype
]
}
inplace:
{
inplace
}
"
)
ans
=
relu
(
x
.
torch_tensor
())
if
sync
is
not
None
:
if
sync
is
not
None
:
sync
()
sync
()
descriptor
=
infiniop
Relu
Descriptor_t
()
descriptor
=
infiniop
Operator
Descriptor_t
()
check_error
(
check_error
(
lib
.
infiniopCreateReluDescriptor
(
LIBINFINIOP
.
infiniopCreateReluDescriptor
(
handle
,
handle
,
ctypes
.
byref
(
descriptor
),
y
.
descriptor
,
x
.
descriptor
ctypes
.
byref
(
descriptor
),
y_tensor
.
descriptor
,
x_tensor
.
descriptor
,
)
)
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor
.
descriptor
.
contents
.
invalidate
()
for
tensor
in
[
x
,
y
]:
y_
tensor
.
des
criptor
.
contents
.
invalidate
()
tensor
.
des
troy_desc
()
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
workspace_size
=
c_uint64
(
0
)
check_error
(
lib
.
infiniopRelu
(
descriptor
,
y_tensor
.
data
,
x_tensor
.
data
,
None
))
if
PROFILE
:
start_time
=
time
.
time
()
for
i
in
range
(
NUM_ITERATIONS
):
check_error
(
check_error
(
lib
.
infiniopRelu
(
descriptor
,
y_tensor
.
data
,
x_tensor
.
data
,
None
)
LIBINFINIOP
.
infiniopGetReluWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
)
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
)
print
(
f
" lib time:
{
elapsed
:
6
f
}
"
)
workspace
=
TestWorkspace
(
workspace_size
.
value
,
y
.
device
)
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-3
)
check_error
(
lib
.
infiniopDestroyReluDescriptor
(
descriptor
))
def
test_cpu
(
lib
,
test_cases
):
def
lib_relu
():
device
=
DeviceEnum
.
DEVICE_CPU
LIBINFINIOP
.
infiniopRelu
(
handle
=
create_handle
(
lib
,
device
)
descriptor
,
workspace
.
data
(),
workspace
.
size
(),
y
.
data
(),
x
.
data
(),
None
for
tensor_shape
,
inplace
in
test_cases
:
)
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
tensor_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cpu"
,
tensor_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
lib_relu
()
def
test_cuda
(
lib
,
test_cases
):
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
y
.
actual_tensor
(),
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
y
.
actual_tensor
(),
ans
,
atol
=
atol
,
rtol
=
rtol
)
device
=
DeviceEnum
.
DEVICE_CUDA
# Profiling workflow
handle
=
create_handle
(
lib
,
device
)
if
PROFILE
:
for
tensor_shape
,
inplace
in
test_cases
:
# fmt: off
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
tensor_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
profile_operation
(
"PyTorch"
,
lambda
:
relu
(
x
.
torch_tensor
()),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
test
(
lib
,
handle
,
"cuda"
,
tensor_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
profile_operation
(
"
lib
"
,
lambda
:
lib_relu
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
# fmt: on
destroy_handle
(
lib
,
handle
)
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
device
=
DeviceEnum
.
DEVICE_BANG
check_error
(
LIBINFINIOP
.
infiniopDestroyReluDescriptor
(
descriptor
))
handle
=
create_handle
(
lib
,
device
)
for
tensor_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
tensor_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"mlu"
,
tensor_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_cases
=
[
# tensor_shape, inplace
((),
Inplace
.
OUT_OF_PLACE
),
((),
Inplace
.
INPLACE_X
),
((
1
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
3
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
3
,
3
,
13
,
9
,
17
),
Inplace
.
INPLACE_X
),
((
32
,
20
,
512
),
Inplace
.
INPLACE_X
),
((
33
,
333
,
333
),
Inplace
.
OUT_OF_PLACE
),
((
32
,
256
,
112
,
112
),
Inplace
.
OUT_OF_PLACE
),
]
args
=
get_args
()
args
=
get_args
()
lib
=
open_lib
()
lib
.
infiniopCreateReluDescriptor
.
restype
=
c_int32
# Configure testing options
lib
.
infiniopCreateReluDescriptor
.
argtypes
=
[
DEBUG
=
args
.
debug
infiniopHandle_t
,
PROFILE
=
args
.
profile
POINTER
(
infiniopReluDescriptor_t
),
NUM_PRERUN
=
args
.
num_prerun
infiniopTensorDescriptor_t
,
NUM_ITERATIONS
=
args
.
num_iterations
infiniopTensorDescriptor_t
,
]
for
device
in
get_test_devices
(
args
):
lib
.
infiniopRelu
.
restype
=
c_int32
test_operator
(
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
lib
.
infiniopRelu
.
argtypes
=
[
infiniopReluDescriptor_t
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroyReluDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyReluDescriptor
.
argtypes
=
[
infiniopReluDescriptor_t
,
]
if
args
.
cpu
:
test_cpu
(
lib
,
test_cases
)
if
args
.
cuda
:
test_cuda
(
lib
,
test_cases
)
if
args
.
bang
:
test_bang
(
lib
,
test_cases
)
if
not
(
args
.
cpu
or
args
.
cuda
or
args
.
bang
):
test_cpu
(
lib
,
test_cases
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment