Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
7bd656b7
Commit
7bd656b7
authored
Feb 17, 2025
by
YdrMaster
Browse files
issue/52: 格式化所有 python 文件,并标注排除格式化的区域
Signed-off-by:
YdrMaster
<
ydrml@hotmail.com
>
parent
ec0ff893
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
370 additions
and
152 deletions
+370
-152
test/infiniop/__init__.py
test/infiniop/__init__.py
+1
-1
test/infiniop/add.py
test/infiniop/add.py
+20
-4
test/infiniop/avg_pool.py
test/infiniop/avg_pool.py
+22
-6
test/infiniop/causal_softmax.py
test/infiniop/causal_softmax.py
+3
-1
test/infiniop/conv.py
test/infiniop/conv.py
+19
-13
test/infiniop/expand.py
test/infiniop/expand.py
+11
-3
test/infiniop/gemm.py
test/infiniop/gemm.py
+16
-7
test/infiniop/global_avg_pool.py
test/infiniop/global_avg_pool.py
+6
-1
test/infiniop/libinfiniop/__init__.py
test/infiniop/libinfiniop/__init__.py
+8
-2
test/infiniop/libinfiniop/datatypes.py
test/infiniop/libinfiniop/datatypes.py
+1
-1
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+74
-27
test/infiniop/matmul.py
test/infiniop/matmul.py
+45
-23
test/infiniop/max_pool.py
test/infiniop/max_pool.py
+17
-5
test/infiniop/random_sample.py
test/infiniop/random_sample.py
+44
-27
test/infiniop/rearrange.py
test/infiniop/rearrange.py
+7
-5
test/infiniop/relu.py
test/infiniop/relu.py
+13
-2
test/infiniop/rms_norm.py
test/infiniop/rms_norm.py
+35
-13
test/infiniop/rotary_embedding.py
test/infiniop/rotary_embedding.py
+8
-6
test/infiniop/swiglu.py
test/infiniop/swiglu.py
+20
-5
No files found.
test/infiniop/__init__.py
View file @
7bd656b7
import
libinfiniop
\ No newline at end of file
import
libinfiniop
test/infiniop/add.py
View file @
7bd656b7
...
...
@@ -41,8 +41,8 @@ def test(
lib
,
handle
,
torch_device
,
c_shape
,
a_shape
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
...
...
@@ -56,13 +56,21 @@ def test(
a
=
torch
.
rand
(
a_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
b_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
c
=
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
c
=
(
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
ans
=
add
(
a
,
b
)
a_tensor
=
to_tensor
(
a
,
lib
)
b_tensor
=
to_tensor
(
b
,
lib
)
c_tensor
=
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
c_tensor
=
(
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
)
descriptor
=
infiniopAddDescriptor_t
()
check_error
(
...
...
@@ -91,8 +99,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -100,8 +110,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -111,13 +123,16 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
...
...
@@ -133,6 +148,7 @@ if __name__ == "__main__":
((
2
,
4
,
3
),
(
2
,
1
,
3
),
(
4
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
3
,
4
,
5
),
(
2
,
3
,
4
,
5
),
(
5
,),
Inplace
.
OUT_OF_PLACE
),
((
3
,
2
,
4
,
5
),
(
4
,
5
),
(
3
,
2
,
1
,
1
),
Inplace
.
OUT_OF_PLACE
),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/avg_pool.py
View file @
7bd656b7
...
...
@@ -35,7 +35,7 @@ class AvgPoolDescriptor(Structure):
infiniopAvgPoolDescriptor_t
=
POINTER
(
AvgPoolDescriptor
)
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
pooling_layers
=
{
1
:
torch
.
nn
.
AvgPool1d
,
2
:
torch
.
nn
.
AvgPool2d
,
...
...
@@ -48,7 +48,9 @@ def pool(x, k, padding, stride, dilation = 1):
return
None
if
ndim
==
3
and
x
.
dtype
==
torch
.
float16
:
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
.
to
(
torch
.
float32
)).
to
(
torch
.
float16
)
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
.
to
(
torch
.
float32
)
).
to
(
torch
.
float16
)
else
:
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
)
if
PROFILE
:
...
...
@@ -69,18 +71,20 @@ def inferShape(x_shape, kernel_shape, padding, strides):
return
x_shape
[:
2
]
+
tuple
(
output_shape
)
# convert a python tuple to a ctype void pointer
def
tuple_to_void_p
(
py_tuple
:
Tuple
):
array
=
ctypes
.
c_int64
*
len
(
py_tuple
)
data_array
=
array
(
*
py_tuple
)
return
ctypes
.
cast
(
data_array
,
ctypes
.
c_void_p
)
def
test
(
lib
,
handle
,
torch_device
,
x_shape
,
k_shape
,
x_shape
,
k_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
,
...
...
@@ -90,7 +94,9 @@ def test(
)
x
=
torch
.
rand
(
x_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
ans
=
pool
(
x
,
k_shape
,
padding
,
strides
)
...
...
@@ -126,7 +132,9 @@ def test(
check_error
(
lib
.
infiniopGetAvgPoolWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
...
...
@@ -164,8 +172,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -173,8 +183,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -184,17 +196,21 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# x_shape, kernel_shape, padding, strides
((
1
,
1
,
10
),
(
3
,),
(
1
,),
(
1
,)),
((
32
,
3
,
224
,
224
),
(
3
,
3
),
(
1
,
1
),
(
2
,
2
)),
((
1
,
1
,
16
,
16
,
16
),
(
5
,
5
,
5
),
(
2
,
2
,
2
),
(
2
,
2
,
2
)),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/causal_softmax.py
View file @
7bd656b7
...
...
@@ -101,6 +101,7 @@ def test_bang(lib, test_cases):
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
x_stride
)
destroy_handle
(
lib
,
handle
)
def
test_ascend
(
lib
,
test_cases
):
import
torch_npu
...
...
@@ -111,11 +112,12 @@ def test_ascend(lib, test_cases):
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# x_shape, x_stride
((
32
,
20
,
512
),
None
),
((
32
,
20
,
512
),
(
20480
,
512
,
1
)),
# Ascend 暂不支持非连续
((
32
,
20
,
512
),
(
20480
,
512
,
1
)),
# Ascend 暂不支持非连续
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/conv.py
View file @
7bd656b7
...
...
@@ -41,17 +41,11 @@ infiniopConvDescriptor_t = POINTER(ConvDescriptor)
def
conv
(
x
,
w
,
stride
,
padding
,
dilation
):
match
len
(
x
.
shape
)
-
2
:
case
1
:
return
F
.
conv1d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
return
F
.
conv1d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
2
:
return
F
.
conv2d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
return
F
.
conv2d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
3
:
return
F
.
conv3d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
return
F
.
conv3d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
_
:
print
(
"Error: Pytorch -> Unsupported tensor dimension"
)
return
None
...
...
@@ -66,11 +60,15 @@ def inferShape(
dilations
:
List
[
int
],
)
->
Tuple
[
int
,
...]:
assert
(
len
(
x_shape
)
==
len
(
w_shape
)
==
len
(
pads
)
+
2
==
len
(
dilations
)
+
2
==
len
(
strides
)
+
2
len
(
x_shape
)
==
len
(
w_shape
)
==
len
(
pads
)
+
2
==
len
(
dilations
)
+
2
==
len
(
strides
)
+
2
),
"x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
output_dims
=
[
math
.
floor
(
(
x_shape
[
i
+
2
]
+
2
*
pads
[
i
]
-
dilations
[
i
]
*
(
w_shape
[
i
+
2
]
-
1
)
-
1
)
(
x_shape
[
i
+
2
]
+
2
*
pads
[
i
]
-
dilations
[
i
]
*
(
w_shape
[
i
+
2
]
-
1
)
-
1
)
/
strides
[
i
]
+
1
)
...
...
@@ -145,7 +143,9 @@ def test(
check_error
(
lib
.
infiniopGetConvWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
...
...
@@ -177,7 +177,7 @@ def test(
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
print
(
f
" lib time:
{
elapsed
:
6
f
}
"
)
if
(
tensor_dtype
==
torch
.
float16
)
:
if
tensor_dtype
==
torch
.
float16
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-2
)
else
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-3
)
...
...
@@ -188,8 +188,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -197,8 +199,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -208,8 +212,10 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/expand.py
View file @
7bd656b7
...
...
@@ -47,10 +47,10 @@ def test(
lib
,
handle
,
torch_device
,
y_shape
,
y_shape
,
x_shape
,
y_stride
=
None
,
x_stride
=
None
,
y_stride
=
None
,
x_stride
=
None
,
tensor_dtype
=
torch
.
float16
,
):
print
(
...
...
@@ -109,8 +109,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -118,8 +120,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -129,13 +133,16 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# y_shape, x_shape, y_stride, x_stride
((),
(),
None
,
None
),
((
3
,
3
),
(
1
,),
None
,
None
),
...
...
@@ -146,6 +153,7 @@ if __name__ == "__main__":
((
2
,
3
,
4
,
5
),
(
5
,),
None
,
None
),
((
3
,
2
,
4
,
5
),
(
3
,
2
,
1
,
1
),
None
,
None
),
((
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
1
),
None
,
None
),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/gemm.py
View file @
7bd656b7
...
...
@@ -27,6 +27,7 @@ PROFILE = False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
class
GEMMDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
...
...
@@ -34,10 +35,15 @@ class GEMMDescriptor(Structure):
infiniopGEMMDescriptor_t
=
POINTER
(
GEMMDescriptor
)
def
gemm
(
A
,
B
,
C
=
None
,
transA
=
False
,
transB
=
False
,
alpha
=
1.0
,
beta
=
0.0
,
dtype
=
torch
.
float32
):
def
gemm
(
A
,
B
,
C
=
None
,
transA
=
False
,
transB
=
False
,
alpha
=
1.0
,
beta
=
0.0
,
dtype
=
torch
.
float32
):
A
=
A
.
T
if
transA
else
A
B
=
B
.
T
if
transB
else
B
result
=
alpha
*
torch
.
matmul
(
A
if
dtype
!=
torch
.
float16
else
A
.
to
(
torch
.
float32
),
B
if
dtype
!=
torch
.
float16
else
B
.
to
(
torch
.
float32
)).
to
(
dtype
)
result
=
alpha
*
torch
.
matmul
(
A
if
dtype
!=
torch
.
float16
else
A
.
to
(
torch
.
float32
),
B
if
dtype
!=
torch
.
float16
else
B
.
to
(
torch
.
float32
),
).
to
(
dtype
)
if
C
is
not
None
:
result
+=
beta
*
C
if
dtype
!=
torch
.
float16
else
C
.
to
(
torch
.
float32
)
if
PROFILE
:
...
...
@@ -64,7 +70,7 @@ def test(
dtype
=
torch
.
float16
,
):
print
(
f
"Testing GEMM on
{
torch_device
}
with transA:
{
transA
}
transB:
{
transB
}
"
f
"Testing GEMM on
{
torch_device
}
with transA:
{
transA
}
transB:
{
transB
}
"
f
"a_shape:
{
a_shape
}
b_shape:
{
b_shape
}
c_shape:
{
c_shape
}
y_shape:
{
y_shape
}
"
f
"a_stride:
{
a_stride
}
b_stride:
{
b_stride
}
c_stride:
{
c_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
)
...
...
@@ -121,9 +127,7 @@ def test(
workspace_size
=
ctypes
.
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetGEMMWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
lib
.
infiniopGetGEMMWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
=
torch
.
zeros
(
int
(
workspace_size
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
...
...
@@ -182,8 +186,10 @@ def test_cpu(lib, test_cases):
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -204,8 +210,10 @@ def test_cuda(lib, test_cases):
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -229,9 +237,10 @@ def test_bang(lib, test_cases):
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/global_avg_pool.py
View file @
7bd656b7
...
...
@@ -99,7 +99,12 @@ def test(
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
check_error
(
lib
.
infiniopGlobalAvgPool
(
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
None
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
None
,
)
)
if
PROFILE
:
...
...
test/infiniop/libinfiniop/__init__.py
View file @
7bd656b7
import
os
import
sys
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'.'
)))
from
.liboperators
import
open_lib
,
CTensor
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"."
)))
from
.liboperators
import
(
open_lib
,
CTensor
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
)
from
.devices
import
*
from
.utils
import
*
from
.datatypes
import
*
test/infiniop/libinfiniop/datatypes.py
View file @
7bd656b7
...
...
@@ -7,7 +7,7 @@ class InfiniDtype:
I32
=
5
I64
=
6
U8
=
7
U16
=
8
U16
=
8
U32
=
9
U64
=
10
F8
=
11
...
...
test/infiniop/libinfiniop/utils.py
View file @
7bd656b7
...
...
@@ -54,6 +54,7 @@ def create_workspace(size, torch_device):
if
size
==
0
:
return
None
import
torch
return
torch
.
zeros
(
size
=
(
size
,),
dtype
=
torch
.
uint8
,
device
=
torch_device
)
...
...
@@ -172,6 +173,7 @@ def get_args():
def
synchronize_device
(
torch_device
):
import
torch
if
torch_device
==
"cuda"
:
torch
.
cuda
.
synchronize
()
elif
torch_device
==
"npu"
:
...
...
@@ -197,13 +199,24 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
If True, the function will print detailed information about any discrepancies between the tensors.
"""
import
numpy
as
np
print_discrepancy
(
actual
,
desired
,
atol
,
rtol
,
verbose
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
def
debug_all
(
actual_vals
:
Sequence
,
desired_vals
:
Sequence
,
condition
:
str
,
atol
=
0
,
rtol
=
1e-2
,
equal_nan
=
False
,
verbose
=
True
):
def
debug_all
(
actual_vals
:
Sequence
,
desired_vals
:
Sequence
,
condition
:
str
,
atol
=
0
,
rtol
=
1e-2
,
equal_nan
=
False
,
verbose
=
True
,
):
"""
Debugging function to compare two sequences of values (actual and desired) pair by pair, results
Debugging function to compare two sequences of values (actual and desired) pair by pair, results
are linked by the given logical condition, and prints discrepancies
Arguments:
----------
...
...
@@ -223,7 +236,10 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
- AssertionError: If the specified `condition` is not 'or' or 'and'.
"""
assert
len
(
actual_vals
)
==
len
(
desired_vals
),
"Invalid Length"
assert
condition
in
{
"or"
,
"and"
},
"Invalid condition: should be either 'or' or 'and'"
assert
condition
in
{
"or"
,
"and"
,
},
"Invalid condition: should be either 'or' or 'and'"
import
numpy
as
np
passed
=
False
if
condition
==
"or"
else
True
...
...
@@ -237,14 +253,22 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
elif
condition
==
"and"
:
if
passed
and
len
(
indices
)
!=
0
:
passed
=
False
print
(
f
"
\033
[31mThe condition has not been satisfied: Condition #
{
index
+
1
}
\033
[0m"
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
print
(
f
"
\033
[31mThe condition has not been satisfied: Condition #
{
index
+
1
}
\033
[0m"
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
,
)
assert
passed
,
"
\033
[31mThe condition has not been satisfied
\033
[0m"
def
print_discrepancy
(
actual
,
expected
,
atol
=
0
,
rtol
=
1e-3
,
verbose
=
True
):
def
print_discrepancy
(
actual
,
expected
,
atol
=
0
,
rtol
=
1e-3
,
verbose
=
True
):
if
actual
.
shape
!=
expected
.
shape
:
raise
ValueError
(
"Tensors must have the same shape to compare."
)
...
...
@@ -273,7 +297,9 @@ def print_discrepancy(
for
idx
in
diff_indices
:
index_tuple
=
tuple
(
idx
.
tolist
())
actual_str
=
f
"
{
actual
[
index_tuple
]:
<
{
col_width
[
1
]
}
.
{
decimal_places
[
1
]
}
f
}
"
expected_str
=
f
"
{
expected
[
index_tuple
]:
<
{
col_width
[
2
]
}
.
{
decimal_places
[
2
]
}
f
}
"
expected_str
=
(
f
"
{
expected
[
index_tuple
]:
<
{
col_width
[
2
]
}
.
{
decimal_places
[
2
]
}
f
}
"
)
delta_str
=
f
"
{
delta
[
index_tuple
]:
<
{
col_width
[
3
]
}
.
{
decimal_places
[
3
]
}
f
}
"
print
(
f
" > Index:
{
str
(
index_tuple
):
<
{
col_width
[
0
]
}}
"
...
...
@@ -287,10 +313,18 @@ def print_discrepancy(
print
(
f
" - Desired dtype:
{
expected
.
dtype
}
"
)
print
(
f
" - Atol:
{
atol
}
"
)
print
(
f
" - Rtol:
{
rtol
}
"
)
print
(
f
" - Mismatched elements:
{
len
(
diff_indices
)
}
/
{
actual
.
numel
()
}
(
{
len
(
diff_indices
)
/
actual
.
numel
()
*
100
}
%)"
)
print
(
f
" - Min(actual) :
{
torch
.
min
(
actual
):
<
{
col_width
[
1
]
}}
| Max(actual) :
{
torch
.
max
(
actual
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(desired):
{
torch
.
min
(
expected
):
<
{
col_width
[
1
]
}}
| Max(desired):
{
torch
.
max
(
expected
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(delta) :
{
torch
.
min
(
delta
):
<
{
col_width
[
1
]
}}
| Max(delta) :
{
torch
.
max
(
delta
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Mismatched elements:
{
len
(
diff_indices
)
}
/
{
actual
.
numel
()
}
(
{
len
(
diff_indices
)
/
actual
.
numel
()
*
100
}
%)"
)
print
(
f
" - Min(actual) :
{
torch
.
min
(
actual
):
<
{
col_width
[
1
]
}}
| Max(actual) :
{
torch
.
max
(
actual
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(desired):
{
torch
.
min
(
expected
):
<
{
col_width
[
1
]
}}
| Max(desired):
{
torch
.
max
(
expected
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(delta) :
{
torch
.
min
(
delta
):
<
{
col_width
[
1
]
}}
| Max(delta) :
{
torch
.
max
(
delta
):
<
{
col_width
[
2
]
}}
"
)
print
(
"-"
*
total_width
+
"
\n
"
)
return
diff_indices
...
...
@@ -298,14 +332,17 @@ def print_discrepancy(
def
get_tolerance
(
tolerance_map
,
tensor_dtype
,
default_atol
=
0
,
default_rtol
=
1e-3
):
"""
Returns the atol and rtol for a given tensor data type in the tolerance_map.
Returns the atol and rtol for a given tensor data type in the tolerance_map.
If the given data type is not found, it returns the provided default tolerance values.
"""
return
tolerance_map
.
get
(
tensor_dtype
,
{
'atol'
:
default_atol
,
'rtol'
:
default_rtol
}).
values
()
return
tolerance_map
.
get
(
tensor_dtype
,
{
"atol"
:
default_atol
,
"rtol"
:
default_rtol
}
).
values
()
def
timed_op
(
func
,
num_iterations
,
device
):
import
time
""" Function for timing operations with synchronization. """
synchronize_device
(
device
)
start
=
time
.
time
()
...
...
@@ -318,7 +355,7 @@ def timed_op(func, num_iterations, device):
def
profile_operation
(
desc
,
func
,
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
):
"""
Unified profiling workflow that is used to profile the execution time of a given function.
It first performs a number of warmup runs, then performs timed execution and
It first performs a number of warmup runs, then performs timed execution and
prints the average execution time.
Arguments:
...
...
@@ -328,11 +365,11 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
- torch_device (str): The device on which the operation runs, provided for timed execution.
- NUM_PRERUN (int): The number of warmup runs.
- NUM_ITERATIONS (int): The number of timed execution iterations, used to calculate the average execution time.
"""
"""
# Warmup runs
for
_
in
range
(
NUM_PRERUN
):
func
()
# Timed execution
elapsed
=
timed_op
(
lambda
:
func
(),
NUM_ITERATIONS
,
torch_device
)
print
(
f
"
{
desc
}
time:
{
elapsed
*
1000
:
6
f
}
ms"
)
...
...
@@ -347,7 +384,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
- lib (ctypes.CDLL): The library object containing the operator implementations.
- device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py.
- test_func (function): The test function to be executed for each test case.
- test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
- test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
to be passed to `test_func`.
- tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
"""
...
...
@@ -355,7 +392,13 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
try
:
for
test_case
in
test_cases
:
for
tensor_dtype
in
tensor_dtypes
:
test_func
(
lib
,
handle
,
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
)
test_func
(
lib
,
handle
,
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
,
)
finally
:
destroy_handle
(
lib
,
handle
)
...
...
@@ -365,22 +408,26 @@ def get_test_devices(args):
Using the given parsed Namespace to determine the devices to be tested.
Argument:
- args: the parsed Namespace object.
- args: the parsed Namespace object.
Return:
- devices_to_test: the devices that will be tested. Default is CPU.
"""
devices_to_test
=
[]
if
args
.
cpu
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
CPU
)
if
args
.
nvidia
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
NVIDIA
)
if
args
.
cambricon
:
if
args
.
cpu
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
CPU
)
if
args
.
nvidia
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
NVIDIA
)
if
args
.
cambricon
:
import
torch_mlu
devices_to_test
.
append
(
InfiniDeviceEnum
.
CAMBRICON
)
if
args
.
ascend
:
if
args
.
ascend
:
import
torch
import
torch_npu
torch
.
npu
.
set_device
(
0
)
# Ascend NPU needs explicit device initialization
torch
.
npu
.
set_device
(
0
)
# Ascend NPU needs explicit device initialization
devices_to_test
.
append
(
InfiniDeviceEnum
.
ASCEND
)
if
not
devices_to_test
:
devices_to_test
=
[
InfiniDeviceEnum
.
CPU
]
...
...
test/infiniop/matmul.py
View file @
7bd656b7
...
...
@@ -2,9 +2,19 @@ import torch
import
ctypes
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_size_t
,
c_uint64
,
c_void_p
,
c_float
from
libinfiniop
import
(
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
open_lib
,
to_tensor
,
get_test_devices
,
check_error
,
rearrange_if_needed
,
create_workspace
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
open_lib
,
to_tensor
,
get_test_devices
,
check_error
,
rearrange_if_needed
,
create_workspace
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
)
# ==============================================================================
...
...
@@ -21,8 +31,8 @@ _TEST_CASES = [
(
1.0
,
0.0
,
(
1
,
2048
),
(
2048
,
2048
),
(
1
,
2048
),
(
4096
,
1
),
(
4096
,
1
),
(
4096
,
1
)),
(
1.0
,
1.0
,
(
6
,
2048
),
(
2048
,
2560
),
(
6
,
2560
),
(
2048
,
1
),
(
1
,
2048
),
(
2560
,
1
)),
(
1.0
,
1.0
,
(
6
,
2048
),
(
2048
,
2560
),
(
6
,
2560
),
(
2048
,
1
),
(
1
,
2048
),
(
2560
,
1
)),
(
1.0
/
8.0
,
0.0
,
(
4
,
8
*
6
,
64
),
(
4
,
64
,
6
),
(
4
,
8
*
6
,
6
),
None
,
None
,
None
),
(
1.0
/
8.0
,
0.0
,
(
4
,
8
*
6
,
64
),
(
4
,
64
,
6
),
(
4
,
8
*
6
,
6
),
None
,
None
,
None
),
(
1.0
/
8.0
,
0.0
,
(
4
,
8
*
6
,
64
),
(
4
,
64
,
6
),
(
4
,
8
*
6
,
6
),
None
,
None
,
None
),
(
1.0
/
8.0
,
0.0
,
(
4
,
8
*
6
,
64
),
(
4
,
64
,
6
),
(
4
,
8
*
6
,
6
),
None
,
None
,
None
),
]
# Data types used for testing
...
...
@@ -30,8 +40,8 @@ _TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
'
atol
'
:
0
,
'
rtol
'
:
1e-2
},
torch
.
float32
:
{
'
atol
'
:
0
,
'
rtol
'
:
1e-3
},
torch
.
float16
:
{
"
atol
"
:
0
,
"
rtol
"
:
1e-2
},
torch
.
float32
:
{
"
atol
"
:
0
,
"
rtol
"
:
1e-3
},
}
DEBUG
=
False
...
...
@@ -39,6 +49,7 @@ PROFILE = False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
# ==============================================================================
# Definitions
# ==============================================================================
...
...
@@ -48,6 +59,7 @@ class MatmulDescriptor(Structure):
infiniopMatmulDescriptor_t
=
POINTER
(
MatmulDescriptor
)
# PyTorch implementation for matrix multiplication
def
matmul
(
_c
,
beta
,
_a
,
_b
,
alpha
):
a
,
b
,
c
=
_a
.
clone
(),
_b
.
clone
(),
_c
.
clone
()
...
...
@@ -55,6 +67,7 @@ def matmul(_c, beta, _a, _b, alpha):
fp32_result
=
torch
.
matmul
(
a
.
to
(
torch
.
float32
),
b
.
to
(
torch
.
float32
))
return
alpha
*
fp32_result
.
to
(
result_dtype
)
+
beta
*
c
# The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES
def
test
(
...
...
@@ -85,7 +98,10 @@ def test(
# Compute the PyTorch reference result
ans
=
matmul
(
c
,
beta
,
a
,
b
,
alpha
)
a
,
b
,
c
=
[
rearrange_if_needed
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
a
,
b
,
c
],
[
a_stride
,
b_stride
,
c_stride
])]
a
,
b
,
c
=
[
rearrange_if_needed
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
a
,
b
,
c
],
[
a_stride
,
b_stride
,
c_stride
])
]
a_tensor
,
b_tensor
,
c_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
a
,
b
,
c
]]
descriptor
=
infiniopMatmulDescriptor_t
()
...
...
@@ -95,7 +111,7 @@ def test(
ctypes
.
byref
(
descriptor
),
c_tensor
.
descriptor
,
a_tensor
.
descriptor
,
b_tensor
.
descriptor
b_tensor
.
descriptor
,
)
)
...
...
@@ -105,22 +121,27 @@ def test(
# Get workspace size and create workspace
workspace_size
=
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetMatmulWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)))
check_error
(
lib
.
infiniopGetMatmulWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
=
create_workspace
(
workspace_size
.
value
,
a
.
device
)
# Execute infiniop matmul operator
def
lib_matmul
():
check_error
(
lib
.
infiniopMatmul
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
c_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
alpha
,
beta
,
None
,
))
check_error
(
lib
.
infiniopMatmul
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
c_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
alpha
,
beta
,
None
,
)
)
lib_matmul
()
# Validate results
...
...
@@ -131,9 +152,10 @@ def test(
# Profiling workflow
if
PROFILE
:
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
matmul
(
c
,
beta
,
a
,
b
,
alpha
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_matmul
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
check_error
(
lib
.
infiniopDestroyMatmulDescriptor
(
descriptor
))
...
...
@@ -150,7 +172,7 @@ if __name__ == "__main__":
POINTER
(
infiniopMatmulDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetMatmulWorkspaceSize
.
restype
=
c_int32
...
...
test/infiniop/max_pool.py
View file @
7bd656b7
...
...
@@ -35,7 +35,7 @@ class MaxPoolDescriptor(Structure):
infiniopMaxPoolDescriptor_t
=
POINTER
(
MaxPoolDescriptor
)
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
pooling_layers
=
{
1
:
torch
.
nn
.
MaxPool1d
,
2
:
torch
.
nn
.
MaxPool2d
,
...
...
@@ -66,18 +66,20 @@ def inferShape(x_shape, kernel_shape, padding, strides):
return
x_shape
[:
2
]
+
tuple
(
output_shape
)
# convert a python tuple to a ctype void pointer
def
tuple_to_void_p
(
py_tuple
:
Tuple
):
array
=
ctypes
.
c_int64
*
len
(
py_tuple
)
data_array
=
array
(
*
py_tuple
)
return
ctypes
.
cast
(
data_array
,
ctypes
.
c_void_p
)
def
test
(
lib
,
handle
,
torch_device
,
x_shape
,
k_shape
,
x_shape
,
k_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
,
...
...
@@ -87,7 +89,9 @@ def test(
)
x
=
torch
.
rand
(
x_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
ans
=
pool
(
x
,
k_shape
,
padding
,
strides
)
...
...
@@ -123,7 +127,9 @@ def test(
check_error
(
lib
.
infiniopGetMaxPoolWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
...
...
@@ -161,8 +167,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -170,8 +178,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -181,8 +191,10 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/random_sample.py
View file @
7bd656b7
...
...
@@ -30,13 +30,13 @@ infiniopRandomSampleDescriptor_t = POINTER(RandomSampleDescriptor)
def
random_sample
(
data
,
random_val
,
topp
,
topk
,
voc
,
temperature
,
torch_device
):
indices
=
torch
.
zeros
([
topk
],
dtype
=
torch
.
int64
)
indices
=
torch
.
zeros
([
topk
],
dtype
=
torch
.
int64
)
dataNp
=
data
.
clone
().
detach
()
sorted_indices
=
torch
.
arange
(
voc
)
for
i
in
range
(
topk
):
for
j
in
range
(
i
+
1
,
voc
):
if
(
dataNp
[
i
]
<
dataNp
[
j
]
)
:
if
dataNp
[
i
]
<
dataNp
[
j
]:
tmp
=
dataNp
[
i
].
clone
().
detach
()
dataNp
[
i
]
=
dataNp
[
j
].
clone
().
detach
()
dataNp
[
j
]
=
tmp
...
...
@@ -44,48 +44,60 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
tmpInd
=
sorted_indices
[
i
].
clone
().
detach
()
sorted_indices
[
i
]
=
sorted_indices
[
j
].
clone
().
detach
()
sorted_indices
[
j
]
=
tmpInd
#sorted_indices = torch.argsort(dataNp, descending=True)
indices
=
sorted_indices
[:
topk
]
#
sorted_indices = torch.argsort(dataNp, descending=True)
indices
=
sorted_indices
[:
topk
]
dataNp
=
dataNp
[
sorted_indices
]
globalM
=
dataNp
[
0
]
dataNp
=
(
dataNp
-
globalM
)
/
temperature
dataNp
=
torch
.
softmax
(
dataNp
.
float
(),
dim
=
0
)
dataNp
=
torch
.
softmax
(
dataNp
.
float
(),
dim
=
0
)
sum_s
=
0
for
end
in
range
(
topk
):
sum_s
+=
dataNp
[
end
]
if
(
sum_s
>=
topp
)
:
if
sum_s
>=
topp
:
break
if
(
end
<
topk
-
1
)
:
if
end
<
topk
-
1
:
end
+=
1
else
:
end
=
topk
sum_s
=
0
for
i
in
range
(
end
):
sum_s
+=
dataNp
[
i
]
random_val
*=
sum_s
sum_s
=
0
for
i
in
range
(
end
):
sum_s
+=
dataNp
[
i
]
if
(
random_val
<
sum_s
)
:
if
random_val
<
sum_s
:
return
indices
[
i
]
def
random_sample_0
(
data
):
return
torch
.
argmax
(
data
)
def
test
(
lib
,
handle
,
torch_device
,
voc
,
random_val
,
topp
,
topk
,
temperature
,
x_dtype
=
torch
.
float16
):
print
(
f
"Testing RandomSample on
{
torch_device
}
with voc:
{
voc
}
dtype:
{
x_dtype
}
"
)
def
test
(
lib
,
handle
,
torch_device
,
voc
,
random_val
,
topp
,
topk
,
temperature
,
x_dtype
=
torch
.
float16
,
):
print
(
f
"Testing RandomSample on
{
torch_device
}
with voc:
{
voc
}
dtype:
{
x_dtype
}
"
)
data
=
torch
.
arange
(
voc
).
float
()
*
0.0001
_perm
=
torch
.
randperm
(
voc
)
data
=
data
[
_perm
].
to
(
x_dtype
).
to
(
torch_device
)
if
(
topp
>
0
and
topk
>
1
):
ans
=
random_sample
(
data
.
to
(
"cpu"
),
random_val
,
topp
,
topk
,
voc
,
temperature
,
"cpu"
)
if
topp
>
0
and
topk
>
1
:
ans
=
random_sample
(
data
.
to
(
"cpu"
),
random_val
,
topp
,
topk
,
voc
,
temperature
,
"cpu"
)
else
:
ans
=
random_sample_0
(
data
)
indices
=
torch
.
zeros
([
1
],
dtype
=
torch
.
int64
).
to
(
torch_device
)
...
...
@@ -96,7 +108,10 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
descriptor
=
infiniopRandomSampleDescriptor_t
()
check_error
(
lib
.
infiniopCreateRandomSampleDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
indices_tensor
.
descriptor
,
x_tensor
.
descriptor
handle
,
ctypes
.
byref
(
descriptor
),
indices_tensor
.
descriptor
,
x_tensor
.
descriptor
,
)
)
...
...
@@ -110,7 +125,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
)
workspace
=
create_workspace
(
workspace_size
.
value
,
torch_device
)
workspace
=
create_workspace
(
workspace_size
.
value
,
torch_device
)
check_error
(
lib
.
infiniopRandomSample
(
descriptor
,
...
...
@@ -131,10 +146,11 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
assert
indices
[
0
].
type
(
ans
.
dtype
)
==
ans
or
data
[
ans
]
==
data
[
indices
[
0
]]
check_error
(
lib
.
infiniopDestroyRandomSampleDescriptor
(
descriptor
))
def
test_cpu
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
(
voc
,
random_val
,
topp
,
topk
,
temperature
)
in
test_cases
:
for
voc
,
random_val
,
topp
,
topk
,
temperature
in
test_cases
:
test
(
lib
,
handle
,
"cpu"
,
voc
,
random_val
,
topp
,
topk
,
temperature
)
destroy_handle
(
lib
,
handle
)
...
...
@@ -142,7 +158,7 @@ def test_cpu(lib, test_cases):
def
test_cuda
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
(
voc
,
random_val
,
topp
,
topk
,
temperature
)
in
test_cases
:
for
voc
,
random_val
,
topp
,
topk
,
temperature
in
test_cases
:
test
(
lib
,
handle
,
"cuda"
,
voc
,
random_val
,
topp
,
topk
,
temperature
)
destroy_handle
(
lib
,
handle
)
...
...
@@ -152,16 +168,17 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
(
voc
,
random_val
,
topp
,
topk
,
temperature
)
in
test_cases
:
for
voc
,
random_val
,
topp
,
topk
,
temperature
in
test_cases
:
test
(
lib
,
handle
,
"mlu"
,
voc
,
random_val
,
topp
,
topk
,
temperature
)
destroy_handle
(
lib
,
handle
)
def
test_ascend
(
lib
,
test_cases
):
import
torch_npu
device
=
DeviceEnum
.
DEVICE_ASCEND
handle
=
create_handle
(
lib
,
device
)
for
(
voc
,
random_val
,
topp
,
topk
,
temperature
)
in
test_cases
:
for
voc
,
random_val
,
topp
,
topk
,
temperature
in
test_cases
:
test
(
lib
,
handle
,
"npu"
,
voc
,
random_val
,
topp
,
topk
,
temperature
)
destroy_handle
(
lib
,
handle
)
...
...
@@ -180,7 +197,7 @@ if __name__ == "__main__":
(
32000
,
0.08
,
1.0
,
25
,
1.0
),
# (119696, 0.01, 1.0, 100, 1.0),
]
args
=
get_args
()
lib
=
open_lib
()
lib
.
infiniopCreateRandomSampleDescriptor
.
restype
=
c_int32
...
...
test/infiniop/rearrange.py
View file @
7bd656b7
...
...
@@ -61,9 +61,7 @@ def test(
x_tensor
.
descriptor
.
contents
.
invalidate
()
y_tensor
.
descriptor
.
contents
.
invalidate
()
check_error
(
lib
.
infiniopRearrange
(
descriptor
,
y_tensor
.
data
,
x_tensor
.
data
,
None
)
)
check_error
(
lib
.
infiniopRearrange
(
descriptor
,
y_tensor
.
data
,
x_tensor
.
data
,
None
))
assert
torch
.
allclose
(
x
,
y
,
atol
=
0
,
rtol
=
1e-3
)
check_error
(
lib
.
infiniopDestroyRearrangeDescriptor
(
descriptor
))
...
...
@@ -87,8 +85,10 @@ def test_cuda(lib, test_cases):
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
x_stride
,
y_shape
,
y_stride
)
destroy_handle
(
lib
,
handle
)
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
test_case
in
test_cases
:
...
...
@@ -97,6 +97,7 @@ def test_bang(lib, test_cases):
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
x_stride
,
y_shape
,
y_stride
)
destroy_handle
(
lib
,
handle
)
def
test_ascend
(
lib
,
test_cases
):
import
torch_npu
...
...
@@ -106,7 +107,8 @@ def test_ascend(lib, test_cases):
x_shape
,
x_stride
=
test_case
[
0
]
y_shape
,
y_stride
=
test_case
[
1
]
test
(
lib
,
handle
,
"npu"
,
x_shape
,
x_stride
,
y_shape
,
y_stride
)
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
args
=
get_args
()
...
...
@@ -119,7 +121,7 @@ if __name__ == "__main__":
(((
32
,
1
,
64
),
(
64
,
2560
,
1
)),
((
32
,
1
,
64
),
(
64
,
64
,
1
))),
(((
4
,
1
,
64
),
(
64
,
2560
,
1
)),
((
4
,
1
,
64
),
(
64
,
11264
,
1
))),
(((
64
,),
(
1
,)),
((
64
,),
(
1
,))),
]
]
lib
=
open_lib
()
lib
.
infiniopCreateRearrangeDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateRearrangeDescriptor
.
argtypes
=
[
...
...
test/infiniop/relu.py
View file @
7bd656b7
...
...
@@ -52,7 +52,7 @@ def test(
lib
,
handle
,
torch_device
,
tensor_shape
,
tensor_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
):
...
...
@@ -61,7 +61,11 @@ def test(
)
x
=
torch
.
rand
(
tensor_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
*
2
-
1
y
=
torch
.
rand
(
tensor_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
x
y
=
(
torch
.
rand
(
tensor_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
x
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
ans
=
relu
(
x
)
...
...
@@ -108,17 +112,22 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
tensor_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
tensor_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cpu"
,
tensor_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
def
test_cuda
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
tensor_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
tensor_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cuda"
,
tensor_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -128,8 +137,10 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
tensor_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
tensor_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"mlu"
,
tensor_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/rms_norm.py
View file @
7bd656b7
...
...
@@ -20,12 +20,14 @@ from operatorspy import (
from
operatorspy.tests.test_utils
import
get_args
import
torch
class
RMSNormDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
infiniopRMSNormDescriptor_t
=
POINTER
(
RMSNormDescriptor
)
def
rms_norm
(
x
,
w
,
eps
):
input_dtype
=
x
.
dtype
hidden_states
=
x
.
to
(
torch
.
float32
)
...
...
@@ -34,9 +36,20 @@ def rms_norm(x, w, eps):
return
w
*
hidden_states
.
to
(
input_dtype
)
def
test
(
lib
,
handle
,
torch_device
,
y_shape
,
x_shape
,
w_shape
,
dtype
=
torch
.
float16
,
w_dtype
=
torch
.
float16
):
print
(
f
"Testing RMS_Norm on
{
torch_device
}
with y_shape:
{
y_shape
}
x_shape:
{
x_shape
}
w_shape:
{
w_shape
}
"
f
" dtype:
{
dtype
}
w_dtype:
{
w_dtype
}
"
)
def
test
(
lib
,
handle
,
torch_device
,
y_shape
,
x_shape
,
w_shape
,
dtype
=
torch
.
float16
,
w_dtype
=
torch
.
float16
,
):
print
(
f
"Testing RMS_Norm on
{
torch_device
}
with y_shape:
{
y_shape
}
x_shape:
{
x_shape
}
w_shape:
{
w_shape
}
"
f
" dtype:
{
dtype
}
w_dtype:
{
w_dtype
}
"
)
y
=
torch
.
zeros
(
y_shape
,
dtype
=
dtype
).
to
(
torch_device
)
x
=
torch
.
rand
(
x_shape
,
dtype
=
dtype
).
to
(
torch_device
)
...
...
@@ -50,12 +63,16 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
w_tensor
=
to_tensor
(
w
,
lib
)
descriptor
=
infiniopRMSNormDescriptor_t
()
w_dataType
=
0
if
w_dtype
==
torch
.
float16
else
1
w_dataType
=
0
if
w_dtype
==
torch
.
float16
else
1
check_error
(
lib
.
infiniopCreateRMSNormDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
y_tensor
.
descriptor
,
x_tensor
.
descriptor
,
w_tensor
.
descriptor
,
eps
handle
,
ctypes
.
byref
(
descriptor
),
y_tensor
.
descriptor
,
x_tensor
.
descriptor
,
w_tensor
.
descriptor
,
eps
,
)
)
...
...
@@ -66,9 +83,7 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
workspace_size
=
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetRMSNormWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
lib
.
infiniopGetRMSNormWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
=
create_workspace
(
workspace_size
.
value
,
y
.
device
)
check_error
(
...
...
@@ -86,37 +101,44 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
assert
torch
.
allclose
(
y
.
to
(
dtype
),
ans
.
to
(
dtype
),
atol
=
1e-3
,
rtol
=
1e-3
)
check_error
(
lib
.
infiniopDestroyRMSNormDescriptor
(
descriptor
))
def
test_cpu
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
(
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
)
in
test_cases
:
for
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
in
test_cases
:
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
)
destroy_handle
(
lib
,
handle
)
def
test_cuda
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
(
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
)
in
test_cases
:
for
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
in
test_cases
:
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
)
destroy_handle
(
lib
,
handle
)
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
(
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
)
in
test_cases
:
for
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
in
test_cases
:
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
)
destroy_handle
(
lib
,
handle
)
def
test_ascend
(
lib
,
test_cases
):
import
torch_npu
device
=
DeviceEnum
.
DEVICE_ASCEND
handle
=
create_handle
(
lib
,
device
)
for
(
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
)
in
test_cases
:
for
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
in
test_cases
:
test
(
lib
,
handle
,
"npu"
,
y_shape
,
x_shape
,
w_shape
,
dtype
,
w_dtype
)
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# y_shape, x_shape, w_shape, dtype, w_dtype
...
...
test/infiniop/rotary_embedding.py
View file @
7bd656b7
...
...
@@ -45,12 +45,13 @@ def rotary_embedding(t, pos, theta, torch_device):
)
freqs
=
torch
.
outer
(
pos
,
freqs
)
freqs_cis
=
torch
.
polar
(
torch
.
ones_like
(
freqs
),
freqs
)
t_
=
torch
.
view_as_complex
(
t
.
reshape
(
*
t
.
shape
[:
-
1
],
-
1
,
2
))
freqs_cis
=
reshape_for_broadcast
(
freqs_cis
,
t_
)
t_out
=
torch
.
view_as_real
(
t_
*
freqs_cis
).
flatten
(
2
).
to
(
t
.
dtype
)
return
t_out
def
sin_cos_table
(
max_seq_len
,
dim
,
torch_device
,
theta
):
pos
=
torch
.
arange
(
0
,
max_seq_len
,
dtype
=
torch
.
float32
,
device
=
torch
.
device
(
torch_device
)
...
...
@@ -73,12 +74,12 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
if
strides
is
not
None
:
t
=
rearrange_tensor
(
t
,
strides
)
posTmp
=
torch
.
arange
(
0
,
t
.
shape
[
0
])
pos
=
torch
.
zeros
(
2
*
posTmp
.
shape
[
0
],
dtype
=
torch
.
int32
)
pos
=
torch
.
zeros
(
2
*
posTmp
.
shape
[
0
],
dtype
=
torch
.
int32
)
for
i
in
range
(
posTmp
.
shape
[
0
]):
pos
[
2
*
i
]
=
posTmp
[
i
]
pos
[
2
*
i
+
1
]
=
0
theta
=
1e4
if
torch_device
==
'
mlu
'
or
torch_device
==
'
npu
'
:
if
torch_device
==
"
mlu
"
or
torch_device
==
"
npu
"
:
ans
=
rotary_embedding
(
t
,
posTmp
,
theta
,
"cpu"
).
to
(
torch_device
)
pos
=
pos
.
to
(
torch_device
)
t
=
t
.
to
(
torch_device
)
...
...
@@ -97,7 +98,7 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
cos_table_tensor
=
to_tensor
(
cos_table
,
lib
)
if
torch_device
==
"npu"
:
torch
.
npu
.
synchronize
()
torch
.
npu
.
synchronize
()
check_error
(
lib
.
infiniopCreateRoPEDescriptor
(
...
...
@@ -156,6 +157,7 @@ def test_cuda(lib, test_cases):
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
shape
,
strides
,
dtype
in
test_cases
:
...
...
@@ -163,7 +165,7 @@ def test_bang(lib, test_cases):
destroy_handle
(
lib
,
handle
)
def
test_ascend
(
lib
,
test_cases
)
:
def
test_ascend
(
lib
,
test_cases
):
import
torch_npu
device
=
DeviceEnum
.
DEVICE_ASCEND
...
...
@@ -172,6 +174,7 @@ def test_ascend(lib, test_cases) :
test
(
lib
,
handle
,
"npu"
,
shape
,
strides
,
dtype
)
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
((
1
,
32
,
128
),
None
,
torch
.
float16
),
...
...
@@ -180,7 +183,6 @@ if __name__ == "__main__":
# 接口 GatherMask 的内部实现相关,目前 48 64 128 都可以支持
((
4
,
1
,
32
),
None
,
torch
.
float16
),
((
1
,
32
,
128
),
None
,
torch
.
float16
),
((
3
,
32
,
128
),
(
8000
,
200
,
1
),
torch
.
float16
),
]
args
=
get_args
()
...
...
test/infiniop/swiglu.py
View file @
7bd656b7
...
...
@@ -29,9 +29,10 @@ infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor)
def
swiglu
(
a
,
b
):
return
a
*
b
/
(
1
+
torch
.
exp
(
-
b
.
float
()).
to
(
b
.
dtype
))
def
test_out_of_place
(
lib
,
handle
,
...
...
@@ -223,6 +224,7 @@ def test_cuda(lib, test_cases):
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
...
...
@@ -238,17 +240,30 @@ def test_bang(lib, test_cases):
def
test_ascend
(
lib
,
test_cases
):
import
torch_npu
device
=
DeviceEnum
.
DEVICE_ASCEND
handle
=
create_handle
(
lib
,
device
)
for
shape
,
a_stride
,
b_stride
,
c_stride
,
dtype
in
test_cases
:
test_out_of_place
(
lib
,
handle
,
"npu"
,
shape
,
a_stride
,
b_stride
,
c_stride
,
dtype
,
torch
.
npu
.
synchronize
lib
,
handle
,
"npu"
,
shape
,
a_stride
,
b_stride
,
c_stride
,
dtype
,
torch
.
npu
.
synchronize
,
)
test_in_place1
(
lib
,
handle
,
"npu"
,
shape
,
a_stride
,
b_stride
,
dtype
,
torch
.
npu
.
synchronize
)
test_in_place2
(
lib
,
handle
,
"npu"
,
shape
,
a_stride
,
b_stride
,
dtype
,
torch
.
npu
.
synchronize
)
test_in_place1
(
lib
,
handle
,
"npu"
,
shape
,
a_stride
,
b_stride
,
dtype
,
torch
.
npu
.
synchronize
)
test_in_place2
(
lib
,
handle
,
"npu"
,
shape
,
a_stride
,
b_stride
,
dtype
,
torch
.
npu
.
synchronize
)
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment